参考项目,微博爬取:
https://github.com/Python3WebSpider/Weibo
参考项目,搭建代理池:
https://github.com/Python3WebSpider/ProxyPool
示例一
搭建代理池或使用付费代理,每次访问proxy_url获取一个随机代理,适合大规模爬取
middlewares.py:
import json
import logging
from scrapy import signals
import requests
class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url
def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False
def process_request(self, request, spider):
if request.meta.get('retry_times'):
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + proxy)
request.meta['proxy'] = uri
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)
class CookiesMiddleware():
def __init__(self, cookies_url):
self.logger = logging.getLogger(__name__)
self.cookies_url = cookies_url
def get_random_cookies(self):
try:
response = requests.get(self.cookies_url)
if response.status_code == 200:
cookies = json.loads(response.text)
return cookies
except requests.ConnectionError:
return False
def process_request(self, request, spider):
self.logger.debug('正在获取Cookies')
cookies = self.get_random_cookies()
if cookies:
request.cookies = cookies
self.logger.debug('使用Cookies ' + json.dumps(cookies))
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
cookies_url=settings.get('COOKIES_URL')
)
示例二
实时从网上爬取免费代理并验证,返回一个可用代理列表,每次请求随机从列表中获取一个代理,适合测试
middlewares.py:
from scrapy import signals
import logging
import random
class ProxyMiddleware(object):
def __init__(self):
self.logger = logging.getLogger(__name__)
def process_request(self, request, spider):
# if request.meta.get('retry_times'):
proxy = random.choice(spider.proxies)
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + proxy)
# logging.warning('使用代理:' + proxy)
request.meta['proxy'] = uri
代理模块proxy.py:
import asyncio
import aiohttp
import time
import sys
import requests
from lxml import etree
TEST_URL = 'https://www.jd.com/'
class GetProxy(object):
def __init__(self):
self.TEST_URL = TEST_URL
self.VALID_STATUS_CODES = [200]
self.usable = set()
def crawl_66ip(self):
proxies = set()
url = 'http://www.66ip.cn/areaindex_35/index.html'
res = requests.get(url)
res.encoding = "gbk"
html = etree.HTML(res.text)
lst = html.xpath('//div[@id="footer"]//table//tr[position()>1]')
for i in lst:
ip = i.xpath('.//td/text()')[0]
port = i.xpath('.//td/text()')[1]
proxy = ip + ':' + port
proxies.add(proxy)
return list(proxies)
async def test_single_proxy(self, proxy):
conn = aiohttp.TCPConnector(ssl=False)
async with aiohttp.ClientSession(connector=conn) as session:
try:
if isinstance(proxy, bytes):
proxy = proxy.decode('utf-8')
real_proxy = 'http://' + proxy
async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
if response.status in self.VALID_STATUS_CODES:
self.usable.add(proxy)
print('代理可用', proxy)
else:
print('请求响应码不合法 ', response.status, 'IP', proxy)
except Exception as e:
print('代理请求失败', proxy)
def run(self):
proxies = self.crawl_66ip()
print('爬取代理个数为:%d' % len(proxies))
print('开始测试:\n')
loop = asyncio.get_event_loop()
tasks = [self.test_single_proxy(proxy) for proxy in proxies]
loop.run_until_complete(asyncio.wait(tasks))
sys.stdout.flush()
time.sleep(5)
def get_proxy_list(self):
self.run()
print('可用代理个数为:%d' % len(self.usable))
print(self.usable)
return list(self.usable)
spider.py
class AitaotuSpider(scrapy.Spider):
name = 'aitaotu'
allowed_domains = ['aitaotu.com']
start_urls = ['https://www.aitaotu.com/dmtp/']
G = GetProxy()
proxies = G.get_proxy_list()
版权声明:本文为qq_34303423原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。