scrapy设置代理

  • Post author:
  • Post category:其他


参考项目,微博爬取:

https://github.com/Python3WebSpider/Weibo


参考项目,搭建代理池:

https://github.com/Python3WebSpider/ProxyPool



示例一

搭建代理池或使用付费代理,每次访问proxy_url获取一个随机代理,适合大规模爬取

middlewares.py:

import json
import logging
from scrapy import signals
import requests


class ProxyMiddleware():
    def __init__(self, proxy_url):
        self.logger = logging.getLogger(__name__)
        self.proxy_url = proxy_url
    
    def get_random_proxy(self):
        try:
            response = requests.get(self.proxy_url)
            if response.status_code == 200:
                proxy = response.text
                return proxy
        except requests.ConnectionError:
            return False
    
    def process_request(self, request, spider):
        if request.meta.get('retry_times'):
            proxy = self.get_random_proxy()
            if proxy:
                uri = 'https://{proxy}'.format(proxy=proxy)
                self.logger.debug('使用代理 ' + proxy)
                request.meta['proxy'] = uri

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(
            proxy_url=settings.get('PROXY_URL')
        )


class CookiesMiddleware():
    def __init__(self, cookies_url):
        self.logger = logging.getLogger(__name__)
        self.cookies_url = cookies_url
    
    def get_random_cookies(self):
        try:
            response = requests.get(self.cookies_url)
            if response.status_code == 200:
                cookies = json.loads(response.text)
                return cookies
        except requests.ConnectionError:
            return False
    
    def process_request(self, request, spider):
        self.logger.debug('正在获取Cookies')
        cookies = self.get_random_cookies()
        if cookies:
            request.cookies = cookies
            self.logger.debug('使用Cookies ' + json.dumps(cookies))
    
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls(
            cookies_url=settings.get('COOKIES_URL')
        )


示例二

实时从网上爬取免费代理并验证,返回一个可用代理列表,每次请求随机从列表中获取一个代理,适合测试

middlewares.py:

from scrapy import signals
import logging
import random


class ProxyMiddleware(object):
    def __init__(self):
        self.logger = logging.getLogger(__name__)

    def process_request(self, request, spider):
        # if request.meta.get('retry_times'):
        proxy = random.choice(spider.proxies)
        uri = 'https://{proxy}'.format(proxy=proxy)
        self.logger.debug('使用代理 ' + proxy)
        # logging.warning('使用代理:' + proxy)
        request.meta['proxy'] = uri

代理模块proxy.py:

import asyncio
import aiohttp
import time
import sys
import requests
from lxml import etree

TEST_URL = 'https://www.jd.com/'


class GetProxy(object):
    def __init__(self):
        self.TEST_URL = TEST_URL
        self.VALID_STATUS_CODES = [200]
        self.usable = set()

    def crawl_66ip(self):
        proxies = set()
        url = 'http://www.66ip.cn/areaindex_35/index.html'
        res = requests.get(url)
        res.encoding = "gbk"
        html = etree.HTML(res.text)
        lst = html.xpath('//div[@id="footer"]//table//tr[position()>1]')
        for i in lst:
            ip = i.xpath('.//td/text()')[0]
            port = i.xpath('.//td/text()')[1]
            proxy = ip + ':' + port
            proxies.add(proxy)
        return list(proxies)

    async def test_single_proxy(self, proxy):
        conn = aiohttp.TCPConnector(ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy, bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://' + proxy
                async with session.get(TEST_URL, proxy=real_proxy, timeout=15, allow_redirects=False) as response:
                    if response.status in self.VALID_STATUS_CODES:
                        self.usable.add(proxy)
                        print('代理可用', proxy)
                    else:
                        print('请求响应码不合法 ', response.status, 'IP', proxy)
            except Exception as e:
                print('代理请求失败', proxy)

    def run(self):
        proxies = self.crawl_66ip()
        print('爬取代理个数为:%d' % len(proxies))
        print('开始测试:\n')
        loop = asyncio.get_event_loop()
        tasks = [self.test_single_proxy(proxy) for proxy in proxies]
        loop.run_until_complete(asyncio.wait(tasks))
        sys.stdout.flush()
        time.sleep(5)

    def get_proxy_list(self):
        self.run()
        print('可用代理个数为:%d' % len(self.usable))
        print(self.usable)
        return list(self.usable)

spider.py

class AitaotuSpider(scrapy.Spider):
    name = 'aitaotu'
    allowed_domains = ['aitaotu.com']
    start_urls = ['https://www.aitaotu.com/dmtp/']

    G = GetProxy()
    proxies = G.get_proxy_list()



版权声明:本文为qq_34303423原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。