scrapy使用selenium中间件 爬取半次元图片

  • Post author:
  • Post category:其他


spider.py

# -*- coding: utf-8 -*-
import scrapy
import logging,time
from sec_bcy.items import  BcyItem

class BcyspiderSpider(scrapy.Spider):
    name = 'bcySpider'

    page_index = 2
    url='https://bcy.net/coser/allwork?&p={0}'
    start_urls = [url.format(str(page_index))]
    #coser列表翻页
    def parse(self, response):
        while self.page_index < 3:
            page_url=self.url.format(str(self.page_index))
            yield scrapy.Request(page_url,callback=self.parse_page)
            self.page_index+=1
    #获取coser个人页面集合
    def parse_page(self,response):
        corser_list=response.xpath("//a[contains(@href,'/u/')]/@href").extract()
        # 去重
        corser_list = list(set(corser_list))
        for coser in corser_list:
            coser_url="https://bcy.net"+coser
            yield scrapy.Request(coser_url,callback=self.parse_detail,meta={'JsPage':True})



    #进入单个coser界面
    def parse_detail(self,response):
        #item=BcyItem()
        # item['name'] = response.xpath("//a[@class='fz18 lh1d2 white']/text()").extract()[0]
        # item['coser_url'] = response.url
        # item['detail_urls']=response.xpath("//a[contains(@href,'detail')]/@href").extract()
        # item['detail_urls']=list(set(item['detail_urls']))
        # item['followers']=response.xpath("//a[contains(@href,'follower')]/span[2]/text()").extract()[0]
        # item['summary'] = ''.join(response.xpath("//p[@class='fz12 lh1d4 mt12 maxh32 ovf']/text()").extract())
        # item['img_urls']=[]
        detail_urls = response.xpath("//a[contains(@href,'detail')]/@href").extract()

        detail_urls = list(set(detail_urls))
        for each in detail_urls:
            url = "https://bcy.net" + each
            yield scrapy.Request(url, callback=self.parse_pic)
            # yield scrapy.Request(url, callback=self.parse_pic, meta={'item': item, 'isEnd': True})
            #判断是否是最后一个url,如果是则附带一个meta key
            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            #scrapy为异步请求,这里的request顺序混乱,isEnd为True的request容易跑到值为False的前面


            # url = "https://bcy.net" + each
            # if (item['detail_urls'].index(each)) == (len(item['detail_urls']) - 1):
            #     yield scrapy.Request(url, callback=self.parse_pic, meta={'item': item,'isEnd':True})
            # else:
            #     yield scrapy.Request(url, callback=self.parse_pic, meta={'item': item,'isEnd':False})





    def parse_pic(self,response):
        item = BcyItem()
        name=response.xpath("//a[@class='fz14 dib maxw250 cut']/text()").extract()
        if len(name) > 0:
            item['name']=name[0]
        else:
            item['name'] = response.xpath("//a[@class='lh24 fz14 name dib mr5']/text()").extract()[0]
        item['detail_url'] = response.url
        img_urls=response.xpath("//img[@class='detail_std detail_clickable']/@src").extract()
        item['img_urls'] = [url.replace('/w650','') for url in img_urls]
        yield item
        # item=response.meta['item']
        # isEnd=response.meta['isEnd']
        # pic_list=response.xpath("//img[@class='detail_std detail_clickable']/@src").extract()
        # for pic in pic_list:
        #     item['img_urls'].append(pic.replace('/w650',''))
        # if isEnd == True:
        #     yield item
        #     time.sleep(2)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import json,os,requests
class SecBcyPipeline(object):
    def __init__(self):
        self.output=open('corser.json','w',encoding='utf-8')
        self.num = 0
        self.path = 'H:/CRAWL/pic/'
    def process_item(self, item, spider):
        jsontext=json.dumps(dict(item),ensure_ascii=False)+',\n'
        self.output.write(jsontext)
        if len(item['img_urls']) != 0:
            dir_path = self.path + item['name']
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            for each in item['img_urls']:
                self.num += 1
                with open(dir_path + '/' + each[-36:].replace('/','-'), 'wb') as handle:
                    print(str(self.num) + ":" + dir_path + '/' + each[-36:])
                    response = requests.get(
                        each,
                        stream=True)
                    for block in response.iter_content(1024):
                        if not block:
                            break
                        handle.write(block)
        return item
    def close_spider(self,spider):
        self.output.close()

middlewares.py:

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import logging,time
from scrapy.http import HtmlResponse


class myMiddleware(object):
    @classmethod
    def process_request(self, request, spider):
        if 'JsPage' in request.meta:
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
            driver = webdriver.Chrome(chrome_options=chrome_options)
            driver.get(request.url)
            #time.sleep(1)
            body = driver.page_source
            # driver.close()        不知道在什么时候释放资源
            return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)





class SecBcySpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    # 设置使用chrome headless

    # 进入网站
    #driver.get()
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(response.url)
        content = driver.page_source.encode('utf-8')
        driver.quit()
        logging.warning('!'*88+response.url)

        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class SecBcyDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        print('!' * 88 + "from_crawler233")
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s


    def process_request(self, request, spider):
        print('!' * 88 + "process_request")

        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        # chrome_options.binary_location = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
        # driver = webdriver.Chrome(chrome_options=chrome_options)
        # driver.get(request.url)
        # content = driver.page_source.encode('utf-8')
        # driver.quit()


        return None

    def process_response(self, request, response, spider):
        print('!' * 88 + "process_response")
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        print('!' * 88 + "process_exception")
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        print('!' * 88 + "spider_opened")
        spider.logger.info('Spider opened: %s' % spider.name)



版权声明:本文为zj457840891原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。