抓取投机岛期货论坛 并写入数据库

  • Post author:
  • Post category:其他


# -*- coding: utf-8 -*-
import scrapy
import pymysql
import datetime
import re
from scrapy.selector import Selector

class JiaoyizheSpider(scrapy.Spider):
    name = 'jiaoyizhe'
    start_urls = ['http://www.jiaoyizhe.com/forum-16-1.html']
    today = datetime.date.today()
    oneday = datetime.timedelta(days=1)
    twoday = datetime.timedelta(days = 2)
    yesterday = today -oneday
    before_yesterday = today -twoday
    page = 1


    def parse(self, response):

        papers = response.xpath(r"//table[@id='threadlisttableid']//tbody")

        all_page = response.xpath(r'//span[@id="fd_page_top"]//label/span/text()').extract()[0]
        all_page = re.findall(r'\d+', all_page)[0]




        for paper in papers:

            title = paper.xpath(r"tr/th/a[2]/text()").extract()
            title_link = paper.xpath(r"tr/th/a[2]/@href").extract()
            click = paper.xpath(r"tr/td[3]/a/text()").extract()
            reply = paper.xpath(r"tr/td[3]/em/text()").extract()
            time = paper.xpath(r"tr/td[2]//span/text()").extract()

            if len(title) != 0:
                title = title[0]
            if len(title_link) != 0:
                title_link = title_link[0]
            else:
                continue
            if len(click) != 0:
                click = click[0]
            if len(reply) != 0:
                reply = reply[0]

            if len(time) != 0:
                time = time[0].replace(u'\xa0 ', u' ')
                times = re.findall(r"昨天|前天", time, re.S)
                if len(times)!=0:
                    if times[0]=="昨天":
                        time = self.yesterday
                    if times[0] == "前天":
                        time = self.before_yesterday
                '''
                    关于日期只需要统计昨天 前天 用正则来匹配这两个字 如果能匹配到 获取今天日期 减去就好了  插入数据库 
                    先将每天的数据存取下来  然后对每天的数据进行分词 统计单个词语出现次数  
                    统计淘股吧

                '''



            item =  scrapy.Request(url=title_link, callback=self.paper_item)



            item.meta['title'] = title
            item.meta['title_link'] = title_link
            item.meta['click'] = click
            item.meta['reply'] = reply
            item.meta['time'] = time


            yield item

        if self.page <= int(all_page):
            self.page += 1
            url = 'http://www.jiaoyizhe.com/forum-16-'+ str(self.page)+'.html'
            yield scrapy.Request(url=url,callback = self.parse)




    def paper_item(self, response):

        text_paper = response.xpath(
            r'//div[@class="t_fsz"]//td//text()').extract()

        if len(text_paper) != 0:
            text_paper = text_paper[0]

        title = response.meta['title']
        title_link = response.meta['title_link']
        click = response.meta['click']
        reply = response.meta['reply']
        time = response.meta['time']



版权声明:本文为u011391734原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。