python100行代码爬取古诗

  • Post author:
  • Post category:python


最近在做AI作诗的项目,需要训练古诗生成模型,于是自己写了个爬虫获取古诗数据

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import xlwt
import re
import string
from urllib import request
from chardet import detect
import chardet
import numpy as np
import json
from os.path import join as pjoin
import os
import time
index = 0#数据集
poet = 0#诗人的序号
def getSoup(url):
    # """获取源码"""
    fp = request.urlopen(url)
    content = fp.read()
    fp.close()
    type(content)
    html = content.decode()  # 解码
    det = chardet.detect(content)  # 判断content的编码方式
    if det['confidence'] > 0.8:  # 当confidence>0.8时认定为判断正确
        html = content.decode(det['encoding'])
    else:
        html = content.decode('gbk')
    soup = BeautifulSoup(html)
    return soup
#下一页
def nextUrl(soup):
    # """获取下一页连接后缀"""
    a = soup.find('a', text=re.compile("^下一页"))
    if a:
        return a.attrs['href']
    else:
        return None
#一级页面【诗人列表】
def firstPage():
    start = time.clock()
    print("开始时间:",start)
    page = 1#页码
    nt = '/shiren/index.html'
    global poet
    while nt:
        print('------------------第'+str(page)+'页-------------')
        soup = getSoup('https://www.gushimi.org' + nt)
        ol = soup.findAll('div', attrs={"class": 'news_title'})
        for div in ol:
            print(str(poet)+":" + div.a.text)
            poet = poet +1
            secondPageUrl = 'https://www.gushimi.org'+div.a.attrs['href']
            secondPage(secondPageUrl)
            print('------------------此诗人爬取结束-------------')
        nt = nextUrl(soup)
        page = page + 1
    end = time.clock()
    print("结束时间:",end)
    print("总共用时:", end - start)
#二级页面【某诗人诗词列表】
def secondPage(url):
    soup = getSoup(url)
    ol = soup.findAll('div', attrs={"class": "content_box"})
    for li in ol[2].findAll("li"):
        thirdPageUrl = 'https://www.gushimi.org'+li.select('a')[0].attrs['href']
        thirdPage(thirdPageUrl)
#三级页面【诗词详细列表】
def thirdPage(url):
    global index
    soup = getSoup(url)
    str0 = soup.findAll('div', attrs={"class": "box_title"})
    str0 = str0[1].text#标题
    ol = soup.findAll('div', attrs={"class": "news_content"})
    str1 = ol[0].find_all("div", class_="old_h1")[0].select('a')[0].text#朝代
    str2 = ol[0].find_all("div", class_="old_h1")[0].select('a')[1].text#作者
    print("      "+str(index)+str2+":"+str0)
    contents = ol[0].find_all("div", class_="newstext")[0]
    str3 = []#诗文
    for li in contents.find_all("div"):
        str3.append(li.text)
    flags = ol[0].findAll("div", class_="newstext")[1]
    str4 = []#关键字
    for li in flags.find_all("a"):
        str4.append(li.text)
    jsonData = {'index': index, 'title': str0, 'dynasty': str1, 'author': str2, 'paragraphs': str3, 'key': str4}
    index = index+1
    write_in_json_data(jsonData)
#写入json文件
def write_in_json_data(jsonData):
    with open('result.json', 'a', encoding='utf-8') as f:
        json.dump(jsonData, f, ensure_ascii=False, indent=4)

if __name__ == '__main__':
    firstPage()




版权声明:本文为xufankang原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。