技术交流:(Python)对新闻的爬取

  • Post author:
  • Post category:python


今天接了一个python小脚本,发来一起分享。

要求:

要求

废话不说,直接上代码:

import os
import requests
from lxml import etree
from bs4 import BeautifulSoup
import re
import urllib.request

# 获取源码
def getHTML(url):
    a = urllib.request.urlopen(url)  # 打开指定网址
    html = a.read()  # 读取网页源码
    html = html.decode("utf-8")  # 解码为unicode码
    return str(html)

#去除HTML标签
def remoteHTMLTag(html):
    soup = BeautifulSoup(html, 'html.parser')
    return str(soup.get_text())

# 获取源码
def get_url(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    html = etree.HTML(response.text)
    return html

# 获取全部国内新闻
def allNews(url):
    html = get_url(url)
    name = html.xpath('//*[@id="rank-defList"]/div/div[2]/div/h3/a/text()')  # 新闻名称
    href = html.xpath('//*[@id="rank-defList"]/div/div[2]/div/h3/a/@href')  # 对应的新闻链接
    for num in range(len(name)):
        if(num<50):# 只要50篇
            new(num,href[num],name[num])
        else:
            break

# 单个新闻内容(地址,新闻名称)
def new(index,url, header):
    # index 文件名称
    # print(getHTML(url)) #单个新闻源码
    # print(remoteHTMLTag(getHTML(url)))#单个新闻内容(去除HTML标签)
    # writeNewSrc(index,getHTML(url))
    writeNewTxt(index, header, remoteHTMLTag(getHTML(url)))
    print("完成"+header+"的爬取")
#打印src
def writeNewSrc(index, html):
    if not os.path.exists("src"):
        os.makedirs("src")
    filename = "src" + "//" + str(index+1) + ".txt"
    if not os.path.exists(filename):

        f = open(filename, 'a', encoding='utf-8')
        f.write(html)
        f.close()
        print("成功打印源碼,目录:"+filename)
#打印txt
def writeNewTxt(index,header,content):
    if not os.path.exists("txt"):
        os.makedirs("txt")
    filename = "txt" + "//" + str(index+1) + ".txt"
    removeContent=''.join(content.split("\n"))

    if not os.path.exists(filename):
        f = open(filename, 'a', encoding='utf-8')
        f.write(str(header))
        f.write("\n\n")
        f.write(removeContent)
        f.close()
        print("成功打印文件,目录:" + filename )

#
if __name__ == '__main__':
    url = 'https://news.china.com/domestic/index.html';
    allNews(url)

特别鸣谢:https://news.china.com/



版权声明:本文为Ddsof_Cai原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。