Python爬虫思路模板

  • Post author:
  • Post category:python


使用Python进行爬虫,原理上是共通的,首先先对网站发起数据请求,可以使用requests模块、urllib.request方法,对于动态网站可以使用selenium模拟浏览器登录方法。拿到服务器返回的网页数据后,对数据进行解析,获取感兴趣的信息,可以使用re模块进行正则匹配,xpath或BeautifulSoup选择节点的方法,对于网页数据是json格式的可以直接导入json模块直接进行解析。解析后的数据放在一个列表或元组的可迭代对象中,最后保存数据,常见的保存方法有:直接存入csv文件或者使用pymysql库存入MySQL数据库。



请求数据

######一、向网站发起数据请求######

#1、requests
import requests

def get_page(url):
    html = requests.get(url = url,
                       headers= headers).content.decode('utf-8')
#     html = requests.get(url = url,
#                        headers= headers).text()
#     html = requests.get(url = url,
#                        headers= headers).json()
    
    return html

#2、selenium模拟浏览器
from selenium import webdriver
import time

#添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')

#创建浏览器对象
browser = webdriver.Chrome()

browser.get('https://www.baidu.com')

#向搜索框输入 
word = input('请输入想要搜索的内容:')
browser.find_element_by_id('kw').send_keys(word)
#模拟点击百度一下
browser.find_element_by_id('su').click()

#给出时间加载页面
time.sleep(2)
browser.find_element_by_class_name('n').click()

##模拟滑动页面到最底部
def scroll_to_bottom(driver):
    js = "return action=document.body.scrollHeight"
    # 初始化现在滚动条所在高度为0
    height = 0
    # 当前窗口总高度
    new_height = driver.execute_script(js)

    while height < new_height:
        # 将滚动条调整至页面底部
        for i in range(height, new_height, 100):
            driver.execute_script('window.scrollTo(0, {})'.format(i))
        time.sleep(3)
        height = new_height
        time.sleep(3)
        new_height = driver.execute_script(js)

#3、urllib
#请求模块
from urllib import request
#编码模块
from urllib import parse
#定义变量
word = input('请输入想要搜索的内容:')
url = "http://www.baidu.com/?s"
headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)"}
#编码,拼接完整URL
query_string = parse.urlencode({'wd':word})
url = url + query_string
#url = "http://www.baidu.com/?s{}".format(urlencode({'wd':word}))
#url = "http://www.baidu.com/?s%s"%urlencode({'wd':word})
#创建请求对象,对请求进行包装
req = request.Request(url=url,headers=headers)
#获取响应对象
respond = request.urlopen(req)
#读取内容
html = respond.read().decode('utf-8')
#保存到文件
filename = '{}.html'.format(word)
with open(filename,'w',encoding='utf-8') as f:
    f.write(html)



解析数据

######二、解析数据######

#1、re模块
import re

def parse_page(html):
    pattern1 = re.compile('data-fav-favortype="1" data-name="(.*?)"><i></i><span>收藏</span></a>',re.S)
    shop_list = pattern1.findall(html)
    pattern2 = re.compile('<span >口味<b><svgmtsi class="shopNum">(.*?)</b></span>',re.S)
    taste_list = pattern2.findall(html)
    pattern3 = re.compile('<span >环境<b><svgmtsi class="shopNum">(.*?)</b></span>',re.S)
    envir_list = pattern3.findall(html)
    pattern4 = re.compile('<span >服务<b><svgmtsi class="shopNum">(.*?)</b></span>',re.S)
    serve_list = pattern4.findall(html)

    info_list = list(zip(shop_list,taste_list,envir_list,serve_list))
    
    return info_list

#2、xpath选择节点
from lxml import etree

def parse_page(html):
        parse_html = etree.HTML(html)
        #新建一个字典用于提取想要的信息
        house_dict = {}
        #新建一个列表添加所有信息,方便后期存储
        house_list = []
        #一个页面包含大类的节点
        dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
        for dd in dd_list:
            house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()
            house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()
            unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()
            house_dict['unit_price'] = unit_price[2:]
            total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()
            total_price = float(total_price) * 10000
            house_dict['total_price'] = total_price
            house = tuple(house_dict.values())
            house_list.append(house)
        
        return house_list

#3、json模块
import json

def parse_page(html):
    #json.loads()方法把字典形式的字符串转换为json格式
    json_html = json.loads(html.split('fetchJSON_comment98(')[1][:-2])
    comments_dict = {}
    comments_list = []
    for comment in json_html['comments']:
        comments_dict['referenceName'] = comment['referenceName']
        comments_dict['creationTime'] = comment['creationTime']
        comments_dict['nickname'] = comment['nickname']
        comments_dict['content'] = comment['content']
        comments_dict['type'] = type
        comments_list.append(list(comments_dict.values()))
        
    return comments_list

#4、BeautifulSoup
from bs4 import BeautifulSoup

def parse_page(html):
    soup = BeautifulSoup(html, 'lxml')
    img_ul = soup.find_all('div', {'class': 'li_img'})
    soup.select('.panel .panel-heading') #选择class='panel'下class='panel-heading'节点
    soup.select('ul li') #选择ul标签下li节点
    soup.select('#list-2 .element') #选择id='list-2'标签下class='element'节点
    soup.select('ul')[0] #选择名称为ul标签第一个



保存数据

######三、存储数据######

#1、存入到csv文件
import csv

def save_page(info_list):
    with open('文件.csv','a',encoding='utf-8',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['品牌','排名','最低价','最高价','销售量'])
        writer.writerows(info_list)

#2、存储到mysql数据库
import pymysql

def save_page(info_list):
    db = pymysql.connect(host = 'localhost',
                        port = 3306,
                        user ='root',
                        passwd = '密码',
                        database = 'stu',
                        charset = 'utf8')
    #获取游标
    cursor = db.cursor()
    ins = 'insert into phone values (%s,%s,%s)'
    cursor.executemany(ins, info_list)
    db.commit()
    
#存储图片
# #连接数据库
# db = pymysql.connect(host = 'localhost',
#                         port = 3306,
#                         user ='root',
#                         passwd = '密码',
#                         database = 'stu',
#                         charset = 'utf8')
# #获取游标
# cursor = db.cursor()

# #存储文件
# with open ('mysql.jpg','rb') as fd:
#     data = fd.read()

# try:
#     #sql语句
#     sql = "insert into images values (1,'mysql.jpg',%s);"
#     #用execute自动传参方法将二进制内容传入语句
#     cur.execute(sql,[data])
#     db.commit()
# except Exception as e:
#     #报错回滚
#     db.rollback()
#     print(e)

# cur.close()
# db.close()


# #获取文件
# sql = "select * from images where filename='mysql.jpg'"

# cur.execute(sql)
# image = cur.fetchone()
# with open (image[1],'wb') as fd:
#     fd.write(image[2])

# cur.close()
# db.close()



爬虫实例

借助一个爬取链家二手房信息的案例,将整体爬取思路进行回顾并将所有代码进行封装。

######实例:封装######

#爬取链家二手房信息
import requests
import pymysql
import time
import random
from lxml import etree


class LianjiaSpider:
    def __init__(self):
        self.url = 'https://wh.lianjia.com/ershoufang/pg{}/'
        self.ua_list = [
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
        ]
        self.page = 1
        self.db = pymysql.connect('localhost','root','密码','lianjiadb',charset='utf8')
        self.cursor = self.db.cursor()
        
        
    def get_page(self,url):
        headers = {'User-Agent':random.choice(self.ua_list)}
        html = requests.get(
            url=url,
            headers=headers).content.decode('utf-8')
        
        self.parse_page(html)
    
    
    def parse_page(self,html):
        parse_html = etree.HTML(html)
        house_dict = {}
        house_list = []
        ins = 'insert into lianjia values (%s,%s,%s,%s)'
        dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
        for dd in dd_list:
            house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()
            house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()
            unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()
            house_dict['unit_price'] = unit_price[2:]
            total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()
            total_price = float(total_price) * 10000
            house_dict['total_price'] = total_price
            house = tuple(house_dict.values())
            house_list.append(house)
        print(house_list)
        self.cursor.executemany(ins,house_list)
        self.db.commit()
        
    
    
    def main(self):
        for page in range(100):
            url = self.url.format(page)
            self.get_page(url)
            time.sleep(random.randint(1,3))
            print('第%s页爬取完成'%self.page)
            self.page+=1
        self.cursor.close()
        self.db.close()
    
    
if __name__ == '__main__':
    start = time.time()
    spider = LianjiaSpider()
    spider.main()
    end = time.time()
    print('执行时间: %.2f'%(end-start))



版权声明:本文为weixin_50646402原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。