使用Python进行爬虫,原理上是共通的,首先先对网站发起数据请求,可以使用requests模块、urllib.request方法,对于动态网站可以使用selenium模拟浏览器登录方法。拿到服务器返回的网页数据后,对数据进行解析,获取感兴趣的信息,可以使用re模块进行正则匹配,xpath或BeautifulSoup选择节点的方法,对于网页数据是json格式的可以直接导入json模块直接进行解析。解析后的数据放在一个列表或元组的可迭代对象中,最后保存数据,常见的保存方法有:直接存入csv文件或者使用pymysql库存入MySQL数据库。
请求数据
######一、向网站发起数据请求######
#1、requests
import requests
def get_page(url):
html = requests.get(url = url,
headers= headers).content.decode('utf-8')
# html = requests.get(url = url,
# headers= headers).text()
# html = requests.get(url = url,
# headers= headers).json()
return html
#2、selenium模拟浏览器
from selenium import webdriver
import time
#添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')
#创建浏览器对象
browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
#向搜索框输入
word = input('请输入想要搜索的内容:')
browser.find_element_by_id('kw').send_keys(word)
#模拟点击百度一下
browser.find_element_by_id('su').click()
#给出时间加载页面
time.sleep(2)
browser.find_element_by_class_name('n').click()
##模拟滑动页面到最底部
def scroll_to_bottom(driver):
js = "return action=document.body.scrollHeight"
# 初始化现在滚动条所在高度为0
height = 0
# 当前窗口总高度
new_height = driver.execute_script(js)
while height < new_height:
# 将滚动条调整至页面底部
for i in range(height, new_height, 100):
driver.execute_script('window.scrollTo(0, {})'.format(i))
time.sleep(3)
height = new_height
time.sleep(3)
new_height = driver.execute_script(js)
#3、urllib
#请求模块
from urllib import request
#编码模块
from urllib import parse
#定义变量
word = input('请输入想要搜索的内容:')
url = "http://www.baidu.com/?s"
headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)"}
#编码,拼接完整URL
query_string = parse.urlencode({'wd':word})
url = url + query_string
#url = "http://www.baidu.com/?s{}".format(urlencode({'wd':word}))
#url = "http://www.baidu.com/?s%s"%urlencode({'wd':word})
#创建请求对象,对请求进行包装
req = request.Request(url=url,headers=headers)
#获取响应对象
respond = request.urlopen(req)
#读取内容
html = respond.read().decode('utf-8')
#保存到文件
filename = '{}.html'.format(word)
with open(filename,'w',encoding='utf-8') as f:
f.write(html)
解析数据
######二、解析数据######
#1、re模块
import re
def parse_page(html):
pattern1 = re.compile('data-fav-favortype="1" data-name="(.*?)"><i></i><span>收藏</span></a>',re.S)
shop_list = pattern1.findall(html)
pattern2 = re.compile('<span >口味<b><svgmtsi class="shopNum">(.*?)</b></span>',re.S)
taste_list = pattern2.findall(html)
pattern3 = re.compile('<span >环境<b><svgmtsi class="shopNum">(.*?)</b></span>',re.S)
envir_list = pattern3.findall(html)
pattern4 = re.compile('<span >服务<b><svgmtsi class="shopNum">(.*?)</b></span>',re.S)
serve_list = pattern4.findall(html)
info_list = list(zip(shop_list,taste_list,envir_list,serve_list))
return info_list
#2、xpath选择节点
from lxml import etree
def parse_page(html):
parse_html = etree.HTML(html)
#新建一个字典用于提取想要的信息
house_dict = {}
#新建一个列表添加所有信息,方便后期存储
house_list = []
#一个页面包含大类的节点
dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
for dd in dd_list:
house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()
house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()
unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()
house_dict['unit_price'] = unit_price[2:]
total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()
total_price = float(total_price) * 10000
house_dict['total_price'] = total_price
house = tuple(house_dict.values())
house_list.append(house)
return house_list
#3、json模块
import json
def parse_page(html):
#json.loads()方法把字典形式的字符串转换为json格式
json_html = json.loads(html.split('fetchJSON_comment98(')[1][:-2])
comments_dict = {}
comments_list = []
for comment in json_html['comments']:
comments_dict['referenceName'] = comment['referenceName']
comments_dict['creationTime'] = comment['creationTime']
comments_dict['nickname'] = comment['nickname']
comments_dict['content'] = comment['content']
comments_dict['type'] = type
comments_list.append(list(comments_dict.values()))
return comments_list
#4、BeautifulSoup
from bs4 import BeautifulSoup
def parse_page(html):
soup = BeautifulSoup(html, 'lxml')
img_ul = soup.find_all('div', {'class': 'li_img'})
soup.select('.panel .panel-heading') #选择class='panel'下class='panel-heading'节点
soup.select('ul li') #选择ul标签下li节点
soup.select('#list-2 .element') #选择id='list-2'标签下class='element'节点
soup.select('ul')[0] #选择名称为ul标签第一个
保存数据
######三、存储数据######
#1、存入到csv文件
import csv
def save_page(info_list):
with open('文件.csv','a',encoding='utf-8',newline='') as f:
writer = csv.writer(f)
writer.writerow(['品牌','排名','最低价','最高价','销售量'])
writer.writerows(info_list)
#2、存储到mysql数据库
import pymysql
def save_page(info_list):
db = pymysql.connect(host = 'localhost',
port = 3306,
user ='root',
passwd = '密码',
database = 'stu',
charset = 'utf8')
#获取游标
cursor = db.cursor()
ins = 'insert into phone values (%s,%s,%s)'
cursor.executemany(ins, info_list)
db.commit()
#存储图片
# #连接数据库
# db = pymysql.connect(host = 'localhost',
# port = 3306,
# user ='root',
# passwd = '密码',
# database = 'stu',
# charset = 'utf8')
# #获取游标
# cursor = db.cursor()
# #存储文件
# with open ('mysql.jpg','rb') as fd:
# data = fd.read()
# try:
# #sql语句
# sql = "insert into images values (1,'mysql.jpg',%s);"
# #用execute自动传参方法将二进制内容传入语句
# cur.execute(sql,[data])
# db.commit()
# except Exception as e:
# #报错回滚
# db.rollback()
# print(e)
# cur.close()
# db.close()
# #获取文件
# sql = "select * from images where filename='mysql.jpg'"
# cur.execute(sql)
# image = cur.fetchone()
# with open (image[1],'wb') as fd:
# fd.write(image[2])
# cur.close()
# db.close()
爬虫实例
借助一个爬取链家二手房信息的案例,将整体爬取思路进行回顾并将所有代码进行封装。
######实例:封装######
#爬取链家二手房信息
import requests
import pymysql
import time
import random
from lxml import etree
class LianjiaSpider:
def __init__(self):
self.url = 'https://wh.lianjia.com/ershoufang/pg{}/'
self.ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
]
self.page = 1
self.db = pymysql.connect('localhost','root','密码','lianjiadb',charset='utf8')
self.cursor = self.db.cursor()
def get_page(self,url):
headers = {'User-Agent':random.choice(self.ua_list)}
html = requests.get(
url=url,
headers=headers).content.decode('utf-8')
self.parse_page(html)
def parse_page(self,html):
parse_html = etree.HTML(html)
house_dict = {}
house_list = []
ins = 'insert into lianjia values (%s,%s,%s,%s)'
dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')
for dd in dd_list:
house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()
house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()
unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()
house_dict['unit_price'] = unit_price[2:]
total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()
total_price = float(total_price) * 10000
house_dict['total_price'] = total_price
house = tuple(house_dict.values())
house_list.append(house)
print(house_list)
self.cursor.executemany(ins,house_list)
self.db.commit()
def main(self):
for page in range(100):
url = self.url.format(page)
self.get_page(url)
time.sleep(random.randint(1,3))
print('第%s页爬取完成'%self.page)
self.page+=1
self.cursor.close()
self.db.close()
if __name__ == '__main__':
start = time.time()
spider = LianjiaSpider()
spider.main()
end = time.time()
print('执行时间: %.2f'%(end-start))
版权声明:本文为weixin_50646402原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。