requests的使用方法
1.发送请求
requests.get(url,*,headers,params,proxies) - 发送get请求
requests.post(url,*,headers,params,proxies) - 发送post请求
参数:
url - 请求地址(一个网站的网址、接口的地址、图片地址等)
headers - 设置请求头(设置cookie和User-Agent的时候使用)
params - 设置参数
proxies - 设置代理
发送get请求,参数直接拼接到url中
requests.get('http://api.tianapi.com/auto/index?key=c9d408fefd8ed4081a9079d0d6165d43&num=10')
发送post请求,参数设置在params中
params = {
'key': 'c9d408fefd8ed4081a9079d0d6165d43',
'num': 10
}
requests.post('http://api.tianapi.com/auto/index', params=params)
response = requests.get('http://www.yingjiesheng.com/')
2.获取响应信息
设置编码方式(乱码的时候才设置)
response.encoding = 'GBK'
获取响应头信息
print(response.headers)
获取响应体
a.获取text值(用于请求网页,直接拿到网页源代码)
print(response.text)
b.获取json解析结果(用于返回json数据的数据接口)
print(response.json())
c.获取content值(获取二进制类型的原数据,用于图片、视频、音频的下载)
print(response.content)
添加请求头
1.添加User-Agent
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
response = requests.get('https://www.51job.com/', headers=headers)
response.encoding = 'GBK'
print(response.text)
2.添加cookie
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'cookie':'_zap=579a8112-dbb8-48ea-91a9-568e904cf704; d_c0="AGDeh6I3LhOPTsi4YaLIfH4eWjH8fhbIRok=|1622299106"; _xsrf=cTHZr7llfvi8sEvOtQHGkIGUT2kfuaAU; __snaker__id=kiWhp6LWaQ80YwpC; _9755xjdesxxd_=32; YD00517437729195%3AWM_NI=TVZkK3JtaIXIXSn7lo0aCfQz8hmCWvVGU4Wj3Aecnpj640FqYrCh8lZBVfYBFLbCoWF5IOKBtm7SvKkZbsDkP%2B23fWn39853EuVyGZt%2FqtJlofn57VALxAJhQx94RUa9M2M%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb1d53af6e8a4dae450b6868bb7c44f968f9aafaa72a5e884d7ea7a83b888a3ee2af0fea7c3b92a988a968cca398aacafaadb439c8da2d8ef65f4bf8d87d472fcb8a3a4d67e96999bd5d662f19fab84ed7caaeafcd1c57b9bae8d89bb689a9e9a87dc4a8694aea4ee5d86eb87b0ca4ab8b3f9b4f541a986ae94c76194f5fed4e55daeedf7d9b83987ab9a9ac652b8f1bf95c968adbaa2d6b65e8394b8a2c96493a8f7d5d15fa3e783d3d037e2a3; YD00517437729195%3AWM_TID=M%2FAfNnIHd%2FhBABFBEBM%2Bzh8Prx%2BAkCV4; l_n_c=1; r_cap_id="NjY5NDNkZWIzY2ZlNDk0MGFkZWNiZjNmYTBjMjQ2NDQ=|1628664254|7d352e74e1eaaeba363546fff8ecc619cbad0025"; cap_id="YjdhMWNkZDM4M2M0NDdjYzg5MTcyZWQ5YzFiMjRjNTQ=|1628664254|7f024e77a0a5a4d74ae67f9e2d5f748ff7313d41"; l_cap_id="MzNmZWEzNjVkNjM0NDAyOTgzZGRjNjc3ZWU3YWQwMTc=|1628664254|e17e11ac337e33bf7a3b0de4cb3a89339470e48a"; n_c=1; atoken=285973C9FDCB4F51B823847A2E8B351B; atoken_expired_in=7776000; capsion_ticket="2|1:0|10:1628664631|14:capsion_ticket|44:ZWQwN2FhZTU1NjJjNDZiZTkwN2ViYjQ2NTNmZTc0MmU=|790f1dd0edf3248fa8f024ad2a72ad770d33aea4c797c7f4088543e19b2483f9"; auth_type=cXFjb25u|1628664645|0dc12f570f4e1b258686560feea5f591726f99e3; token="Mjg1OTczQzlGRENCNEY1MUI4MjM4NDdBMkU4QjM1MUI=|1628664645|0a27196a7650239f8e5d69361dbbb1a5755d690b"; client_id="NDdFRkMzMzJEQkY1N0RDOTU0OEQ3MEQ3ODBEODYyQ0M=|1628664645|d51e3551769c63454054d83178eacaa7631e8936"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1626697003,1626699827,1628664201,1628664688; SESSIONID=F0f1Ozjah3hpYsLm2cDYK8yEpfvlHpvGOkhdRPtH4c4; JOID=UFERBEhVLaZUYMJ9bVC3f7LA8Fl4GBDDYS6zGg88FcMnPbA6DSJQsjBqxHNqYDNNsqzLI5JWwcJyA-NlYeq4eVc=; osd=UlwRCk1XIKZaZcBwbV6yfb_A_lx6FRDNZCy-GgE5F84nM7U4ACJetzJnxH1vYj5NvKnJLpJYxMB_A-1gY-e4d1I=; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1628664745; captcha_session_v2="2|1:0|10:1628664745|18:captcha_session_v2|88:QVVOVE84Q0NzV2loUzJBd3UzREVDejJIa2k2c2tJaWU0UVRMU1FpMi9DakNqdVJEMDhFYXhudjJiTmdQbmUxTg==|576f28717ce8bf496750ab8287d49598db22086195912c76bbfa158495c750e1"; KLBRSID=cdfcc1d45d024a211bb7144f66bda2cf|1628664745|1628664200; gdxidpyhxdE=yG%2B3QjZU6Yhoiwtc0rPmfuEl4EuBMQbEpV%2BBiL1QedDhZBIUCjgJm0YBH6yWop0r2g7984gXHjCbpls2NDtb%5CbEryhX8s0BYZxdKE3UCIyGwsw77njE2ja0GzfhkgANNgXO9dK1dOQssqQO2sAngOT0EW7bu7bTBn25RDRiG%2FlAb2Sts%3A1628665942462'
}
response = requests.get('https://www.zhihu.com', headers=headers)
print(response.text)
json解析
import requests
# 获取今日头条的json数据接口,然后再发送请求
response = requests.get('https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc&_signature=_02B4Z6wo00d01X.g2AgAAIDDl0iJmFbkIVl.xNyAAD7ve5rc90eYpUagYiMEKQrfIz8iJPKuacCxb32tQcqbwZpt0i3u2X-hae-fgV3NqtDiEbEJK7EPc235gzTPL4EhVZ7cxFeHkLUI27pv29')
all_news = response.json()['data']
for news in all_news:
print(news['Title'])
print(news['Image']['url'])
图片下载
import requests
def download_image(img_url):
# 请求网络图片数据
response = requests.get(img_url)
# 获取数据保存到本地文件
data = response.content
f = open(f'files/{img_url.split("/")[-1]}', 'wb')
f.write(data)
if __name__ == '__main__':
download_image('https://p5.toutiaoimg.com/img/pgc-image/9f5d102756354b6db8fa9408c57d01c8~cs_noop.png')
千图网图片下载
import requests
from re import findall
def download_image(img_url):
# 请求网络图片数据
response = requests.get(img_url)
# 获取数据保存到本地文件
data = response.content
with open(f'files/{img_url.split("/")[-1].split("!")[0]}', 'wb') as fp:
fp.write(data)
if __name__ == '__main__':
response = requests.get('https://www.58pic.com/tupian/qixi-0-0.html')
data = response.text
result = findall(r'(?s)<img src="(\S+?)">', data)
for x in result:
download_image(f'https:{x}')
print('下载成功')
bs4的使用
from bs4 import BeautifulSoup
1.准备需要解析的网页数据(实际是用request或者selenium获取)
data = open('test2.html', encoding='utf-8').read()
2.创建BeautifulSoup对象(可以自动纠正数据中错误的html结构)
# BeautifulSoup(数据,解析器)
soup = BeautifulSoup(data, 'lxml')
# print(soup)
3.通过BeautifulSoup对象获取标签和标签内容
1)获取标签
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<p>我是段落1</p>
<a href="">我是超链接1</a>
<div>
<a href="https://www.baid.com">我是超链接2</a>
<p>我是段落2</p>
<span>
<p id="p1">我是超链接3</p>
</span>
</div>
<img src="http://www.cesi.jpg" alt="">
<span id="s1">我是<b>span1</b></span>
</body>
</html>
BeautifulSoup对象.select(css选择器) – 获取css选择器选中的所有标签;返回的是列表,列表中的元素是选中的标签对象
BeautifulSoup对象.select_one(css选择器) – 获取css选择器选中的第一个标签;返回的是标签对象
result = soup.select('p')
print(result) # [<p>我是段落1</p>, <p>我是段落2</p>, <p id="p1">我是超链接3</p>]
result = soup.select_one('p')
print(result) # <p>我是段落1</p>
result = soup.select('#p1')
print(result) # [<p id="p1">我是超链接3</p>]
result = soup.select_one('#p1')
print(result) # <p id="p1">我是超链接3</p>
result = soup.select('div p')
print(result) # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]
result = soup.select('div>p')
print(result) # [<p>我是段落2</p>]
2)获取标签内容
a.标签对象.string – 获取标签中的文字内容(只有在标签内容是纯文字的时候才有效,否则结果是None)
p2 = soup.select_one('div>p')
# print(p2) # <p>我是段落2</p>
# print(p2.string) # 我是段落2
s1 = soup.select_one('#s1')
# print(s1) # <span id="s1">我是<b>span1</b></span>
# print(s1.string) # None
b.标签对象.get_text() – 获取标签内容中所有的文字信息
print(p2.get_text()) # '我是段落2'
print(s1.get_text()) # '我是span1'
c.标签对象.contents
print(p2.contents) # ['我是段落2']
# print(s1.contents) # ['我是', <b>span1</b>]
result = s1.contents
print(result) # ['我是', <b>span1</b>]
print(result[-1].get_text()) # span1
3)获取标签属性
a1 = soup.select_one('div>a')
print(a1) # <a href="https://www.baid.com">我是超链接2</a>
print(a1.attrs['href']) # https://www.baid.com
img1 = soup.select_one('img')
print(img1) # <img alt="" src="http://www.cesi.jpg"/>
print(img1.attrs['src']) # http://www.cesi.jpg
补充:
BeautifulSoup对象.select/select_one(css选择器) – 在整个网页中获取css选择器选中的标签
标签对象.select/select_one(css选择器) – 在指定标签中获取css选择器选中的标签
ps = soup.select('p')
print(ps) # [<p>我是段落1</p>, <p>我是段落2</p>, <p id="p1">我是超链接3</p>]
div1 = soup.select_one('div')
ps = div1.select('p')
print(ps) # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]
豆瓣电影
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
response = requests.get('https://movie.douban.com/top250', headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
all_movie_li = soup.select('#content>div>div.article>ol>li')
# print(all_movie_li[0])
for li in all_movie_li:
img_url = li.select_one('.pic>a>img').attrs['src']
print(img_url)
name = li.select_one('.title').get_text()
print(name)
des = li.select_one('.inq').get_text()
print(des)
score = li.select_one('.rating_num').get_text()
print(score)
print('-----------------------------------')