day2-requests和bs4

  • Post author:
  • Post category:其他




requests的使用方法


1.发送请求

requests.get(url,*,headers,params,proxies) - 发送get请求
requests.post(url,*,headers,params,proxies) - 发送post请求

参数:
url - 请求地址(一个网站的网址、接口的地址、图片地址等)
headers - 设置请求头(设置cookie和User-Agent的时候使用)
params - 设置参数
proxies - 设置代理

发送get请求,参数直接拼接到url中

requests.get('http://api.tianapi.com/auto/index?key=c9d408fefd8ed4081a9079d0d6165d43&num=10')

发送post请求,参数设置在params中

params = {
    'key': 'c9d408fefd8ed4081a9079d0d6165d43',
    'num': 10
}
requests.post('http://api.tianapi.com/auto/index', params=params)
response = requests.get('http://www.yingjiesheng.com/')


2.获取响应信息

设置编码方式(乱码的时候才设置)

response.encoding = 'GBK'

获取响应头信息

print(response.headers)

获取响应体

a.获取text值(用于请求网页,直接拿到网页源代码)
print(response.text)

b.获取json解析结果(用于返回json数据的数据接口)
print(response.json())

c.获取content值(获取二进制类型的原数据,用于图片、视频、音频的下载)
print(response.content)



添加请求头


1.添加User-Agent

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}

response = requests.get('https://www.51job.com/', headers=headers)

response.encoding = 'GBK'

print(response.text)


2.添加cookie

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
    'cookie':'_zap=579a8112-dbb8-48ea-91a9-568e904cf704; d_c0="AGDeh6I3LhOPTsi4YaLIfH4eWjH8fhbIRok=|1622299106"; _xsrf=cTHZr7llfvi8sEvOtQHGkIGUT2kfuaAU; __snaker__id=kiWhp6LWaQ80YwpC; _9755xjdesxxd_=32; YD00517437729195%3AWM_NI=TVZkK3JtaIXIXSn7lo0aCfQz8hmCWvVGU4Wj3Aecnpj640FqYrCh8lZBVfYBFLbCoWF5IOKBtm7SvKkZbsDkP%2B23fWn39853EuVyGZt%2FqtJlofn57VALxAJhQx94RUa9M2M%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb1d53af6e8a4dae450b6868bb7c44f968f9aafaa72a5e884d7ea7a83b888a3ee2af0fea7c3b92a988a968cca398aacafaadb439c8da2d8ef65f4bf8d87d472fcb8a3a4d67e96999bd5d662f19fab84ed7caaeafcd1c57b9bae8d89bb689a9e9a87dc4a8694aea4ee5d86eb87b0ca4ab8b3f9b4f541a986ae94c76194f5fed4e55daeedf7d9b83987ab9a9ac652b8f1bf95c968adbaa2d6b65e8394b8a2c96493a8f7d5d15fa3e783d3d037e2a3; YD00517437729195%3AWM_TID=M%2FAfNnIHd%2FhBABFBEBM%2Bzh8Prx%2BAkCV4; l_n_c=1; r_cap_id="NjY5NDNkZWIzY2ZlNDk0MGFkZWNiZjNmYTBjMjQ2NDQ=|1628664254|7d352e74e1eaaeba363546fff8ecc619cbad0025"; cap_id="YjdhMWNkZDM4M2M0NDdjYzg5MTcyZWQ5YzFiMjRjNTQ=|1628664254|7f024e77a0a5a4d74ae67f9e2d5f748ff7313d41"; l_cap_id="MzNmZWEzNjVkNjM0NDAyOTgzZGRjNjc3ZWU3YWQwMTc=|1628664254|e17e11ac337e33bf7a3b0de4cb3a89339470e48a"; n_c=1; atoken=285973C9FDCB4F51B823847A2E8B351B; atoken_expired_in=7776000; capsion_ticket="2|1:0|10:1628664631|14:capsion_ticket|44:ZWQwN2FhZTU1NjJjNDZiZTkwN2ViYjQ2NTNmZTc0MmU=|790f1dd0edf3248fa8f024ad2a72ad770d33aea4c797c7f4088543e19b2483f9"; auth_type=cXFjb25u|1628664645|0dc12f570f4e1b258686560feea5f591726f99e3; token="Mjg1OTczQzlGRENCNEY1MUI4MjM4NDdBMkU4QjM1MUI=|1628664645|0a27196a7650239f8e5d69361dbbb1a5755d690b"; client_id="NDdFRkMzMzJEQkY1N0RDOTU0OEQ3MEQ3ODBEODYyQ0M=|1628664645|d51e3551769c63454054d83178eacaa7631e8936"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1626697003,1626699827,1628664201,1628664688; SESSIONID=F0f1Ozjah3hpYsLm2cDYK8yEpfvlHpvGOkhdRPtH4c4; JOID=UFERBEhVLaZUYMJ9bVC3f7LA8Fl4GBDDYS6zGg88FcMnPbA6DSJQsjBqxHNqYDNNsqzLI5JWwcJyA-NlYeq4eVc=; osd=UlwRCk1XIKZaZcBwbV6yfb_A_lx6FRDNZCy-GgE5F84nM7U4ACJetzJnxH1vYj5NvKnJLpJYxMB_A-1gY-e4d1I=; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1628664745; captcha_session_v2="2|1:0|10:1628664745|18:captcha_session_v2|88:QVVOVE84Q0NzV2loUzJBd3UzREVDejJIa2k2c2tJaWU0UVRMU1FpMi9DakNqdVJEMDhFYXhudjJiTmdQbmUxTg==|576f28717ce8bf496750ab8287d49598db22086195912c76bbfa158495c750e1"; KLBRSID=cdfcc1d45d024a211bb7144f66bda2cf|1628664745|1628664200; gdxidpyhxdE=yG%2B3QjZU6Yhoiwtc0rPmfuEl4EuBMQbEpV%2BBiL1QedDhZBIUCjgJm0YBH6yWop0r2g7984gXHjCbpls2NDtb%5CbEryhX8s0BYZxdKE3UCIyGwsw77njE2ja0GzfhkgANNgXO9dK1dOQssqQO2sAngOT0EW7bu7bTBn25RDRiG%2FlAb2Sts%3A1628665942462'
}

response = requests.get('https://www.zhihu.com', headers=headers)

print(response.text)



json解析

import requests

# 获取今日头条的json数据接口,然后再发送请求
response = requests.get('https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc&_signature=_02B4Z6wo00d01X.g2AgAAIDDl0iJmFbkIVl.xNyAAD7ve5rc90eYpUagYiMEKQrfIz8iJPKuacCxb32tQcqbwZpt0i3u2X-hae-fgV3NqtDiEbEJK7EPc235gzTPL4EhVZ7cxFeHkLUI27pv29')

all_news = response.json()['data']

for news in all_news:
    print(news['Title'])
    print(news['Image']['url'])



图片下载

import requests

def download_image(img_url):
    # 请求网络图片数据
    response = requests.get(img_url)

    # 获取数据保存到本地文件
    data = response.content

    f = open(f'files/{img_url.split("/")[-1]}', 'wb')
    f.write(data)


if __name__ == '__main__':
    download_image('https://p5.toutiaoimg.com/img/pgc-image/9f5d102756354b6db8fa9408c57d01c8~cs_noop.png')



千图网图片下载

import requests
from re import findall


def download_image(img_url):
    # 请求网络图片数据
    response = requests.get(img_url)

    # 获取数据保存到本地文件
    data = response.content

    with open(f'files/{img_url.split("/")[-1].split("!")[0]}', 'wb') as fp:
        fp.write(data)


if __name__ == '__main__':
    response = requests.get('https://www.58pic.com/tupian/qixi-0-0.html')
    data = response.text
    result = findall(r'(?s)<img src="(\S+?)">', data)
    for x in result:
        download_image(f'https:{x}')
        print('下载成功')



bs4的使用

from bs4 import BeautifulSoup


1.准备需要解析的网页数据(实际是用request或者selenium获取)

data = open('test2.html', encoding='utf-8').read()


2.创建BeautifulSoup对象(可以自动纠正数据中错误的html结构)

# BeautifulSoup(数据,解析器)
soup = BeautifulSoup(data, 'lxml')
# print(soup)


3.通过BeautifulSoup对象获取标签和标签内容


1)获取标签

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <p>我是段落1</p>
    <a href="">我是超链接1</a>
    <div>
         <a href="https://www.baid.com">我是超链接2</a>
        <p>我是段落2</p>
            <span>
                <p id="p1">我是超链接3</p>
            </span>
    </div>
    <img src="http://www.cesi.jpg" alt="">

    <span id="s1">我是<b>span1</b></span>
</body>
</html>

BeautifulSoup对象.select(css选择器) – 获取css选择器选中的所有标签;返回的是列表,列表中的元素是选中的标签对象

BeautifulSoup对象.select_one(css选择器) – 获取css选择器选中的第一个标签;返回的是标签对象

result = soup.select('p')
print(result)  # [<p>我是段落1</p>, <p>我是段落2</p>, <p id="p1">我是超链接3</p>]

result = soup.select_one('p')
print(result)  # <p>我是段落1</p>

result = soup.select('#p1')
print(result)  # [<p id="p1">我是超链接3</p>]

result = soup.select_one('#p1')
print(result)  # <p id="p1">我是超链接3</p>

result = soup.select('div p')
print(result)  # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]

result = soup.select('div>p')
print(result)  # [<p>我是段落2</p>]


2)获取标签内容

a.标签对象.string – 获取标签中的文字内容(只有在标签内容是纯文字的时候才有效,否则结果是None)

p2 = soup.select_one('div>p')
# print(p2)  # <p>我是段落2</p>

# print(p2.string)  # 我是段落2

s1 = soup.select_one('#s1')
# print(s1)  # <span id="s1">我是<b>span1</b></span>
# print(s1.string)  # None

b.标签对象.get_text() – 获取标签内容中所有的文字信息

print(p2.get_text())  # '我是段落2'
print(s1.get_text())  # '我是span1'

c.标签对象.contents

print(p2.contents)  # ['我是段落2']
# print(s1.contents)  # ['我是', <b>span1</b>]

result = s1.contents
print(result)  # ['我是', <b>span1</b>]
print(result[-1].get_text())  # span1


3)获取标签属性

a1 = soup.select_one('div>a')
print(a1)  # <a href="https://www.baid.com">我是超链接2</a>
print(a1.attrs['href'])  # https://www.baid.com

img1 = soup.select_one('img')
print(img1)  # <img alt="" src="http://www.cesi.jpg"/>
print(img1.attrs['src'])  # http://www.cesi.jpg

补充:

BeautifulSoup对象.select/select_one(css选择器) – 在整个网页中获取css选择器选中的标签

标签对象.select/select_one(css选择器) – 在指定标签中获取css选择器选中的标签

ps = soup.select('p')
print(ps)  # [<p>我是段落1</p>, <p>我是段落2</p>, <p id="p1">我是超链接3</p>]

div1 = soup.select_one('div')
ps = div1.select('p')
print(ps)  # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]



豆瓣电影

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}

response = requests.get('https://movie.douban.com/top250', headers=headers)


soup = BeautifulSoup(response.text, 'lxml')
all_movie_li = soup.select('#content>div>div.article>ol>li')
# print(all_movie_li[0])

for li in all_movie_li:
    img_url = li.select_one('.pic>a>img').attrs['src']
    print(img_url)

    name = li.select_one('.title').get_text()
    print(name)

    des = li.select_one('.inq').get_text()
    print(des)

    score = li.select_one('.rating_num').get_text()
    print(score)
    print('-----------------------------------')



版权声明:本文为YongQiangA原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。