# __author: han-zhang
# date: 2019/8/18 15:59
import requests
import time, re, json
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
def parse_first(url):
# 处理一级页面,得到所有的数字、字母开头的链接
r = requests.get(url=url, headers=headers)
# 生成soup对象
soup = BeautifulSoup(r.text, 'lxml')
# 得到以数字开头的a对象
number_list = soup.select('.bus_kt_r1 > a')
# print(len(number_list))
# 得到以字母开头的a对象
char__list = soup.select('.bus_kt_r2 > a')
# 合并两个列表
all_a_list = number_list + char__list
all_href_list = []
# 提取所有的链接
for oa in all_a_list:
# print(oa)
# 添加协议主机
href_url = url.rstrip('/') + oa['href']
all_href_list.append(href_url)
return all_href_list
# 判断车站数
def num(up_total, soup):
if int(up_total) < int(25):
up_total_name = soup.select('.bus_site_layer')[0]
up_total_list = up_total_name.select('div > a')
up_name_list = []
# 遍历获取所有名字
for oa in up_total_list:
up_name_list.append(oa.string)
try:
# 下行总站数
down_total = soup.select('.bus_line_top >span')[1].string.strip('共站').strip()
# 下行总站牌
down_total_name = soup.select('.bus_site_layer')[1]
down_total_list = down_total_name.select('div > a')
down_name_list = []
for ob in down_total_list:
down_name_list.append(ob.string)
except Exception as e:
down_total = '无下行线路'
down_name_list = []
# 如果站点数大于25
else:
n = int(int(up_total) // 25 + 1)
up_total_name_list = soup.select('.bus_site_layer')[0:n]
# 循环将所有的站点对象拿到函数
up_name_list = []
for up_total_name in up_total_name_list:
up_total_list = up_total_name.select('div > a')
# 遍历获取所有名字
for oa in up_total_list:
up_name_list.append(oa.string)
try:
# 下行总站数
down_total = soup.select('.bus_line_top >span')[1].string.strip('共站').strip()
# 下行总站牌
down_name_list = []
down_total_name_list = soup.select('.bus_site_layer')[n:]
for down_total_name in down_total_name_list:
down_total_list = down_total_name.select('div > a')
for ob in down_total_list:
down_name_list.append(ob.string)
except Exception as e:
down_total = '无下行线路'
down_name_list = []
return up_name_list, down_total, down_name_list
def parse_second_page(all_href_list, url, fp):
# 遍历这个里面的,依次发送请求
for href in all_href_list:
r = requests.get(url=href, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 提取详细公交的链接
odiv = soup.find('div', id='con_site_1')
# 查找div下的所有a链接
oa_list = odiv.find_all('a')
oa_href_list = []
# 提取所有a对象的href
for oa in oa_list:
href = url.rstrip('/') + oa['href']
oa_href_list.append(href)
# 向三级页面发送请求
for href in oa_href_list:
r = requests.get(url=href, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 获取线路名称
route_name = soup.select('.bus_i_t1 > h1')[0].string.strip()
print('开始爬取%s公交信息' % route_name.strip())
# print(route_name)
# 运行时间
run_time = soup.select('.bus_i_content > .bus_i_t4')[0].string.lstrip('运行时间:')
# 票价信息
price_info = soup.select('.bus_i_content > .bus_i_t4')[1].string.lstrip('票价信息:')
# 公交公司
bus_company = soup.select('.bus_i_content > .bus_i_t4 >a')[0].string
# 上线总站数,正则匹配
# up_total=re.compile(r'<span class="bus_line_no">.*?(\d+).*</span>')
# ret=up_total.findall(r.text)
up_total = soup.select('.bus_line_top > span')[0].string.strip('共站').strip()
# 获取上行总站牌
up_name_list, down_total, down_name_list = num(up_total, soup)
# 保存到字典中
item = {
'获取线路名称': route_name,
'运行时间': run_time,
'票价信息': price_info,
'公交公司': bus_company,
'上线总站数': up_total,
'上行总站牌': up_name_list,
'下行总站数': down_total,
'下行总站牌': down_name_list
}
string = json.dumps(item, ensure_ascii=False)
fp.write(string + '\n')
time.sleep(2)
print('结束爬取%s公交信息' % route_name.strip())
def main():
city = input('请输入你要爬去的城市汉语拼音:')
url = 'https://{}.8684.cn/' # 后面要切/
url = url.format(city)
print(url)
fp = open(city+'.txt', 'w', encoding='utf8')
# 解析一级节点
all_href_list = parse_first(url)
# 解析二级节点
parse_second_page(all_href_list, url, fp)
if __name__ == '__main__':
main()
版权声明:本文为weixin_42590877原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。