利用xpath进行数据抓取,存入mongo数据库,有心的同学可以尝试做抢课脚本哦!采用云打码平台进行验证码识别,当然手动输入也不是不可以的。
# -*- coding: UTF-8 -*-
_author_ = 'zy'
_date_ = '2019/2/1 0001 17:50'
import requests
import re
import sys
'''模拟登录'''
CaptchaUrl = "http://210.42.121.241//servlet/GenImg"
PostUrl = "http://210.42.121.241/servlet/Login"
# 保存验证码到本地
s=requests.session()
c_header={
'Host': '210.42.121.241',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
image=s.get(CaptchaUrl,headers=c_header)
f=open("test.jpg",'wb')
f.write(image.content)
f.close()
import ydm
SecretCode = ydm.use_ydm('test.jpg')
# SecretCode = ydm_func(YUNDAMA_APP_ID,YUNDAMA_APP_KEY,YUNDAMA_USER,YUNDAMA_PASS,"test.jpg",1004)
print(SecretCode)
postData = {
'id': '',
'pwd': '',
'xdvfb': SecretCode,
}
con=s.post(PostUrl,postData)
l_cookie=con.cookies
# 打开保存的验证码图片 输入
print(l_cookie)
headers = {
'Host':'210.42.121.241',
'Origin':'http://210.42.121.241',
# 'Cookie':l_cookie,
'Referer':'http://210.42.121.241/guest/guest_chooseLsn_parent.jsp',
'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36',
}
con_url='http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp'
contents=s.get(con_url,headers=headers)
st_url='http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum={page}'
#理解了urlencode的意思,他是表达了将数据在网址上体现
print(contents.cookies)
#http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum=4
#url = user_url.format(uid=userid)
#http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum=4
#total_count
contents=s.get(st_url,headers=headers)
from lxml import etree
import pymysql,pymongo,random
dbname = 'WHU教务系统'
client = pymongo.MongoClient('127.0.0.1', 27017) # 缺少一步骤进行属性的清洗操作,确定是否有这个值
db = client.course
with open("test.html",'wb') as f:
f.write(contents.content)
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//a[@class="navegate"]')
pages=len(result)
print(pages)
pages=24
print(html)
start_url='http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum='
for page in range(pages):
page=page+1
st_url=start_url+str(page)
print(st_url)
file_name='test'+str(page)+'.html'
print(file_name)
#/html/body/table/tbody/tr[2]/td[5]
body=s.get(st_url,headers=headers)
with open(file_name, 'wb') as f:
f.write(body.content)
html = etree.parse(file_name, etree.HTMLParser())
li = html.xpath('//table[@class="table listTable"]/tr') # /html/body/table/tbody/tr[1]
rows = len(li) - 1
for row in range(rows):
row=row+1
if row==1:
pass
else:
data={}
xpa= '//table[@class="table listTable"]/tr[{row}]/td[1]/text()'.format(row=row)
data['课程名']=html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[2]/text()'.format(row=row)
data['学分']=html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[3]/font/text()'.format(row=row)
xpb = '//table[@class="table listTable"]/tr[{row}]/td[3]/text()'.format(row=row)
data['剩余人数'] = html.xpath(xpa)
xpa= '//table[@class="table listTable"]/tr[{row}]/td[4]/text()'.format(row=row)
data['教师名'] = html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[5]/text()'.format(row=row)
data['职称'] = html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[6]/text()'.format(row=row)
data['授课学院'] = html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[7]/text()'.format(row=row)
data['教材'] = html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[8]/text()'.format(row=row)
data['学年'] = html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[9]/text()'.format(row=row)
data['学期'] = html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[10]/div[@class="overflow"]/text()'.format(row=row)
data['上课时间地点']=html.xpath(xpa)
xpa = '//table[@class="table listTable"]/tr[{row}]/td[@id="tips"]/div/text()'.format(row=row)
data['授课类型'] = html.xpath(xpa)
for key in data:
if len(data[key])==1:
data[key]=data[key][0]
if len(data[key])==0:
data[key]=None
print(data)
db[dbname].insert_one(data)
print('插入一条')
# def deal_with_list(dbname):
# cursor = db[dbname].find()
# for data in cursor:
# for key in data:
# try:
#
#
# data = {
# "text": comment
# }
#
# mongoid = data['_id']
# myquery = {"_id": mongoid}
# newvalues = {"$set": data}
#
# db[dbname].update_one(myquery, newvalues)
版权声明:本文为qq_34069180原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。