武汉大学教务系统爬虫

利用xpath进行数据抓取，存入mongo数据库，有心的同学可以尝试做抢课脚本哦！采用云打码平台进行验证码识别，当然手动输入也不是不可以的。
# -*- coding: UTF-8 -*-
_author_ = 'zy'
_date_ = '2019/2/1 0001 17:50'
import requests
import re
import sys

'''模拟登录'''
CaptchaUrl = "http://210.42.121.241//servlet/GenImg"
PostUrl = "http://210.42.121.241/servlet/Login"
# 保存验证码到本地
s=requests.session()
c_header={
'Host': '210.42.121.241',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}
image=s.get(CaptchaUrl,headers=c_header)

f=open("test.jpg",'wb')
f.write(image.content)
f.close()

import ydm

SecretCode = ydm.use_ydm('test.jpg')

# SecretCode = ydm_func(YUNDAMA_APP_ID,YUNDAMA_APP_KEY,YUNDAMA_USER,YUNDAMA_PASS,"test.jpg",1004)
print(SecretCode)

postData = {
'id': '',
'pwd': '',
'xdvfb': SecretCode,
}

con=s.post(PostUrl,postData)
l_cookie=con.cookies
# 打开保存的验证码图片 输入
print(l_cookie)

headers = {
'Host':'210.42.121.241',
'Origin':'http://210.42.121.241',
# 'Cookie':l_cookie,
'Referer':'http://210.42.121.241/guest/guest_chooseLsn_parent.jsp',
'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36',
}
con_url='http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp'
contents=s.get(con_url,headers=headers)
st_url='http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum={page}'
#理解了urlencode的意思，他是表达了将数据在网址上体现
print(contents.cookies)
#http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum=4
#url = user_url.format(uid=userid)
#http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum=4
#total_count
contents=s.get(st_url,headers=headers)

from lxml import etree
import pymysql,pymongo,random
dbname = 'WHU教务系统'
client = pymongo.MongoClient('127.0.0.1', 27017)  # 缺少一步骤进行属性的清洗操作，确定是否有这个值
db = client.course

with open("test.html",'wb') as f:
    f.write(contents.content)


html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//a[@class="navegate"]')
pages=len(result)
print(pages)
pages=24
print(html)
start_url='http://210.42.121.241/guest/guest_choose_PubLsn_list.jsp?XiaoQu=0&credit=0&keyword=&pageNum='


for page in range(pages):
    page=page+1
    st_url=start_url+str(page)
    print(st_url)

    file_name='test'+str(page)+'.html'
    print(file_name)

    #/html/body/table/tbody/tr[2]/td[5]
    body=s.get(st_url,headers=headers)

    with open(file_name, 'wb') as f:
        f.write(body.content)

    html = etree.parse(file_name, etree.HTMLParser())
    li = html.xpath('//table[@class="table listTable"]/tr')  # /html/body/table/tbody/tr[1]
    rows = len(li) - 1

    for row in range(rows):
        row=row+1
        if row==1:
            pass
        else:
            data={}
            xpa= '//table[@class="table listTable"]/tr[{row}]/td[1]/text()'.format(row=row)
            data['课程名']=html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[2]/text()'.format(row=row)
            data['学分']=html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[3]/font/text()'.format(row=row)
            xpb = '//table[@class="table listTable"]/tr[{row}]/td[3]/text()'.format(row=row)
            data['剩余人数'] = html.xpath(xpa)
            xpa= '//table[@class="table listTable"]/tr[{row}]/td[4]/text()'.format(row=row)
            data['教师名'] = html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[5]/text()'.format(row=row)
            data['职称'] = html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[6]/text()'.format(row=row)
            data['授课学院'] = html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[7]/text()'.format(row=row)
            data['教材'] = html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[8]/text()'.format(row=row)
            data['学年'] = html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[9]/text()'.format(row=row)
            data['学期'] = html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[10]/div[@class="overflow"]/text()'.format(row=row)
            data['上课时间地点']=html.xpath(xpa)
            xpa = '//table[@class="table listTable"]/tr[{row}]/td[@id="tips"]/div/text()'.format(row=row)
            data['授课类型'] = html.xpath(xpa)
            for key in data:
                if len(data[key])==1:
                    data[key]=data[key][0]
                if len(data[key])==0:
                    data[key]=None
            print(data)
            db[dbname].insert_one(data)
            print('插入一条')

# def deal_with_list(dbname):
#     cursor = db[dbname].find()
#     for data in cursor:
#         for key in data:
#             try:
#
#
#         data = {
#             "text": comment
#         }
#
#         mongoid = data['_id']
#         myquery = {"_id": mongoid}
#         newvalues = {"$set": data}
#
#         db[dbname].update_one(myquery, newvalues)
原文链接：https://blog.csdn.net/qq_34069180/article/details/86757056
你可能也喜欢