# -*- coding: utf-8 -*-
"""
Created on Fri Apr 27 15:12:18 2018
#python 3.6
"""
from requests_html import HTMLSession
from pymongo import MongoClient
import datetime
def createDB():
#创建数据库,返回三个集合,分别存放三个网站的数据
client = MongoClient('localhost',27017)
db = client.job
collection_ustc = db.ustc
collection_hfut = db.hfut
collection_ahu = db.ahu
return collection_ustc,collection_hfut,collection_ahu
def store(collection,_id,Theme,HoldDate,VenuesName,Description):
#将数据存入数据库相应集合
try:
data = {"_id":_id,"Theme":Theme,"HoldDate":HoldDate,"VenuesName":VenuesName,"Description":Description}
collection.save(data)
except:
pass
def isFuture(HoldDate,isFuture = False):
#判断宣讲会是否结束
time1 = datetime.datetime.now().date()
time2 = HoldDate.date()
if (time1 - time2).days < 0:
isFuture = True
return isFuture
def get_USTC_Info(col_ustc):
#获取科大宣讲会信息
url = 'http://www.job.ustc.edu.cn/API/Web/Recruit.ashx?rand=0.08298740764954782&pagesize=20&pageindex=1&action=list&keyword='
response_ustc = HTMLSession().get(url)
datas = response_ustc.json()['data']
def getDescription(_id):
url_detail = 'http://www.job.ustc.edu.cn/API/Web/Recruit.ashx?action=info&rid='+str(_id)
response_detailInfo = HTMLSession().get(url_detail)
Description = response_detailInfo.html.find('p')[1].full_text[:-2]
return Description
for data in datas:
# print(data['ID'])
if data['StatusName'][-10:-7] != '已过期':
#判断宣讲会是否结束,如果没结束则存入数据库
store(col_ustc,data['ID'],data['Theme'],data['HoldDateTxt'][:10]+' '+data['HoldDateTxt'][-11:],data['VenuesName'],getDescription(data['ID']))
def get_HFUT_Info(collection_hfut):
#获取工大宣讲会信息
url = 'http://gdjy.hfut.edu.cn/products/list/1.html?list=a'
response_hfut = HTMLSession().get(url)
tbody = response_hfut.html.find('tbody')[0]
# print(tbody.text)
def getDescription(href):
#获取宣讲会详细信息
response_detailInfo = HTMLSession().get(href)
detailInfo = response_detailInfo.html.find('div.panel-body',first = True).full_text
return detailInfo
for i in range(len(response_hfut.html.find('tbody',first = True).find('tr'))):
_id = tbody.find('tr')[i].attrs['data-key']
# print(_id)
Theme = tbody.find('tr')[i].find('td')[0].text
HoldDate = datetime.datetime.strptime(str(tbody.find('tr')[i].find('td')[1].text)[:16],'%Y/%m/%d %H:%M')
VenuesName = tbody.find('tr')[i].find('td')[2].text
href = str(tbody.find('tr')[i].absolute_links)[2:-2]
Description = getDescription(href)
# print(Theme,HoldDate,VenuesName)
if isFuture(HoldDate):
# print(Theme,HoldDate,VenuesName)
store(collection_hfut,_id,Theme,HoldDate,VenuesName,Description)
def get_AHU_Info(col_ahu):
url = 'http://www.job.ahu.edu.cn/detach.portal?.p=Znxjb20ud2lzY29tLnBvcnRhbC5jb250YWluZXIuY29yZS5pbXBsLlBvcnRsZXRFbnRpdHlXaW5kb3d8cGUxOHx2aWV3fG5vcm1hbHxhY3Rpb249cXVlcnlBbGxacGhNYW5hZ2VWaWV3'
response_ahu = HTMLSession().get(url)
info = response_ahu.html.find('table.portlet-table')[0].find('tr')[1:]
def getDescription(_id):
temp = 'http://www.job.ahu.edu.cn/detach.portal?.pen=pe18&.pmn=view&action=oneView&zphbh='+_id
#temp 这个url地址打开较慢
url_detail = HTMLSession().get(temp).html.find('table.w-zph-title',first = True).find('a')[1].attrs['href']
response_detailInfo = HTMLSession().get(url_detail)
detailInfo = response_detailInfo.html.find('div.bulletin-content',first = True).full_text
return detailInfo
for i in info:
_id = i.find('a')[2].attrs['onclick'][18:-11]
Theme = i.find('a')[0].text
HoldDate = i.find('td')[2].text +' '+i.find('td')[3].text
VenuesName = i.find('td')[1].text
Description = getDescription(_id)
# print(Description)
store(col_ahu,_id,Theme,HoldDate,VenuesName,Description)
if __name__ == '__main__':
collection_ustc,collection_hfut,collection_ahu = createDB()
get_HFUT_Info(collection_hfut)
get_USTC_Info(collection_ustc)
get_AHU_Info(collection_ahu)
1. 爬取的合工大、安大、中科大就业信息网宣讲会信息,使用datetime模块判断宣讲会是否举办过,如果已经举办过,就不写入数据库。
2. requests_html—HTMLSession模块中的find()方法可以直接根据标签获取信息,当first为1时,返回查找到的第一个标签的对象,first为0时,返回查找到的所有相应标签形成的数组,默认为0。
3. div.panel-body 其中 div 是标签,panel-body 是div标签的class,这种写法可以直接定位到<div class = “panel-body”>这个标签并获取标签的信息。注意 “.”后面只能是相应标签的class,不能是别的。
detailInfo = response_detailInfo.html.find('div.panel-body',first = True)
版权声明:本文为Lin9977原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。