python爬取合工大、安大、中科大就业信息网宣讲会信息——requests_html&mongoDB

  • Post author:
  • Post category:python


# -*- coding: utf-8 -*-
"""
Created on Fri Apr 27 15:12:18 2018
#python 3.6
"""

from requests_html import HTMLSession
from pymongo import MongoClient
import datetime


def createDB():
    #创建数据库,返回三个集合,分别存放三个网站的数据
    client = MongoClient('localhost',27017)
    db = client.job
    collection_ustc = db.ustc
    collection_hfut = db.hfut
    collection_ahu = db.ahu
    return collection_ustc,collection_hfut,collection_ahu

def store(collection,_id,Theme,HoldDate,VenuesName,Description):
    #将数据存入数据库相应集合
    try:
        data = {"_id":_id,"Theme":Theme,"HoldDate":HoldDate,"VenuesName":VenuesName,"Description":Description}
        collection.save(data)
    except:
        pass

def isFuture(HoldDate,isFuture = False):   
    #判断宣讲会是否结束  
    time1 = datetime.datetime.now().date()
    time2 = HoldDate.date()
    if (time1 - time2).days < 0: 
        isFuture = True
    return isFuture

def get_USTC_Info(col_ustc):
    #获取科大宣讲会信息 
    url = 'http://www.job.ustc.edu.cn/API/Web/Recruit.ashx?rand=0.08298740764954782&pagesize=20&pageindex=1&action=list&keyword='
    response_ustc = HTMLSession().get(url)
    datas =  response_ustc.json()['data']
    
    def getDescription(_id): 
        url_detail = 'http://www.job.ustc.edu.cn/API/Web/Recruit.ashx?action=info&rid='+str(_id)
        response_detailInfo = HTMLSession().get(url_detail)
        Description = response_detailInfo.html.find('p')[1].full_text[:-2]       
        return Description
                
    for data in datas:
#        print(data['ID'])        
        if data['StatusName'][-10:-7] != '已过期':
            #判断宣讲会是否结束,如果没结束则存入数据库
            store(col_ustc,data['ID'],data['Theme'],data['HoldDateTxt'][:10]+' '+data['HoldDateTxt'][-11:],data['VenuesName'],getDescription(data['ID']))

def get_HFUT_Info(collection_hfut):
    #获取工大宣讲会信息
    url = 'http://gdjy.hfut.edu.cn/products/list/1.html?list=a'
    response_hfut = HTMLSession().get(url)
    tbody = response_hfut.html.find('tbody')[0]
#    print(tbody.text)
    def getDescription(href):
        #获取宣讲会详细信息
        response_detailInfo = HTMLSession().get(href)
        detailInfo = response_detailInfo.html.find('div.panel-body',first = True).full_text
        return detailInfo
        
    for i in range(len(response_hfut.html.find('tbody',first = True).find('tr'))):
        _id = tbody.find('tr')[i].attrs['data-key']
#        print(_id)
        Theme = tbody.find('tr')[i].find('td')[0].text
        HoldDate = datetime.datetime.strptime(str(tbody.find('tr')[i].find('td')[1].text)[:16],'%Y/%m/%d %H:%M')
        VenuesName = tbody.find('tr')[i].find('td')[2].text
        href = str(tbody.find('tr')[i].absolute_links)[2:-2]
        Description = getDescription(href)
#        print(Theme,HoldDate,VenuesName)
        if isFuture(HoldDate):
#            print(Theme,HoldDate,VenuesName)
            store(collection_hfut,_id,Theme,HoldDate,VenuesName,Description)
        
def get_AHU_Info(col_ahu):
    url = 'http://www.job.ahu.edu.cn/detach.portal?.p=Znxjb20ud2lzY29tLnBvcnRhbC5jb250YWluZXIuY29yZS5pbXBsLlBvcnRsZXRFbnRpdHlXaW5kb3d8cGUxOHx2aWV3fG5vcm1hbHxhY3Rpb249cXVlcnlBbGxacGhNYW5hZ2VWaWV3'
    response_ahu = HTMLSession().get(url)
    info = response_ahu.html.find('table.portlet-table')[0].find('tr')[1:]
    
    def getDescription(_id):
        temp = 'http://www.job.ahu.edu.cn/detach.portal?.pen=pe18&.pmn=view&action=oneView&zphbh='+_id
        #temp 这个url地址打开较慢
        url_detail = HTMLSession().get(temp).html.find('table.w-zph-title',first = True).find('a')[1].attrs['href']
        response_detailInfo = HTMLSession().get(url_detail)
        detailInfo = response_detailInfo.html.find('div.bulletin-content',first = True).full_text
        return detailInfo
    
    for i in info:
        _id = i.find('a')[2].attrs['onclick'][18:-11]
        Theme = i.find('a')[0].text
        HoldDate = i.find('td')[2].text +' '+i.find('td')[3].text
        VenuesName = i.find('td')[1].text
        Description = getDescription(_id)
#        print(Description)
        store(col_ahu,_id,Theme,HoldDate,VenuesName,Description)
       
if __name__ == '__main__':
    collection_ustc,collection_hfut,collection_ahu = createDB()
    get_HFUT_Info(collection_hfut)
    get_USTC_Info(collection_ustc)
    get_AHU_Info(collection_ahu)
    

1.    爬取的合工大、安大、中科大就业信息网宣讲会信息,使用datetime模块判断宣讲会是否举办过,如果已经举办过,就不写入数据库。

2.    requests_html—HTMLSession模块中的find()方法可以直接根据标签获取信息,当first为1时,返回查找到的第一个标签的对象,first为0时,返回查找到的所有相应标签形成的数组,默认为0。

3.    div.panel-body   其中 div 是标签,panel-body 是div标签的class,这种写法可以直接定位到<div class = “panel-body”>这个标签并获取标签的信息。注意 “.”后面只能是相应标签的class,不能是别的。

detailInfo = response_detailInfo.html.find('div.panel-body',first = True)



版权声明:本文为Lin9977原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。