百度地图API爬取网吧

  • Post author:
  • Post category:其他
# coding=utf-8
import urllib
import json
import sys
import random

reload(sys)
sys.setdefaultencoding("utf-8")

"""
02作业要求:
    寻找武汉市中学(或小学)周围500米(或其他)内的网吧,要求:提交代码py文件及运行结果文件(txt),txt文件格式如下:
    1,XXX小学
    1-1,XXX网吧
    1-2,XXX网吧
    2,XXX小学
    2-1,XXX网吧
@:param
    url:获取查询结果的网页地址
    json_file: 解析了网页结果的json文件
    bounds:查询区域的经纬度范围,该类中默认为武汉市的外接矩形经纬度
@:author
    2019.3.23
"""


class SearchPOI:
    """
    初始化参数,根据初始化的参数构造url,用于生产第一个区域块
    """
    def __init__(self):
        self.ipaddress = 'http://api.map.baidu.com/place/v2/search?query='
        self.address = '中学'
        # 武汉市经纬度范围,即查询区域
        bounds = [29.966667, 113.683333, 31.366667, 115.083333]
        self.radius = '&radius=500'
        self.filter = '网吧'
        self.total_bounds = []
        # 用于存储所有需要遍历的区域块,当区域块中兴趣点数量超过400后,将划分的新快加入到该列表中
        self.total_bounds.append(bounds)
        # 输出格式设置为json
        self.output = "output=" + "json"
        # 开发者秘钥池,防止一个账号频繁访问获取不到数据
        self.ak_pool = [换成自己的ak]
        self.page_para = "page_size=20&page_num="
        # 返回WGS84下的数据
        self.coord = "&coord_type=1"
        # 将经纬度范围连接成字符串,构造url
        self.str_bounds = ','.join([str(_) for _ in bounds])
        # 最开始的url
        self.url = self.ipaddress + self.address + '&' + "bounds=" + self.str_bounds + '&' + self.output + '&' \
                   + random.choice(self.ak_pool) + self.page_para

    """用于获取指定url下的结果文件,返回json格式的数据文件"""
    @staticmethod
    def get_json_file(url):
        # time.sleep(1)
        url_file = urllib.urlopen(url)
        url_result = url_file.read()
        json_file = json.loads(url_result)
        try:
            total = int(json_file["total"])
        except KeyError:
            # 如果获取不到文件,证明此时ak被限制时间,将json_file赋为空,便于更换ak
            json_file = None
        return json_file

    """传入json文件并读取其指定属性"""
    @staticmethod
    def read_json_file(json_file):
        # 读取json文件的属性值存入列表中
        itpretresult = []
        for text in json_file["results"]:
            itpretresult.append(text["name"])
            itpretresult.append(text["location"])
        return itpretresult

    """判断每个块的兴趣点总数"""
    @staticmethod
    def total_judge(json_file):
        total = int(json_file["total"])
        return total

    """根据区域的经纬度范围来构造学校查询的url"""
    def create_url(self, temp_region, pagenumber):
        # 把区域列表的值转为字符串连接起来,构造url
        sstr_bounds = ','.join([str(_) for _ in temp_region])
        uurl = self.ipaddress + self.address + '&' + "bounds=" + sstr_bounds + '&' + self.output + '&' + \
               random.choice(self.ak_pool) \
               + self.page_para + str(pagenumber) + self.coord
        return uurl

    """根据区域的经纬度范围来构造url"""
    def create_buffer_url(self, lat, lon, pagenumber):
        uurl = self.ipaddress + '网吧' + '&' + "location=" + str(lat) + ',' + str(
            lon) + self.radius + '&' + self.output + '&' + \
               random.choice(self.ak_pool) \
               + self.page_para + str(pagenumber) + self.coord
        return uurl

    """将兴趣点总数超过400个的区域划分为4小块"""
    def split_region(self, latmin, lonmin, latmax, lonmax):
        # 中心经纬度
        latmid = (latmax + latmin) / 2
        lonmid = (lonmin + lonmax) / 2
        # 划分的四块区域
        leftupper_region = [latmid, lonmin, latmax, lonmid]
        leftdown_region = [latmin, lonmin, latmid, lonmid]
        rightupper_region = [latmid, lonmid, latmax, lonmax]
        rightdown_region = [latmin, lonmid, latmid, lonmax]
        # 将四块区域加入到待遍历列表中
        self.total_bounds.append(leftupper_region)
        self.total_bounds.append(leftdown_region)
        self.total_bounds.append(rightupper_region)
        self.total_bounds.append(rightdown_region)

    """获取武汉市内所有的学校"""
    def get_school(self, totalbounds):
        school_result = []  # 存储该区域的学校结果
        for bounds in totalbounds:
            # 如果区域的列表长度是5,证明该区域已经划分为小区域,故跳过
            if len(bounds) == 5:
                continue
            # 否则就构造该区域的url
            sch_temp_url = self.create_url(bounds, 0)
            sch_temp_json_file = self.get_json_file(sch_temp_url)
            # 该循环用于当前ak被限制时重新构造url并读取文件,只有读取到文件才跳出循环
            while sch_temp_json_file is None:
                sch_temp_url = self.create_url(bounds, 0)
                sch_temp_json_file = self.get_json_file(sch_temp_url)
            sch_tem_total = self.total_judge(sch_temp_json_file)
            if sch_tem_total >= 400:
                # 如果总数大于400 切分为四块
                self.split_region(bounds[0], bounds[1], bounds[2], bounds[3])
                bounds.append(1)
            else:
                # 小于400,直接读取json文件
                # 获取结果总页数
                if sch_tem_total % 20 == 0:
                    sch_page_num = sch_tem_total / 20
                else:
                    sch_page_num = sch_tem_total / 20 + 1
                # 遍历每一页的结果,,存储所有的学校结果
                for page in range(0, sch_page_num):
                    sch_url = self.create_url(bounds, page)
                    sch_json_file = self.get_json_file(sch_url)
                    while sch_json_file is None:
                        sch_url = self.create_url(bounds, page)
                        sch_json_file = self.get_json_file(sch_url)
                    sch_iresult = self.read_json_file(sch_json_file)
                    # 将读取到的结果存入学校结果列表中
                    for i in range(0, len(sch_iresult)):
                        school_result.append(sch_iresult[i])
        return school_result

    """获取各个学校周围的网吧"""
    def get_wangba(self, school_result):
        wangba_result = []  # 存储学校周围的网吧遍历结果
        for sc_number in range(1, len(school_result), 2):
            # 获取学校位置经纬度,以此为中心进行查询
            lat = str(school_result[sc_number]['lat'])
            lon = str(school_result[sc_number]['lng'])
            wangba_url = self.create_buffer_url(lat, lon, 0)
            wangba_jfile = self.get_json_file(wangba_url)
            while wangba_jfile is None:
                wangba_url = self.create_buffer_url(lat, lon, 0)
                wangba_jfile = self.get_json_file(wangba_url)
            wb_tem_total = self.total_judge(wangba_jfile)
            print '正在遍历第' + str((sc_number + 1)/2) + '所学校'
            # 如果没结果,则遍历下一所学校
            if wb_tem_total == 0:
                continue
            elif wb_tem_total <= 20:  # 如果总数小于20,页数设置为1
                wb_page_nums = 1
            elif wb_tem_total % 20 == 0:  # 如果总数是20的整数倍,页数即为运算结果
                wb_page_nums = wb_tem_total / 20
            else:
                wb_page_nums = wb_tem_total / 20 + 1  # 不是整数倍,则取商再加一

            # 页数为一则直接获取该页的网吧结果并存入列表中
            if wb_page_nums == 1:
                wb_tem_result = self.read_json_file(wangba_jfile)
                per_sch_wb = [school_result[sc_number - 1]]
                for i in range(0, len(wb_tem_result)):
                    per_sch_wb.append(wb_tem_result[i])  # 存入列表
                wangba_result.append(per_sch_wb)
                continue
            # 页数大于1则遍历每一页的结果,,存储所有的网吧结果到列表中
            else:
                for wbpage in range(0, wb_page_nums):
                    # 页数大于1则每一页都要重新构造url,故重新获取经纬度
                    lat = str(school_result[i]['lat'])
                    lon = str(school_result[i]['lng'])
                    wb_url = searchPoi.create_buffer_url(lat, lon, wbpage)
                    wb_json_file = searchPoi.get_json_file(wb_url)
                    # 防止ak限制取到空值
                    while wb_json_file is None:
                        wb_url = searchPoi.create_buffer_url(lat, lon, wbpage)
                        wb_json_file = searchPoi.get_json_file(wb_url)
                    wb_iresult = searchPoi.read_json_file(wb_json_file)
                    # 构造该列表用于存储学校和其周围网吧的信息
                    per_sch_wb = [school_result[sc_number - 1]]
                    for i in range(0, len(wb_iresult)):
                        per_sch_wb.append(wb_iresult[i])
                    # 将学校和周围网吧的信息存入总的网吧结果表中
                    wangba_result.append(per_sch_wb)
        return wangba_result

    """将查询到的网吧结果写入到txt文件中"""
    @staticmethod
    def write2txt(wangba_result):
        with open(u'02.txt', 'w+') as f:
            result_str = ''  # 以字符串的形式将网吧结果存储起来
            # 设计输出的txt格式
            for i in range(0, len(wangba_result)):
                tmp_str = '\n' + str(i + 1) + ', ' + wangba_result[i][0] + '\n'  # 打印学校名称
                sstr = ""
                k = 0
                for j in range(1, len(wangba_result[i]), 2):  # 打印网吧名称
                    s = wangba_result[i][j].encode("utf-8")
                    sstr += (str(i + 1) + '-' + str(k + 1) + ', ' + s + '\n')
                    k = k + 1
                result_str += tmp_str + sstr  # 将学校名称和网吧名称连接起来
            f.write(result_str)  # 将所有结果写入txt文件中
        # 关闭文件,释放资源
        f.close()


# 程序入口
if __name__ == "__main__":
    # 新建查询网吧的类对象
    searchPoi = SearchPOI()
    # 获取该区域的学校信息
    school_result = searchPoi.get_school(searchPoi.total_bounds)
    print "该区域学校总数为" + str((len(school_result)+1)/2)
    # 获取该区域所有学校周围的网吧信息
    wangba_result = searchPoi.get_wangba(school_result)
    # 将结果写入txt文件中
    searchPoi.write2txt(wangba_result)
    print "txt文件已写入到当前目录下!"


版权声明:本文为wangxujin666原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。