python爬取音乐_python爬虫爬取qq音乐巅峰榜热歌歌词,jieba中文分词,词云展示…

  • Post author:
  • Post category:python


4721803fd1f8b3084a462415f5ba852d.png

1、获取列表页信息,url为https://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?tpl=3&page=detail&date=2019_02&topid=26&type=top&song_begin=0&song_num=30&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0

9f48b4ad8ca8fb0a68379a721af5a558.png

json样式为:

e5a2562fcf165aad9812bf9533a9fffd.png

2、获取详情页

headers = {
    "authority": "c.y.qq.com",
    "method": "GET",
    "path": "/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid=225716644&-=jsonp1&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0",
    "scheme": "https",
    "accept": "application/json, text/javascript, */*; q=0.01",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9",
    "cookie": "pgv_pvi=5936793600; pt2gguin=o1952436511; RK=g+4hNa7BQD; ptcz=653047c5b0174eb6b929c242110d08693b9dfcbaa701ddbf37ccc23c3366b94c; pgv_pvid=9049425500; ts_uid=9851761599; o_cookie=1952436511; tvfe_boss_uuid=5e81ff5fb8d5a1ea; yqq_stat=0; pgv_info=ssid=s484511232; ts_refer=ADTAGbaiduald; pgv_si=s21197824; yq_index=0; player_exist=1; qqmusic_fromtag=66; yplayer_open=0; ts_last=y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "origin": "https://y.qq.com",
    "referer": "https://y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}

jsond = {
    "nobase64": "1",
    "musicid": item['data']['songid'],
    "-": "jsonp1",
    "g_tk": "5381",
    "loginUin": "0",
    "hostUin": "0",
    "format": "json",
    "inCharset": "utf8",
    "outCharset": "utf-8",
    "notice": "0",
    "platform": "yqq.json",
    "needNewCode": "0"
}
r = requests.get("https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg", params=jsond, headers=headers)

1373ceef1e6c3ff6351f487410ba1094.png

json样式为:

ca47ac74516a5861ed14be7bcb62ae7f.png

3、将歌词存到文件test.txt里,用于读取。

4、逐行读取文件、构建要处理的数据字符串

5、jieba库、词云制作。

爬虫代码:

# -*-coding:UTF-8 -*-

import json
import re
import requests

headers = {
    "authority": "c.y.qq.com",
    "method": "GET",
    "path": "/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid=225716644&-=jsonp1&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0",
    "scheme": "https",
    "accept": "application/json, text/javascript, */*; q=0.01",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9",
    "cookie": "pgv_pvi=5936793600; pt2gguin=o1952436511; RK=g+4hNa7BQD; ptcz=653047c5b0174eb6b929c242110d08693b9dfcbaa701ddbf37ccc23c3366b94c; pgv_pvid=9049425500; ts_uid=9851761599; o_cookie=1952436511; tvfe_boss_uuid=5e81ff5fb8d5a1ea; yqq_stat=0; pgv_info=ssid=s484511232; ts_refer=ADTAGbaiduald; pgv_si=s21197824; yq_index=0; player_exist=1; qqmusic_fromtag=66; yplayer_open=0; ts_last=y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "origin": "https://y.qq.com",
    "referer": "https://y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}

jsonlist={
    "tpl":"3" ,
    "page": "detail",
    "date": "2019_02",
    "topid": "26",
    "type": "top",
    "song_begin": "0",
    "song_num": "100",
    "g_tk": "5381",
    "loginUin": "0",
    "hostUin": "0",
    "format": "json",
    "inCharset": "utf8",
    "outCharset": "utf-8",
    "notice": "0",
    "platform": "yqq.json",
    "needNewCode": "0"
}
r1 = requests.get("https://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg", params=jsonlist)
jlist = json.loads(r1.text)
f = open('test.txt', 'a+')
for item in jlist['songlist']:
    #print (str(item['data']['songid'])+" "+item['data']['songname'])
    jsond = {
        "nobase64": "1",
        "musicid": item['data']['songid'],
        "-": "jsonp1",
        "g_tk": "5381",
        "loginUin": "0",
        "hostUin": "0",
        "format": "json",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq.json",
        "needNewCode": "0"
    }
    r = requests.get("https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg", params=jsond, headers=headers)
    r.encoding = "utf-8"
    ch_pat = re.compile(r'[u4e00-u9fa5:]+')
    ch_words = ch_pat.findall(r.text)

    first = 0
    for i in range(1, int(len(ch_words) / 2)):
        if ch_words[i].find(':') > 0:
            first = i
            break
    flag = first
    for i in range(first, int(len(ch_words) / 2)):
        if ch_words[i].find(':') < 0 and ch_words[i + 1].find(':') < 0 and ch_words[i + 2].find(':') < 0:
            flag = i
            break

    #print(ch_words[flag:], "n", flag)
    #strres = ','.join(ch_words[flag:])
    strquqita = ''
    for i in ch_words[flag:]:
        if i.find(':')<0:
            strquqita = strquqita+i+","
    #chuli = r.text.replace("&#32",'').replace('[&#58;','').replace("]&#10;",'')
    #f.write(codecs.BOM_UTF8)
    f.write(strquqita+"n")
    print (strquqita)
f.close()

词云代码

#-*-coding:UTF-8 -*-
import jieba
from wordcloud import WordCloud
f = open('test.txt', 'r+')
f.readline()
strchuli = ''
for i in f:
    strchuli = strchuli+i+"。"
wordlist = jieba.cut(strchuli, cut_all=False)
#print (len(list(wordlist)))
word_string = " ".join(wordlist)
wordcloud = WordCloud(font_path='C:WindowsFontssimkai.ttf', background_color="white",width=1000, height=860, margin=2).generate(word_string)
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file('jieguo.png')