酷狗音乐热门榜单-酷狗TOP500(网页版)链接为:
# 链接 https://www.kugou.com/yy/rank/home/1-8888.html?from=rank # 网页版并无下一页 只能通过自己构造链接实现 # 经发现 2-8888 3-8888 替换即可
import pandas as pd import numpy as np import time import requests from bs4 import BeautifulSoup import matplotlib.pyplot as plt from PIL import Image from wordcloud import WordCloud
# 待爬取网页 url = r'https://www.kugou.com/yy/rank/home/1-8888.html?from=rank' # 头部文件 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'Connection': 'close' } # 请求 r = requests.get(url, headers=headers) r.status_code # 200 正常返回
# bs4解析 soup = BeautifulSoup(r.text, 'lxml') titles = soup.select('.pc_temp_songname') href = soup.select('.pc_temp_songname') times = soup.select('.pc_temp_time') # 存储列表 data_all = [] for titles, times, href in zip(titles, times, href): data = { '歌名':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[0].strip(), '歌手':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[1].strip(), '时长':times.get_text().replace('\n', '').replace('\t', '').replace('\r', '').strip(), '链接':href.get('href') } print(data) data_all.append(data) df = pd.DataFrame(data_all) ''' 歌名 歌手 时长 链接 0 孤勇者 陈奕迅 4:16 https://www.kugou.com/mixsong/5rcb3re6.html 1 一路生花 温奕心 4:16 https://www.kugou.com/mixsong/592l9gb7.html 2 叹 黄龄、Tăng Duy Tân 4:11 https://www.kugou.com/mixsong/5w42mq78.html 3 好想抱住你 程jiajia 3:42 https://www.kugou.com/mixsong/5uhaec79.html 4 下潜 川青、Morerare 3:37 https://www.kugou.com/mixsong/5sewos85.html '''
def get_data(): dic = {} data_all = [] for i in range(1, 24): url = f'https://www.kugou.com/yy/rank/home/{i}-8888.html?from=rank' # urls = 'https://www.kugou.com/yy/rank/home/%d-8888.html?from=rank' % i # 头部文件 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'Connection': 'close' } # 请求 r = requests.get(url, headers=headers) # bs4解析 soup = BeautifulSoup(r.text, 'lxml') titles = soup.select('.pc_temp_songname') href = soup.select('.pc_temp_songname') times = soup.select('.pc_temp_time') # 存储列表 for titles, times, href in zip(titles, times, href): data = { '歌名':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[0].strip(), '歌手':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[1].strip(), '时长':times.get_text().replace('\n', '').replace('\t', '').replace('\r', '').strip(), '链接':href.get('href') } print(data) data_all.append(data) if data['歌手'] not in dic: dic[data['歌手']] = 1 else: dic[data['歌手']] += 1 time.sleep(2) return data_all, dic # 调用 data_all, dic = get_data() df = pd.DataFrame(data_all)
import pandas as pd import numpy as np import time import requests from bs4 import BeautifulSoup import matplotlib.pyplot as plt from PIL import Image from wordcloud import WordCloud def cnt_songer(songer, dic): if songer not in dic: dic[songer] = 1 else: dic[songer] += 1 def get_data(): dic = {} data_all = [] for i in range(1, 24): url = f'https://www.kugou.com/yy/rank/home/{i}-8888.html?from=rank' # urls = 'https://www.kugou.com/yy/rank/home/%d-8888.html?from=rank' % i # 头部文件 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', 'Connection': 'close' } # 请求 r = requests.get(url, headers=headers) # bs4解析 soup = BeautifulSoup(r.text, 'lxml') titles = soup.select('.pc_temp_songname') href = soup.select('.pc_temp_songname') times = soup.select('.pc_temp_time') # 存储列表 for titles, times, href in zip(titles, times, href): data = { '歌名':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[0].strip(), '歌手':titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[1].strip(), '时长':times.get_text().replace('\n', '').replace('\t', '').replace('\r', '').strip(), '链接':href.get('href') } print(data) data_all.append(data) cnt_songer(data['歌手'], dic) time.sleep(2) return data_all, dic def process_data(dic): items = dict(sorted(dic.items(), key=lambda x: x[1], reverse=True)) items = {key: value for key, value in items.items() if value > 1} # print(items) return items def main(): data_all, dic = get_data() df = pd.DataFrame(data_all) items = process_data(dic) print(len(items)) return df, items if __name__ == '__main__': data, dic_result = main()
有待继续学习!
To be continue.........
参考链接:华语乐坛到底姓什么?------酷狗篇