为了了解大家在使用开源项目过程遇到最多的问题是什么?
这里通过收集gitee中的issue中已完成的数据,进行分析,并生成词云图
一、页面结构分析
二、编写程序代码
三、运行程序结果
四、词云图生成
我们可以通过获取到网页源码,然后利用xpath进行解析,要让程序完整的执行下去,我们需要做翻页处理,这里方式有很多,我是采用的判断页面中是否有下一页的图标,如果有,就当前加+1,最终可以做到,将所有需要收集的信息提取到txt文件中,采集完成之后,通过txt文件生成词云图。
#!/usr/bin/python # -*- coding: UTF-8 -*- """ @author: Roc-xb """ import requests from lxml import etree def run(page=1): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36' } params = ( ('assignee_id', ''), ('author_id', ''), ('branch', ''), ('collaborator_ids', ''), ('issue_search', ''), ('label_ids', ''), ('label_text', ''), ('milestone_id', ''), ('priority', ''), ('private_issue', ''), ('program_id', ''), ('project_type', ''), ('scope', ''), ('sort', ''), ('state', 'closed'), ('target_project', ''), ('page', page), ) response = requests.get('https://gitee.com/y_project/RuoYi/issues', headers=headers, params=params).text dom = etree.HTML(response) res = dom.xpath('//*[@id="git-issues"]/div/div/div[1]/h3/a/text()') print("".join(res)) with open("issus.txt", 'a', encoding="utf-8") as f: f.writelines(res) next_page = str(dom.xpath('//*[@id="git-discover-page"]/a[@rel="next"]//text()')) if len(next_page) > 1: run(page + 1) if __name__ == '__main__': run()
#!/usr/bin/python # -*- coding: UTF-8 -*- """ @author: Roc-xb """ from wordcloud import WordCloud import matplotlib.pyplot as plt # 绘制图像的模块 import jieba # jieba分词 f = open('issus.txt', 'r', encoding='UTF-8').read() # 结巴分词,生成字符串,wordcloud无法直接生成正确的中文词云 cut_text = " ".join(jieba.cut(f)) wordcloud = WordCloud( # 设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的 font_path="C:/Windows/Fonts/simfang.ttf", # 设置了背景,宽高 background_color="white", width=1500, height=880).generate(cut_text) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show()