2019年年初,《流浪地球》全国上榜。在豆瓣上,首日开分站稳8分以上,评分了之后点映的高热。微博上跟着出现吴京客串31天与6000万的热度搜。知乎上关于“评价刘慈欣如何评价刘慈欣小说改编的同名电影《流浪地球片》”的热门话题,包括导演郭帆的最高赞回答。
本篇文章爬取了豆瓣网上《流浪地球》的部分影评,并进行数据分析和可视化处理。
1 # 爬取电影《流浪地球》的影评 2 import requests 3 from lxml import etree 4 from tqdm import tqdm 5 import time 6 import random 7 import pandas as pd 8 import re 9 10 name_list, content_list, date_list, score_list, city_list = [], [], [], [], [] 11 movie_name = "" 12 13 def get_city(url, i): 14 time.sleep(round(random.uniform(2, 3), 2)) 15 headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 17 cookies = {'cookie': 'bid=Ge7txCUP3v4; ll="108303"; _vwo_uuid_v2=DB48689393ACB497681C7C540C832B546|f3d53bcb0314c9a34c861e9c724fcdec; ap_v=0,6.0; dbcl2="159607750:sijMjNWV7ek"; ck=kgmP; push_doumail_num=0; push_noty_num=0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1549433417%2C%22https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26266893%2Fcomments%3Fsort%3Dnew_score%26status%3DP%22%5D; _pk_ses.100001.8cb4=*; __lnkrntdmcvrd=-1; __yadk_uid=KqejvPo3L0HIkc2Zx7UXOJF6Vt9PpoJU; _pk_id.100001.8cb4=91514e1ada30bfa5.1549433417.1.1549433694.1549433417'} # 2018.7.25修改, 18 res = requests.get(url, cookies=cookies, headers=headers) 19 if (res.status_code == 200): 20 print("\n成功获取第{}个用户城市信息!".format(i)) 21 else: 22 print("\n第{}个用户城市信息获取失败".format(i)) 23 pattern = re.compile('<div class="user-info">.*?<a href=".*?">(.*?)</a>', re.S) 24 item = re.findall(pattern, res.text) # list类型 25 return (item[0]) # 只有一个元素,所以直接返回
(1)网页分析
获取对象:
评论用户
评论内容
评分
评论日期
用户所在城市
(2)数据获取与存储
1、获取饼干
本人用的是Chrome浏览器,Ctrl+F12进入开发者工具页面。F5刷新一下出现数据,找到cookies、headers。
2、加载headers、cookies,并用requests库获取信息
1 def get_content(id, page): 2 headers = { 3 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 4 cookies = {'cookie': ' 此处填入自己的cookies,否则不能正常爬取 '} 5 url = "https://movie.douban.com/subject/" + str(id) + "/comments?start=" + str(page * 10) + "&limit=20&sort=new_score&status=P" 6 res = requests.get(url, headers=headers, cookies=cookies) 7 8 pattern = re.compile('<div id="wrapper">.*?<div id="content">.*?<h1>(.*?) 短评</h1>', re.S) 9 global movie_name 10 movie_name = re.findall(pattern, res.text)[0] # list类型 11 12 res.encoding = "utf-8" 13 if (res.status_code == 200): 14 print("\n第{}页短评爬取成功!".format(page + 1)) 15 print(url) 16 else: 17 print("\n第{}页爬取失败!".format(page + 1)) 18 19 with open('html.html', 'w', encoding='utf-8') as f: 20 f.write(res.text) 21 f.close() 22 x = etree.HTML(res.text)
3、解析需求数据
此处我用xpath解析。发现有的用户虽然有评论,但没有给评分,所以分数和日期这两个的xpath位置是会变动的。因此需要加判断,如果发现分数里面解析日期,证明该条评论没有给出评分。
1 for i in range(1, 21): # 每页20个评论用户 2 name = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/text()'.format(i)) 3 # 下面是个大bug,如果有的人没有评分,但是评论了,那么score解析出来是日期,而日期所在位置spen[3]为空 4 score = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[2]/@title'.format(i)) 5 date = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[3]/@title'.format(i)) 6 m = '\d{4}-\d{2}-\d{2}' 7 try: 8 match = re.compile(m).match(score[0]) 9 except IndexError: 10 break 11 if match is not None: 12 date = score 13 score = ["null"] 14 else: 15 pass 16 content = x.xpath('//*[@id="comments"]/div[{}]/div[2]/p/span/text()'.format(i)) 17 id = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/@href'.format(i)) 18 try: 19 city = get_city(id[0], i) # 调用评论用户的ID城市信息获取 20 except IndexError: 21 city = " " 22 name_list.append(str(name[0])) 23 score_list.append(str(score[0]).strip('[]\'')) # bug 有些人评论了文字,但是没有给出评分 24 date_list.append(str(date[0]).strip('[\'').split(' ')[0]) 25 content_list.append(str(content[0]).strip()) 26 city_list.append(city)
4、获取电影名称
1 pattern = re.compile('<div id="wrapper">.*?<div id="content">.*?<h1>(.*?) 短评</h1>', re.S) 2 global movie_name 3 movie_name = re.findall(pattern, res.text)[0] # list类型
5、数据存储
1 def main(ID, pages): 2 global movie_name 3 for i in tqdm(range(0, pages)): # 豆瓣只开放500条评论 4 get_content(ID, i) # 第一个参数是豆瓣电影对应的id序号,第二个参数是想爬取的评论页数 5 time.sleep(round(random.uniform(3, 5), 2)) 6 infos = {'name': name_list, 'city': city_list, 'content': content_list, 'score': score_list, 'date': date_list} 7 data = pd.DataFrame(infos, columns=['name', 'city', 'content', 'score', 'date']) 8 data.to_csv(movie_name + ".csv") # 存储名为 电影名.csv
(3)、数据分析与可视化
1、获取饼干
1 # 数据分析可视化 2 import os 3 import pandas as pd 4 from pandas import DataFrame 5 import re 6 from pyecharts import Line, Geo, Bar, Pie, Page, ThemeRiver 7 from snownlp import SnowNLP 8 import jieba 9 import matplotlib.pyplot as plt 10 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 11 12 fth = open('pyecharts_citys_supported.txt', 'r', encoding='utf-8').read() # pyecharts支持城市列表
城市信息过滤器中文字
1 # 过滤字符串只保留中文 2 def translate(str): 3 line = str.strip() 4 p2 = re.compile('[^\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5 5 zh = " ".join(p2.split(line)).strip() 6 zh = ",".join(zh.split()) 7 str = re.sub("[A-Za-z0-9!!,%\[\],。]", "", zh) 8 return str
匹配pyecharts支持的城市列表
1 # 下面是按照列属性读取的 2 def count_sentiment(csv_file): 3 path = os.path.abspath(os.curdir) 4 csv_file = path+ "\\" + csv_file + ".csv" 5 csv_file = csv_file.replace('\\', '\\\\') 6 d = pd.read_csv(csv_file, engine='python', encoding='utf-8') 7 motion_list = [] 8 for i in d['content']: 9 try: 10 s = round(SnowNLP(i).sentiments, 2) 11 motion_list.append(s) 12 except TypeError: 13 continue 14 result = {} 15 for i in set(motion_list): 16 result[i] = motion_list.count(i) 17 return result
2、基于nownlp的情感分析
snownlp主要进行中文分词(算法是Character-Base Generative Model)、词性可以官网的原理是TnT、3-gram 隐马)、情感分析(有介绍原理,但指定购物类的评论的准确率,其实是因为它的语料库主要是再生方面的,可以自己构建相关领域语料库,替换原来的,准确率也相当不错的)、文本分类(原理是朴素贝叶斯)、转换拼音、繁体转简体、提取文本关键词(原理是TextRank)、提取摘要(原理是TextRank)、分割句子、相似文本(原理是BM25)【摘自CSDN】。在此之前,可以先看一下官网,里面有最基础的一些命令的介绍。官网链接:https://pypi.org/project/snownlp/
因为已知p全部是unicode编码,所以要注意数据是否为unicode编码。是unicode编码,所以不要为了去除中文文本里面含有的英文,所以不要把中文文本里面含有的英文,都转码成统一的编码格式只是调用snownlp原生语料对文本进行分析,snlp重点针对购物领域,所以是为了提高情绪分析评价的适当程度,可以采取训练语料库的方法。
1 def draw_sentiment_pic(csv_file): 2 attr, val = [], [] 3 info = count_sentiment(csv_file) 4 info = sorted(info.items(), key=lambda x: x[0], reverse=False) # dict的排序方法 5 for each in info[:-1]: 6 attr.append(each[0]) 7 val.append(each[1]) 8 line = Line(csv_file+":影评情感分析") 9 line.add("", attr, val, is_smooth=True, is_more_utils=True) 10 line.render(csv_file+"_情感分析曲线图.html")
3、评论来源城市分析
调用pyecharts的页面函数,可以在一个图像对象中创建chart
,只需要对应的添加组件。
1 def draw_citys_pic(csv_file): 2 page = Page(csv_file+":评论城市分析") 3 info = count_city(csv_file) 4 geo = Geo("","小本聪原创",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff") 5 while True: # 二次筛选,和pyecharts支持的城市库进行匹配,如果报错则删除该城市对应的统计 6 try: 7 attr, val = geo.cast(info) 8 geo.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False, 9 is_piecewise=True, visual_split_number=6, symbol_size=15, is_visualmap=True) 10 except ValueError as e: 11 e = str(e) 12 e = e.split("No coordinate is specified for ")[1] # 获取不支持的城市名称 13 info.pop(e) 14 else: 15 break 16 info = sorted(info.items(), key=lambda x: x[1], reverse=False) # list排序 17 print(info) 18 info = dict(info) # list转dict 19 print(info) 20 attr, val = [], [] 21 for key in info: 22 attr.append(key) 23 val.append(info[key]) 24 25 26 geo1 = Geo("", "评论城市分布", title_pos="center", width=1200, height=600, 27 background_color='#404a59', title_color="#fff") 28 geo1.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False, 29 is_piecewise=True, visual_split_number=10, symbol_size=15, is_visualmap=True, is_more_utils=True) 30 # geo1.render(csv_file + "_城市dotmap.html") 31 page.add_chart(geo1) 32 geo2 = Geo("", "评论来源热力图",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff",) 33 geo2.add("", attr, val, type="heatmap", is_visualmap=True, visual_range=[0, 50],visual_text_color='#fff', is_more_utils=True) 34 # geo2.render(csv_file+"_城市heatmap.html") # 取CSV文件名的前8位数 35 page.add_chart(geo2) 36 bar = Bar("", "评论来源排行", title_pos="center", width=1200, height=600 ) 37 bar.add("", attr, val, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff',mark_point=["average"],mark_line=["average"], 38 is_more_utils=True, is_label_show=True, is_datazoom_show=True, xaxis_rotate=45) 39 bar.render(csv_file+"_城市评论bar.html") # 取CSV文件名的前8位数 40 page.add_chart(bar) 41 pie = Pie("", "评论来源饼图", title_pos="right", width=1200, height=600) 42 pie.add("", attr, val, radius=[20, 50], label_text_color=None, is_label_show=True, legend_orient='vertical', is_more_utils=True, legend_pos='left') 43 pie.render(csv_file + "_城市评论Pie.html") # 取CSV文件名的前8位数 44 page.add_chart(pie) 45 page.render(csv_file + "_城市评论分析汇总.html")
4、电影推荐走势分析
读取csv文件,以dataframe(df)形式保存
遍历df行,保存到list
统计相同 日期相同 评分的个数
转换为df格式,设置列名
按日期排序
去新的每一个日期的推荐种,因此需要增加到最少的5种。
1 def score_draw(csv_file): 2 page = Page(csv_file+":评论等级分析") 3 score, date, val, score_list = [], [], [], [] 4 result = {} 5 path = os.path.abspath(os.curdir) 6 csv_file = path + "\\" + csv_file + ".csv" 7 csv_file = csv_file.replace('\\', '\\\\') 8 d = pd.read_csv(csv_file, engine='python', encoding='utf-8')[['score', 'date']].dropna() # 读取CSV转为dataframe格式,并丢弃评论为空的记录 9 for indexs in d.index: # 一种遍历df行的方法(下面还有第二种,iterrows) 10 score_list.append(tuple(d.loc[indexs].values[:])) # 目前只找到转换为tuple然后统计相同元素个数的方法 11 print("有效评分总数量为:",len(score_list), " 条") 12 for i in set(list(score_list)): 13 result[i] = score_list.count(i) # dict类型 14 info = [] 15 for key in result: 16 score= key[0] 17 date = key[1] 18 val = result[key] 19 info.append([score, date, val]) 20 info_new = DataFrame(info) # 将字典转换成为数据框 21 info_new.columns = ['score', 'date', 'votes'] 22 info_new.sort_values('date', inplace=True) # 按日期升序排列df,便于找最早date和最晚data,方便后面插值 23 print("first df", info_new) 24 # 以下代码用于插入空缺的数据,每个日期的评分类型应该有5中,依次遍历判断是否存在,若不存在则往新的df中插入新数值 25 mark = 0 26 creat_df = pd.DataFrame(columns = ['score', 'date', 'votes']) # 创建空的dataframe 27 for i in list(info_new['date']): 28 location = info_new[(info_new.date==i)&(info_new.score=="力荐")].index.tolist() 29 if location == []: 30 creat_df.loc[mark] = ["力荐", i, 0] 31 mark += 1 32 location = info_new[(info_new.date==i)&(info_new.score=="推荐")].index.tolist() 33 if location == []: 34 creat_df.loc[mark] = ["推荐", i, 0] 35 mark += 1 36 location = info_new[(info_new.date==i)&(info_new.score=="还行")].index.tolist() 37 if location == []: 38 creat_df.loc[mark] = ["还行", i, 0] 39 mark += 1 40 location = info_new[(info_new.date==i)&(info_new.score=="较差")].index.tolist() 41 if location == []: 42 creat_df.loc[mark] = ["较差", i, 0] 43 mark += 1 44 location = info_new[(info_new.date==i)&(info_new.score=="很差")].index.tolist() 45 if location == []: 46 creat_df.loc[mark] = ["很差", i, 0] 47 mark += 1 48 info_new = info_new.append(creat_df.drop_duplicates(), ignore_index=True) 49 score_list = [] 50 info_new.sort_values('date', inplace=True) # 按日期升序排列df,便于找最早date和最晚data,方便后面插值 51 print(info_new) 52 for index, row in info_new.iterrows(): # 第二种遍历df的方法 53 score_list.append([row['date'], row['votes'], row['score']]) 54 tr = ThemeRiver() 55 tr.add(['力荐', '推荐', '还行', '较差', '很差'], score_list, is_label_show=True, is_more_utils=True) 56 page.add_chart(tr) 57 58 attr, v1, v2, v3, v4, v5 = [], [], [], [], [], [] 59 attr = list(sorted(set(info_new['date']))) 60 bar = Bar() 61 for i in attr: 62 v1.append(int(info_new[(info_new['date']==i)&(info_new['score']=="力荐")]['votes'])) 63 v2.append(int(info_new[(info_new['date']==i)&(info_new['score']=="推荐")]['votes'])) 64 v3.append(int(info_new[(info_new['date']==i)&(info_new['score']=="还行")]['votes'])) 65 v4.append(int(info_new[(info_new['date']==i)&(info_new['score']=="较差")]['votes'])) 66 v5.append(int(info_new[(info_new['date']==i)&(info_new['score']=="很差")]['votes'])) 67 bar.add("力荐", attr, v1, is_stack=True) 68 bar.add("推荐", attr, v2, is_stack=True) 69 bar.add("还行", attr, v3, is_stack=True) 70 bar.add("较差", attr, v4, is_stack=True) 71 bar.add("很差", attr, v5, is_stack=True, is_convert=True, mark_line=["average"], is_more_utils=True) 72 page.add_chart(bar) 73 74 line = Line() 75 line.add("力荐", attr, v1, is_stack=True) 76 line.add("推荐", attr, v2, is_stack=True) 77 line.add("还行", attr, v3, is_stack=True) 78 line.add("较差", attr, v4, is_stack=True) 79 line.add("很差", attr, v5, is_stack=True, is_convert=False, mark_line=["average"], is_more_utils=True) 80 page.add_chart(line)
由于允许爬取的量少和时间问题,部分数据不是很清楚。但仍然可以发现一些。在电影最开始的一周内,为评论高峰,尤其是最热门的3篇,这很常见,但也可能有偏差,因为爬虫获取的数据是通过豆瓣电影排序的,如果数据量大,可能会更接近真实情况。
另外发现,影片在最上前也有部分评论,分析可能是电影公映前的小规模试映,而且这些提前批的用户的评分均一,差不多近影评最后值的全面评论的最终评分,这些能提前观看影片的,可能是资深影迷或者影视从业人员,他们的评论非常不错的参考价值。
5、影评词云图
词云图,先读取CSV文件一帧形式保存,制作删除评论中非中文,选了胡歌照片作为背景,并设置了歌词数据表。
1 def main(csv_file, stopwords_path, pic_path): 2 draw_sentiment_pic(csv_file) 3 draw_citys_pic(csv_file) 4 score_draw(csv_file) 5 word_cloud(csv_file,stopwords_path, pic_path) 6 7 8 if __name__ == '__main__': 9 main("流浪地球", "stopwords.txt", "胡歌.jpg" )
可以用高频振动词“出镜”出片的认可,“特效”表现出特效镜头对科幻片的演绎,“科幻电影”表现出影迷对科幻类电影的兴趣。
以上就是本次爬取豆瓣网《流浪地球》短评的过程与数据分析。
1 # 爬取电影《流浪地球》的影评 2 import requests 3 from lxml import etree 4 from tqdm import tqdm 5 import time 6 import random 7 import pandas as pd 8 import re 9 10 name_list, content_list, date_list, score_list, city_list = [], [], [], [], [] 11 movie_name = "" 12 13 def get_city(url, i): 14 time.sleep(round(random.uniform(2, 3), 2)) 15 headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 17 cookies = {'cookie': 'bid=Ge7txCUP3v4; ll="108303"; _vwo_uuid_v2=DB48689393ACB497681C7C540C832B546|f3d53bcb0314c9a34c861e9c724fcdec; ap_v=0,6.0; dbcl2="159607750:sijMjNWV7ek"; ck=kgmP; push_doumail_num=0; push_noty_num=0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1549433417%2C%22https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26266893%2Fcomments%3Fsort%3Dnew_score%26status%3DP%22%5D; _pk_ses.100001.8cb4=*; __lnkrntdmcvrd=-1; __yadk_uid=KqejvPo3L0HIkc2Zx7UXOJF6Vt9PpoJU; _pk_id.100001.8cb4=91514e1ada30bfa5.1549433417.1.1549433694.1549433417'} # 2018.7.25修改, 18 res = requests.get(url, cookies=cookies, headers=headers) 19 if (res.status_code == 200): 20 print("\n成功获取第{}个用户城市信息!".format(i)) 21 else: 22 print("\n第{}个用户城市信息获取失败".format(i)) 23 pattern = re.compile('<div class="user-info">.*?<a href=".*?">(.*?)</a>', re.S) 24 item = re.findall(pattern, res.text) # list类型 25 return (item[0]) # 只有一个元素,所以直接返回 26 27 def get_content(id, page): 28 headers = { 29 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'} 30 cookies = {'cookie': ' 此处填入自己的cookies,否则不能正常爬取 '} 31 url = "https://movie.douban.com/subject/" + str(id) + "/comments?start=" + str(page * 10) + "&limit=20&sort=new_score&status=P" 32 res = requests.get(url, headers=headers, cookies=cookies) 33 pattern = re.compile('<div id="wrapper">.*?<div id="content">.*?<h1>(.*?) 短评</h1>', re.S) 34 global movie_name 35 movie_name = re.findall(pattern, res.text)[0] # list类型 36 res.encoding = "utf-8" 37 if (res.status_code == 200): 38 print("\n第{}页短评爬取成功!".format(page + 1)) 39 print(url) 40 else: 41 print("\n第{}页爬取失败!".format(page + 1)) 42 43 with open('html.html', 'w', encoding='utf-8') as f: 44 f.write(res.text) 45 f.close() 46 x = etree.HTML(res.text) 47 for i in range(1, 21): # 每页20个评论用户 48 name = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/text()'.format(i)) 49 # 下面是个大bug,如果有的人没有评分,但是评论了,那么score解析出来是日期,而日期所在位置spen[3]为空 50 score = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[2]/@title'.format(i)) 51 date = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[3]/@title'.format(i)) 52 m = '\d{4}-\d{2}-\d{2}' 53 try: 54 match = re.compile(m).match(score[0]) 55 except IndexError: 56 break 57 if match is not None: 58 date = score 59 score = ["null"] 60 else: 61 pass 62 content = x.xpath('//*[@id="comments"]/div[{}]/div[2]/p/span/text()'.format(i)) 63 id = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/@href'.format(i)) 64 try: 65 city = get_city(id[0], i) # 调用评论用户的ID城市信息获取 66 except IndexError: 67 city = " " 68 name_list.append(str(name[0])) 69 score_list.append(str(score[0]).strip('[]\'')) # bug 有些人评论了文字,但是没有给出评分 70 date_list.append(str(date[0]).strip('[\'').split(' ')[0]) 71 content_list.append(str(content[0]).strip()) 72 city_list.append(city) 73 74 def main(ID, pages): 75 global movie_name 76 for i in tqdm(range(0, pages)): # 豆瓣只开放500条评论 77 get_content(ID, i) # 第一个参数是豆瓣电影对应的id序号,第二个参数是想爬取的评论页数 78 time.sleep(round(random.uniform(3, 5), 2)) 79 infos = {'name': name_list, 'city': city_list, 'content': content_list, 'score': score_list, 'date': date_list} 80 data = pd.DataFrame(infos, columns=['name', 'city', 'content', 'score', 'date']) 81 data.to_csv(movie_name + ".csv") # 存储名为 电影名.csv 82 83 if __name__ == '__main__': 84 main(26266893, 25) # 评论电影的ID号+要爬取的评论页面数 85 86 # 数据分析可视化 87 import os 88 import pandas as pd 89 from pandas import DataFrame 90 import re 91 from pyecharts import Line, Geo, Bar, Pie, Page, ThemeRiver 92 from snownlp import SnowNLP 93 import jieba 94 import matplotlib.pyplot as plt 95 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 96 97 fth = open('pyecharts_citys_supported.txt', 'r', encoding='utf-8').read() # pyecharts支持城市列表 98 99 # 过滤字符串只保留中文 100 def translate(str): 101 line = str.strip() 102 p2 = re.compile('[^\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5 103 zh = " ".join(p2.split(line)).strip() 104 zh = ",".join(zh.split()) 105 str = re.sub("[A-Za-z0-9!!,%\[\],。]", "", zh) 106 return str 107 108 # 下面是按照列属性读取的 109 def count_sentiment(csv_file): 110 path = os.path.abspath(os.curdir) 111 csv_file = path+ "\\" + csv_file + ".csv" 112 csv_file = csv_file.replace('\\', '\\\\') 113 d = pd.read_csv(csv_file, engine='python', encoding='utf-8') 114 motion_list = [] 115 for i in d['content']: 116 try: 117 s = round(SnowNLP(i).sentiments, 2) 118 motion_list.append(s) 119 except TypeError: 120 continue 121 result = {} 122 for i in set(motion_list): 123 result[i] = motion_list.count(i) 124 return result 125 126 def draw_sentiment_pic(csv_file): 127 attr, val = [], [] 128 info = count_sentiment(csv_file) 129 info = sorted(info.items(), key=lambda x: x[0], reverse=False) # dict的排序方法 130 for each in info[:-1]: 131 attr.append(each[0]) 132 val.append(each[1]) 133 line = Line(csv_file+":影评情感分析") 134 line.add("", attr, val, is_smooth=True, is_more_utils=True) 135 line.render(csv_file+"_情感分析曲线图.html") 136 137 def word_cloud(csv_file, stopwords_path, pic_path): 138 pic_name = csv_file+"_词云图.png" 139 path = os.path.abspath(os.curdir) 140 csv_file = path+ "\\" + csv_file + ".csv" 141 csv_file = csv_file.replace('\\', '\\\\') 142 d = pd.read_csv(csv_file, engine='python', encoding='utf-8') 143 content = [] 144 for i in d['content']: 145 try: 146 i = translate(i) 147 except AttributeError as e: 148 continue 149 else: 150 content.append(i) 151 comment_after_split = jieba.cut(str(content), cut_all=False) 152 wl_space_split = " ".join(comment_after_split) 153 backgroud_Image = plt.imread(pic_path) 154 stopwords = STOPWORDS.copy() 155 with open(stopwords_path, 'r', encoding='utf-8') as f: 156 for i in f.readlines(): 157 stopwords.add(i.strip('\n')) 158 f.close() 159 160 wc = WordCloud(width=1024, height=768, background_color='white', 161 mask=backgroud_Image, font_path="C:\simhei.ttf", 162 stopwords=stopwords, max_font_size=400, 163 random_state=50) 164 wc.generate_from_text(wl_space_split) 165 img_colors = ImageColorGenerator(backgroud_Image) 166 wc.recolor(color_func=img_colors) 167 plt.imshow(wc) 168 plt.axis('off') 169 plt.show() 170 wc.to_file(pic_name) 171 172 def count_city(csv_file): 173 path = os.path.abspath(os.curdir) 174 csv_file = path+ "\\" + csv_file +".csv" 175 csv_file = csv_file.replace('\\', '\\\\') 176 177 d = pd.read_csv(csv_file, engine='python', encoding='utf-8') 178 city = [translate(n) for n in d['city'].dropna()] # 清洗城市,将中文城市提取出来并删除标点符号等 179 180 # 这是从网上找的省份的名称,将其转换成列表的形式 181 province = '湖南,湖北,广东,广西、河南、河北、山东、山西,江苏、浙江、江西、黑龙江、新疆,云南、贵州、福建、吉林、安徽,四川、西藏、宁夏、辽宁、青海、甘肃、陕西,内蒙古、台湾,海南' 182 province = province.replace('、',',').split(',') 183 rep_province = "|".join(province) # re.sub中城市替换的条件 184 185 All_city = jieba.cut("".join(city)) # 分词,将省份和市级地名分开,当然有一些如吉林长春之类没有很好的分开,因此我们需要用re.sub()来将之中的省份去除掉 186 final_city= [] 187 for a_city in All_city: 188 a_city_sub = re.sub(rep_province,"",a_city) # 对每一个单元使用sub方法,如果有省份的名称,就将他替换为“”(空) 189 if a_city_sub == "": # 判断,如果为空,跳过 190 continue 191 elif a_city_sub in fth: # 因为所有的省份都被排除掉了,便可以直接判断城市在不在列表之中,如果在,final_city便增加 192 final_city.append(a_city_sub) 193 else: # 不在fth中的城市,跳过 194 continue 195 196 result = {} 197 print("城市总数量为:",len(final_city)) 198 for i in set(final_city): 199 result[i] = final_city.count(i) 200 return result 201 202 def draw_citys_pic(csv_file): 203 page = Page(csv_file+":评论城市分析") 204 info = count_city(csv_file) 205 geo = Geo("","小本聪原创",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff") 206 while True: # 二次筛选,和pyecharts支持的城市库进行匹配,如果报错则删除该城市对应的统计 207 try: 208 attr, val = geo.cast(info) 209 geo.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False, 210 is_piecewise=True, visual_split_number=6, symbol_size=15, is_visualmap=True) 211 except ValueError as e: 212 e = str(e) 213 e = e.split("No coordinate is specified for ")[1] # 获取不支持的城市名称 214 info.pop(e) 215 else: 216 break 217 info = sorted(info.items(), key=lambda x: x[1], reverse=False) # list排序 218 print(info) 219 info = dict(info) # list转dict 220 print(info) 221 attr, val = [], [] 222 for key in info: 223 attr.append(key) 224 val.append(info[key]) 225 226 geo1 = Geo("", "评论城市分布", title_pos="center", width=1200, height=600, 227 background_color='#404a59', title_color="#fff") 228 geo1.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False, 229 is_piecewise=True, visual_split_number=10, symbol_size=15, is_visualmap=True, is_more_utils=True) 230 # geo1.render(csv_file + "_城市dotmap.html") 231 page.add_chart(geo1) 232 geo2 = Geo("", "评论来源热力图",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff",) 233 geo2.add("", attr, val, type="heatmap", is_visualmap=True, visual_range=[0, 50],visual_text_color='#fff', is_more_utils=True) 234 # geo2.render(csv_file+"_城市heatmap.html") # 取CSV文件名的前8位数 235 page.add_chart(geo2) 236 bar = Bar("", "评论来源排行", title_pos="center", width=1200, height=600 ) 237 bar.add("", attr, val, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff',mark_point=["average"],mark_line=["average"], 238 is_more_utils=True, is_label_show=True, is_datazoom_show=True, xaxis_rotate=45) 239 bar.render(csv_file+"_城市评论bar.html") # 取CSV文件名的前8位数 240 page.add_chart(bar) 241 pie = Pie("", "评论来源饼图", title_pos="right", width=1200, height=600) 242 pie.add("", attr, val, radius=[20, 50], label_text_color=None, is_label_show=True, legend_orient='vertical', is_more_utils=True, legend_pos='left') 243 pie.render(csv_file + "_城市评论Pie.html") # 取CSV文件名的前8位数 244 page.add_chart(pie) 245 page.render(csv_file + "_城市评论分析汇总.html") 246 247 def score_draw(csv_file): 248 page = Page(csv_file+":评论等级分析") 249 score, date, val, score_list = [], [], [], [] 250 result = {} 251 path = os.path.abspath(os.curdir) 252 csv_file = path + "\\" + csv_file + ".csv" 253 csv_file = csv_file.replace('\\', '\\\\') 254 d = pd.read_csv(csv_file, engine='python', encoding='utf-8')[['score', 'date']].dropna() # 读取CSV转为dataframe格式,并丢弃评论为空的记录 255 for indexs in d.index: # 一种遍历df行的方法(下面还有第二种,iterrows) 256 score_list.append(tuple(d.loc[indexs].values[:])) # 目前只找到转换为tuple然后统计相同元素个数的方法 257 print("有效评分总数量为:",len(score_list), " 条") 258 for i in set(list(score_list)): 259 result[i] = score_list.count(i) # dict类型 260 info = [] 261 for key in result: 262 score= key[0] 263 date = key[1] 264 val = result[key] 265 info.append([score, date, val]) 266 info_new = DataFrame(info) # 将字典转换成为数据框 267 info_new.columns = ['score', 'date', 'votes'] 268 info_new.sort_values('date', inplace=True) # 按日期升序排列df,便于找最早date和最晚data,方便后面插值 269 print("first df", info_new) 270 # 以下代码用于插入空缺的数据,每个日期的评分类型应该有5中,依次遍历判断是否存在,若不存在则往新的df中插入新数值 271 mark = 0 272 creat_df = pd.DataFrame(columns = ['score', 'date', 'votes']) # 创建空的dataframe 273 for i in list(info_new['date']): 274 location = info_new[(info_new.date==i)&(info_new.score=="力荐")].index.tolist() 275 if location == []: 276 creat_df.loc[mark] = ["力荐", i, 0] 277 mark += 1 278 location = info_new[(info_new.date==i)&(info_new.score=="推荐")].index.tolist() 279 if location == []: 280 creat_df.loc[mark] = ["推荐", i, 0] 281 mark += 1 282 location = info_new[(info_new.date==i)&(info_new.score=="还行")].index.tolist() 283 if location == []: 284 creat_df.loc[mark] = ["还行", i, 0] 285 mark += 1 286 location = info_new[(info_new.date==i)&(info_new.score=="较差")].index.tolist() 287 if location == []: 288 creat_df.loc[mark] = ["较差", i, 0] 289 mark += 1 290 location = info_new[(info_new.date==i)&(info_new.score=="很差")].index.tolist() 291 if location == []: 292 creat_df.loc[mark] = ["很差", i, 0] 293 mark += 1 294 info_new = info_new.append(creat_df.drop_duplicates(), ignore_index=True) 295 score_list = [] 296 info_new.sort_values('date', inplace=True) # 按日期升序排列df,便于找最早date和最晚data,方便后面插值 297 print(info_new) 298 for index, row in info_new.iterrows(): # 第二种遍历df的方法 299 score_list.append([row['date'], row['votes'], row['score']]) 300 tr = ThemeRiver() 301 tr.add(['力荐', '推荐', '还行', '较差', '很差'], score_list, is_label_show=True, is_more_utils=True) 302 page.add_chart(tr) 303 304 attr, v1, v2, v3, v4, v5 = [], [], [], [], [], [] 305 attr = list(sorted(set(info_new['date']))) 306 bar = Bar() 307 for i in attr: 308 v1.append(int(info_new[(info_new['date']==i)&(info_new['score']=="力荐")]['votes'])) 309 v2.append(int(info_new[(info_new['date']==i)&(info_new['score']=="推荐")]['votes'])) 310 v3.append(int(info_new[(info_new['date']==i)&(info_new['score']=="还行")]['votes'])) 311 v4.append(int(info_new[(info_new['date']==i)&(info_new['score']=="较差")]['votes'])) 312 v5.append(int(info_new[(info_new['date']==i)&(info_new['score']=="很差")]['votes'])) 313 bar.add("力荐", attr, v1, is_stack=True) 314 bar.add("推荐", attr, v2, is_stack=True) 315 bar.add("还行", attr, v3, is_stack=True) 316 bar.add("较差", attr, v4, is_stack=True) 317 bar.add("很差", attr, v5, is_stack=True, is_convert=True, mark_line=["average"], is_more_utils=True) 318 page.add_chart(bar) 319 320 line = Line() 321 line.add("力荐", attr, v1, is_stack=True) 322 line.add("推荐", attr, v2, is_stack=True) 323 line.add("还行", attr, v3, is_stack=True) 324 line.add("较差", attr, v4, is_stack=True) 325 line.add("很差", attr, v5, is_stack=True, is_convert=False, mark_line=["average"], is_more_utils=True) 326 page.add_chart(line) 327 328 page.render(csv_file[:-4] + "_日投票量分析汇总.html") 329 330 def main(csv_file, stopwords_path, pic_path): 331 draw_sentiment_pic(csv_file) 332 draw_citys_pic(csv_file) 333 score_draw(csv_file) 334 word_cloud(csv_file,stopwords_path, pic_path) 335 336 if __name__ == '__main__': 337 main("流浪地球", "stopwords.txt", "胡歌.jpg" )