Python教程

Python爬取《流浪地球》豆瓣影评与数据分析

本文主要是介绍Python爬取《流浪地球》豆瓣影评与数据分析,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!

一、选题背景:

  2019年年初,《流浪地球》全国上榜。在豆瓣上,首日开分站稳8分以上,评分了之后点映的高热。微博上跟着出现吴京客串31天与6000万的热度搜。知乎上关于“评价刘慈欣如何评价刘慈欣小说改编的同名电影《流浪地球片》”的热门话题,包括导演郭帆的最高赞回答。

二、数据说明:

  本篇文章爬取了豆瓣网上《流浪地球》的部分影评,并进行数据分析和可视化处理。

三、实施过程及代码:

 1 # 爬取电影《流浪地球》的影评
 2 import requests
 3 from lxml import etree
 4 from tqdm import tqdm
 5 import time
 6 import random
 7 import pandas as pd
 8 import re
 9 
10 name_list, content_list, date_list, score_list, city_list = [], [], [], [], []
11 movie_name = ""
12 
13 def get_city(url, i):
14     time.sleep(round(random.uniform(2, 3), 2))
15     headers = {
16         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
17     cookies = {'cookie': 'bid=Ge7txCUP3v4; ll="108303"; _vwo_uuid_v2=DB48689393ACB497681C7C540C832B546|f3d53bcb0314c9a34c861e9c724fcdec; ap_v=0,6.0; dbcl2="159607750:sijMjNWV7ek"; ck=kgmP; push_doumail_num=0; push_noty_num=0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1549433417%2C%22https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26266893%2Fcomments%3Fsort%3Dnew_score%26status%3DP%22%5D; _pk_ses.100001.8cb4=*; __lnkrntdmcvrd=-1; __yadk_uid=KqejvPo3L0HIkc2Zx7UXOJF6Vt9PpoJU; _pk_id.100001.8cb4=91514e1ada30bfa5.1549433417.1.1549433694.1549433417'}  # 2018.7.25修改,
18     res = requests.get(url, cookies=cookies, headers=headers)
19     if (res.status_code == 200):
20         print("\n成功获取第{}个用户城市信息!".format(i))
21     else:
22         print("\n第{}个用户城市信息获取失败".format(i))
23     pattern = re.compile('<div class="user-info">.*?<a href=".*?">(.*?)</a>', re.S)
24     item = re.findall(pattern, res.text)  # list类型
25     return (item[0])  # 只有一个元素,所以直接返回

(1)网页分析

获取对象:

  • 评论用户

  • 评论内容

  • 评分

  • 评论日期

  • 用户所在城市

 

(2)数据获取与存储

1、获取饼干

本人用的是Chrome浏览器,Ctrl+F12进入开发者工具页面。F5刷新一下出现数据,找到cookies、headers。

2、加载headers、cookies,并用requests库获取信息

 1 def get_content(id, page):
 2     headers = {
 3         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
 4     cookies = {'cookie': '                  此处填入自己的cookies,否则不能正常爬取                  '}
 5     url = "https://movie.douban.com/subject/" + str(id) + "/comments?start=" + str(page * 10) + "&limit=20&sort=new_score&status=P"
 6     res = requests.get(url, headers=headers, cookies=cookies)
 7 
 8     pattern = re.compile('<div id="wrapper">.*?<div id="content">.*?<h1>(.*?) 短评</h1>', re.S)
 9     global movie_name
10     movie_name = re.findall(pattern, res.text)[0]  # list类型
11 
12     res.encoding = "utf-8"
13     if (res.status_code == 200):
14         print("\n第{}页短评爬取成功!".format(page + 1))
15         print(url)
16     else:
17         print("\n第{}页爬取失败!".format(page + 1))
18 
19     with open('html.html', 'w', encoding='utf-8') as f:
20         f.write(res.text)
21         f.close()
22     x = etree.HTML(res.text)

3、解析需求数据

  此处我用xpath解析。发现有的用户虽然有评论,但没有给评分,所以分数和日期这两个的xpath位置是会变动的。因此需要加判断,如果发现分数里面解析日期,证明该条评论没有给出评分。

 1     for i in range(1, 21):   # 每页20个评论用户
 2         name = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/text()'.format(i))
 3         # 下面是个大bug,如果有的人没有评分,但是评论了,那么score解析出来是日期,而日期所在位置spen[3]为空
 4         score = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[2]/@title'.format(i))
 5         date = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[3]/@title'.format(i))
 6         m = '\d{4}-\d{2}-\d{2}'
 7         try:
 8             match = re.compile(m).match(score[0])
 9         except IndexError:
10             break
11         if match is not None:
12             date = score
13             score = ["null"]
14         else:
15             pass
16         content = x.xpath('//*[@id="comments"]/div[{}]/div[2]/p/span/text()'.format(i))
17         id = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/@href'.format(i))
18         try:
19             city = get_city(id[0], i)  # 调用评论用户的ID城市信息获取
20         except IndexError:
21             city = " "
22         name_list.append(str(name[0]))
23         score_list.append(str(score[0]).strip('[]\''))  # bug 有些人评论了文字,但是没有给出评分
24         date_list.append(str(date[0]).strip('[\'').split(' ')[0])
25         content_list.append(str(content[0]).strip())
26         city_list.append(city)

4、获取电影名称

1 pattern = re.compile('<div id="wrapper">.*?<div id="content">.*?<h1>(.*?) 短评</h1>', re.S)
2 global movie_name
3 movie_name = re.findall(pattern, res.text)[0]  # list类型

5、数据存储

1 def main(ID, pages):
2     global movie_name
3     for i in tqdm(range(0, pages)):  # 豆瓣只开放500条评论
4         get_content(ID, i)  # 第一个参数是豆瓣电影对应的id序号,第二个参数是想爬取的评论页数
5         time.sleep(round(random.uniform(3, 5), 2))
6     infos = {'name': name_list, 'city': city_list, 'content': content_list, 'score': score_list, 'date': date_list}
7     data = pd.DataFrame(infos, columns=['name', 'city', 'content', 'score', 'date'])
8     data.to_csv(movie_name + ".csv")  # 存储名为  电影名.csv

(3)、数据分析与可视化

1、获取饼干

 1 # 数据分析可视化
 2 import os
 3 import pandas as pd
 4 from pandas import DataFrame
 5 import re
 6 from pyecharts import Line, Geo, Bar, Pie, Page, ThemeRiver
 7 from snownlp import SnowNLP
 8 import jieba
 9 import matplotlib.pyplot as plt
10 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
11 
12 fth = open('pyecharts_citys_supported.txt', 'r', encoding='utf-8').read() # pyecharts支持城市列表

  城市信息过滤器中文字

1 # 过滤字符串只保留中文
2 def translate(str):
3     line = str.strip()
4     p2 = re.compile('[^\u4e00-\u9fa5]')   # 中文的编码范围是:\u4e00到\u9fa5
5     zh = " ".join(p2.split(line)).strip()
6     zh = ",".join(zh.split())
7     str = re.sub("[A-Za-z0-9!!,%\[\],。]", "", zh)
8     return str

  匹配pyecharts支持的城市列表

 1 # 下面是按照列属性读取的
 2 def count_sentiment(csv_file):
 3     path = os.path.abspath(os.curdir)
 4     csv_file = path+ "\\" + csv_file + ".csv"
 5     csv_file = csv_file.replace('\\', '\\\\')
 6     d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
 7     motion_list = []
 8     for i in d['content']:
 9         try:
10             s = round(SnowNLP(i).sentiments, 2)
11             motion_list.append(s)
12         except TypeError:
13             continue
14     result = {}
15     for i in set(motion_list):
16         result[i] = motion_list.count(i)
17     return result

2、基于nownlp的情感分析

  snownlp主要进行中文分词(算法是Character-Base Generative Model)、词性可以官网的原理是TnT、3-gram 隐马)、情感分析(有介绍原理,但指定购物类的评论的准确率,其实是因为它的语料库主要是再生方面的,可以自己构建相关领域语料库,替换原来的,准确率也相当不错的)、文本分类(原理是朴素贝叶斯)、转换拼音、繁体转简体、提取文本关键词(原理是TextRank)、提取摘要(原理是TextRank)、分割句子、相似文本(原理是BM25)【摘自CSDN】。在此之前,可以先看一下官网,里面有最基础的一些命令的介绍。官网链接:https://pypi.org/project/snownlp/

  因为已知p全部是unicode编码,所以要注意数据是否为unicode编码。是unicode编码,所以不要为了去除中文文本里面含有的英文,所以不要把中文文本里面含有的英文,都转码成统一的编码格式只是调用snownlp原生语料对文本进行分析,snlp重点针对购物领域,所以是为了提高情绪分析评价的适当程度,可以采取训练语料库的方法。

 1 def draw_sentiment_pic(csv_file):
 2     attr, val = [], []
 3     info = count_sentiment(csv_file)
 4     info = sorted(info.items(), key=lambda x: x[0], reverse=False)  # dict的排序方法
 5     for each in info[:-1]:
 6         attr.append(each[0])
 7         val.append(each[1])
 8     line = Line(csv_file+":影评情感分析")
 9     line.add("", attr, val, is_smooth=True, is_more_utils=True)
10     line.render(csv_file+"_情感分析曲线图.html")

3、评论来源城市分析

  调用pyecharts的页面函数,可以在一个图像对象中创建chart,只需要对应的添加组件。

 1 def draw_citys_pic(csv_file):
 2     page = Page(csv_file+":评论城市分析")
 3     info = count_city(csv_file)
 4     geo = Geo("","小本聪原创",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff")
 5     while True:   # 二次筛选,和pyecharts支持的城市库进行匹配,如果报错则删除该城市对应的统计
 6         try:
 7             attr, val = geo.cast(info)
 8             geo.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False,
 9                     is_piecewise=True, visual_split_number=6, symbol_size=15, is_visualmap=True)
10         except ValueError as e:
11             e = str(e)
12             e = e.split("No coordinate is specified for ")[1]  # 获取不支持的城市名称
13             info.pop(e)
14         else:
15             break
16     info = sorted(info.items(), key=lambda x: x[1], reverse=False)  # list排序
17     print(info)
18     info = dict(info)   # list转dict
19     print(info)
20     attr, val = [], []
21     for key in info:
22         attr.append(key)
23         val.append(info[key])
24 
25 
26     geo1 = Geo("", "评论城市分布", title_pos="center", width=1200, height=600,
27               background_color='#404a59', title_color="#fff")
28     geo1.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False,
29             is_piecewise=True, visual_split_number=10, symbol_size=15, is_visualmap=True, is_more_utils=True)
30     # geo1.render(csv_file + "_城市dotmap.html")
31     page.add_chart(geo1)
32     geo2 = Geo("", "评论来源热力图",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff",)
33     geo2.add("", attr, val, type="heatmap", is_visualmap=True, visual_range=[0, 50],visual_text_color='#fff', is_more_utils=True)
34     # geo2.render(csv_file+"_城市heatmap.html")  # 取CSV文件名的前8位数
35     page.add_chart(geo2)
36     bar = Bar("", "评论来源排行", title_pos="center", width=1200, height=600 )
37     bar.add("", attr, val, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff',mark_point=["average"],mark_line=["average"],
38             is_more_utils=True, is_label_show=True, is_datazoom_show=True, xaxis_rotate=45)
39     bar.render(csv_file+"_城市评论bar.html")  # 取CSV文件名的前8位数
40     page.add_chart(bar)
41     pie = Pie("", "评论来源饼图", title_pos="right", width=1200, height=600)
42     pie.add("", attr, val, radius=[20, 50], label_text_color=None, is_label_show=True, legend_orient='vertical', is_more_utils=True, legend_pos='left')
43     pie.render(csv_file + "_城市评论Pie.html")  # 取CSV文件名的前8位数
44     page.add_chart(pie)
45     page.render(csv_file + "_城市评论分析汇总.html")

4、电影推荐走势分析

  • 读取csv文件,以dataframe(df)形式保存

  • 遍历df行,保存到list

  • 统计相同 日期相同 评分的个数

  • 转换为df格式,设置列名

  • 按日期排序

  • 去新的每一个日期的推荐种,因此需要增加到最少的5种。

 1 def score_draw(csv_file):
 2     page = Page(csv_file+":评论等级分析")
 3     score, date, val, score_list = [], [], [], []
 4     result = {}
 5     path = os.path.abspath(os.curdir)
 6     csv_file = path + "\\" + csv_file + ".csv"
 7     csv_file = csv_file.replace('\\', '\\\\')
 8     d = pd.read_csv(csv_file, engine='python', encoding='utf-8')[['score', 'date']].dropna()  # 读取CSV转为dataframe格式,并丢弃评论为空的记录
 9     for indexs in d.index:  # 一种遍历df行的方法(下面还有第二种,iterrows)
10         score_list.append(tuple(d.loc[indexs].values[:])) # 目前只找到转换为tuple然后统计相同元素个数的方法
11     print("有效评分总数量为:",len(score_list), " 条")
12     for i in set(list(score_list)):
13         result[i] = score_list.count(i)  # dict类型
14     info = []
15     for key in result:
16         score= key[0]
17         date = key[1]
18         val = result[key]
19         info.append([score, date, val])
20     info_new = DataFrame(info)  # 将字典转换成为数据框
21     info_new.columns = ['score', 'date', 'votes']
22     info_new.sort_values('date', inplace=True)    # 按日期升序排列df,便于找最早date和最晚data,方便后面插值
23     print("first df", info_new)
24     # 以下代码用于插入空缺的数据,每个日期的评分类型应该有5中,依次遍历判断是否存在,若不存在则往新的df中插入新数值
25     mark = 0
26     creat_df = pd.DataFrame(columns = ['score', 'date', 'votes']) # 创建空的dataframe
27     for i in list(info_new['date']):
28         location = info_new[(info_new.date==i)&(info_new.score=="力荐")].index.tolist()
29         if location == []:
30             creat_df.loc[mark] = ["力荐", i, 0]
31             mark += 1
32         location = info_new[(info_new.date==i)&(info_new.score=="推荐")].index.tolist()
33         if location == []:
34             creat_df.loc[mark] = ["推荐", i, 0]
35             mark += 1
36         location = info_new[(info_new.date==i)&(info_new.score=="还行")].index.tolist()
37         if location == []:
38             creat_df.loc[mark] = ["还行", i, 0]
39             mark += 1
40         location = info_new[(info_new.date==i)&(info_new.score=="较差")].index.tolist()
41         if location == []:
42             creat_df.loc[mark] = ["较差", i, 0]
43             mark += 1
44         location = info_new[(info_new.date==i)&(info_new.score=="很差")].index.tolist()
45         if location == []:
46             creat_df.loc[mark] = ["很差", i, 0]
47             mark += 1
48     info_new = info_new.append(creat_df.drop_duplicates(), ignore_index=True)
49     score_list = []
50     info_new.sort_values('date', inplace=True)    # 按日期升序排列df,便于找最早date和最晚data,方便后面插值
51     print(info_new)
52     for index, row in info_new.iterrows():   # 第二种遍历df的方法
53         score_list.append([row['date'], row['votes'], row['score']])
54     tr = ThemeRiver()
55     tr.add(['力荐', '推荐', '还行', '较差', '很差'], score_list, is_label_show=True, is_more_utils=True)
56     page.add_chart(tr)
57 
58     attr, v1, v2, v3, v4, v5 = [], [], [], [], [], []
59     attr = list(sorted(set(info_new['date'])))
60     bar = Bar()
61     for i in attr:
62         v1.append(int(info_new[(info_new['date']==i)&(info_new['score']=="力荐")]['votes']))
63         v2.append(int(info_new[(info_new['date']==i)&(info_new['score']=="推荐")]['votes']))
64         v3.append(int(info_new[(info_new['date']==i)&(info_new['score']=="还行")]['votes']))
65         v4.append(int(info_new[(info_new['date']==i)&(info_new['score']=="较差")]['votes']))
66         v5.append(int(info_new[(info_new['date']==i)&(info_new['score']=="很差")]['votes']))
67     bar.add("力荐", attr, v1, is_stack=True)
68     bar.add("推荐", attr, v2, is_stack=True)
69     bar.add("还行", attr, v3, is_stack=True)
70     bar.add("较差", attr, v4, is_stack=True)
71     bar.add("很差", attr, v5, is_stack=True, is_convert=True, mark_line=["average"], is_more_utils=True)
72     page.add_chart(bar)
73 
74     line = Line()
75     line.add("力荐", attr, v1, is_stack=True)
76     line.add("推荐", attr, v2, is_stack=True)
77     line.add("还行", attr, v3, is_stack=True)
78     line.add("较差", attr, v4, is_stack=True)
79     line.add("很差", attr, v5, is_stack=True, is_convert=False, mark_line=["average"], is_more_utils=True)
80     page.add_chart(line)

  由于允许爬取的量少和时间问题,部分数据不是很清楚。但仍然可以发现一些。在电影最开始的一周内,为评论高峰,尤其是最热门的3篇,这很常见,但也可能有偏差,因为爬虫获取的数据是通过豆瓣电影排序的,如果数据量大,可能会更接近真实情况。

  另外发现,影片在最上前也有部分评论,分析可能是电影公映前的小规模试映,而且这些提前批的用户的评分均一,差不多近影评最后值的全面评论的最终评分,这些能提前观看影片的,可能是资深影迷或者影视从业人员,他们的评论非常不错的参考价值。

5、影评词云图

  词云图,先读取CSV文件一帧形式保存,制作删除评论中非中文,选了胡歌照片作为背景,并设置了歌词数据表。

1 def main(csv_file, stopwords_path, pic_path):
2     draw_sentiment_pic(csv_file)
3     draw_citys_pic(csv_file)
4     score_draw(csv_file)
5     word_cloud(csv_file,stopwords_path, pic_path)
6 
7 
8 if __name__ == '__main__':
9     main("流浪地球", "stopwords.txt", "胡歌.jpg" )

四、总结

  可以用高频振动词“出镜”出片的认可,“特效”表现出特效镜头对科幻片的演绎,“科幻电影”表现出影迷对科幻类电影的兴趣。

  以上就是本次爬取豆瓣网《流浪地球》短评的过程与数据分析。

完整代码:

  1 # 爬取电影《流浪地球》的影评
  2 import requests
  3 from lxml import etree
  4 from tqdm import tqdm
  5 import time
  6 import random
  7 import pandas as pd
  8 import re
  9 
 10 name_list, content_list, date_list, score_list, city_list = [], [], [], [], []
 11 movie_name = ""
 12 
 13 def get_city(url, i):
 14     time.sleep(round(random.uniform(2, 3), 2))
 15     headers = {
 16         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
 17     cookies = {'cookie': 'bid=Ge7txCUP3v4; ll="108303"; _vwo_uuid_v2=DB48689393ACB497681C7C540C832B546|f3d53bcb0314c9a34c861e9c724fcdec; ap_v=0,6.0; dbcl2="159607750:sijMjNWV7ek"; ck=kgmP; push_doumail_num=0; push_noty_num=0; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1549433417%2C%22https%3A%2F%2Fmovie.douban.com%2Fsubject%2F26266893%2Fcomments%3Fsort%3Dnew_score%26status%3DP%22%5D; _pk_ses.100001.8cb4=*; __lnkrntdmcvrd=-1; __yadk_uid=KqejvPo3L0HIkc2Zx7UXOJF6Vt9PpoJU; _pk_id.100001.8cb4=91514e1ada30bfa5.1549433417.1.1549433694.1549433417'}  # 2018.7.25修改,
 18     res = requests.get(url, cookies=cookies, headers=headers)
 19     if (res.status_code == 200):
 20         print("\n成功获取第{}个用户城市信息!".format(i))
 21     else:
 22         print("\n第{}个用户城市信息获取失败".format(i))
 23     pattern = re.compile('<div class="user-info">.*?<a href=".*?">(.*?)</a>', re.S)
 24     item = re.findall(pattern, res.text)  # list类型
 25     return (item[0])  # 只有一个元素,所以直接返回
 26 
 27 def get_content(id, page):
 28     headers = {
 29         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
 30     cookies = {'cookie': '                  此处填入自己的cookies,否则不能正常爬取                  '}
 31     url = "https://movie.douban.com/subject/" + str(id) + "/comments?start=" + str(page * 10) + "&limit=20&sort=new_score&status=P"
 32     res = requests.get(url, headers=headers, cookies=cookies)
 33     pattern = re.compile('<div id="wrapper">.*?<div id="content">.*?<h1>(.*?) 短评</h1>', re.S)
 34     global movie_name
 35     movie_name = re.findall(pattern, res.text)[0]  # list类型
 36     res.encoding = "utf-8"
 37     if (res.status_code == 200):
 38         print("\n第{}页短评爬取成功!".format(page + 1))
 39         print(url)
 40     else:
 41         print("\n第{}页爬取失败!".format(page + 1))
 42 
 43     with open('html.html', 'w', encoding='utf-8') as f:
 44         f.write(res.text)
 45         f.close()
 46     x = etree.HTML(res.text)
 47     for i in range(1, 21):   # 每页20个评论用户
 48         name = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/text()'.format(i))
 49         # 下面是个大bug,如果有的人没有评分,但是评论了,那么score解析出来是日期,而日期所在位置spen[3]为空
 50         score = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[2]/@title'.format(i))
 51         date = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/span[3]/@title'.format(i))
 52         m = '\d{4}-\d{2}-\d{2}'
 53         try:
 54             match = re.compile(m).match(score[0])
 55         except IndexError:
 56             break
 57         if match is not None:
 58             date = score
 59             score = ["null"]
 60         else:
 61             pass
 62         content = x.xpath('//*[@id="comments"]/div[{}]/div[2]/p/span/text()'.format(i))
 63         id = x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/@href'.format(i))
 64         try:
 65             city = get_city(id[0], i)  # 调用评论用户的ID城市信息获取
 66         except IndexError:
 67             city = " "
 68         name_list.append(str(name[0]))
 69         score_list.append(str(score[0]).strip('[]\''))  # bug 有些人评论了文字,但是没有给出评分
 70         date_list.append(str(date[0]).strip('[\'').split(' ')[0])
 71         content_list.append(str(content[0]).strip())
 72         city_list.append(city)
 73 
 74 def main(ID, pages):
 75     global movie_name
 76     for i in tqdm(range(0, pages)):  # 豆瓣只开放500条评论
 77         get_content(ID, i)  # 第一个参数是豆瓣电影对应的id序号,第二个参数是想爬取的评论页数
 78         time.sleep(round(random.uniform(3, 5), 2))
 79     infos = {'name': name_list, 'city': city_list, 'content': content_list, 'score': score_list, 'date': date_list}
 80     data = pd.DataFrame(infos, columns=['name', 'city', 'content', 'score', 'date'])
 81     data.to_csv(movie_name + ".csv")  # 存储名为  电影名.csv
 82 
 83 if __name__ == '__main__':
 84     main(26266893, 25)  # 评论电影的ID号+要爬取的评论页面数
 85 
 86 # 数据分析可视化
 87 import os
 88 import pandas as pd
 89 from pandas import DataFrame
 90 import re
 91 from pyecharts import Line, Geo, Bar, Pie, Page, ThemeRiver
 92 from snownlp import SnowNLP
 93 import jieba
 94 import matplotlib.pyplot as plt
 95 from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
 96 
 97 fth = open('pyecharts_citys_supported.txt', 'r', encoding='utf-8').read() # pyecharts支持城市列表
 98 
 99 # 过滤字符串只保留中文
100 def translate(str):
101     line = str.strip()
102     p2 = re.compile('[^\u4e00-\u9fa5]')   # 中文的编码范围是:\u4e00到\u9fa5
103     zh = " ".join(p2.split(line)).strip()
104     zh = ",".join(zh.split())
105     str = re.sub("[A-Za-z0-9!!,%\[\],。]", "", zh)
106     return str
107 
108 # 下面是按照列属性读取的
109 def count_sentiment(csv_file):
110     path = os.path.abspath(os.curdir)
111     csv_file = path+ "\\" + csv_file + ".csv"
112     csv_file = csv_file.replace('\\', '\\\\')
113     d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
114     motion_list = []
115     for i in d['content']:
116         try:
117             s = round(SnowNLP(i).sentiments, 2)
118             motion_list.append(s)
119         except TypeError:
120             continue
121     result = {}
122     for i in set(motion_list):
123         result[i] = motion_list.count(i)
124     return result
125 
126 def draw_sentiment_pic(csv_file):
127     attr, val = [], []
128     info = count_sentiment(csv_file)
129     info = sorted(info.items(), key=lambda x: x[0], reverse=False)  # dict的排序方法
130     for each in info[:-1]:
131         attr.append(each[0])
132         val.append(each[1])
133     line = Line(csv_file+":影评情感分析")
134     line.add("", attr, val, is_smooth=True, is_more_utils=True)
135     line.render(csv_file+"_情感分析曲线图.html")
136 
137 def word_cloud(csv_file, stopwords_path, pic_path):
138     pic_name = csv_file+"_词云图.png"
139     path = os.path.abspath(os.curdir)
140     csv_file = path+ "\\" + csv_file + ".csv"
141     csv_file = csv_file.replace('\\', '\\\\')
142     d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
143     content = []
144     for i in d['content']:
145         try:
146             i = translate(i)
147         except AttributeError as e:
148             continue
149         else:
150             content.append(i)
151     comment_after_split = jieba.cut(str(content), cut_all=False)
152     wl_space_split = " ".join(comment_after_split)
153     backgroud_Image = plt.imread(pic_path)
154     stopwords = STOPWORDS.copy()
155     with open(stopwords_path, 'r', encoding='utf-8') as f:
156         for i in f.readlines():
157             stopwords.add(i.strip('\n'))
158         f.close()
159 
160     wc = WordCloud(width=1024, height=768, background_color='white',
161                    mask=backgroud_Image, font_path="C:\simhei.ttf",
162                    stopwords=stopwords, max_font_size=400,
163                    random_state=50)
164     wc.generate_from_text(wl_space_split)
165     img_colors = ImageColorGenerator(backgroud_Image)
166     wc.recolor(color_func=img_colors)
167     plt.imshow(wc)
168     plt.axis('off')  
169     plt.show() 
170     wc.to_file(pic_name)
171 
172 def count_city(csv_file):
173     path = os.path.abspath(os.curdir)
174     csv_file = path+ "\\" + csv_file +".csv"
175     csv_file = csv_file.replace('\\', '\\\\')
176     
177     d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
178     city = [translate(n) for n in d['city'].dropna()] # 清洗城市,将中文城市提取出来并删除标点符号等 
179     
180     # 这是从网上找的省份的名称,将其转换成列表的形式
181     province = '湖南,湖北,广东,广西、河南、河北、山东、山西,江苏、浙江、江西、黑龙江、新疆,云南、贵州、福建、吉林、安徽,四川、西藏、宁夏、辽宁、青海、甘肃、陕西,内蒙古、台湾,海南'
182     province = province.replace('、',',').split(',')
183     rep_province = "|".join(province) # re.sub中城市替换的条件
184     
185     All_city = jieba.cut("".join(city)) # 分词,将省份和市级地名分开,当然有一些如吉林长春之类没有很好的分开,因此我们需要用re.sub()来将之中的省份去除掉
186     final_city= []
187     for a_city in All_city:
188         a_city_sub = re.sub(rep_province,"",a_city) # 对每一个单元使用sub方法,如果有省份的名称,就将他替换为“”(空)
189         if a_city_sub == "": # 判断,如果为空,跳过
190             continue
191         elif a_city_sub in fth: # 因为所有的省份都被排除掉了,便可以直接判断城市在不在列表之中,如果在,final_city便增加
192             final_city.append(a_city_sub)
193         else: # 不在fth中的城市,跳过
194             continue
195             
196     result = {}
197     print("城市总数量为:",len(final_city))
198     for i in set(final_city):
199         result[i] = final_city.count(i)
200     return result
201 
202 def draw_citys_pic(csv_file):
203     page = Page(csv_file+":评论城市分析")
204     info = count_city(csv_file)
205     geo = Geo("","小本聪原创",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff")
206     while True:   # 二次筛选,和pyecharts支持的城市库进行匹配,如果报错则删除该城市对应的统计
207         try:
208             attr, val = geo.cast(info)
209             geo.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False,
210                     is_piecewise=True, visual_split_number=6, symbol_size=15, is_visualmap=True)
211         except ValueError as e:
212             e = str(e)
213             e = e.split("No coordinate is specified for ")[1]  # 获取不支持的城市名称
214             info.pop(e)
215         else:
216             break
217     info = sorted(info.items(), key=lambda x: x[1], reverse=False)  # list排序
218     print(info)
219     info = dict(info)   # list转dict
220     print(info)
221     attr, val = [], []
222     for key in info:
223         attr.append(key)
224         val.append(info[key])
225 
226     geo1 = Geo("", "评论城市分布", title_pos="center", width=1200, height=600,
227               background_color='#404a59', title_color="#fff")
228     geo1.add("", attr, val, visual_range=[0, 300], visual_text_color="#fff", is_geo_effect_show=False,
229             is_piecewise=True, visual_split_number=10, symbol_size=15, is_visualmap=True, is_more_utils=True)
230     # geo1.render(csv_file + "_城市dotmap.html")
231     page.add_chart(geo1)
232     geo2 = Geo("", "评论来源热力图",title_pos="center", width=1200,height=600, background_color='#404a59', title_color="#fff",)
233     geo2.add("", attr, val, type="heatmap", is_visualmap=True, visual_range=[0, 50],visual_text_color='#fff', is_more_utils=True)
234     # geo2.render(csv_file+"_城市heatmap.html")  # 取CSV文件名的前8位数
235     page.add_chart(geo2)
236     bar = Bar("", "评论来源排行", title_pos="center", width=1200, height=600 )
237     bar.add("", attr, val, is_visualmap=True, visual_range=[0, 100], visual_text_color='#fff',mark_point=["average"],mark_line=["average"],
238             is_more_utils=True, is_label_show=True, is_datazoom_show=True, xaxis_rotate=45)
239     bar.render(csv_file+"_城市评论bar.html")  # 取CSV文件名的前8位数
240     page.add_chart(bar)
241     pie = Pie("", "评论来源饼图", title_pos="right", width=1200, height=600)
242     pie.add("", attr, val, radius=[20, 50], label_text_color=None, is_label_show=True, legend_orient='vertical', is_more_utils=True, legend_pos='left')
243     pie.render(csv_file + "_城市评论Pie.html")  # 取CSV文件名的前8位数
244     page.add_chart(pie)
245     page.render(csv_file + "_城市评论分析汇总.html")
246 
247 def score_draw(csv_file):
248     page = Page(csv_file+":评论等级分析")
249     score, date, val, score_list = [], [], [], []
250     result = {}
251     path = os.path.abspath(os.curdir)
252     csv_file = path + "\\" + csv_file + ".csv"
253     csv_file = csv_file.replace('\\', '\\\\')
254     d = pd.read_csv(csv_file, engine='python', encoding='utf-8')[['score', 'date']].dropna()  # 读取CSV转为dataframe格式,并丢弃评论为空的记录
255     for indexs in d.index:  # 一种遍历df行的方法(下面还有第二种,iterrows)
256         score_list.append(tuple(d.loc[indexs].values[:])) # 目前只找到转换为tuple然后统计相同元素个数的方法
257     print("有效评分总数量为:",len(score_list), " 条")
258     for i in set(list(score_list)):
259         result[i] = score_list.count(i)  # dict类型
260     info = []
261     for key in result:
262         score= key[0]
263         date = key[1]
264         val = result[key]
265         info.append([score, date, val])
266     info_new = DataFrame(info)  # 将字典转换成为数据框
267     info_new.columns = ['score', 'date', 'votes']
268     info_new.sort_values('date', inplace=True)    # 按日期升序排列df,便于找最早date和最晚data,方便后面插值
269     print("first df", info_new)
270     # 以下代码用于插入空缺的数据,每个日期的评分类型应该有5中,依次遍历判断是否存在,若不存在则往新的df中插入新数值
271     mark = 0
272     creat_df = pd.DataFrame(columns = ['score', 'date', 'votes']) # 创建空的dataframe
273     for i in list(info_new['date']):
274         location = info_new[(info_new.date==i)&(info_new.score=="力荐")].index.tolist()
275         if location == []:
276             creat_df.loc[mark] = ["力荐", i, 0]
277             mark += 1
278         location = info_new[(info_new.date==i)&(info_new.score=="推荐")].index.tolist()
279         if location == []:
280             creat_df.loc[mark] = ["推荐", i, 0]
281             mark += 1
282         location = info_new[(info_new.date==i)&(info_new.score=="还行")].index.tolist()
283         if location == []:
284             creat_df.loc[mark] = ["还行", i, 0]
285             mark += 1
286         location = info_new[(info_new.date==i)&(info_new.score=="较差")].index.tolist()
287         if location == []:
288             creat_df.loc[mark] = ["较差", i, 0]
289             mark += 1
290         location = info_new[(info_new.date==i)&(info_new.score=="很差")].index.tolist()
291         if location == []:
292             creat_df.loc[mark] = ["很差", i, 0]
293             mark += 1
294     info_new = info_new.append(creat_df.drop_duplicates(), ignore_index=True)
295     score_list = []
296     info_new.sort_values('date', inplace=True)    # 按日期升序排列df,便于找最早date和最晚data,方便后面插值
297     print(info_new)
298     for index, row in info_new.iterrows():   # 第二种遍历df的方法
299         score_list.append([row['date'], row['votes'], row['score']])
300     tr = ThemeRiver()
301     tr.add(['力荐', '推荐', '还行', '较差', '很差'], score_list, is_label_show=True, is_more_utils=True)
302     page.add_chart(tr)
303 
304     attr, v1, v2, v3, v4, v5 = [], [], [], [], [], []
305     attr = list(sorted(set(info_new['date'])))
306     bar = Bar()
307     for i in attr:
308         v1.append(int(info_new[(info_new['date']==i)&(info_new['score']=="力荐")]['votes']))
309         v2.append(int(info_new[(info_new['date']==i)&(info_new['score']=="推荐")]['votes']))
310         v3.append(int(info_new[(info_new['date']==i)&(info_new['score']=="还行")]['votes']))
311         v4.append(int(info_new[(info_new['date']==i)&(info_new['score']=="较差")]['votes']))
312         v5.append(int(info_new[(info_new['date']==i)&(info_new['score']=="很差")]['votes']))
313     bar.add("力荐", attr, v1, is_stack=True)
314     bar.add("推荐", attr, v2, is_stack=True)
315     bar.add("还行", attr, v3, is_stack=True)
316     bar.add("较差", attr, v4, is_stack=True)
317     bar.add("很差", attr, v5, is_stack=True, is_convert=True, mark_line=["average"], is_more_utils=True)
318     page.add_chart(bar)
319 
320     line = Line()
321     line.add("力荐", attr, v1, is_stack=True)
322     line.add("推荐", attr, v2, is_stack=True)
323     line.add("还行", attr, v3, is_stack=True)
324     line.add("较差", attr, v4, is_stack=True)
325     line.add("很差", attr, v5, is_stack=True, is_convert=False, mark_line=["average"], is_more_utils=True)
326     page.add_chart(line)
327 
328     page.render(csv_file[:-4] + "_日投票量分析汇总.html")
329 
330 def main(csv_file, stopwords_path, pic_path):
331     draw_sentiment_pic(csv_file)
332     draw_citys_pic(csv_file)
333     score_draw(csv_file)
334     word_cloud(csv_file,stopwords_path, pic_path)
335 
336 if __name__ == '__main__':
337     main("流浪地球", "stopwords.txt", "胡歌.jpg" )

 

这篇关于Python爬取《流浪地球》豆瓣影评与数据分析的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!