使用到的技术模块
conn = sqlite3.connect("movie.db") cur = conn.cursor() sql = "select desc from movie" data = cur.execute(sql) text = "" for item in data: text = text + item[0] cur.close() conn.close()
# 分词 cut = jieba.cut(text) str = " ".join(cut) print("分词个数:", len(str))
# 设置中文停用词 从文件中读取屏蔽词 stopwords = set() content = [line.strip() for line in open(r'static/txt/stopwords.txt', 'r', encoding="utf-8").readlines()] stopwords.update(content)
# 设置遮罩图片 img = Image.open(r'./static/assets/img/tree.jpg') # 打开遮罩图片,图片背景为白色 img_arr = np.array(img) # 将图片转为数组格式
遮罩图片可以随意,注意北背景为白色即可
# 封装词云 wc = WordCloud( background_color="white", mask=img_arr, font_path="simkai.ttf", # 字体 C:\Windows\Fonts stopwords=stopwords # 过滤停用词 ) wc.generate_from_text(str)
# 绘制图片 fig = plt.figure(1) plt.imshow(wc) plt.axis("off") # 是否显示坐标轴 # plt.show() # 显示生成的词云图片 # 输出词云图片到文件 默认dpi为400 plt.savefig(r'./static/assets/img/word.jpg', dpi=450)
import jieba # 分词 import matplotlib.pyplot as plt # 绘图, 数据可视化 from wordcloud import WordCloud # 词云 from PIL import Image # 图片处理 import numpy as np # 矩阵运算 import sqlite3 # 准备词云所需的文字 conn = sqlite3.connect("movie.db") cur = conn.cursor() sql = "select desc from movie" data = cur.execute(sql) text = "" for item in data: text = text + item[0] cur.close() conn.close() # 分词 cut = jieba.cut(text) str = " ".join(cut) print("分词个数:", len(str)) # 设置中文停用词 从文件中读取屏蔽词 stopwords = set() content = [line.strip() for line in open(r'static/txt/stopwords.txt', 'r', encoding="utf-8").readlines()] stopwords.update(content) # 设置遮罩图片 img = Image.open(r'./static/assets/img/tree.jpg') # 打开遮罩图片,图片背景为白色 img_arr = np.array(img) # 将图片转为数组格式 # 封装词云 wc = WordCloud( background_color="white", mask=img_arr, font_path="simkai.ttf", # 字体 C:\Windows\Fonts stopwords=stopwords # 过滤停用词 ) wc.generate_from_text(str) # 绘制图片 fig = plt.figure(1) plt.imshow(wc) plt.axis("off") # 是否显示坐标轴 # plt.show() # 显示生成的词云图片 # 输出词云图片到文件 默认dpi为400 plt.savefig(r'./static/assets/img/word.jpg', dpi=450)
本案例数据是案例1中通过网络爬虫获取到的数据
Python案例实操1-网络爬虫