一:谭松波酒店评价是本地就有的,读者也可以自己爬取微博数据或其他的数据集也可以
链接:https://pan.baidu.com/s/1TrumHVMk-Kc4PJz8INMYbg
提取码:xsa7
其中有一个是积极评价,消极评价。以及一个停用词
二:对评价进行情感分析,本文中做的工作是将预料划分为积极和消极两种情感,目前使用的是SVM模型进行训练,但是该模型训练效果较差,后期会改用其他模型
达到准确率为76%
其中正向95%左右
负面55%左右(由于使用的数据集是不对称数据集,负面语料较少)
import jieba import numpy as np import pandas as pd import os.path from gensim.models.word2vec import Word2Vec import glob import joblib from sklearn.model_selection import cross_val_score from sklearn.svm import SVC tol_num=0 right=0 all_pos_files=glob.glob(os.path.join("pos.txt"))//这三个地址记得改成你的地址 all_pos_files=glob.glob(os.path.join("pos.txt"))// all_neg_files=glob.glob(os.path.join("neg.txt"))// stopLists_path="stoplist.txt" stopwords=[] with open(stopLists_path,'r',encoding='utf-8') as f_stop: for line in f_stop: if len(line)>0: stopwords.append(line.strip) def split_stopwords(words,stoplist): word_list=[] for word in words: if (word.strip() not in stoplist): word_list.append(word.strip()) return word_list #读取文本,预处理 neg=pd.read_csv('neg.txt',sep='\n',header=None) pos=pd.read_csv('pos.txt',sep='\n',header=None) neg['words']=neg[0].apply(lambda x: jieba.lcut(str(x).lstrip('-1 '))) #将函数应用到所有数据 pos['words']=pos[0].apply(lambda x: jieba.lcut(str(x).lstrip('-1 '))) pos_true=[] neg_true=[] for words in pos.words: pos_true.append(split_stopwords(words,stopwords)) for words in neg.words: neg_true.append(split_stopwords(words,stopwords)) x=np.concatenate((pos_true,neg_true)) #合并训练集 y=np.concatenate((np.ones(len(pos_true)),np.zeros(len(neg_true)))) #标志,1 pos ,0 neg if (os.path.exists("word_embedding")): w2v=Word2Vec.load("word_embedding") else: w2v=Word2Vec(vector_size=300,min_count=10) w2v.build_vocab(x) w2v.train(x,total_examples=w2v.corpus_count,epochs=w2v.epochs) w2v.save("word_embedding") def total_vec(words): vec = np.zeros(300).reshape((1,300)) #初始化数组 for word in words: try: vec += w2v.wv[word].reshape((1,300)) except KeyError: continue return vec train_vec = np.concatenate([total_vec(words) for words in x]) #计算每一句话向量 def predict(s,stopList): s_words=jieba.lcut(s) s_words=split_stopwords(s_words,stopList) s_words_vec=total_vec(s_words) result =model.predict(s_words_vec) if int(result[0])==1: print(s,'[积极]') else: print(s,'[消极]') return result #SVMpart if (os.path.exists("SVC_model_Emotion.m")): model=joblib.load("SVC_model_Emotion.m") else: model = SVC(kernel = 'rbf', verbose=True) model.fit(train_vec,y) joblib.dump(model,"SVC_model_Emotion.m") for file in all_pos_files: try: f=open(file,'r',encoding='utf-8') test=f.read().strip() if (predict(test,stopwords) == 1): right += 1 tol_num+=1 except UnicodeDecodeError: continue pos_right=right pos_tol=tol_num for file in all_neg_files: try: f=open(file,'r',encoding='utf-8') test=f.read().strip() if (predict(test,stopwords)==0): right+=1 tol_num+=1 except UnicodeDecodeError: continue neg_right=right-pos_right neg_tol=tol_num-pos_tol print("pos数据正确率为"+str(pos_right/pos_tol)+"总数据量为"+str(pos_tol)+"正确量为"+str(pos_right)) print("neg数据正确率为"+str(neg_right/neg_tol)+"总数据量为"+str(neg_tol)+"正确量为"+str(neg_right)) print("正确率为"+str(right/tol_num))
代码中的评价文件的地址记得修改不然会报错,以及记得要下载库和导库(或者把评价文件和停用词文件直接拖到和你py文件同一个目录下就不用改了)
附一些运行成功的图片
::侵删
::侵删
::侵删