Java教程

jieba 分词-西游记

本文主要是介绍jieba 分词-西游记,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
import jieba
 
def takeSecond(elem):
    return elem[1]
 
def main():
    path = "西游记.txt"
    file = open(path,"r",encoding="utf-8")
    text=file.read()
    file.close()
 
    words = jieba.lcut(text)
    counts = {}
    for word in words:
        if len(word) == 1:
            continue    
        elif word == "大圣" or word=="老孙" or word=="行者" or word=="孙大圣" or word=="孙行者" or word=="猴王" or word=="悟空" or word=="齐天大圣" or word=="猴子":
            rword = "孙悟空"
        elif word == "师父" or word == "三藏" or word=="圣僧":
            rword = "唐僧"
        elif word == "呆子" or word=="八戒" or word=="老猪":
            rword = "猪八戒"
        elif word=="沙和尚":
            rword="沙僧"
        elif word == "妖精" or word=="妖魔" or word=="妖道":
            rword = "妖怪"
        elif word=="佛祖":
            rword="如来"
        elif word=="三太子":
            rword="白马"
        else:
            rword = word
        counts[rword] = counts.get(rword,0) + 1
 
    file = open("excludes.txt","r",encoding="utf-8")
    excludes = file.read().split(",")
    file.close
 
    for delWord in excludes:
        try:
            del counts[delWord]
        except:
            continue
 
    items = list(counts.items())
    items.sort(key = takeSecond,reverse=True)    
 
    for i in range(20):
        item=items[i]
        keyWord =item[0]
        count=item[1]
        print("{0:<10}{1:>5}".format(keyWord,count))
main()

 

 

 

这篇关于jieba 分词-西游记的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!