# 文本词频分析

# 一、前置库

jiebajieba

# 二、词频分析代码及结果

import jieba
	with open('C:/Users/asus/Desktop/test.txt','r',encoding='UTF-8') as novelFile:
    novel = novelFile.read()
stopwords = [line.strip() for line in open('C:/Users/asus/Desktop/stopword.txt','r',encoding='UTF-8').readlines()]
novelList = list(jieba.lcut(novel))
novelDict = {}
# 统计出词频字典
for word in novelList:
    if word not in stopwords:
        # 不统计字数为一的词
        if len(word) == 1:
            continue
        else:
            novelDict[word] = novelDict.get(word,0) + 1
# 对词频进行排序
novelListSorted = list(novelDict.items())
novelListSorted.sort(key=lambda e: e[1],reverse=True)
# 打印前 10 词频
topWordNum = 0
for topWordTup in novelListSorted[:10]:
    print(topWordTup)

输出结果如下:

词频结果

这一段是输出词频统计图的代码

from matplotlib import pyplot as plt
x = [c for c,v in novelListSorted]
y = [v for c,v in novelListSorted]
plt.plot(x[:10],y[:10],color='r')
plt.show()

词频图

使用的文章和分词文本如下:

test.txt<br/>
stopwords.txt