# 文本词频分析
# 一、前置库
库
# 二、词频分析代码及结果
import jieba | |
with open('C:/Users/asus/Desktop/test.txt','r',encoding='UTF-8') as novelFile: | |
novel = novelFile.read() | |
stopwords = [line.strip() for line in open('C:/Users/asus/Desktop/stopword.txt','r',encoding='UTF-8').readlines()] | |
novelList = list(jieba.lcut(novel)) | |
novelDict = {} | |
# 统计出词频字典 | |
for word in novelList: | |
if word not in stopwords: | |
# 不统计字数为一的词 | |
if len(word) == 1: | |
continue | |
else: | |
novelDict[word] = novelDict.get(word,0) + 1 | |
# 对词频进行排序 | |
novelListSorted = list(novelDict.items()) | |
novelListSorted.sort(key=lambda e: e[1],reverse=True) | |
# 打印前 10 词频 | |
topWordNum = 0 | |
for topWordTup in novelListSorted[:10]: | |
print(topWordTup) |
输出结果如下:
这一段是输出词频统计图的代码
from matplotlib import pyplot as plt | |
x = [c for c,v in novelListSorted] | |
y = [v for c,v in novelListSorted] | |
plt.plot(x[:10],y[:10],color='r') | |
plt.show() |
使用的文章和分词文本如下:
test.txt<br/>
stopwords.txt