def calc_frequencies(words, words_n=50, lang='german'):
words = [word for word in words if len(word) > 1]
words = [word for word in words if not word.isnumeric()]
words = [word.lower() for word in words]
# words = [word for word in words if word not in all_stopwords]
# Stemming words seems to make matters worse, disabled
# stemmer = nltk.stem.snowball.SnowballStemmer(lang)
# words = [stemmer.stem(word) for word in words]
fdist = nltk.FreqDist(words)
return fdist.most_common(words_n)
评论列表
文章目录