def _calculate_word_scores(self, phrase_list):
"""Scores words according to frequency and tendency to appear in multi-word key phrases"""
word_freq = nltk.FreqDist()
word_multiplier = nltk.FreqDist()
for phrase in phrase_list:
# Give a higher score if word appears in multi-word candidates
multi_word = min(2, len(filter(lambda x: not is_numeric(x), phrase)))
for word in phrase:
# Normalize by taking the stem
word_freq[stem(word)] += 1
word_multiplier[stem(word)] += multi_word
for word in word_freq.keys():
word_multiplier[word] = word_multiplier[word] / float(word_freq[word]) # Take average
word_scores = {}
for word in word_freq.keys():
word_scores[word] = word_freq[word] * word_multiplier[word]
return word_scores
评论列表
文章目录