def scoreFunction(wholetext):
"""Get text, find most common words and compare with known
stopwords. Return dictionary of values"""
dictiolist = {}
scorelist = {}
# These are the available languages with stopwords from NLTK
NLTKlanguages=["dutch","finnish","german","italian", "portuguese",
"spanish","turkish","danish","english", "french","hungarian",
"norwegian","russian","swedish"]
FREElanguages = [""]
languages=NLTKlanguages + FREElanguages
# Fill the dictionary of languages, to avoid unnecessary function calls
for lang in NLTKlanguages:
dictiolist[lang] = stopwords.words(lang)
# Split all the text in tokens and convert to lowercase. In a
# decent version of this, I'd also clean the unicode
tokens = word_tokenize(wholetext)
tokens = [t.lower() for t in tokens]
# Determine the frequency distribution of words, looking for the
# most common words
freq_dist = FreqDist(tokens)
# This is the only interesting piece, and not by much. Pick a
# language, and check if each of the 20 most common words is in
# the language stopwords. If it's there, add 1 to this language
# for each word matched. So the maximal score is 20. Why 20? No
# specific reason, looks like a good number of words.
for lang in languages:
scorelist[lang]=0
for word in freq_dist.keys()[0:20]:
if word in dictiolist[lang]:
scorelist[lang]+=1
return scorelist
评论列表
文章目录