def createPopularWords(combined, lowerBound, upperBound):
allWords = []
for message in combined:
for word in message[0]:
allWords.append(word)
allWords = nltk.FreqDist(allWords)
# grab the top several thousand words, ignoring the lowerBound most popular
# grabbing more words leads to more accurate predictions, at the cost of both memory and compute time
# ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall
popularWords = []
wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound]
for pair in wordsToUse:
popularWords.append(pair[0])
return popularWords
# extract features from a single document in a consistent manner for all documents in a corpus
# simply returns whether a given word in popularWords is included in the document
评论列表
文章目录