def extractFeatures(self, article, n, customStopWords=None):
# pass in article as a tuple ( text, title)
text = article[0]
# extract the text
title = article[1]
# extract the title
sentences = sent_tokenize(text)
# split text into sentences
word_sent = [word_tokenize(sentences.lower()) for a in sentences]
# split sentences into words
self._freq = self._compute_frequencies(word_sent, customStopWords)
# calculate word freq using member func created above
if n < 0:
# how many features (words) to return - a -ve number means
# no feature ( word) selection, just return all features
return nlargest(len(self._freq_keys()),
self._freq, key=self._freq.get)
else:
# here we say if calling e func has asked for a subset
# then return only the 'n' largest features, i.e. the
# most important words ( important == frequent, less stopwords)
return nlargest(n, self._freq, key=self._freq.get)
NewsArticleClass.py 文件源码
python
阅读 26
收藏 0
点赞 0
评论 0
评论列表
文章目录