def create_lexicon(pos, neg):
lexicon = []
for fi in [pos, neg]:
with open (fi, 'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
"""
This is done in the tutorial.
Seems like a brute force method of removing stopwords.
TODO: Use NLTK stopwords to remove stop words ?
"""
l2 = []
for w in w_counts:
if 1000 > w_counts[w] > 50:
l2.append(w)
return l2
sentiment_featureset.py 文件源码
python
阅读 24
收藏 0
点赞 0
评论 0
评论列表
文章目录