def __init__(self, lines):
self.lookup = {}
self.max_len = 0
ensure_package_path()
from nltk.tokenize import wordpunct_tokenize as tokenize
for line in lines:
word_data = json.loads(line)
# capture both positive and negative, choose one at scoring time
pos_score, neg_score = word_data['pos'], word_data['neg']
terms = [word_data['word']]
# TODO: make the sentiment scorer configurable
if 'word_ar' in word_data:
terms.append(word_data['word_ar'])
if 'word_ur' in word_data:
terms.append(word_data['word_ur'])
for term in terms:
# if a scores exists for a term use the least neutral score
existing_scores = (0., 0.)
if term in self.lookup:
existing_scores = self.lookup[term]
self.lookup[term] = (max(pos_score, existing_scores[0]), max(neg_score, existing_scores[1]))
# update the maximum token length to check
self.max_len = max(len(tokenize(term)), self.max_len)
评论列表
文章目录