def score(self, sentence):
# track both positive and negative scores for sentence
pos_score, neg_score = 0., 0.
# assuming no contextual forms are used for Arabic
ensure_package_path()
from nltk.tokenize import wordpunct_tokenize as tokenize
tokens = tokenize(sentence.lower())
term_count = 0
# using nested while loops here to accomodate early termination of
# inner loop, and updating the index of the outer loop based on the
# number of tokens used in the sub-phrase
i = 0
while i < len(tokens):
matched = False
j = min(self.max_len, len(tokens) - i)
# check phrase lengths up to `max_len`
while j > 0 and (i + j) <= len(tokens):
sub_tokens = tokens[i : i + j]
sub_word = ' '.join(sub_tokens)
# if a match exist for phrase, update scores and counts
if sub_word in self.lookup:
sub_word_scores = self.lookup[sub_word]
pos_score += sub_word_scores[0]
neg_score += sub_word_scores[1]
term_count += 1
matched = True
i += j
break
j -= 1
# if not matched, skip token
if not matched:
i += 1
# if no terms matched, or scores are equal, return a neutral score
if pos_score == neg_score:
return 0.5
# if sentence is more positive than negative, use positive word sense
elif pos_score > neg_score:
return 0.5 + pos_score / term_count / 2
# if sentence is more negative than positive, use negative word sense
else:
return 0.5 - neg_score / term_count / 2
评论列表
文章目录