bytileAggregator.py 文件源码-python代码片段

def score(self, sentence):
        # track both positive and negative scores for sentence
        pos_score, neg_score = 0., 0.
        # assuming no contextual forms are used for Arabic
        ensure_package_path()
        from nltk.tokenize import wordpunct_tokenize as tokenize
        tokens = tokenize(sentence.lower())
        term_count = 0
        # using nested while loops here to accomodate early termination of 
        # inner loop, and updating the index of the outer loop based on the
        #  number of tokens used in the sub-phrase
        i = 0
        while i < len(tokens):
            matched = False
            j = min(self.max_len, len(tokens) - i)
            # check phrase lengths up to `max_len`
            while j > 0 and (i + j) <= len(tokens):
                sub_tokens = tokens[i : i + j]
                sub_word = ' '.join(sub_tokens)
                # if a match exist for phrase, update scores and counts
                if sub_word in self.lookup:
                    sub_word_scores = self.lookup[sub_word]
                    pos_score += sub_word_scores[0]
                    neg_score += sub_word_scores[1]
                    term_count += 1
                    matched = True
                    i += j
                    break
                j -= 1
            # if not matched, skip token
            if not matched:
                i += 1
        # if no terms matched, or scores are equal, return a neutral score
        if pos_score == neg_score:
            return 0.5
        # if sentence is more positive than negative, use positive word sense
        elif pos_score > neg_score:
            return 0.5 + pos_score / term_count / 2 
        # if sentence is more negative than positive, use negative word sense
        else:
            return 0.5 - neg_score / term_count / 2