preprocess_fields_v3.py 文件源码-python代码片段

def __init__(self, t, lexicon, maxTokens = 0, scorer = tokenization_based_score, distinctCount = 0, stopWords = None):
        super(TokenizedMatcher, self).__init__(t)
        currentMax = maxTokens
        self.scorer = scorer
        self.phrasesMap = validated_lexical_map(lexicon)
        self.tokenIdx = dict()
        self.distinctCount = distinctCount
        self.stopWords = stop_words_as_normalized_list(stopWords)
        for np in self.phrasesMap.keys():
            tokens = list([t for t in np.split(' ') if t not in self.stopWords])
            if len(tokens) < 1: continue
            if maxTokens < 1 and len(tokens) > currentMax:
                currentMax = len(tokens)
                if currentMax > DTC:
                    logging.warning('Full tokenization of lexicon: encountered token of length {}, above DTC!'.format(currentMax))
            matchedRefPhrase = ' '.join(tokens[:currentMax])
            if matchedRefPhrase not in self.tokenIdx or len(self.tokenIdx[matchedRefPhrase]) < len(np):
                self.tokenIdx[matchedRefPhrase] = np
        self.maxTokens = currentMax
        logging.info('SET UP %d-token matcher (%s-defined length) for <%s> with lexicon of size %d, total variants %d',
            self.maxTokens, 'user' if maxTokens > 0 else 'data', self.t, len(self.phrasesMap), len(self.tokenIdx))