preprocess_fields_v3.py 文件源码

python
阅读 27 收藏 0 点赞 0 评论 0

项目:the-magical-csv-merge-machine 作者: entrepreneur-interet-general 项目源码 文件源码
def __init__(self, t, lexicon, maxTokens = 0, scorer = tokenization_based_score, distinctCount = 0, stopWords = None):
        super(TokenizedMatcher, self).__init__(t)
        currentMax = maxTokens
        self.scorer = scorer
        self.phrasesMap = validated_lexical_map(lexicon)
        self.tokenIdx = dict()
        self.distinctCount = distinctCount
        self.stopWords = stop_words_as_normalized_list(stopWords)
        for np in self.phrasesMap.keys():
            tokens = list([t for t in np.split(' ') if t not in self.stopWords])
            if len(tokens) < 1: continue
            if maxTokens < 1 and len(tokens) > currentMax:
                currentMax = len(tokens)
                if currentMax > DTC:
                    logging.warning('Full tokenization of lexicon: encountered token of length {}, above DTC!'.format(currentMax))
            matchedRefPhrase = ' '.join(tokens[:currentMax])
            if matchedRefPhrase not in self.tokenIdx or len(self.tokenIdx[matchedRefPhrase]) < len(np):
                self.tokenIdx[matchedRefPhrase] = np
        self.maxTokens = currentMax
        logging.info('SET UP %d-token matcher (%s-defined length) for <%s> with lexicon of size %d, total variants %d',
            self.maxTokens, 'user' if maxTokens > 0 else 'data', self.t, len(self.phrasesMap), len(self.tokenIdx))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号