def __init__(self, t, lexicon, maxTokens = 0, scorer = tokenization_based_score, distinctCount = 0, stopWords = None):
super(TokenizedMatcher, self).__init__(t)
currentMax = maxTokens
self.scorer = scorer
self.phrasesMap = validated_lexical_map(lexicon)
self.tokenIdx = dict()
self.distinctCount = distinctCount
self.stopWords = stop_words_as_normalized_list(stopWords)
for np in self.phrasesMap.keys():
tokens = list([t for t in np.split(' ') if t not in self.stopWords])
if len(tokens) < 1: continue
if maxTokens < 1 and len(tokens) > currentMax:
currentMax = len(tokens)
if currentMax > DTC:
logging.warning('Full tokenization of lexicon: encountered token of length {}, above DTC!'.format(currentMax))
matchedRefPhrase = ' '.join(tokens[:currentMax])
if matchedRefPhrase not in self.tokenIdx or len(self.tokenIdx[matchedRefPhrase]) < len(np):
self.tokenIdx[matchedRefPhrase] = np
self.maxTokens = currentMax
logging.info('SET UP %d-token matcher (%s-defined length) for <%s> with lexicon of size %d, total variants %d',
self.maxTokens, 'user' if maxTokens > 0 else 'data', self.t, len(self.phrasesMap), len(self.tokenIdx))
preprocess_fields_v3.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录