def _char_wb_ngrams(self, text_document):
"""Whitespace sensitive char-n-gram tokenization.
Tokenize text_document into a sequence of character n-grams
excluding any whitespace (operating only inside word boundaries)"""
# normalize white spaces
text_document = self._white_spaces.sub(" ", text_document)
min_n, max_n = self.ngram_range
ngrams = []
for w in text_document.split():
w = ' ' + w + ' '
w_len = len(w)
for n in xrange(min_n, max_n + 1):
offset = 0
ngrams.append(w[offset:offset + n])
while offset + n < w_len:
offset += 1
ngrams.append(w[offset:offset + n])
if offset == 0: # count a short word (w_len < n) only once
break
return ngrams
评论列表
文章目录