def _extract_text_ngram_freqs(self, text):
"""Tokenize the text.
For each token in the text, extract ngrams of different length (from 1
to 5). Compute how many times each of these ngrams occur in the text.
Then return a dictionary of { ngram: frequencies }.
>>> implementation = CavnarTrenkleImpl()
>>> ngrams = implementation._extract_text_ngram_freqs("HeLLo")
>>> ngrams == {'h':1, 'e': 1, 'l': 2, 'o': 1, 'he': 1, 'el': 1, 'll': 1, \
'lo': 1, 'hel': 1, 'ell': 1, 'llo': 1, 'hell': 1, 'ello': 1, 'hello': 1}
True
>>> ngrams = implementation._extract_text_ngram_freqs("CIAO")
>>> ngrams == {'c':1, 'i': 1, 'a': 1, 'o': 1, 'ci': 1, 'ia': 1, 'ao': 1, \
'cia': 1, 'iao': 1, 'ciao': 1}
True
"""
tokens = wordpunct_tokenize(text.lower()) # Force lower case
# TODO: Delete numbers and punctuation
# TODO: Should we use nltk twitter tokenizer?
ngram_freqs = defaultdict(int)
for token in tokens:
for n in range(1, 6): # Use 1-grams to 5-grams
for ngram in ngrams(token, n):
ngram_string = ''.join(ngram)
ngram_freqs[ngram_string] += 1
# ngram_freqs[ngrams(token, n)] += 1
return ngram_freqs
评论列表
文章目录