cavnar_trenkle_impl.py 文件源码-python代码片段

def _extract_text_ngram_freqs(self, text):
        """Tokenize the text.

        For each token in the text, extract ngrams of different length (from 1
        to 5). Compute how many times each of these ngrams occur in the text.
        Then return a dictionary of { ngram: frequencies }.

        >>> implementation = CavnarTrenkleImpl()
        >>> ngrams = implementation._extract_text_ngram_freqs("HeLLo")
        >>> ngrams == {'h':1, 'e': 1, 'l': 2, 'o': 1, 'he': 1, 'el': 1, 'll': 1, \
            'lo': 1, 'hel': 1, 'ell': 1, 'llo': 1, 'hell': 1, 'ello': 1, 'hello': 1}
        True
        >>> ngrams = implementation._extract_text_ngram_freqs("CIAO")
        >>> ngrams == {'c':1, 'i': 1, 'a': 1, 'o': 1, 'ci': 1, 'ia': 1, 'ao': 1, \
            'cia': 1, 'iao': 1, 'ciao': 1}
        True

        """
        tokens = wordpunct_tokenize(text.lower()) # Force lower case
        # TODO: Delete numbers and punctuation
        # TODO: Should we use nltk twitter tokenizer?

        ngram_freqs = defaultdict(int)
        for token in tokens:
            for n in range(1, 6): # Use 1-grams to 5-grams
                for ngram in ngrams(token, n):
                    ngram_string = ''.join(ngram)
                    ngram_freqs[ngram_string] += 1
                # ngram_freqs[ngrams(token, n)] += 1

        return ngram_freqs