language_model.py 文件源码-python代码片段

def generate_from_trigrams(lm, start_words, n_words):
        """
        backoff model
        start_words: list of two strings.
        n_words: integer >= 0, number of words to generate, not including start_words
        lm: lowercase_tokens must be nonempty
        """
        # Create probability maps
        trigram_counter = Counter(ngrams(lm.lowercase_tokens, 3))
        trigram_prob = trigram_prob_map(trigram_counter)
        bigram_cfd = nltk.ConditionalFreqDist(ngrams(lm.lowercase_tokens, 2))
        bigram_prob = bigram_prob_map(bigram_cfd)
        unigram_counter = Counter(lm.lowercase_tokens)
        unigram_prob = unigram_prob_map(unigram_counter)

        # Build sentence
        w1, w2 = start_words[0], start_words[1]
        words = [w1, w2]
        for i in range(n_words):
            # Use trigram
            if (w1, w2) in trigram_prob:
                prob_map = trigram_prob[(w1, w2)]
                next_words = prob_map.keys()
                next_word = choice(next_words, p=[prob_map[w] for w in next_words])

            # Use bigram
            elif w2 in bigram_prob:
                prob_map = bigram_prob[w2]
                next_words = prob_map.keys()
                next_word = choice(next_words, p=[prob_map[w] for w in next_words])

            # Use unigram
            else:
                prob_map = unigram_prob
                next_words = prob_map.keys()
                next_word = choice(next_words, p=[prob_map[w] for w in next_words])


            # Update words
            w1 = w2
            w2 = next_word
            words.append(w2)
        sentence = ' '.join(words)
        return sentence