python类collocations()的实例源码-面圈网

word_processing.py 文件源码项目：Hacker_News_Article_Topics 作者: reeddunkle 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def generate_collocations(tokens):
    '''
    Given list of tokens, return collocations.
    '''

    ignored_words = nltk.corpus.stopwords.words('english')
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # Best results with window_size, freq_filter of: (2,1) (2,2) (5,1)
    finder = BigramCollocationFinder.from_words(tokens, window_size = 2)
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)

    colls = finder.nbest(bigram_measures.likelihood_ratio, 5)

    return colls

keywords.py 文件源码项目：feature_engineering 作者: webeng 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def extract_bigrams(self, text):

        text = self.remove_return_lines_and_quotes(text)
        bigrams = []

        st = PorterStemmer()
        stop = stopwords.words('english')

        more_stop_words = [
            '(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
        stop = stopwords.words('english')
        stop = stop + more_stop_words

        tokens = st.stem(text)
        tokens = nltk.word_tokenize(tokens.lower())
        tokens = [i for i in tokens if i not in stop]
        tokens = [word for word in tokens if len(word) > 2]

        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        finder.apply_freq_filter(2)
        top_bigrams = finder.nbest(bigram_measures.pmi, 1000)

        for bg in top_bigrams:
            bg = " ".join(bg)
            tag = nltk.pos_tag([bg])[0]

            if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
                bigrams.append(tag[0])

        return bigrams

ngrams.py 文件源码项目：DebateAnalysis 作者: Lingistic 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def get_bigram_likelihood(statements, freq_filter=3, nbest=200):
    """
    Returns n (likelihood ratio) bi-grams from a group of documents
    :param        statements: list of strings
    :param output_file: output path for saved file
    :param freq_filter: filter for # of appearances in bi-gram
    :param       nbest: likelihood ratio for bi-grams
    """

    words = list()
    print 'Generating word list...'
    #tokenize sentence into words
    for statement in statements:
        # remove non-words
        tokenizer = RegexpTokenizer(r'\w+')
        words.extend(tokenizer.tokenize(statement))

    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bigram_finder = BigramCollocationFinder.from_words(words)

    # only bi-grams that appear n+ times
    bigram_finder.apply_freq_filter(freq_filter)

    # TODO: use custom stop words
    bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in nltk.corpus.stopwords.words('english'))

    bigram_results = bigram_finder.nbest(bigram_measures.likelihood_ratio, nbest)

    return bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)

word_processing.py 文件源码项目：Hacker_News_Article_Topics 作者: reeddunkle 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def display_collocations(articles):
    '''
    Given list of article tuples (title, text, url), generates and
    PRINTS collocations (no return).
    '''

    def clear_screen():
        os.system('cls' if os.name == 'nt' else 'clear')

    total = len(articles)

    for i, article in enumerate(articles):

        title, text, url = article
        article_number = i + 1

        if title != '':

            try:
                clear_screen()

                colls = generate_collocations(text)

                print("Article {}/{}".format(article_number, total))
                print('---------------')
                print("{}\n".format(title))
                print("Link: {}\n".format(url))
                print("Topics:")



                output = ""
                for tup in colls:
                    word1, word2 = tup
                    output += "{} {}; ".format(word1, word2)

                print(output[:-2])
                print('---------------\n')

                print("Press ENTER for next article or any key to exit.")
                user_input = raw_input("> ")
                if user_input:
                    exit(0)

            except TypeError:
                continue

        else:
            continue

        clear_screen()