def generate_collocations(tokens):
'''
Given list of tokens, return collocations.
'''
ignored_words = nltk.corpus.stopwords.words('english')
bigram_measures = nltk.collocations.BigramAssocMeasures()
# Best results with window_size, freq_filter of: (2,1) (2,2) (5,1)
finder = BigramCollocationFinder.from_words(tokens, window_size = 2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
finder.apply_freq_filter(1)
colls = finder.nbest(bigram_measures.likelihood_ratio, 5)
return colls
python类collocations()的实例源码
def extract_bigrams(self, text):
text = self.remove_return_lines_and_quotes(text)
bigrams = []
st = PorterStemmer()
stop = stopwords.words('english')
more_stop_words = [
'(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
stop = stopwords.words('english')
stop = stop + more_stop_words
tokens = st.stem(text)
tokens = nltk.word_tokenize(tokens.lower())
tokens = [i for i in tokens if i not in stop]
tokens = [word for word in tokens if len(word) > 2]
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(2)
top_bigrams = finder.nbest(bigram_measures.pmi, 1000)
for bg in top_bigrams:
bg = " ".join(bg)
tag = nltk.pos_tag([bg])[0]
if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
bigrams.append(tag[0])
return bigrams
def get_bigram_likelihood(statements, freq_filter=3, nbest=200):
"""
Returns n (likelihood ratio) bi-grams from a group of documents
:param statements: list of strings
:param output_file: output path for saved file
:param freq_filter: filter for # of appearances in bi-gram
:param nbest: likelihood ratio for bi-grams
"""
words = list()
print 'Generating word list...'
#tokenize sentence into words
for statement in statements:
# remove non-words
tokenizer = RegexpTokenizer(r'\w+')
words.extend(tokenizer.tokenize(statement))
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_words(words)
# only bi-grams that appear n+ times
bigram_finder.apply_freq_filter(freq_filter)
# TODO: use custom stop words
bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in nltk.corpus.stopwords.words('english'))
bigram_results = bigram_finder.nbest(bigram_measures.likelihood_ratio, nbest)
return bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)
def display_collocations(articles):
'''
Given list of article tuples (title, text, url), generates and
PRINTS collocations (no return).
'''
def clear_screen():
os.system('cls' if os.name == 'nt' else 'clear')
total = len(articles)
for i, article in enumerate(articles):
title, text, url = article
article_number = i + 1
if title != '':
try:
clear_screen()
colls = generate_collocations(text)
print("Article {}/{}".format(article_number, total))
print('---------------')
print("{}\n".format(title))
print("Link: {}\n".format(url))
print("Topics:")
output = ""
for tup in colls:
word1, word2 = tup
output += "{} {}; ".format(word1, word2)
print(output[:-2])
print('---------------\n')
print("Press ENTER for next article or any key to exit.")
user_input = raw_input("> ")
if user_input:
exit(0)
except TypeError:
continue
else:
continue
clear_screen()