def tokens2topbigrams(sep, measure, freq, scores, tokens):
'''Find top most interesting bi-grams in a token document.
Uses the --measure argument to determine what measure to use to define
'interesting'.
'''
content = read_tokens(tokens)
bcf = nltk.collocations.BigramCollocationFinder.from_words(content)
bcf.apply_freq_filter(freq)
nltk_measure = MEASURES[measure]
bigrams = bcf.score_ngrams(nltk_measure)
out = [b[0] for b in bigrams]
if scores:
out = [b[0] + tuple([str(b[1])]) for b in bigrams]
write_csv(out, str(sep))
评论列表
文章目录