def get_counts():
global unigrams
global bigrams
global sentences
for i in xrange(1, NUM_FILES+1):
if i in SKIP:
continue
with open("Shakespeare_parsed/%03d" % i) as f:
for line in f:
tokens = get_tokens(line)
tokens = [t.lower() for t in tokens]
tags = nltk.pos_tag(tokens)
if len(tokens) == 0:
continue
sentences.append(tokens)
prev_word = ""
for token in tokens:
unigrams[token] += 1
if not prev_word == "":
bigrams[(prev_word,token)] += 1
prev_word = token
top10_uni = unigrams.most_common()[:10]
top10_bi = bigrams.most_common()[:10]
评论列表
文章目录