def n_gram_analysis_simple(infile, gram, stop):
ngram = dict()
f = open(infile, "r" )
#f2 = codecs.open(outfile, "w+", "utf-8")
for l in f:
x = nltk.ngrams(l.split(),gram)
for w in x:
# if stop:
# if w not in stops:
# if w in ngram:
# ngram[w]+=1
# else:
# ngram[w]=1
if w in ngram:
ngram[w] += 1
else:
ngram[w] = 1
p = list(ngram.items())
p.sort(key = lambda x: -x[1])
print len(p)
for x in p[:10]:
sen = ' '.join(x[0])
cnt = int(x[1])
if cnt == 0:
cnt = 1
print sen, cnt
评论列表
文章目录