def generate_vocab(filename,min_fre=5,prefix=""):
vf = open("../data/"+prefix+"vocab_generate.txt",'w')
word = {}
for line in file(filename):
line = line.strip()
try:
sentencesToken = nltk.sent_tokenize(line)
except:
continue
for i in range(len(sentencesToken)):
tokens = nltk.word_tokenize(sentencesToken[i])
for token in tokens:
word.setdefault(token,0)
word[token] += 1
for char,num in sorted(word.items(),key=lambda x:x[1],reverse=True):
if num < min_fre:
break
vf.write(char+" "+str(num)+"\n")
评论列表
文章目录