def corpus2dict(corpusfiles):
""" From a given corpus, create a gensim dictionary for mapping words to ints """
corpus = list()
corpus.append(["PADDING"]) #has word index 0
corpus.append(["UNKNOWN"]) #has word index 1
for cf in corpusfiles:
#print "INFO: corpus = %s"%(corpusfiles)
if cf is not None: #source can be none
corpus.extend(preprocess(codecs.open(cf,"r","utf8").readlines()))
wordDictionary = corpora.Dictionary(corpus)
return wordDictionary
评论列表
文章目录