def corpus2dict15(corpusfiles, lowercase=True):
""" From a given corpus, create a gensim dictionary for mapping words to ints, important: WMT15 data is already tokenized! """
corpus = list()
corpus.append(["PADDING"]) #has word index 0
corpus.append(["UNKNOWN"]) #has word index 1
for cf in corpusfiles:
if cf is not None: #source can be none
#just for huge lookuptable that contains all words from pretraining
# if lowercase:
# corpus.extend([l.lower().split() for l in codecs.open(cf,"r","utf8").readlines()])
# else:
# corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])
corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])
wordDictionary = corpora.Dictionary(corpus)
#print "... build word dictionary with vocabulary size =", len(wordDictionary)
return wordDictionary
评论列表
文章目录