ContextExtractor.py 文件源码-python代码片段

ContextExtractor.py 文件源码

python

阅读 19 收藏 0 点赞 0 评论 0

项目：quetch 作者: juliakreutzer 项目源码文件源码

def corpus2dict15(corpusfiles, lowercase=True): 
    """ From a given corpus, create a gensim dictionary for mapping words to ints, important: WMT15 data is already tokenized! """
    corpus = list()
    corpus.append(["PADDING"]) #has word index 0
    corpus.append(["UNKNOWN"]) #has word index 1
    for cf in corpusfiles:
        if cf is not None: #source can be none

#just for huge lookuptable that contains all words from pretraining
#           if lowercase:
#               corpus.extend([l.lower().split() for l in codecs.open(cf,"r","utf8").readlines()])
#           else:
#               corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])

            corpus.extend([l.split() for l in codecs.open(cf,"r","utf8").readlines()])
    wordDictionary = corpora.Dictionary(corpus)
    #print "... build word dictionary with vocabulary size =", len(wordDictionary)
    return wordDictionary