W2V.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:USTC_AILab2 作者: overflocat 项目源码 文件源码
def getCorpus():
    documents = []
    txtNames = glob.glob("original/*.txt")
    for fileName in txtNames:
        fp = open(fileName)
        buf = fp.readline()
        documents.append(buf)

    stoplist = set('for a of the and to in at'.split())
    texts = [[word for word in document.translate(string.maketrans("", ""), string.punctuation).lower().split() if word not in stoplist]
             for document in documents]

    #Actually dictionary and corpus are of no use here
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=50000)
    dictionary.save('tmp/imdb.dict')

    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('tmp/imdb.mm', corpus)

    return texts
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号