data_helpers.py 文件源码-python代码片段

data_helpers.py 文件源码

python

阅读 35 收藏 0 点赞 0 评论 0

项目：CNN-Text-Pairs-Classification 作者: RandolphVI 项目源码文件源码

def create_word2vec_model(embedding_size, input_file=TEXT_DIR):
    """
    Create the word2vec model based on the given embedding size and the corpus file.
    :param embedding_size: The embedding size
    :param input_file: The corpus file
    """
    word2vec_file = 'word2vec_' + str(embedding_size) + '.model'

    if os.path.isfile(word2vec_file):
        logging.info('? The word2vec model you want create already exists!')
    else:
        sentences = word2vec.LineSentence(input_file)
        # sg=0 means use CBOW model(default); sg=1 means use skip-gram model.
        model = gensim.models.Word2Vec(sentences, size=embedding_size, min_count=0,
                                       sg=0, workers=multiprocessing.cpu_count())
        model.save(word2vec_file)