python类Doc2Vec()的实例源码-面圈网

embedding_trainer.py 文件源码项目：Kaggle_HomeDepot 作者: ChenglongChen 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def train_word2vec_model(df, columns):
    model_param = {
        "alpha": config.EMBEDDING_ALPHA,
        "learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY,
        "n_epoch": config.EMBEDDING_N_EPOCH,
        "sg": 1,
        "hs": 1,
        "min_count": config.EMBEDDING_MIN_COUNT,
        "size": config.EMBEDDING_DIM,
        "sample": 0.001,
        "window": config.EMBEDDING_WINDOW,
        "workers": config.EMBEDDING_WORKERS,
    }
    model_dir = config.WORD2VEC_MODEL_DIR
    model_name = "Homedepot-word2vec-D%d-min_count%d.model"%(
                    model_param["size"], model_param["min_count"])

    word2vec = DataFrameWord2Vec(df, columns, model_param)
    word2vec.train()
    word2vec.save(model_dir, model_name)


#---------------------- Doc2Vec ----------------------

train_vector_model.py 文件源码项目：MyCluster 作者: yinminggang 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def trainDoc2Vector(sentence_count, vector_dimension):
    # train and save the model
    sentences = TaggedLineDocument('sources/splited_words.txt')
    model = Doc2Vec(sentences, size=vector_dimension, window=8, min_count=2, workers=multiprocessing.cpu_count())
    model.train(sentences, total_examples=sentence_count, epochs=model.iter)
    model.save('result/doc2vec.model')
    # save vectors
    out = open('result/doc2vec.vector', mode='w+', encoding='utf-8')
    for index in range(0, sentence_count, 1):
        docvec = model.docvecs[index]
        out.write(' '.join(str(f) for f in docvec) + "\n")

    out.close()

doc2vec.py 文件源码项目：vec4ir 作者: lgalke 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self,
                 analyzer=None, matching=None,
                 name=None,
                 verbose=0,
                 n_epochs=10,
                 alpha=0.25,
                 min_alpha=0.05,
                 n_jobs=4,
                 **kwargs):
        # self.model = model
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.verbose = verbose
        self.name = "paragraph-vectors" if name is None else name

        if matching is True:
            self._matching = Matching()
        elif matching is False or matching is None:
            self._matching = None
        else:
            self._matching = Matching(**dict(matching))

        self.analyzer = analyzer
        self.model = Doc2Vec(alpha=alpha,
                             min_alpha=alpha,
                             size=500,
                             window=8,
                             min_count=1,
                             sample=1e-5,
                             workers=n_jobs,
                             negative=20,
                             dm=0, dbow_words=1,  # words only with dm!=0?
                             dm_mean=0,  # unused when in concat mode
                             dm_concat=1,
                             dm_tag_count=1
                             )
        self.n_epochs = n_epochs
        self._neighbors = NearestNeighbors(**kwargs)

doc2vec.py 文件源码项目：vec4ir 作者: lgalke 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def fit(self, docs, y):
        assert len(docs) == len(y)
        model = self.model
        n_epochs = self.n_epochs
        verbose = self.verbose
        decay = (self.alpha - self.min_alpha) / n_epochs
        X = [TaggedDocument(self.analyzer(doc), [label])
             for doc, label in zip(docs, y)]

        if verbose > 0:
            print("First 3 tagged documents:\n", X[:3])
            print("Training doc2vec model")
        # d2v = Doc2Vec()
        # d2v.build_vocab(X)
        # if self.intersect is not None:
        #     d2v.intersect_word2vec_format(self.intersect)
        model.build_vocab(X)
        for epoch in range(n_epochs):
            if verbose:
                print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
            model.train(X)
            model.alpha -= decay  # apply global decay
            model.min_alpha = model.alpha  # but no decay inside one epoch

        if verbose > 0:
            print("Finished.")
            print("model:", self.model)

        if self._matching:
            self._matching.fit(docs)
        else:
            # if we dont do matching, its enough to fit a nearest neighbors on
            # all centroids before query time
            dvs = np.asarray([model.docvecs[tag] for tag in y])
            self._neighbors.fit(dvs)

        self._y = y

        return self

train.py 文件源码项目：w2vec-similarity 作者: jayantj 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def train_and_save_doc2vec(docs, output_file, options = {}):
  print "Training model..."
  model = Doc2Vec(docs, **options)
  model.save(output_file)

contentView.py 文件源码项目：OpinionSpam 作者: Coder-Yu 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def trainingNet(self, window, nDimension):
        self.nDimension = nDimension
        sentences = LabeledLineSentence(self.corpus)
        self.model = Doc2Vec(min_count=1, window=window, size=nDimension, sample=1e-4, negative=5, workers=4)
        corpus = sentences.to_array()
        self.model.build_vocab(corpus)
        for epoch in range(10):
            self.model.train(sentences.sentences_perm())

features_nn.py 文件源码项目：Semantic-Texual-Similarity-Toolkits 作者: rgtjf 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def extract_instances(self, train_instances):
        sentences = []
        for idx, train_instance in enumerate(train_instances):
            sa, sb = train_instance.get_word(type='lemma', lower=True)
            sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
            sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))

        model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000)

        features = []
        infos = []
        for idx in range(len(train_instances)):
            vec_a = model.docvecs['sa_%d' % idx]
            vec_b = model.docvecs['sb_%d' % idx]
            feature, info = vk.get_all_kernel(vec_a, vec_b)
            features.append(feature)
            infos.append([])
            # infos.append([vec_a, vec_b])

        return features, infos

    # def load_instances(self, train_instances):
    #     """
    #     extract cosine distance from already trained feature file
    #     without modify the feature_file
    #     this function's priority is higher that the above extract_instances
    #     """
    #
    #     _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file)
    #     features = []
    #     infos = []
    #     ''' get features from train instances'''
    #     for _feature in _features:
    #         feature = Feature._feat_string_to_list(_feature, _n_dim)
    #         features.append([feature[1]])
    #         infos.append(['cosine'])
    #
    #     features = [ Feature._feat_list_to_string(feature) for feature in features ]
    #
    #     return features, 1, _n_instance

embedding_trainer.py 文件源码项目：Kaggle_HomeDepot 作者: ChenglongChen 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def __init__(self, df, columns, model_param):
        super().__init__(df, columns, model_param)
        self.model = Doc2Vec(dm=self.model_param["dm"], 
                                hs=self.model_param["hs"], 
                                alpha=self.model_param["alpha"],
                                min_alpha=self.model_param["alpha"],
                                min_count=self.model_param["min_count"], 
                                size=self.model_param["size"], 
                                sample=self.model_param["sample"], 
                                window=self.model_param["window"], 
                                workers=self.model_param["workers"])

train_doc2vec.py 文件源码项目：hh-page-classifier 作者: TeamHG-Memex 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def train(input_jlgz, *, size, limit, min_df, max_features):
    print('FAST_VERSION', FAST_VERSION)
    documents = Documents(input_jlgz, limit=limit)
    model = Doc2Vec(
        documents=documents,
        size=size,
        min_count=min_df,
        max_vocab_size=max_features,
        workers=multiprocessing.cpu_count(),
        sample=1e-5,
    )
    return model