python类Word2Vec()的实例源码-面圈网

train.py 文件源码项目：TextRankPlus 作者: zuoxiaolei 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def run():
    '''
    ????
    '''
    reload(sys)
    sys.setdefaultencoding('utf8')
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp1 = r'wiki_model'
    outp2 = r'vector.txt'
    model = Word2Vec(sentences, size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

    testData = ['??','??','??','??']
    for i in testData:
        temp = model.most_similar(i)
        for j in temp:
            print '%f %s'%(j[1],j[0])
        print ''

train_vector_model.py 文件源码项目：MyCluster 作者: yinminggang 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def trainWord2Vector(sentence_count, vector_dimension, train_count):

    lines, model_out, vector_out = "sources/splited_words.txt", "result/word2vec.model", "result/pre_word2vec.vector"
    logging.info("??????")
    sentences = LineSentence(lines)
    # ??min_count=3??????3?? ????????????word2vec.vector?
    # workers????????????CPU??  ???3
    # sg?????????
    model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8,
                     min_count=0, workers=multiprocessing.cpu_count())
    # ?????  ??????
    for i in range(train_count):
        model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter)

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model.save(model_out)
    model.wv.save_word2vec_format(vector_out)

test1.py 文件源码项目：MyCluster 作者: yinminggang 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def trainWord2Vector(sentence_count, vector_dimension, train_count):

    lines, model_out, vector_out = "com/com/test1/test1sources/splited_words.txt", \
                                   "com/com/test1/test1sources/word2vec.model", \
                                   "com/com/test1/test1sources/word2vec.vector"
    logging.info("??????")
    sentences = LineSentence(lines)
    # ??min_count=3??????3?? ????????????word2vec.vector?
    # workers????????????CPU??  ???3
    model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8,
                     min_count=0, workers=multiprocessing.cpu_count())
    # ?????  ??????
    for i in range(train_count):
        model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter)

    # trim unneeded model memory = use(much) less RAM
    # model.init_sims(replace=True)
    model.save(model_out)
    model.wv.save_word2vec_format(vector_out)

postprocessing.py 文件源码项目：vec4ir 作者: lgalke 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def uptrain(corpus,
            model_path=None,
            binary=True,
            lockf=0.0,
            min_count=1,
            size=300,
            **word2vec_params):
    wv = Word2Vec(min_count=min_count, size=size, **word2vec_params)
    print("Building vocabulary...")
    wv.build_vocab(corpus)
    print("Found %d distinct words." % len(wv.index2word))
    if model_path is not None:
        print("Intersecting with", model_path, "...")
        wv.intersect_word2vec_format(model_path, binary=binary, lockf=lockf)
        print("Intersected vectors locked with", lockf)

    total_examples = len(corpus)
    print("Training on %d documents..." % total_examples)
    wv.train(corpus, total_examples=total_examples)

    return wv

create_word2vec.py 文件源码项目：dutchembeddings 作者: clips 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def create(basedir, num_workers=12, size=320, threshold=5):
        """
        Creates a word2vec model using the Gensim word2vec implementation.

        :param basedir: the dir from which to get the documents.
        :param num_workers: the number of workers to use for training word2vec
        :param size: the size of the resulting vectors.
        :param threshold: the frequency threshold.
        :return: the model.
        """

        logging.basicConfig(level=logging.INFO)
        sentences = SentenceIter(root=basedir)

        model = Word2Vec(sentences=sentences,
                         sg=True,
                         size=size,
                         workers=num_workers,
                         min_count=threshold,
                         window=11,
                         negative=15)
        model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab")

        return model

graph2vec.py 文件源码项目：PyTorchText 作者: chenyuntc 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def train_save(self, list_csv):
        sentences = MySentences(list_csv)
        num_features = 256
        min_word_count = 1
        num_workers = 20
        context = 5
        epoch = 20
        sample = 1e-5
        model = Word2Vec(
            sentences,
            size=num_features,
            min_count=min_word_count,
            workers=num_workers,
            sample=sample,
            window=context,
            iter=epoch,
        )
        #model.save(model_fn)
        return model

word2vec.py 文件源码项目：Word2VecStackoverflow 作者: ase-sharif 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def main(positive, negative, topn):
    """ This method train word2vec model, and return most similar tags

    Args:
        positive (list): list of positive tags
        negative (list): list of negative tags
        topn (int): number of top keywords in word2vec

    Returns:
        list: Return list of word2vec

    """
    with open('tags.txt') as f:
        content = f.readlines()
        sentences = [x.split() for x in content]

        model = Word2Vec(sentences, min_count=20)

        return model.most_similar(positive=positive, negative=negative, topn=topn)

models.py 文件源码项目：ShallowLearn 作者: giacbrd 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def fit_embeddings(self, documents):
        """
        Train word embeddings of the classification model, using the same parameter values for classification on Gensim ``Word2Vec``.
        Similar to use a pre-trained model.
        :param documents:
        """
        params = self.get_params()
        del params['pre_trained']
        del params['bucket']
        # Word2Vec has not softmax
        if params['loss'] == 'softmax':
            params['loss'] = 'hs'
        LabeledWord2Vec.init_loss(LabeledWord2Vec(), params, params['loss'])
        del params['loss']
        w2v = Word2Vec(sentences=documents, **params)
        self._classifier = LabeledWord2Vec.load_from(w2v)

node2vec.py 文件源码项目：entity2rec 作者: D2KLab 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def learn_embeddings(self, output):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        self._simulate_walks()  # simulate random walks

        model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        model.wv.save_word2vec_format(output, binary=True)

        print("saved model in word2vec binary format")

        return

Make_word2vec.py 文件源码项目：deeplearning 作者: fanfanfeng 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def training_word2vec():
    sentences = []

    read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.data_path_jieba)
    label_dir_list = os.listdir(read_dir_path)
    for label_dir in label_dir_list:
        label_dir_path = os.path.join(read_dir_path,label_dir)
        label_file_list = os.listdir(label_dir_path)
        for label_file in label_file_list:
            with open(os.path.join(label_dir_path,label_file),'rb') as reader:
                word_list = reader.read().decode('utf-8').replace('\n','').replace('\r','').strip()
                sentences.append(word_list)

    model_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.word2Vect_path)
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    model_save_path = os.path.join(model_path,sogou_classfication.model_name)

    model = Word2Vec(sentences,max_vocab_size=None,window=8,size=256,min_count=5,workers=4,iter=20)
    model.save(model_save_path)

word2vec_wiki.py 文件源码项目：Book_DeepLearning_Practice 作者: wac81 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def load_save_word2vec_model(line_words, model_filename):
    # ????
    feature_size = 500
    content_window = 5
    freq_min_count = 3
    # threads_num = 4
    negative = 3   #best????hierarchical softmax??(??????????)????negative sampling??(??????)?
    iter = 20

    print("word2vec...")
    tic = time.time()
    if os.path.isfile(model_filename):
        model = models.Word2Vec.load(model_filename)
        print(model.vocab)
        print("Loaded word2vec model")
    else:
        bigram_transformer = models.Phrases(line_words)
        model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count())
        toc = time.time()
        print("Word2vec completed! Elapsed time is %s." % (toc-tic))
        model.save(model_filename)
        # model.save_word2vec_format(save_model2, binary=False)
        print("Word2vec Saved!")
    return model

node2vec.py 文件源码项目：entity2vec 作者: D2KLab 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def learn_embeddings(self, output, output_format='binary'):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        self._simulate_walks()  # simulate random walks

        model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        is_binary = output_format != 'text'
        model.wv.save_word2vec_format(output, binary=is_binary)

        actual_format = 'text' if output_format == 'text' else 'binary'
        print("saved model in word2vec %s format" % actual_format)

        return

word_vectors.py 文件源码项目：spacy-dev-resources 作者: explosion 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def main(lang, in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        sample=1e-5,
        negative=negative
    )
    nlp = spacy.load(lang, parser=False, tagger=False, entity=False)
    corpus = Corpus(in_dir)
    total_words = 0
    total_sents = 0
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        with io.open(text_loc, 'r', encoding='utf8') as file_:
            text = file_.read()
        total_sents += text.count('\n')
        doc = nlp(text)
        total_words += corpus.count_doc(doc)  
        logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
                    text_no, total_words, len(corpus.strings))
    model.corpus_count = total_sents
    model.raw_vocab = defaultdict(int)
    for orth, freq in corpus.counts:
        if freq >= min_count:
            model.raw_vocab[nlp.vocab.strings[orth]] = freq
    model.scale_vocab()
    model.finalize_vocab()
    model.iter = nr_iter
    model.train(corpus)
    model.save(out_loc)

preprocess.py 文件源码项目：blstm-cws 作者: chantera 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def gen_embeddings(in_file, out_file, size=100):
    corpus = LineSentence(in_file)
    model = Word2Vec(
        sentences=corpus, size=size, alpha=0.025, window=5, min_count=5,
        max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
        sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
        trim_rule=None, sorted_vocab=1
    )
    model.save_word2vec_format(out_file, binary=False)

pretraining.py 文件源码项目：pandora 作者: mikekestemont 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def fit(self, tokens):
        # get most frequent items for plotting:
        tokens = [t.lower() for t in tokens]
        self.mfi = [t for t,_ in Counter(tokens).most_common(self.nb_mfi)]
        self.sentence_iterator = SentenceIterator(tokens=tokens)
        # train embeddings:
        self.w2v_model = Word2Vec(self.sentence_iterator,
                             window=self.window,
                             min_count=self.minimum_count,
                             size=self.size,
                             workers=self.nb_workers,
                             negative=self.nb_negative)
        self.plot_mfi()
        self.most_similar()

        # build an index of the train tokens
        # which occur at least min_count times:
        self.token_idx = {'<UNK>': 0}
        for k, v in Counter(tokens).items():
            if v >= self.minimum_count:
                self.token_idx[k] = len(self.token_idx)

        # create an ordered vocab:
        self.train_token_vocab = [k for k, v in sorted(self.token_idx.items(),\
                        key=itemgetter(1))]
        self.pretrained_embeddings = self.get_weights(self.train_token_vocab)

        return self

zhword2vec.py 文件源码项目：ChineseSA 作者: cwlseu 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def zhword2vec(ifname, fmodel):
    '''Training the word2vec word
    more: http://radimrehurek.com/gensim/models/word2vec.html

    '''
    model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
                    min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
    model.save(fmodel)
    # model.save_word2vec_format(fword2vec, binary=False)

preanalysis.py 文件源码项目：ChineseSA 作者: cwlseu 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def train_model(self, ofmodel, space = ' '):
        if self.traincorpusfname == None or not os.path.exists():
            ifname = self.__pretrain_model(space)
        else:
            ifname = self.traincorpusfname
        self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n')
        #
        # Calling Gensim 3rdparty lib, Training the word2vec word
        # more: http://radimrehurek.com/gensim/models/word2vec.html
        model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
                    min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
        self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n')
        model.save(ofmodel)
        return (model, ofmodel)

# if __name__=='__main__':
#     if len(sys.argv) < 3:
#         print(globals()['__doc__'] %locals())
#         sys.exit(1)

#     inp, outp =sys.argv[1:3]
#     #inp = '../../data/zhwiki-latest-pages-articles.xml.bz2','r'
#     #outp = '../../model/word2vec.model'

#     wiki = tWikiCorpus(inp, _lemmatize=False, _dictionary={})
#     print 'wiki'
#     wiki.getTexts(outp, space=' ')

trainvecmodel.py 文件源码项目：ChineseSA 作者: cwlseu 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def train_model(self, ofmodel, space = ' '):
        if self.traincorpusfname == None or not os.path.exists(self.traincorpusfname):
            ifname = self.pretrain_model(space)
        else:
            ifname = self.traincorpusfname
        self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n')
        #
        # Calling Gensim 3rdparty lib, Training the word2vec word
        # more: http://radimrehurek.com/gensim/models/word2vec.html
        model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
                    min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
        self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n')
        model.save(ofmodel)
        return (model, ofmodel)

train.py 文件源码项目：DeepNews 作者: kabrapratik28 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'):
        model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count())
        model.wv.save_word2vec_format(model_save_file_name, binary=False)

train.py 文件源码项目：w2vec-similarity 作者: jayantj 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def train_and_save(sents, output_file, options = {}):
  print "Training model..."
  model = Word2Vec(sents, **options)
  model.save(output_file)

word2vec.py 文件源码项目：ShallowLearn 作者: giacbrd 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, loss='softmax', bucket=0, **kwargs):
        """
        Exactly as the parent class `Word2Vec <https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec>`_.
        Some parameter values are overwritten (e.g. sg=0 because we never use skip-gram here), look at the code for details.
        Argument names must be explicit!

        `loss` = one value in {ns, hs, softmax}. If "ns" is selected negative sampling will be used
        as loss function, together with the parameter `negative`. With "hs" hierarchical softmax will be used,
        while with "softmax" (default) the sandard softmax function (the other two are "approximations").
         The `hs` argument does not exist anymore.

        `bucket` is the maximum number of hashed words, i.e., we limit the feature space to this number,
        ergo we use the hashing trick in the word vocabulary. Default to 0, NO hashing trick

        It basically builds two vocabularies, one for the sample words and one for the labels,
        so that the input layer is only made of words, while the output layer is only made of labels.
        **Parent class methods that are not overridden here are not tested and not safe to use**.
        """
        self.lvocab = {}  # Vocabulary of labels only
        self.index2label = []
        kwargs['sg'] = 0
        kwargs['window'] = sys.maxsize
        kwargs['sentences'] = None
        kwargs['hashfxn'] = custom_hash  # Force a consistent function across different Python versions
        self.softmax = self.init_loss(kwargs, loss)
        self.bucket = bucket
        super(LabeledWord2Vec, self).__init__(**kwargs)

word2vec.py 文件源码项目：ShallowLearn 作者: giacbrd 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def train(self, sentences, total_words=None, word_count=0,
              total_examples=None, queue_factor=2, report_delay=1.0):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
        (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
        sentences are the same as those that were used to initially build the vocabulary.

        """
        if self.bucket > 0:
            sentences = HashIter(sentences, self.bucket, with_labels=True)
        if (self.model_trimmed_post_training):
            raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension not loaded for Word2Vec, training will be slow. "
                          "Install a C compiler and reinstall gensim for fast training.")
            self.neg_labels = []
            if self.negative > 0:
                # precompute negative labels optimization for pure-python training
                self.neg_labels = zeros(self.negative + 1)
                self.neg_labels[0] = 1.
        return super(LabeledWord2Vec, self).train(sentences, total_words, word_count,
                                                  total_examples, queue_factor, report_delay)

word2vec.py 文件源码项目：ShallowLearn 作者: giacbrd 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def load_from(cls, other_model):
        """
        Import data and parameter values from other model
        :param other_model: A ``LabeledWord2Vec`` object, or a ``Word2Vec`` or ``KeyedVectors`` object of Gensim
        """
        softmax = getattr(other_model, 'softmax', False)
        if softmax:
            loss = 'softmax'
        elif not other_model.hs and other_model.negative:
            loss = 'ns'
        else:
            loss = 'hs'
        new_model = LabeledWord2Vec(
            loss=loss,
            negative=other_model.negative if loss == 'ns' else 0,
            size=other_model.vector_size,
            seed=other_model.seed
        )
        new_model.reset_from(other_model)
        for attr in vars(other_model):
            if hasattr(new_model, attr):
                if not isinstance(other_model, LabeledWord2Vec) and (attr == 'syn1' or attr == 'syn1neg'):
                    continue
                value = getattr(other_model, attr, getattr(new_model, attr))
                if isinstance(value, KeyedVectors):
                    new_model.wv.syn0 = value.syn0
                    new_model.wv.syn0norm = value.syn0norm
                else:
                    setattr(new_model, attr, value)
        return new_model

reader.py 文件源码项目：YelpDataChallenge 作者: fujunswufe 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def load_w2v(corpus, dictionary):
    '''
    Return the trained Word2Vec model
    Train a model if model doesn't exist yet
    :param corpus:
    :param dictionary:
    :return:
    '''
    if not os.path.isfile(W2V_MODEL_PATH):
        num_features = 300    # Word vector dimensionality
        min_word_count = 5    # Minimum word count
        num_workers = 5       # Number of threads to run in parallel
        window = 5          # Context window size
        downsampling = 1e-5   # Downsample setting for frequent words
        print("Training the word2vec model!")
        sents = get_review_sentences()
        # Initialize and train the model (this will take some time)
        model = models.Word2Vec(sents, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = window, sample = downsampling)

        # If you don't plan to train the model any further, calling
        # init_sims will make the model much more memory-efficient.
        model.init_sims(replace=True)

        # It can be helpful to create a meaningful model name and
        # save the model for later use. You can load it later using Word2Vec.load()
        model.save(W2V_MODEL_PATH)
        tfidf = models.Word2Vec(corpus)
        print('Word2vec model created!')

    print('Loading word2vec model')
    w2v = models.Word2Vec.load(W2V_MODEL_PATH)
    print('Loading word2vec model complished!')
    return w2v

frequent_pattern Item.py 文件源码项目：Recommendation-based-on-sequence- 作者: Bereket123 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def main():
    load_sequence('/home/beki/Documents/2nd Year/BD & DM Project/retail_dataset.csv')
    # split patterns to train_patterns and test_patterns

    train_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.8))
    test_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.2))

    # Word vector representation learning
    model = Word2Vec(train_patterns, size=15, window=3, min_count=1, workers=1, iter=3, sample=1e-4, negative=20)


    # Test
    test_size = float(len(test_patterns))
    hit = 0.0
    for current_pattern in test_patterns:
        if len(current_pattern) < 2:
            test_size -= 1.0
            continue
        # Reduce the current pattern in the test set by removing the last item
        last_item = current_pattern.pop()

        # Keep those items in the reduced current pattern, which are also in the models vocabulary
        items = [it for it in current_pattern if it in model.vocab]
        if len(items) <= 2:
            test_size -= 1.0
            continue

        # Predict the most similar items to items
        prediction = model.most_similar(positive=items)

        # Check if the item that we have removed from the test, last_item, is among
        # the predicted ones.
        for predicted_item, score in prediction:
            if predicted_item == last_item:
                hit += 1.0
                #print last_item
                #print prediction

    print 'Accuracy like measure: {}'.format(hit / test_size)

main.py 文件源码项目：struc2vec 作者: leoribeiro 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def learn_embeddings():
    '''
    Learn embeddings by optimizing the Skipgram objective using SGD.
    '''
    logging.info("Initializing creation of the representations...")
    walks = LineSentence('random_walks.txt')
    model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
    model.wv.save_word2vec_format(args.output)
    logging.info("Representations created.")

    return

make_word2vec.py 文件源码项目：deeplearning 作者: fanfanfeng 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def make_word2vec():
    data_path = tv_classfication.tv_data_path
    sentence = data_work(data_path)
    model = Word2Vec(sentence,size=256,workers=4,window=10,iter=30)
    model.save(tv_classfication.word2vec_path)

__main__.py 文件源码项目：GraphEmbeddingsRecommenderSystems 作者: himangshunits 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)

    ####################################################################################################################################################################
    # Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah)
    ####################################################################################################################################################################

    # call the build_deepwalk_corpus function
    # Take and populate the arguments from the command lines.                     
    generated_walks = graph.build_deepwalk_corpus(G = G, num_paths = args.number_walks, path_length = args.walk_length, alpha=0, rand=random.Random(0))
    # Call word2vec to build the model.
    # print generated_walks
    # The structure Looks like ['32173', '32168'], ['124010', '22676'], ['17792', '72925'],
    model = Word2Vec(generated_walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)


    ####################################################################################################################################################################
    # Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah)
    ####################################################################################################################################################################

    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3] for line in fin]    # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth]

    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1,6))
    print cm

vector_train.py 文件源码项目：wiki_zh_vec 作者: zhouhoo 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def word2vec_train(input_file, output_file):
    sentences = word2vec.LineSentence(input_file)
    model = Word2Vec(sentences, size=300, min_count=10, sg=0, workers=multiprocessing.cpu_count())
    model.save(output_file)
    model.save_word2vec_format(output_file + '.vector', binary=True)

train_w2v_model.py 文件源码项目：DocumentClassification 作者: liu-nlper 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def train():
    extract_sentece()

    in_path = './Data/corpus/sentence.txt'
    out_path = './Data/embedding/word2vec.bin'
    # ????
    model = Word2Vec(
        sg=1, sentences=LineSentence(in_path),
        size=256, window=5, min_count=3, workers=4, iter=40)
    model.wv.save_word2vec_format(out_path, binary=True)