def train_word2vec_model(df, columns):
model_param = {
"alpha": config.EMBEDDING_ALPHA,
"learning_rate_decay": config.EMBEDDING_LEARNING_RATE_DECAY,
"n_epoch": config.EMBEDDING_N_EPOCH,
"sg": 1,
"hs": 1,
"min_count": config.EMBEDDING_MIN_COUNT,
"size": config.EMBEDDING_DIM,
"sample": 0.001,
"window": config.EMBEDDING_WINDOW,
"workers": config.EMBEDDING_WORKERS,
}
model_dir = config.WORD2VEC_MODEL_DIR
model_name = "Homedepot-word2vec-D%d-min_count%d.model"%(
model_param["size"], model_param["min_count"])
word2vec = DataFrameWord2Vec(df, columns, model_param)
word2vec.train()
word2vec.save(model_dir, model_name)
#---------------------- Doc2Vec ----------------------
python类Doc2Vec()的实例源码
def trainDoc2Vector(sentence_count, vector_dimension):
# train and save the model
sentences = TaggedLineDocument('sources/splited_words.txt')
model = Doc2Vec(sentences, size=vector_dimension, window=8, min_count=2, workers=multiprocessing.cpu_count())
model.train(sentences, total_examples=sentence_count, epochs=model.iter)
model.save('result/doc2vec.model')
# save vectors
out = open('result/doc2vec.vector', mode='w+', encoding='utf-8')
for index in range(0, sentence_count, 1):
docvec = model.docvecs[index]
out.write(' '.join(str(f) for f in docvec) + "\n")
out.close()
def __init__(self,
analyzer=None, matching=None,
name=None,
verbose=0,
n_epochs=10,
alpha=0.25,
min_alpha=0.05,
n_jobs=4,
**kwargs):
# self.model = model
self.alpha = alpha
self.min_alpha = min_alpha
self.verbose = verbose
self.name = "paragraph-vectors" if name is None else name
if matching is True:
self._matching = Matching()
elif matching is False or matching is None:
self._matching = None
else:
self._matching = Matching(**dict(matching))
self.analyzer = analyzer
self.model = Doc2Vec(alpha=alpha,
min_alpha=alpha,
size=500,
window=8,
min_count=1,
sample=1e-5,
workers=n_jobs,
negative=20,
dm=0, dbow_words=1, # words only with dm!=0?
dm_mean=0, # unused when in concat mode
dm_concat=1,
dm_tag_count=1
)
self.n_epochs = n_epochs
self._neighbors = NearestNeighbors(**kwargs)
def fit(self, docs, y):
assert len(docs) == len(y)
model = self.model
n_epochs = self.n_epochs
verbose = self.verbose
decay = (self.alpha - self.min_alpha) / n_epochs
X = [TaggedDocument(self.analyzer(doc), [label])
for doc, label in zip(docs, y)]
if verbose > 0:
print("First 3 tagged documents:\n", X[:3])
print("Training doc2vec model")
# d2v = Doc2Vec()
# d2v.build_vocab(X)
# if self.intersect is not None:
# d2v.intersect_word2vec_format(self.intersect)
model.build_vocab(X)
for epoch in range(n_epochs):
if verbose:
print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
model.train(X)
model.alpha -= decay # apply global decay
model.min_alpha = model.alpha # but no decay inside one epoch
if verbose > 0:
print("Finished.")
print("model:", self.model)
if self._matching:
self._matching.fit(docs)
else:
# if we dont do matching, its enough to fit a nearest neighbors on
# all centroids before query time
dvs = np.asarray([model.docvecs[tag] for tag in y])
self._neighbors.fit(dvs)
self._y = y
return self
def train_and_save_doc2vec(docs, output_file, options = {}):
print "Training model..."
model = Doc2Vec(docs, **options)
model.save(output_file)
def trainingNet(self, window, nDimension):
self.nDimension = nDimension
sentences = LabeledLineSentence(self.corpus)
self.model = Doc2Vec(min_count=1, window=window, size=nDimension, sample=1e-4, negative=5, workers=4)
corpus = sentences.to_array()
self.model.build_vocab(corpus)
for epoch in range(10):
self.model.train(sentences.sentences_perm())
def extract_instances(self, train_instances):
sentences = []
for idx, train_instance in enumerate(train_instances):
sa, sb = train_instance.get_word(type='lemma', lower=True)
sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))
model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000)
features = []
infos = []
for idx in range(len(train_instances)):
vec_a = model.docvecs['sa_%d' % idx]
vec_b = model.docvecs['sb_%d' % idx]
feature, info = vk.get_all_kernel(vec_a, vec_b)
features.append(feature)
infos.append([])
# infos.append([vec_a, vec_b])
return features, infos
# def load_instances(self, train_instances):
# """
# extract cosine distance from already trained feature file
# without modify the feature_file
# this function's priority is higher that the above extract_instances
# """
#
# _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file)
# features = []
# infos = []
# ''' get features from train instances'''
# for _feature in _features:
# feature = Feature._feat_string_to_list(_feature, _n_dim)
# features.append([feature[1]])
# infos.append(['cosine'])
#
# features = [ Feature._feat_list_to_string(feature) for feature in features ]
#
# return features, 1, _n_instance
def __init__(self, df, columns, model_param):
super().__init__(df, columns, model_param)
self.model = Doc2Vec(dm=self.model_param["dm"],
hs=self.model_param["hs"],
alpha=self.model_param["alpha"],
min_alpha=self.model_param["alpha"],
min_count=self.model_param["min_count"],
size=self.model_param["size"],
sample=self.model_param["sample"],
window=self.model_param["window"],
workers=self.model_param["workers"])
def train(input_jlgz, *, size, limit, min_df, max_features):
print('FAST_VERSION', FAST_VERSION)
documents = Documents(input_jlgz, limit=limit)
model = Doc2Vec(
documents=documents,
size=size,
min_count=min_df,
max_vocab_size=max_features,
workers=multiprocessing.cpu_count(),
sample=1e-5,
)
return model