def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None):
if embedding_type == 'google':
embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
elif embedding_type == 'twitter':
estimator = Pipeline([
('tokenize', MapCorporas(tokenize_)),
('word2vec', MergeSliceCorporas(CachedFitTransform(Word2Vec(
sg=1, size=d, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
), self.memory))),
]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs])
embeddings_ = estimator.named_steps['word2vec'].estimator
embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
else:
embeddings_ = SimpleNamespace(X=np.empty((0, d)), vocab={})
estimator = Pipeline([
('tokenize', MapCorporas(tokenize_)),
# 0.25 is chosen so the unknown vectors have approximately the same variance as google pre-trained ones
('embeddings', MapCorporas(Embeddings(
embeddings_, rand=lambda shape: get_rng().uniform(-0.25, 0.25, shape).astype('float32'),
include_zero=True
))),
])
estimator.fit(construct_docs)
return estimator.named_steps['embeddings'].estimator
评论列表
文章目录