def fit_logreg(self):
tokenize_sense = CachedFitTransform(Pipeline([
('tokenize', Map(compose(tokenize, normalize_special, unescape))),
('normalize', MapTokens(normalize_elongations)),
]), self.memory)
features = FeatureUnion([
# ('w2v_doc', ToCorporas(Pipeline([
# ('tokenize', MapCorporas(tokenize_sense)),
# ('feature', MergeSliceCorporas(Doc2VecTransform(CachedFitTransform(Doc2Vec(
# dm=0, dbow_words=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20,
# workers=16
# ), self.memory)))),
# ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]))),
# ('w2v_word_avg', Pipeline([
# ('tokenize', tokenize_sense),
# ('feature', Word2VecAverage(CachedFitTransform(Word2Vec(
# sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
# ), self.memory))),
# ]).fit(self.unsup_docs[:10**6])),
# ('w2v_word_avg_google', Pipeline([
# ('tokenize', tokenize_sense),
# ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
# ])),
# ('w2v_word_norm_avg', Pipeline([
# ('tokenize', tokenize_sense),
# ('feature', Word2VecNormAverage(CachedFitTransform(Word2Vec(
# sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
# ), self.memory))),
# ]).fit(self.unsup_docs[:10**6])),
('w2v_word_norm_avg_google', Pipeline([
('tokenize', tokenize_sense),
('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
])),
# ('w2v_word_max', Pipeline([
# ('tokenize', tokenize_sense),
# ('feature', Word2VecMax(CachedFitTransform(Word2Vec(
# sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
# ), self.memory))),
# ]).fit(self.unsup_docs[:10**6])),
# ('w2v_word_max_google', Pipeline([
# ('tokenize', tokenize_sense),
# ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))),
# ])),
# ('w2v_word_inv', ToCorporas(Pipeline([
# ('tokenize', MapCorporas(tokenize_sense)),
# ('feature', MergeSliceCorporas(Word2VecInverse(CachedFitTransform(Word2Vec(
# sg=1, size=100, window=10, hs=0, negative=5, sample=0, min_count=1, iter=20, workers=16
# ), self.memory)))),
# ]).fit([self.train_docs, self.unsup_docs[:10**5], self.val_docs, self.test_docs]))),
])
classifier = LogisticRegression()
with temp_log_level({'gensim.models.word2vec': logging.INFO}):
classifier.fit(features.transform(self.train_docs), self.train_labels())
estimator = Pipeline([('features', features), ('classifier', classifier)])
return 'logreg({})'.format(','.join(name for name, _ in features.transformer_list)), estimator
评论列表
文章目录