def link_vectorizer(train_docs, stats=None, n_most_common=1000,
return_transf=False):
# One pass to compute training corpus statistics.
train_docs = list(train_docs)
if stats is None:
stats = stats_train(train_docs)
lemma_freqs, prod_freqs, _, pmi_incoming, pmi_outgoing = stats
# vectorize BOW-style features
lemma_vocab = [w for w, _ in lemma_freqs[:n_most_common]]
prod_vocab = [p for p, _ in prod_freqs[:n_most_common]]
vects = dict(lemmas=dict(vocabulary=lemma_vocab, lowercase=True),
productions=dict(vocabulary=prod_vocab), pos={}, discourse={},
indicators={}, indicator_preceding_in_para={},
indicator_following_in_para={})
raw_keys = ['src__is_first_in_para', 'src__is_last_in_para',
'trg__is_first_in_para', 'trg__is_last_in_para',
'same_sentence', 'src_precedes_trg', 'trg_precedes_src',
'any_shared_nouns', 'src__pmi_pos_ratio', 'src__pmi_neg_ratio',
'trg__pmi_pos_ratio', 'trg__pmi_neg_ratio', 'src__pmi_pos_any',
'src__pmi_neg_any', 'trg__pmi_pos_any', 'trg__pmi_neg_any', ]
nrm_keys = ['src__n_tokens', 'trg__n_tokens', 'props_between', 'n_props',
'n_shared_nouns']
vect_list = list(make_union_link(vects)) + [
('raw', FilteredDictVectorizer(raw_keys)), ('nrm', make_pipeline(
FilteredDictVectorizer(nrm_keys, sparse=False),
MinMaxScaler((0, 1))))]
vect = FeatureUnion(vect_list)
train_feats = [f for doc in train_docs for f in doc.features]
[add_pmi_features(f, pmi_incoming, pmi_outgoing) for f in train_feats]
if return_transf:
X_tr = vect.fit_transform(train_feats)
return vect, X_tr
else:
return vect.fit(train_feats)
评论列表
文章目录