def prop_vectorizer(train_docs, which, stats=None, n_most_common_tok=1000,
n_most_common_dep=1000, return_transf=False):
# One pass to compute training corpus statistics.
train_docs = list(train_docs)
if stats is None:
stats = stats_train(train_docs)
lemma_freqs, _, dep_freqs, _, _ = stats
# vectorize BOW-style features
lemma_vocab = [w for w, _ in lemma_freqs[:n_most_common_tok]]
dep_vocab = [p for p, _ in dep_freqs[:n_most_common_dep]]
vects = dict(lemmas=dict(vocabulary=lemma_vocab, lowercase=True),
dependency_tuples=dict(vocabulary=dep_vocab), pos={},
discourse={}, indicators={}, indicator_preceding_in_para={},
indicator_following_in_para={})
raw_keys = ['is_first_in_para', 'is_last_in_para', 'toks_to_sent_ratio',
'relative_in_para', 'first_person_any', 'root_vb_modal',
'root_vb_tense']
nrm_keys = ['n_tokens', 'n_toks_in_sent', 'n_toks_in_para',
'n_toks_preceding_in_sent', 'n_toks_following_in_sent',
'preceding_props_in_para', 'following_props_in_para',
'parse_tree_height', 'n_subordinate_clauses']
if which == 'ukp':
raw_keys += ['is_in_intro', 'is_in_conclusion',
'has_shared_np_intro', 'has_shared_vp_intro',
'has_shared_np_conclusion', 'has_shared_vp_conclusion']
nrm_keys += ['n_shared_np_intro', 'n_shared_vp_intro',
'n_shared_np_conclusion', 'n_shared_vp_conclusion']
# load embeds
embed_vocab, embeds = load_embeds(which)
vect_list = list(make_union_prop(vects)) + [
('raw', FilteredDictVectorizer(raw_keys)),
('nrm', make_pipeline(FilteredDictVectorizer(nrm_keys, sparse=False),
MinMaxScaler((0, 1)))),
('embeds', EmbeddingVectorizer(embeds, embed_vocab))]
if which == 'ukp':
vect_list.append(('proba', PrecedingStats()))
vect = FeatureUnion(vect_list)
train_feats = [f for doc in train_docs for f in doc.prop_features]
if return_transf:
X_tr = vect.fit_transform(train_feats)
return vect, X_tr
else:
return vect.fit(train_feats)
评论列表
文章目录