def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
'''
Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
Suitable for use as a step in a SKLearn Pipeline.
inputs:
parser: a Spacy pipeline object
returns:
feature transformer: FeatureUnion
'''
tfidf = Pipeline([
('cln', CleanTextTransformer()),
('pre', PreTokenizer(parser=parser)),
('vect', TfidfVectorizer(
max_features=3000, decode_error='replace')),
('clf', None)
])
grammar_counter = Pipeline([
('cln', CleanTextTransformer()),
('grm', GrammarTransformer(parser=parser)),
('to_dict', DictVectorizer()),
('clf', None)
])
if run_grammar and run_tfidf:
print('Running both feature sets.')
feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
elif not run_grammar:
print('Running only TFIDF.')
feature_transformer = FeatureUnion([("tfidf", tfidf)])
elif not run_tfidf:
print('Running only PCFGs.')
feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
return feature_transformer
transform_features.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录