def ngrams_selection(train_data, train_labels, ind, model_file,
ngram_range_=(1, 1), max_num_features=100,
analyzer_type='word'):
"""Create and save vectorizers and feature selectors on given train data.
Args:
train_data: list of train text samples
train_labels: list of train labels
ind: index of vectorizer/selector to save file
model_file: model filename
ngram_range_: range of n-grams
max_num_features: maximum number of features to select
analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char'
Returns:
nothing
"""
vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type)
X_train = vectorizer.fit_transform(train_data)
if max_num_features < X_train.shape[1]:
ch2 = SelectKBest(chi2, k=max_num_features)
ch2.fit(X_train, train_labels)
data_struct = {'vectorizer': vectorizer, 'selector': ch2}
print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin')
with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
pickle.dump(data_struct, f)
else:
data_struct = {'vectorizer': vectorizer}
print ('creating', model_file + '_ngrams_vect_' + ind + '.bin')
with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
pickle.dump(data_struct, f)
return
评论列表
文章目录