utils.py 文件源码-python代码片段

def ngrams_selection(train_data, train_labels, ind, model_file,
                     ngram_range_=(1, 1), max_num_features=100,
                     analyzer_type='word'):
    """Create and save vectorizers and feature selectors on given train data.

    Args:
        train_data: list of train text samples
        train_labels: list of train labels
        ind: index of vectorizer/selector to save file
        model_file: model filename
        ngram_range_: range of n-grams
        max_num_features: maximum number of features to select
        analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char'

    Returns:
        nothing
    """
    vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type)

    X_train = vectorizer.fit_transform(train_data)

    if max_num_features < X_train.shape[1]:
        ch2 = SelectKBest(chi2, k=max_num_features)
        ch2.fit(X_train, train_labels)
        data_struct = {'vectorizer': vectorizer, 'selector': ch2}
        print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin')
        with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
            pickle.dump(data_struct, f)
    else:
        data_struct = {'vectorizer': vectorizer}
        print ('creating', model_file + '_ngrams_vect_' + ind + '.bin')
        with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
            pickle.dump(data_struct, f)
    return