preprocess.py 文件源码-python代码片段

preprocess.py 文件源码
python
阅读 20 收藏 0 点赞 0 评论 0
def get_local_words(word_count, threshold, y_train, train_seq, num_words):

    feature_index = delete_low_freq_words(word_count, threshold)
    print(len(train_seq), len(feature_index))
    word_freq_matrix = np.zeros([len(train_seq), len(feature_index)])

    for (seq_idx, seq) in enumerate(train_seq):
        word_freq_list = np.zeros(len(feature_index))

        for word in seq:
            if (word not in feature_index):
                continue
            else:
                word_idx = feature_index[word]
                word_freq_matrix[seq_idx][word_idx] += 1

    sk = SelectKBest(chi2, k="all")
    sk.fit_transform(csr_matrix(word_freq_matrix), y_train)
    score_list = sk.scores_

    word_score = {}
    for (feature, idx) in feature_index.items():
        word_score[feature] = score_list[idx]

    word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True)

    local_word_list = []
    for (word, score) in word_score[:num_words]:
        local_word_list.append(word)

    del word_freq_matrix

    return local_word_list