def get_local_words(word_count, threshold, y_train, train_seq, num_words):
feature_index = delete_low_freq_words(word_count, threshold)
print(len(train_seq), len(feature_index))
word_freq_matrix = np.zeros([len(train_seq), len(feature_index)])
for (seq_idx, seq) in enumerate(train_seq):
word_freq_list = np.zeros(len(feature_index))
for word in seq:
if (word not in feature_index):
continue
else:
word_idx = feature_index[word]
word_freq_matrix[seq_idx][word_idx] += 1
sk = SelectKBest(chi2, k="all")
sk.fit_transform(csr_matrix(word_freq_matrix), y_train)
score_list = sk.scores_
word_score = {}
for (feature, idx) in feature_index.items():
word_score[feature] = score_list[idx]
word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True)
local_word_list = []
for (word, score) in word_score[:num_words]:
local_word_list.append(word)
del word_freq_matrix
return local_word_list
评论列表
文章目录