def get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt):
"""
Get overlap, idf weighted overlap, overlap excluding stopwords, and idf weighted overlap excluding stopwords.
"""
stoplist = set(stopwords.words('english'))
num_docs = len(sent_list_1)
overlap_feats = []
for s1, s2 in zip(sent_list_1, sent_list_2):
tokens_a_set, tokens_b_set = set(s1), set(s2)
intersect = tokens_a_set & tokens_b_set
overlap = len(intersect) / (len(tokens_a_set) + len(tokens_b_set))
idf_intersect = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect)
idf_weighted_overlap = idf_intersect / (len(tokens_a_set) + len(tokens_b_set))
tokens_a_set_no_stop = set(w for w in s1 if w not in stoplist)
tokens_b_set_no_stop = set(w for w in s2 if w not in stoplist)
intersect_no_stop = tokens_a_set_no_stop & tokens_b_set_no_stop
overlap_no_stop = len(intersect_no_stop) / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
idf_intersect_no_stop = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect_no_stop)
idf_weighted_overlap_no_stop = idf_intersect_no_stop / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
overlap_feats.append([overlap, idf_weighted_overlap, overlap_no_stop, idf_weighted_overlap_no_stop])
return overlap_feats
评论列表
文章目录