def _gen_jaccard_sims(self, bodies_dict, stances):
# currently assumes both body and headline are longer than 0.
punc_rem_tokenizer = nltk.RegexpTokenizer(r'\w+')
avg_sims = []
max_sims = []
parsed_bodies_dict = {}
for body_id, body in bodies_dict.iteritems():
sents = nltk.sent_tokenize(body)
sents = self._remove_punctuation(sents)
sents = self._word_tokenize(sents)
parsed_bodies_dict[body_id] = sents # cache parsed body
for st in stances:
headline = st['Headline']
headline = headline.translate(self.REMOVE_PUNC_MAP)
headline = nltk.word_tokenize(headline)
jacc_sims = []
for sent in sents:
if len(sent) < 1:
continue
# extend shorter word list so that both are the same length
len_diff = len(headline) - len(sent)
headline_cpy = headline
sent_cpy = sent
if len_diff < 0: # sent longer than headline
headline_cpy = headline_cpy + ([headline_cpy[-1]] * abs(len_diff))
elif len_diff > 0: # headline longer than sent
sent_cpy = sent_cpy + ([sent_cpy[-1]] * abs(len_diff))
jacc_sims.append(jaccard_similarity_score(headline_cpy, sent_cpy))
avg_sim = self._threshold_parser((sum(jacc_sims) / len(jacc_sims)), [0.2])
max_sim = self._threshold_parser(max(jacc_sims), [0.2])
avg_sims.append(avg_sim)
max_sims.append(max_sim)
return avg_sims, max_sims
stance_detection.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录