def score_paragraphs(self, question, paragraphs: List[ExtractedParagraphWithAnswers]):
tfidf = self._tfidf
text = []
for para in paragraphs:
text.append(" ".join(" ".join(s) for s in para.text))
try:
para_features = tfidf.fit_transform(text)
q_features = tfidf.transform([" ".join(question)])
except ValueError:
return []
q_words = {x for x in question if x.lower() not in self._stop}
q_words_lower = {x.lower() for x in q_words}
word_matches_features = np.zeros((len(paragraphs), 2))
for para_ix, para in enumerate(paragraphs):
found = set()
found_lower = set()
for sent in para.text:
for word in sent:
if word in q_words:
found.add(word)
elif word.lower() in q_words_lower:
found_lower.add(word.lower())
word_matches_features[para_ix, 0] = len(found)
word_matches_features[para_ix, 1] = len(found_lower)
tfidf = pairwise_distances(q_features, para_features, "cosine").ravel()
starts = np.array([p.start for p in paragraphs])
log_word_start = np.log(starts/400.0 + 1)
first = starts == 0
scores = tfidf * self.TFIDF_W + self.LOG_WORD_START_W * log_word_start + self.FIRST_W * first +\
self.LOWER_WORD_W * word_matches_features[:, 1] + self.WORD_W * word_matches_features[:, 0]
return scores
评论列表
文章目录