def reverseLinking(sent, text_candidate):
tokens = sent.split()
label = ["O"] * len(tokens)
text_attention_indices = None
exact_match = False
if text_candidate is None or len(text_candidate) == 0:
return '<UNK>', label, exact_match
# sorted by length
for text in sorted(text_candidate, key=lambda x:len(x), reverse=True):
pattern = r'(^|\s)(%s)($|\s)' % (re.escape(text))
if re.search(pattern, sent):
text_attention_indices = get_indices(tokens, text.split())
break
if text_attention_indices != None:
exact_match = True
for i in text_attention_indices:
label[i] = 'I'
else:
try:
v, score = process.extractOne(sent, text_candidate, scorer=fuzz.partial_ratio)
except:
print("Extraction Error with FuzzyWuzzy : {} || {}".format(sent, text_candidate))
return '<UNK>', label, exact_match
v = v.split()
n_gram_candidate = get_ngram(tokens)
n_gram_candidate = sorted(n_gram_candidate, key=lambda x: fuzz.ratio(x[0], v), reverse=True)
top = n_gram_candidate[0]
for i in range(top[1], top[2]):
label[i] = 'I'
entity_text = []
for l, t in zip(label, tokens):
if l == 'I':
entity_text.append(t)
entity_text = " ".join(entity_text)
label = " ".join(label)
return entity_text, label, exact_match
评论列表
文章目录