def rm_doubled_senses(entry):
"""Some entries have multiple senses. A few of them are exactly the same,
remove these.
This function returns True if an element has been altered"""
senses = list(findall(entry, 'sense'))
if len(senses) == 1:
return
# obtain a mapping from XML node -> list of words within `<quote>…</quote>`
senses = {sense: tuple(q.text.strip() for q in tei_iter(sense, 'quote')
if q.text) for sense in senses}
changed = False
# pair each sense with another and compare their content
for s1, s2 in itertools.combinations(senses.items(), 2):
if len(s1[1]) == len(s2[1]):
# if two senses are *excactly* identical
if all(e1 == e2 for e1, e2 in zip(s1[1], s2[1])):
try:
entry.remove(s2[0]) # sense node object
changed = True
except ValueError: # already removed?
pass
return changed
评论列表
文章目录