def load_annotations(self, ann_dir, etype, ptype):
trainfiles = [ann_dir + '/' + f for f in os.listdir(self.path)]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
# logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f
with open(f, 'r') as f:
article = "<Article>" + f.read() + "</Article>"
soup = BeautifulSoup(article, 'xml')
title = soup.ArticleTitle
abstract = soup.AbstractText
title_text = title.get_text()
abstract_text = abstract.get_text()
abs_offset = len(title.get_text()) + 1
title_entities = title.find_all("prot", recursive=False)
abs_entities = abstract.find_all("prot", recursive=False)
lastindex = 0
for ei, e in enumerate(title_entities):
estart = title_text.find(e.text, lastindex)
eend = estart + len(e.text)
etext = title_text[estart:eend]
#print etext, estart, eend, self.documents[did].text
this_sentence = self.documents[did].find_sentence_containing(estart, eend, chemdner=False)
eid = this_sentence.tag_entity(estart, eend, "protein", text=e.text)
if eid is None:
print "did not add this entity: {}".format(e.text)
# print e.text
lastindex = estart
lastindex = 0
for ei, e in enumerate(abs_entities):
estart = abstract_text.find(e.text, lastindex)
eend = estart + len(e.text)
etext = self.documents[did].text[estart:eend]
# logging.info("{} - {}".format(lastindex, e.text))
#logging.info(estart)
#logging.info("{} + {} {}: {}-{}: {}".format(abstract_text.find(e.text, lastindex), abs_offset, e.text, estart,
# eend, "-".join([str(s.offset) for s in self.documents[did].sentences])))
#logging.info(abstract_text)
this_sentence = self.documents[did].find_sentence_containing(estart + abs_offset, eend + abs_offset, chemdner=False)
eid = this_sentence.tag_entity(estart + abs_offset - this_sentence.offset , eend + abs_offset - this_sentence.offset,
"protein", text=e.text)
if eid is None:
print "did not add this entity: {}".format(e.text)
# print e.text
lastindex = estart
#for s in all_entities:
# print s, len(all_entities[s])
评论列表
文章目录