aimed_corpus.py 文件源码

python
阅读 17 收藏 0 点赞 0 评论 0

项目:IBRel 作者: lasigeBioTM 项目源码 文件源码
def load_annotations(self, ann_dir, etype, ptype):
        trainfiles = [ann_dir + '/' + f for f in os.listdir(self.path)]
        total = len(trainfiles)
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
        time_per_abs = []
        for current, f in enumerate(trainfiles):
            # logging.debug('%s:%s/%s', f, current + 1, total)
            print '{}:{}/{}'.format(f, current + 1, total)
            did = f
            with open(f, 'r') as f:
                article = "<Article>" + f.read() +  "</Article>"
            soup = BeautifulSoup(article, 'xml')
            title = soup.ArticleTitle
            abstract = soup.AbstractText
            title_text = title.get_text()
            abstract_text = abstract.get_text()
            abs_offset = len(title.get_text()) + 1
            title_entities = title.find_all("prot", recursive=False)
            abs_entities = abstract.find_all("prot", recursive=False)
            lastindex = 0
            for ei, e in enumerate(title_entities):
                estart = title_text.find(e.text, lastindex)
                eend = estart + len(e.text)
                etext = title_text[estart:eend]
                #print etext, estart, eend, self.documents[did].text
                this_sentence = self.documents[did].find_sentence_containing(estart, eend, chemdner=False)
                eid = this_sentence.tag_entity(estart, eend, "protein", text=e.text)
                if eid is None:
                    print "did not add this entity: {}".format(e.text)
                # print e.text
                lastindex = estart
            lastindex = 0
            for ei, e in enumerate(abs_entities):
                estart = abstract_text.find(e.text, lastindex)
                eend = estart + len(e.text)
                etext = self.documents[did].text[estart:eend]
                # logging.info("{} - {}".format(lastindex, e.text))
                #logging.info(estart)
                #logging.info("{} + {} {}: {}-{}: {}".format(abstract_text.find(e.text, lastindex), abs_offset, e.text, estart,
                 #                                           eend, "-".join([str(s.offset) for s in self.documents[did].sentences])))
                #logging.info(abstract_text)
                this_sentence = self.documents[did].find_sentence_containing(estart + abs_offset, eend + abs_offset, chemdner=False)
                eid = this_sentence.tag_entity(estart + abs_offset - this_sentence.offset , eend + abs_offset - this_sentence.offset,
                                               "protein", text=e.text)
                if eid is None:
                    print "did not add this entity: {}".format(e.text)
                # print e.text
                lastindex = estart
        #for s in all_entities:
        #    print s, len(all_entities[s])
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号