genia_corpus.py 文件源码-python代码片段

def load_corpus(self, corenlpserver, process=True):

        soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser')
        docs = soup.find_all("article")
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
        pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
        n_lines = 1
        time_per_abs = []
        for doc in docs:
            did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
            title = doc.title.sentence.get_text()
            sentences = doc.abstract.find_all("sentence")
            doc_sentences = []
            doc_text = title + " "
            doc_offset = 0
            for si, s in enumerate(sentences):
                t = time.time()
                stext = s.get_text()
                sid = did + ".s" + str(si)
                doc_text += stext + " "
                this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did)
                doc_offset = len(doc_text)
                doc_sentences.append(this_sentence)
            newdoc = Document(doc_text, process=False, did=did)
            newdoc.sentences = doc_sentences[:]
            newdoc.process_document(corenlpserver, "biomedical")
            #logging.info(len(newdoc.sentences))
            self.documents[newdoc.did] = newdoc
            abs_time = time.time() - t
            time_per_abs.append(abs_time)
            logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
            pbar.update(n_lines)
            n_lines += 1
        pbar.finish()
        abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
        logging.info("average time per abstract: %ss" % abs_avg)