def load_corpus(self, corenlpserver, process=True):
soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser')
docs = soup.find_all("article")
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
n_lines = 1
time_per_abs = []
for doc in docs:
did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
title = doc.title.sentence.get_text()
sentences = doc.abstract.find_all("sentence")
doc_sentences = []
doc_text = title + " "
doc_offset = 0
for si, s in enumerate(sentences):
t = time.time()
stext = s.get_text()
sid = did + ".s" + str(si)
doc_text += stext + " "
this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did)
doc_offset = len(doc_text)
doc_sentences.append(this_sentence)
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentences = doc_sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
#logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(n_lines)
n_lines += 1
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
评论列表
文章目录