def load_corpus(self, corenlpserver, process=True):
# self.path is just one file with every document
time_per_abs = []
with open(self.path, 'r') as xml:
t = time.time()
root = ET.fromstring(xml.read())
all_docs = root.findall("document")
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start()
for i, doc in enumerate(all_docs):
doctext = ""
did = doc.get('id')
doc_sentences = [] # get the sentences of this document
doc_offset = 0 # offset of the current sentence relative to the document
for sentence in doc.findall('sentence'):
sid = sentence.get('id')
#logging.info(sid)
text = sentence.get('text')
#text = text.replace('\r\n', ' ')
doctext += " " + text # generate the full text of this document
this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did)
doc_offset = len(doctext)
doc_sentences.append(this_sentence)
newdoc = Document(doctext, process=False, did=did)
newdoc.sentences = doc_sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
pbar.update(i+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
评论列表
文章目录