def load_corpus(self, corenlpserver, process=True):
total_lines = sum(1 for line in open(self.path))
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines, redirect_stdout=True).start()
time_per_abs = []
with codecs.open(self.path, 'r', "utf-8") as trainfile:
current = 0
for line in trainfile:
#logging.debug('%s:%s/%s', f, current + 1, total)
x = line.strip().split(" ")
did = x[0]
doctext = " ".join(x[1:])
newdoc = Document(doctext, process=False, did=did)
#newdoc.sentence_tokenize("biomedical")
sid = did + ".s0"
newdoc.sentences.append(Sentence(doctext, offset=0, sid=sid, did=did))
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
# abs_time = time.time() - t
# time_per_abs.append(abs_time)
#logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current+1)
current += 1
pbar.finish()
# abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
# logging.info("average time per abstract: %ss" % abs_avg)
评论列表
文章目录