def load_corpus(self, corenlpserver, process=True):
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
nlines = 0
with open(self.path) as f:
for nlines, l in enumerate(f):
pass
print nlines
pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
with codecs.open(self.path, 'r', "utf-8") as corpusfile:
doc_text = ""
sentences = []
for i,l in enumerate(corpusfile):
if l.startswith("###"): # new doc
if doc_text != "":
logging.debug("creating document: {}".format(doc_text))
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentences = sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
# logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
doc_text = ""
did = "JNLPBA" + l.strip().split(":")[-1]
logging.debug("starting new document:" + did)
sentence_text = ""
doc_offset = 0
sentences = []
elif l.strip() == "" and sentence_text != "": # new sentence
#logging.debug("creating mew sentence: {}".format(sentence_text))
sid = did + ".s" + str(len(sentences))
this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did)
doc_offset += len(sentence_text) + 1
doc_text += sentence_text + " "
sentences.append(this_sentence)
if i == nlines:
logging.debug("creating document: {}".format(doc_text))
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentences = sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
# logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
doc_text = ""
# start new sentence
sentence_text = ""
else:
#logging.debug(str(i) + "/" + str(l))
t = l.strip().split("\t")
if sentence_text != "":
sentence_text += " "
#if t[1] == "B-protein"
sentence_text += t[0]
pbar.update(i)
pbar.finish()
评论列表
文章目录