jnlpba_corpus.py 文件源码-python代码片段

def load_corpus(self, corenlpserver, process=True):
        widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
        nlines = 0
        with open(self.path) as f:
            for nlines, l in enumerate(f):
                pass
        print nlines
        pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
        with codecs.open(self.path, 'r', "utf-8") as corpusfile:
            doc_text = ""
            sentences = []
            for i,l in enumerate(corpusfile):
                if l.startswith("###"): # new doc
                    if doc_text != "":
                        logging.debug("creating document: {}".format(doc_text))
                        newdoc = Document(doc_text, process=False, did=did)
                        newdoc.sentences = sentences[:]
                        newdoc.process_document(corenlpserver, "biomedical")
                        # logging.info(len(newdoc.sentences))
                        self.documents[newdoc.did] = newdoc
                        doc_text = ""
                    did = "JNLPBA" + l.strip().split(":")[-1]
                    logging.debug("starting new document:" + did)
                    sentence_text = ""
                    doc_offset = 0
                    sentences = []
                elif l.strip() == "" and sentence_text != "": # new sentence
                    #logging.debug("creating mew sentence: {}".format(sentence_text))
                    sid = did + ".s" + str(len(sentences))
                    this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did)
                    doc_offset += len(sentence_text) + 1
                    doc_text += sentence_text + " "
                    sentences.append(this_sentence)
                    if i == nlines:
                        logging.debug("creating document: {}".format(doc_text))
                        newdoc = Document(doc_text, process=False, did=did)
                        newdoc.sentences = sentences[:]
                        newdoc.process_document(corenlpserver, "biomedical")
                        # logging.info(len(newdoc.sentences))
                        self.documents[newdoc.did] = newdoc
                        doc_text = ""
                    # start new sentence
                    sentence_text = ""
                else:
                    #logging.debug(str(i) + "/" + str(l))
                    t = l.strip().split("\t")
                    if sentence_text != "":
                        sentence_text += " "
                    #if t[1] == "B-protein"
                    sentence_text += t[0]
                pbar.update(i)
            pbar.finish()