def progress_bar(n):
import progressbar
return progressbar.ProgressBar(
max_value=n,
widgets=[
progressxbar.Percentage(),
' ',
'(',
progressbar.SimpleProgress(),
')',
' ',
progressbar.Bar(),
' ',
progressbar.AdaptiveETA(),
])
# http://code.activestate.com/recipes/577058/
python类AdaptiveETA()的实例源码
def __init__(self, name, max_value=100, history_len=5, display=True,
display_data={'train':['loss', 'accuracy'], 'test':['loss', 'accuracy']},
level=logging.INFO, train_log_mode='TRAIN_PROGRESS', test_log_mode='TEST_PROGRESS'):
super(ProgressbarLogger, self).__init__(
name, level=level, display=display, logfile=None,
train_log_mode=train_log_mode, test_log_mode=test_log_mode)
self.train_log_data = {}
self.test_log_data = {}
self.max_value = max_value
self.history_len = history_len
self.display_data = display_data
self.mode['TRAIN_PROGRESS'] = self.log_train_progress
self.mode['TEST_PROGRESS'] = self.log_test_progress
# create logging format
self.widgets = [progressbar.FormatLabel('(%(value)d of %(max)s)'),
' ', progressbar.Percentage(),
' ', progressbar.Bar()]
self.dynamic_data = {k+'_'+kk: 0.0 for k in display_data.keys() for kk in display_data[k]}
diff_data = {'diff_'+k+'_'+kk: 0.0 for k in display_data.keys() for kk in display_data[k]}
self.dynamic_data.update(diff_data)
for t in display_data.keys():
ddstr = ' [' + t + ']'
for s in display_data[t]:
value_name = t + '_' + s
ddstr = ddstr + ' ' + s + ':' + '%(' + value_name + ').3f (%(diff_' + value_name + ').3f)'
self.widgets.append(progressbar.FormatLabel(ddstr))
self.widgets.extend(['|', progressbar.FormatLabel('Time: %(elapsed)s'), '|', progressbar.AdaptiveETA()])
def load_corpus(self, corenlpserver, process=True):
"""
Use the PubMed web services to retrieve the title and abstract of each PMID
:param corenlpserver:
:param process:
:return:
"""
time_per_abs = []
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=len(self.pmids), redirect_stdout=True).start()
for i, pmid in enumerate(self.pmids):
t = time.time()
newdoc = PubmedDocument(pmid)
if newdoc.abstract == "":
logging.info("ignored {} due to the fact that no abstract was found".format(pmid))
continue
newdoc.process_document(corenlpserver, "biomedical")
self.documents["PMID" + pmid] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
pbar.update(i+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def create_progress_bar(message):
widgets = [
message,
progressbar.Counter(),
' ',
progressbar.Percentage(),
' ',
progressbar.Bar(),
progressbar.AdaptiveETA()
]
pbar = progressbar.ProgressBar(widgets=widgets)
return pbar
def load_corpus(self, corenlpserver, process=True):
total_lines = sum(1 for line in open(self.path))
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines, redirect_stdout=True).start()
time_per_abs = []
with codecs.open(self.path, 'r', "utf-8") as trainfile:
current = 0
for line in trainfile:
#logging.debug('%s:%s/%s', f, current + 1, total)
x = line.strip().split(" ")
did = x[0]
doctext = " ".join(x[1:])
newdoc = Document(doctext, process=False, did=did)
#newdoc.sentence_tokenize("biomedical")
sid = did + ".s0"
newdoc.sentences.append(Sentence(doctext, offset=0, sid=sid, did=did))
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
# abs_time = time.time() - t
# time_per_abs.append(abs_time)
#logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current+1)
current += 1
pbar.finish()
# abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
# logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
# self.path is the base directory of the files of this corpus
trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
#logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f.split(".")[0].split("/")[-1]
t = time.time()
with codecs.open(f, 'r', 'utf-8') as txt:
doctext = txt.read()
doctext = doctext.replace("\n", " ")
newdoc = Document(doctext, process=False, did=did)
newdoc.sentence_tokenize("biomedical")
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
#logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
# self.path is just one file with every document
time_per_abs = []
with open(self.path, 'r') as xml:
t = time.time()
root = ET.fromstring(xml.read())
all_docs = root.findall("document")
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start()
for i, doc in enumerate(all_docs):
doctext = ""
did = doc.get('id')
doc_sentences = [] # get the sentences of this document
doc_offset = 0 # offset of the current sentence relative to the document
for sentence in doc.findall('sentence'):
sid = sentence.get('id')
#logging.info(sid)
text = sentence.get('text')
#text = text.replace('\r\n', ' ')
doctext += " " + text # generate the full text of this document
this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did)
doc_offset = len(doctext)
doc_sentences.append(this_sentence)
newdoc = Document(doctext, process=False, did=did)
newdoc.sentences = doc_sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
pbar.update(i+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
# self.path is the base directory of the files of this corpus
trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
#logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f.split(".")[0].split("/")[-1]
t = time.time()
with io.open(f, 'r', encoding='utf8') as txt:
doctext = txt.read()
newdoc = Document(doctext, process=False, did=did)
newdoc.sentence_tokenize("biomedical")
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
#logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
trainfiles = [self.path + '/' + f for f in os.listdir(self.path)]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
#logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f
t = time.time()
with open(f, 'r') as f:
article = "<Article>" + f.read() + "</Article>"
soup = BeautifulSoup(article, 'xml')
#doc = soup.find_all("article")
title = soup.ArticleTitle.get_text()
abstract = soup.AbstractText.get_text()
doc_text = title + " " + abstract
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentence_tokenize("biomedical")
newdoc.process_document(corenlpserver, "biomedical")
#logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def make_progress_bar(name, size):
widgets = [
'%s: ' % name[:8],
progressbar.Percentage(),
' ',
progressbar.Bar(),
' ',
progressbar.AdaptiveETA(),
' ',
progressbar.DataSize(),
' ',
progressbar.AdaptiveTransferSpeed(),
]
return progressbar.ProgressBar(widgets=widgets, max_value=size)
def load_annotations(self, ann_dir, etype, ptype):
trainfiles = [ann_dir + '/' + f for f in os.listdir(self.path)]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
# logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f
with open(f, 'r') as f:
article = "<Article>" + f.read() + "</Article>"
soup = BeautifulSoup(article, 'xml')
title = soup.ArticleTitle
abstract = soup.AbstractText
title_text = title.get_text()
abstract_text = abstract.get_text()
abs_offset = len(title.get_text()) + 1
title_entities = title.find_all("prot", recursive=False)
abs_entities = abstract.find_all("prot", recursive=False)
lastindex = 0
for ei, e in enumerate(title_entities):
estart = title_text.find(e.text, lastindex)
eend = estart + len(e.text)
etext = title_text[estart:eend]
#print etext, estart, eend, self.documents[did].text
this_sentence = self.documents[did].find_sentence_containing(estart, eend, chemdner=False)
eid = this_sentence.tag_entity(estart, eend, "protein", text=e.text)
if eid is None:
print "did not add this entity: {}".format(e.text)
# print e.text
lastindex = estart
lastindex = 0
for ei, e in enumerate(abs_entities):
estart = abstract_text.find(e.text, lastindex)
eend = estart + len(e.text)
etext = self.documents[did].text[estart:eend]
# logging.info("{} - {}".format(lastindex, e.text))
#logging.info(estart)
#logging.info("{} + {} {}: {}-{}: {}".format(abstract_text.find(e.text, lastindex), abs_offset, e.text, estart,
# eend, "-".join([str(s.offset) for s in self.documents[did].sentences])))
#logging.info(abstract_text)
this_sentence = self.documents[did].find_sentence_containing(estart + abs_offset, eend + abs_offset, chemdner=False)
eid = this_sentence.tag_entity(estart + abs_offset - this_sentence.offset , eend + abs_offset - this_sentence.offset,
"protein", text=e.text)
if eid is None:
print "did not add this entity: {}".format(e.text)
# print e.text
lastindex = estart
#for s in all_entities:
# print s, len(all_entities[s])