def preprocess(self, questions: List[QASetting],
answers: Optional[List[List[Answer]]] = None,
is_eval: bool = False) -> List[XQAAnnotation]:
if answers is None:
answers = [None] * len(questions)
preprocessed = []
if len(questions) > 1000:
bar = progressbar.ProgressBar(
max_value=len(questions),
widgets=[' [', progressbar.Timer(), '] ', progressbar.Bar(), ' (', progressbar.ETA(), ') '])
for q, a in bar(zip(questions, answers)):
preprocessed.append(self.preprocess_instance(q, a))
else:
for q, a in zip(questions, answers):
preprocessed.append(self.preprocess_instance(q, a))
return preprocessed
python类Timer()的实例源码
def preprocess(self, questions: List[QASetting],
answers: Optional[List[List[Answer]]] = None,
is_eval: bool = False) -> List[MCAnnotation]:
if answers is None:
answers = [None] * len(questions)
preprocessed = []
if len(questions) > 1000:
bar = progressbar.ProgressBar(
max_value=len(questions),
widgets=[' [', progressbar.Timer(), '] ', progressbar.Bar(), ' (', progressbar.ETA(), ') '])
for i, (q, a) in bar(enumerate(zip(questions, answers))):
preprocessed.append(self.preprocess_instance(i, q, a))
else:
for i, (q, a) in enumerate(zip(questions, answers)):
preprocessed.append(self.preprocess_instance(i, q, a))
return preprocessed
def progressbarize(iterable, progress=False):
"""Construct progressbar for loops if progressbar requested, otherwise return directly iterable.
:param iterable: iterable to use
:param progress: True if print progressbar
"""
if progress:
# The casting to list is due to possibly yielded value that prevents
# ProgressBar to compute overall ETA
return progressbar.ProgressBar(widgets=[
progressbar.Timer(), ', ',
progressbar.Percentage(), ', ',
progressbar.SimpleProgress(), ', ',
progressbar.ETA()
])(list(iterable))
return iterable
def deleteHostsByHostgroup(groupname):
hostgroup = zapi.hostgroup.get(output=['groupid'],filter={'name': groupname})
if hostgroup.__len__() != 1:
logger.error('Hostgroup not found: %s\n\tFound this: %s' % (groupname,hostgroup))
groupid = int(hostgroup[0]['groupid'])
hosts = zapi.host.get(output=['name','hostid'],groupids=groupid)
total = len(hosts)
logger.info('Hosts found: %d' % (total))
if ( args.run ):
x = 0
bar = ProgressBar(maxval=total,widgets=[Percentage(), ReverseBar(), ETA(), RotatingMarker(), Timer()]).start()
logger.echo = False
for host in hosts:
x = x + 1
bar.update(x)
logger.debug('(%d/%d) >> Removing >> %s' % (x, total, host))
out = zapi.globo.deleteMonitors(host['name'])
bar.finish()
logger.echo = True
else:
logger.info('No host removed due to --no-run arg. Full list of hosts:')
for host in hosts:
logger.info('%s' % host['name'])
return
def hosts_disable_all():
"""
status de host 0 = enabled
status de host 1 = disabled
"""
logger.info('Disabling all hosts, in blocks of 1000')
hosts = zapi.host.get(output=[ 'hostid' ], search={ 'status': 0 })
maxval = int(ceil(hosts.__len__())/1000+1)
bar = ProgressBar(maxval=maxval,widgets=[Percentage(), ReverseBar(), ETA(), RotatingMarker(), Timer()]).start()
i = 0
for i in xrange(maxval):
block = hosts[:1000]
del hosts[:1000]
result = zapi.host.massupdate(hosts=[ x for x in block ], status=1)
i += 1
bar.update(i)
bar.finish()
logger.info('Done')
return
def proxy_passive_to_active():
"""
status de prxy 5 = active
status de prxy 6 = passive
"""
logger.info('Change all proxys to active')
proxys = zapi.proxy.get(output=[ 'shorten', 'host' ],
filter={ 'status': 6 })
if ( proxys.__len__() == 0 ):
logger.info('Done')
return
bar = ProgressBar(maxval=proxys.__len__(),widgets=[Percentage(), ReverseBar(), ETA(), RotatingMarker(), Timer()]).start()
i = 0
for x in proxys:
i += 1
proxyid = x['proxyid']
result = zapi.proxy.update(proxyid=proxyid, status=5)
logger.echo = False
logger.debug('Changed from passive to active proxy: %s' % (x['host']))
bar.update(i)
bar.finish()
logger.echo = True
logger.info('Done')
return
def load_corpus(self, corenlpserver, process=True):
"""
Use the PubMed web services to retrieve the title and abstract of each PMID
:param corenlpserver:
:param process:
:return:
"""
time_per_abs = []
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=len(self.pmids), redirect_stdout=True).start()
for i, pmid in enumerate(self.pmids):
t = time.time()
newdoc = PubmedDocument(pmid)
if newdoc.abstract == "":
logging.info("ignored {} due to the fact that no abstract was found".format(pmid))
continue
newdoc.process_document(corenlpserver, "biomedical")
self.documents["PMID" + pmid] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
pbar.update(i+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def _setup_progress(self, options):
if options.progress:
if self.beanstalk:
# With Beanstalk C&C we don't know how many...
self.progress = progressbar.ProgressBar(
redirect_stdout=True,
redirect_stderr=True,
widgets=[
'Total: ',
progressbar.Counter(),
', ',
progressbar.Timer()
])
else:
self.progress = progressbar.ProgressBar(
redirect_stdout=True,
redirect_stderr=True,
widgets=[
progressbar.Percentage(),
progressbar.Bar(),
' (', progressbar.ETA(), ') ',
])
else:
self.progress = None
def bar_update(self, epoch, logs):
ologs = {}
for k in self.custom_log_functions:
ologs[k] = self.custom_log_functions[k]()
for k in logs:
if len(k) > 5:
ologs[k[-5:]] = logs[k]
else:
ologs[k] = logs[k]
if not hasattr(self,'bar'):
import progressbar
widgets = [
progressbar.Timer(format='%(elapsed)s'),
' ', progressbar.Counter(),
progressbar.Bar(),
progressbar.AbsoluteETA(format='%(eta)s'), ' ',
]
keys = []
for k in ologs:
keys.append(k)
keys.sort()
for k in keys:
widgets.append(progressbar.DynamicMessage(k))
widgets.append(' ')
self.bar = progressbar.ProgressBar(max_value=self.max_epoch, widgets=widgets)
self.bar.update(epoch+1, **ologs)
def __call__(self, epoch):
if self._batches is None:
logger.info("Preparing evaluation data...")
self._batches = self.reader.input_module.batch_generator(self._dataset, self._batch_size, is_eval=True)
logger.info("Started evaluation %s" % self._info)
metrics = defaultdict(lambda: list())
bar = progressbar.ProgressBar(
max_value=len(self._dataset) // self._batch_size + 1,
widgets=[' [', progressbar.Timer(), '] ', progressbar.Bar(), ' (', progressbar.ETA(), ') '])
for i, batch in bar(enumerate(self._batches)):
inputs = self._dataset[i * self._batch_size:(i + 1) * self._batch_size]
predictions = self.reader.model_module(batch, self._ports)
m = self.apply_metrics(inputs, predictions)
for k in self._metrics:
metrics[k].append(m[k])
metrics = self.combine_metrics(metrics)
super().add_to_history(metrics, self._iter, epoch)
printmetrics = sorted(metrics.keys())
res = "Epoch %d\tIter %d\ttotal %d" % (epoch, self._iter, self._total)
for m in printmetrics:
res += '\t%s: %.3f' % (m, metrics[m])
self.update_summary(self._iter, self._info + '_' + m, metrics[m])
if self._write_metrics_to is not None:
with open(self._write_metrics_to, 'a') as f:
f.write("{0} {1} {2:.5}\n".format(datetime.now(), self._info + '_' + m,
np.round(metrics[m], 5)))
res += '\t' + self._info
logger.info(res)
if self._side_effect is not None:
self._side_effect_state = self._side_effect(metrics, self._side_effect_state)
def __init__(self, nbytes, nfiles):
self._total_bytes = nbytes
self._pending_files = nfiles
self._transferring_files = 0
self._complete_files = 0
self._lock = Lock()
self._data = {}
widgets = ['Progress: ', Percentage(), ' ', Bar(left='[',right=']'),
' ', Timer(format='Time: %s'), ' ', FileTransferSpeed()]
if self._total_bytes > 0:
self.pbar = ProgressBar(widgets=widgets, maxval=self._total_bytes).start()
else:
self.pbar = ProgressBar(widgets=widgets, maxval=nfiles).start()
def create_app(load_db=True, populate_qr_cache=True, progressbar=False):
# Set up logging
log_level = os.environ.get('AF_LOGGING_LEVEL', None)
if log_level is not None:
log_levels = ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
if log_level.upper() in log_levels:
log_level = getattr(log, log_level)
log.basicConfig(level=log_level)
else:
log.warning('Invalid log level: {}'.format(log_level.upper()))
else:
log.warning('No log level set, using default level.')
log.info('Creating Flask application')
app = Flask(__name__)
app.register_blueprint(root)
# Now load the database if requested
if load_db:
from . import database_handler as dh
log.info('Loading database.')
dh.get_database() # This loads the database into memory.
log.info('Database loaded.')
if populate_qr_cache:
if progressbar:
from progressbar import ProgressBar, Bar, Timer, ETA
pbar = ProgressBar(widgets=['Populating QR cache: ', Bar(),
' ', Timer(), ' ', ETA()])
kwargs = {'pbar': pbar}
else:
log.info('Populating QR cache.')
kwargs = {}
from .cache_utils import populate_qr_cache
populate_qr_cache(**kwargs)
return app
def createSQL(table,values,name='insert'):
'''
Generate the SQL insert line, breaking each insert to up to ~1k values
and up to ~1k insert's (~1M values total for each SQL file)
'''
logger.info('Generating SQL file')
queryInsert='INSERT INTO %s (itemid,clock,num,value_min,value_avg,value_max) VALUES' % table
i=0 # Controls the progress bar
x=0 # Controls number of inserts in one line
y=0 # Controls number of lines in one file
z=0 # Controls number of file name
valuesLen=values.__len__()
sqlFile='%s.sql.%d' % (name,z)
logger.debug('Total itens for %s: %d' % (name,valuesLen))
if valuesLen > 0:
bar=ProgressBar(maxval=valuesLen,widgets=[Percentage(), ReverseBar(), ETA(), RotatingMarker(), Timer()]).start()
for value in values:
i+=1
x+=1
if x != 1: # First line only
sqlInsert='%s,%s' % (sqlInsert,value)
else:
sqlInsert=value
if y >= 1000: # If there is more than 1k lines, write to new file
z+=1
y=0
if x >= 1000 or i == valuesLen: # If there is more than 1k values or we finished our list, write to file
sqlFile='%s.sql.%d' % (name,z)
fileAppend(f=sqlFile,content='%s %s;\n' % (queryInsert,sqlInsert))
x=0
y+=1
sqlInsert=''
if args.loglevel.upper() != 'DEBUG': # Dont print progressbar if in debug mode
bar.update(i)
bar.finish()
else:
logger.warning('No values received')
def discovery_disable_all(status=0):
"""
Alterar status de todos os discoveries *auto*
Status 0 = enable
Status 1 = disable
"""
logger.info('Disabling all network discoveries')
druleids = zapi.drule.get(output=[ 'druleid', 'iprange', 'name', 'proxy_hostid', 'status' ],
selectDChecks='extend', filter={ 'status': 0 })
if ( druleids.__len__() == 0 ):
logger.info('Done')
return
bar = ProgressBar(maxval=druleids.__len__(),widgets=[Percentage(), ReverseBar(), ETA(), RotatingMarker(), Timer()]).start()
i = 0
for x in druleids:
params_disable = {
'druleid': x['druleid'],
'iprange': x['iprange'],
'name': x['name'],
'dchecks': x['dchecks'],
'status': 1
}
out = zapi.drule.update(**params_disable)
logger.echo = False
if out:
logger.debug('\tNew status: %s (%s) --> %d' % (x['name'],out['druleids'],status))
else:
logger.warning('\tFAILED to change status: %s (%s) --> %d' % (x['name'],out['druleids'],status))
i += 1
bar.update(i)
logger.echo = True
bar.finish()
logger.info('Done')
return
def desabilitaItensNaoSuportados():
query = {
"output": "extend",
"filter": {
"state": 1
},
"monitored": True
}
filtro = raw_input('Qual a busca para key_? [NULL = ENTER] ')
if filtro.__len__() > 0:
query['search']={'key_': filtro}
limite = raw_input('Qual o limite de itens? [NULL = ENTER] ')
if limite.__len__() > 0:
try:
query['limit']=int(limite)
except:
print 'Limite invalido'
raw_input("Pressione ENTER para voltar")
main()
opcao = raw_input("Confirma operação? [s/n]")
if opcao == 's' or opcao == 'S':
itens = zapi.item.get(query)
print 'Encontramos {} itens'.format(itens.__len__())
bar = ProgressBar(maxval=itens.__len__(),widgets=[Percentage(), ReverseBar(), ETA(), RotatingMarker(), Timer()]).start()
i = 0
for x in itens:
result = zapi.item.update({"itemid": x['itemid'], "status": 1})
i += 1
bar.update(i)
bar.finish()
print "Itens desabilitados!!!"
raw_input("Pressione ENTER para continuar")
main()
def find_samples_bounding_rect(path):
min_w = 0
min_h = 0
print ('finding bounding box:')
bar = progressbar.ProgressBar(maxval=num_classes*num_samples,
widgets=[
' [', progressbar.Timer(), '] ',
progressbar.Bar(),
' (', progressbar.ETA(), ') ',
])
bar.start()
counter = 0
for i in range(1, num_classes + 1):
for j in range(1, num_samples + 1):
filename = '{0}/Sample{1:03d}/img{1:03d}-{2:03d}.png'.format(path, i, j)
# opencv read -> Gray Image -> Bounding Rect
im = cv2.imread(filename)
imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
imgray = cv2.bitwise_not(imgray)
_, contours, _ = cv2.findContours(imgray, cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
_, _, w, h = cv2.boundingRect(contours[len(contours) - 1])
# find maximum resolution
min_w = max(min_w, w)
min_h = max(min_h, h)
# update progress bar
counter = counter + 1
bar.update(counter)
bar.finish()
return min_w, min_h
def _init_pbar(self, ini_val, max_val, label):
self._pbar = progressbar.ProgressBar(
min_value=0,
max_value=max_val,
initial_value=ini_val,
widgets=[
label,
progressbar.Percentage(),
'(', progressbar.SimpleProgress(), ')',
progressbar.Bar(),
progressbar.Timer(), ' ',
'|', progressbar.ETA(),
]
)
self._pbar.start()
def load_corpus(self, corenlpserver, process=True):
"""Load the CHEMDNER corpus file on the dir element"""
# open filename and parse lines
total_lines = sum(1 for line in open(self.path))
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines).start()
n_lines = 1
time_per_abs = []
with io.open(self.path, 'r', encoding="utf-8") as inputfile:
for line in inputfile:
t = time.time()
# each line is PMID title abs
tsv = line.split('\t')
doctext = tsv[1].strip().replace("<", "(").replace(">", ")").replace(". ", ", ") + ". "
doctext += tsv[2].strip().replace("<", "(").replace(">", ")")
newdoc = Document(doctext, process=False,
did=tsv[0], title=tsv[1].strip() + ".")
newdoc.sentence_tokenize("biomedical")
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
pbar.update(n_lines)
n_lines += 1
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
total_lines = sum(1 for line in open(self.path))
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total_lines, redirect_stdout=True).start()
time_per_abs = []
with codecs.open(self.path, 'r', "utf-8") as trainfile:
current = 0
for line in trainfile:
#logging.debug('%s:%s/%s', f, current + 1, total)
x = line.strip().split(" ")
did = x[0]
doctext = " ".join(x[1:])
newdoc = Document(doctext, process=False, did=did)
#newdoc.sentence_tokenize("biomedical")
sid = did + ".s0"
newdoc.sentences.append(Sentence(doctext, offset=0, sid=sid, did=did))
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
# abs_time = time.time() - t
# time_per_abs.append(abs_time)
#logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current+1)
current += 1
pbar.finish()
# abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
# logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
soup = BeautifulSoup(codecs.open(self.path, 'r', "utf-8"), 'html.parser')
docs = soup.find_all("article")
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=len(docs)).start()
n_lines = 1
time_per_abs = []
for doc in docs:
did = "GENIA" + doc.articleinfo.bibliomisc.text.split(":")[1]
title = doc.title.sentence.get_text()
sentences = doc.abstract.find_all("sentence")
doc_sentences = []
doc_text = title + " "
doc_offset = 0
for si, s in enumerate(sentences):
t = time.time()
stext = s.get_text()
sid = did + ".s" + str(si)
doc_text += stext + " "
this_sentence = Sentence(stext, offset=doc_offset, sid=sid, did=did)
doc_offset = len(doc_text)
doc_sentences.append(this_sentence)
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentences = doc_sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
#logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(n_lines)
n_lines += 1
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
# self.path is just one file with every document
time_per_abs = []
with open(self.path, 'r') as xml:
t = time.time()
root = ET.fromstring(xml.read())
all_docs = root.findall("document")
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=len(all_docs)).start()
for i, doc in enumerate(all_docs):
doctext = ""
did = doc.get('id')
doc_sentences = [] # get the sentences of this document
doc_offset = 0 # offset of the current sentence relative to the document
for sentence in doc.findall('sentence'):
sid = sentence.get('id')
#logging.info(sid)
text = sentence.get('text')
#text = text.replace('\r\n', ' ')
doctext += " " + text # generate the full text of this document
this_sentence = Sentence(text, offset=doc_offset, sid=sid, did=did)
doc_offset = len(doctext)
doc_sentences.append(this_sentence)
newdoc = Document(doctext, process=False, did=did)
newdoc.sentences = doc_sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
pbar.update(i+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
# self.path is the base directory of the files of this corpus
trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
#logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f.split(".")[0].split("/")[-1]
t = time.time()
with io.open(f, 'r', encoding='utf8') as txt:
doctext = txt.read()
newdoc = Document(doctext, process=False, did=did)
newdoc.sentence_tokenize("biomedical")
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
#logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
# self.path is the base directory of the files of this corpus
trainfiles = [self.path + '/' + f for f in os.listdir(self.path) if f.endswith('.txt')]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
#logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f.split(".")[0]
t = time.time()
with io.open(f, 'r', encoding='utf8') as txt:
doctext = txt.read()
newdoc = Document(doctext, process=False, did=did)
newdoc.sentence_tokenize("biomedical")
if process:
newdoc.process_document(corenlpserver, "biomedical")
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
#logging.info("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current+1)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def load_corpus(self, corenlpserver, process=True):
trainfiles = [self.path + '/' + f for f in os.listdir(self.path)]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
#logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f
t = time.time()
with open(f, 'r') as f:
article = "<Article>" + f.read() + "</Article>"
soup = BeautifulSoup(article, 'xml')
#doc = soup.find_all("article")
title = soup.ArticleTitle.get_text()
abstract = soup.AbstractText.get_text()
doc_text = title + " " + abstract
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentence_tokenize("biomedical")
newdoc.process_document(corenlpserver, "biomedical")
#logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
abs_time = time.time() - t
time_per_abs.append(abs_time)
logging.debug("%s sentences, %ss processing time" % (len(newdoc.sentences), abs_time))
pbar.update(current)
pbar.finish()
abs_avg = sum(time_per_abs)*1.0/len(time_per_abs)
logging.info("average time per abstract: %ss" % abs_avg)
def __init__(self):
self.lines = []
self.line_acc = []
self.duration = None
self.source = None
self.started = False
self.pbar = progressbar.ProgressBar(widgets=[
lambda w, d: self.source, ' ',
progressbar.AnimatedMarker(
markers="—/|\\",
),
progressbar.Bar(
marker='?',
fill='?',
left=' ?',
right='? ',
),
progressbar.Percentage(
'%(percentage)3d% % '
),
progressbar.Timer(
format='(%(elapsed)s / '
),
AbsoluteTimeETA(
format='%(eta)s)',
format_finished='%(elapsed)s',
format_not_started='--:--:--'
),
])
def update(content_type, reload_metadata, path):
"""
Add a specific path to the databases, loading all content and updating the
database where necessary.
The path at PATH will be recursively searched for data.
"""
import os
from . import database_handler as dh
from .resolver import Resolver
from progressbar import ProgressBar, Bar, Timer, ETA
def pbar(msg):
return ProgressBar(widgets=[msg, ' ',
Bar(), ' ', Timer(), ' ', ETA()])
# If this is a relative path, interpret it as relative to the base
# media path, not the cwd.
path = Resolver().resolve_media(path).path
if content_type in ('b', 'books'):
updater = dh.BookDatabaseUpdater(path)
else:
raise ValueError('Unknown type {}'.format(utype))
print('Loading database')
db = dh.load_database()
print('Loading all new entries.')
updater.update_db_entries(db)
dh.save_database(db) # Save as we progress
print('Loading books associated with entries.')
updater.assign_books_to_entries(db)
dh.save_database(db)
updater.update_book_metadata(db, pbar=pbar('Loading book metadata:'),
reload_metadata=reload_metadata)
dh.save_database(db)
print('Updating author database')
updater.update_author_db(db)
dh.save_database(db)
print('Updating book covers')
updater.update_cover_images(db)
dh.save_database(db)
def __init__(self, present, past=None, max_value=1, vars=None,
**kwargs):
self.present = present
self.sub_bar = None
self.finished = None
if past is None:
past = present
self.msg_bar = MessageBar(
msg=present, finish_msg="%s finished in" % past)
widgets = [self.msg_bar, " "]
if max_value is None:
widgets.append(progressbar.Timer(format="%(elapsed)s"))
else:
widgets.append(progressbar.ETA(
format="ETA: %(eta)s",
format_finished="%(elapsed)s"))
if vars is not None:
self.var_vals = progressbar.FormatCustomText(
" (" + ", ".join("%s: %%(%s)s" % (v, v) for v in vars) + ")",
{v: "---" for v in vars})
widgets.append(self.var_vals)
else:
self.var_vals = None
def update_thread():
while not self.finished:
if self.sub_bar is None or self.sub_bar.finished:
self.update()
time.sleep(0.001)
self.thread = threading.Thread(target=update_thread)
self.thread.daemon = True
if max_value is None:
max_value = progressbar.UnknownLength
super(ProgressBar, self).__init__(
poll_interval=0.1, widgets=widgets, fd=sys.stdout,
max_value=max_value, **kwargs)
def main():
'''
Controls general flow of operations
'''
# If it exists, use the cached data of hosts and items
if (os.path.isfile(move_items_file)):
with open(move_items_file) as infile:
hosts=json.load(infile)
logger.info('Cache loaded from file (%s)' % move_items_file)
else:
hosts=getItems()
with open(move_items_file, 'w') as outfile:
json.dump(hosts, outfile)
logger.info('Cache written to file (%s)' % move_items_file)
for host in hosts:
logger.info('Geting trends data of host: %s' % host['name'])
host['trends']=list()
host['trends_uint']=list()
if host['itens'].__len__() > 0:
bar=ProgressBar(maxval=host['itens'].__len__(),widgets=[Percentage(), ReverseBar(), ETA(), RotatingMarker(), Timer()]).start()
i=0
for item in host['itens']:
temp=getTrends(hostname=host['name'],item=item)
i+=1
if args.loglevel.upper() != 'DEBUG':
bar.update(i)
if temp['table'] == 'trends':
for value in temp['values']:
host['trends'].append('(%d, %d, %d, %d, %d, %d)' % (int(item['itemid']), int(value[1]), int(value[2]), int(value[3]), int(value[4]), int(value[5])))
elif temp['table'] == 'trends_uint':
for value in temp['values']:
host['trends_uint'].append('(%d, %d, %d, %d, %d, %d)' % (int(item['itemid']), int(value[1]), int(value[2]), int(value[3]), int(value[4]), int(value[5])))
else:
logger.warning('Unknown value type: %s' % temp['table'])
bar.finish()
'''
Now, we send in blocks of up to ~1M values to generate the SQL files
'''
if host['trends'].__len__() > 0:
createSQL(table='trends',values=host['trends'],name=host['name'])
elif host['trends_uint'].__len__() > 0:
createSQL(table='trends_uint',values=host['trends_uint'],name=host['name'])
else:
logger.warning('No data from %s found to be sent.' % host['name'])
# Start DB connection
def load_corpus(self, corenlpserver, process=True):
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA(), ' ', pb.Timer()]
nlines = 0
with open(self.path) as f:
for nlines, l in enumerate(f):
pass
print nlines
pbar = pb.ProgressBar(widgets=widgets, maxval=nlines).start()
with codecs.open(self.path, 'r', "utf-8") as corpusfile:
doc_text = ""
sentences = []
for i,l in enumerate(corpusfile):
if l.startswith("###"): # new doc
if doc_text != "":
logging.debug("creating document: {}".format(doc_text))
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentences = sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
# logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
doc_text = ""
did = "JNLPBA" + l.strip().split(":")[-1]
logging.debug("starting new document:" + did)
sentence_text = ""
doc_offset = 0
sentences = []
elif l.strip() == "" and sentence_text != "": # new sentence
#logging.debug("creating mew sentence: {}".format(sentence_text))
sid = did + ".s" + str(len(sentences))
this_sentence = Sentence(sentence_text, offset=doc_offset, sid=sid, did=did)
doc_offset += len(sentence_text) + 1
doc_text += sentence_text + " "
sentences.append(this_sentence)
if i == nlines:
logging.debug("creating document: {}".format(doc_text))
newdoc = Document(doc_text, process=False, did=did)
newdoc.sentences = sentences[:]
newdoc.process_document(corenlpserver, "biomedical")
# logging.info(len(newdoc.sentences))
self.documents[newdoc.did] = newdoc
doc_text = ""
# start new sentence
sentence_text = ""
else:
#logging.debug(str(i) + "/" + str(l))
t = l.strip().split("\t")
if sentence_text != "":
sentence_text += " "
#if t[1] == "B-protein"
sentence_text += t[0]
pbar.update(i)
pbar.finish()
def load_annotations(self, ann_dir, etype, ptype):
trainfiles = [ann_dir + '/' + f for f in os.listdir(self.path)]
total = len(trainfiles)
widgets = [pb.Percentage(), ' ', pb.Bar(), ' ', pb.AdaptiveETA(), ' ', pb.Timer()]
pbar = pb.ProgressBar(widgets=widgets, maxval=total, redirect_stdout=True).start()
time_per_abs = []
for current, f in enumerate(trainfiles):
# logging.debug('%s:%s/%s', f, current + 1, total)
print '{}:{}/{}'.format(f, current + 1, total)
did = f
with open(f, 'r') as f:
article = "<Article>" + f.read() + "</Article>"
soup = BeautifulSoup(article, 'xml')
title = soup.ArticleTitle
abstract = soup.AbstractText
title_text = title.get_text()
abstract_text = abstract.get_text()
abs_offset = len(title.get_text()) + 1
title_entities = title.find_all("prot", recursive=False)
abs_entities = abstract.find_all("prot", recursive=False)
lastindex = 0
for ei, e in enumerate(title_entities):
estart = title_text.find(e.text, lastindex)
eend = estart + len(e.text)
etext = title_text[estart:eend]
#print etext, estart, eend, self.documents[did].text
this_sentence = self.documents[did].find_sentence_containing(estart, eend, chemdner=False)
eid = this_sentence.tag_entity(estart, eend, "protein", text=e.text)
if eid is None:
print "did not add this entity: {}".format(e.text)
# print e.text
lastindex = estart
lastindex = 0
for ei, e in enumerate(abs_entities):
estart = abstract_text.find(e.text, lastindex)
eend = estart + len(e.text)
etext = self.documents[did].text[estart:eend]
# logging.info("{} - {}".format(lastindex, e.text))
#logging.info(estart)
#logging.info("{} + {} {}: {}-{}: {}".format(abstract_text.find(e.text, lastindex), abs_offset, e.text, estart,
# eend, "-".join([str(s.offset) for s in self.documents[did].sentences])))
#logging.info(abstract_text)
this_sentence = self.documents[did].find_sentence_containing(estart + abs_offset, eend + abs_offset, chemdner=False)
eid = this_sentence.tag_entity(estart + abs_offset - this_sentence.offset , eend + abs_offset - this_sentence.offset,
"protein", text=e.text)
if eid is None:
print "did not add this entity: {}".format(e.text)
# print e.text
lastindex = estart
#for s in all_entities:
# print s, len(all_entities[s])