def __init__(self, options, columns):
start_time = time.time()
if (("keyspace" not in options) or ("columnfamily" not in options)) and ("query" not in options):
logger.log("Either query or columnfamily and keyspace parameter is required.", ERROR)
self.columnfamily = options.get("columnfamily", None)
self.keyspace = options.get("keyspace", None)
self.query = options.get("query", None)
self.prepare_select_stmt = options.get("prepare_selects", properties.PREPARE_SELECTS_DEFAULT) == 'True'
self.init_connection(options, columns)
start_time1 = time.time()
self.describe_db()
end_time = time.time()
self.insert_stmt = None
self.delete_stmt = None
self.prepared_select_stmts = {}
if ISDEBUG:
logger.log("DB described in {0} ms".format(int((end_time - start_time1) * 1000)))
logger.log("initialized in {0} ms".format(int((end_time - start_time) * 1000)))
python类log()的实例源码
def update_timeline(self, task, new_user):
uid = task['uid']
user_info = weibo_util.user_info(uid)
num_weibos = int(user_info['statuses_count'])
old_weibos = self.get_old_weibos(new_user, uid)
since_id = 0 if old_weibos == {} else min(old_weibos.keys())
all_weibos = self.get_all_weibos(uid, num_weibos, max_id=0, since_id=since_id)
new_weibos = {wid : all_weibos[wid] for wid in all_weibos if wid not in old_weibos}
deleted_weibos = {wid : old_weibos[wid] for wid in old_weibos if wid not in all_weibos}
if len(all_weibos) != num_weibos:
deleted_weibos = self.check_deleted_weibos(deleted_weibos)
self.update_all(task, user_info, all_weibos, new_weibos, deleted_weibos)
#logger.log('[x] Crawl finished for user %s.' % (uid))
def validity_check(self, entry):
out_folder = entry['screenshots_path']
if out_folder[-1] != '/':
out_folder += '/'
temp_out_folder=entry['temp_screenshots_path']
if temp_out_folder[-1] != '/':
temp_out_folder += '/'
phantomjs = self.conf['phantomjs']
if not os.path.exists(out_folder):
os.mkdir(entry['screenshots_path'])
if not os.path.exists(temp_out_folder):
os.mkdir(entry['temp_screenshots_path'])
if not os.path.isfile(phantomjs):
logger.log('[x] You need to download PhantomJs!', 'red')
return False
self.out_folder = out_folder
self.temp_out_folder = temp_out_folder
self.phantomjs = phantomjs
def clip(self):
"""
This function clips the mosaicked, projected images to the size of the
referenceImage
"""
subprocess.call(['gdaltindex', self.extent, self.referenceImagePath])
dataNames = sorted(glob.glob(self.fullPath + '/full*.tif'))
splitAt = len(self.fullPath) + 1
for i in range(len(dataNames)):
x = dataNames[i]
y = dataNames[i][:splitAt] + dataNames[i][splitAt+4:]
subprocess.call(['gdalwarp', '-r', 'near', '-cutline', self.extent, '-crop_to_cutline', x, y, '-dstnodata', '9999'])
for n in dataNames:
os.remove(n)
dataNames = sorted(glob.glob(self.fullPath + '/*.tif'))
test = gdal.Open(dataNames[0]).ReadAsArray()
logger.log('SUCCESS', 'Clipping complete! %d %s files were successfully clipped to the size of %s with dimensions %d rows by %d columns' % (len(dataNames), str(self.outformat), str(self.referenceImagePath), test.shape[0], test.shape[1]))
def init_connection(self, options, columns):
start_time = time.time()
if "hosts" not in options:
logger.log("The hosts parameter is needed, setting to localhost.", WARNING)
hosts = options.get("hosts", "localhost").split(",")
if "port" not in options:
logger.log("The port parameter is needed, setting to {0}.".format(properties.DEFAULT_CASSANDRA_PORT), WARNING)
self.port = options.get("port", properties.DEFAULT_CASSANDRA_PORT)
self.limit = options.get("limit", None)
self.allow_filtering = options.get("allow_filtering", properties.ALLOW_FILTERING_DEFAULT) == 'True'
self.enable_trace = options.get("trace", properties.ENABLE_TRACE_STATEMENTS_DEFAULT) == 'True'
self.ttl = int(options.get("ttl", properties.DEFAULT_TTL))
timeout = options.get("timeout", None)
username = options.get("username", None)
password = options.get("password", None)
self.cluster = Cluster(hosts)
if(username is not None):
self.cluster.auth_provider = PlainTextAuthProvider(username=username, password=password)
# Cassandra connection init
self.cluster.executor_threads = 4
self.cluster.connect_timeout = int(options.get("connection_timeout", properties.DEFAULT_CONNECTION_TIMEOUT))
self.session = self.cluster.connect()
end_time = time.time()
if ISDEBUG:
logger.log("connected in {0} ms".format(int((end_time - start_time) * 1000)))
if timeout is not None:
self.session.default_timeout = float(timeout)
def prepare_insert_stmt(self):
insert_stmt_str = u"INSERT INTO {0}.{1} ({2}) VALUES ({3})".format(
self.keyspace, self.columnfamily, u",".join(self.queryableColumns), u",".join([u"?"] * len(self.queryableColumns)))
if self.ttl != 0:
insert_stmt_str += " USING TTL {0}".format(self.ttl)
if ISDEBUG:
logger.log("insert statement: {0}".format(insert_stmt_str))
logger.log("preparing insert statement")
st = time.time()
self.insert_stmt = self.session.prepare(insert_stmt_str)
if ISDEBUG:
logger.log("insert statement prepared in {0} ms".format((time.time() - st) * 1000))
def prepare_delete_stmt(self):
delete_stmt_str = u"DELETE FROM {0}.{1} WHERE {2};".format(self.keyspace, self.columnfamily, u" AND ".join(map(lambda str: str + u" = ?", self.rowIdColumns)))
if ISDEBUG:
logger.log("preparing delete statement")
st = time.time()
self.delete_stmt = self.session.prepare(delete_stmt_str)
if ISDEBUG:
logger.log("delete statement prepared in {0} ms".format((time.time() - st) * 1000))
def insert(self, new_values):
if self.insert_stmt is None:
self.prepare_insert_stmt()
args = self.get_insert_args(new_values)
if ISDEBUG:
logger.log("requested insert {0}".format(args))
st = time.time()
self.session.execute(self.insert_stmt, args)
if ISDEBUG:
et = time.time()
logger.log("insert completed in {0} ms".format((et - st) * 1000))
return new_values
def delete(self, rowid):
if self.delete_stmt is None:
self.prepare_delete_stmt()
if ISDEBUG:
logger.log(u"requested delete for id: {0}".format(rowid))
values = self.get_delete_args(rowid)
if ISDEBUG:
st = time.time()
self.session.execute(self.delete_stmt, values)
if ISDEBUG:
et = time.time()
logger.log("delete completed in {0} ms".format((et - st) * 1000))
return {}
def get_row_id_column(self):
if ISDEBUG:
logger.log(u"rowid requested")
return self.ROWIDCOLUMN
def _log_rc(cmd_output, funct_name, **kvarg):
"""Generic logger that picks correct log type based on return code"""
rc = cmd_output['rc'] if 'rc' in cmd_output else cmd_output
logger.log(logger.decide_level(rc),
funct_name,
**kvarg
)
def parse_and_bind(self, string):
'''Parse and execute single line of a readline init file.'''
try:
log('parse_and_bind("%s")' % string)
if string.startswith('#'):
return
if string.startswith('set'):
m = re.compile(r'set\s+([-a-zA-Z0-9]+)\s+(.+)\s*$').match(string)
if m:
var_name = m.group(1)
val = m.group(2)
try:
setattr(self, var_name.replace('-','_'), val)
except AttributeError:
log('unknown var="%s" val="%s"' % (var_name, val))
else:
log('bad set "%s"' % string)
return
m = re.compile(r'\s*(.+)\s*:\s*([-a-zA-Z]+)\s*$').match(string)
if m:
key = m.group(1)
func_name = m.group(2)
py_name = func_name.replace('-', '_')
try:
func = getattr(self.mode, py_name)
except AttributeError:
log('unknown func key="%s" func="%s"' % (key, func_name))
if self.debug:
print 'pyreadline parse_and_bind error, unknown function to bind: "%s"' % func_name
return
self.mode._bind_key(key, func)
except:
log('error')
raise
def read_init_file(self, filename=None):
'''Parse a readline initialization file. The default filename is the last filename used.'''
log('read_init_file("%s")' % filename)
#History file book keeping methods (non-bindable)
def set_completer(self, function=None):
'''Set or remove the completer function.
If function is specified, it will be used as the new completer
function; if omitted or None, any completer function already
installed is removed. The completer function is called as
function(text, state), for state in 0, 1, 2, ..., until it returns a
non-string value. It should return the next possible completion
starting with text.
'''
log('set_completer')
self.completer = function
def get_completer(self):
'''Get the completer function.
'''
log('get_completer')
return self.completer
def create_tables(self):
for table in WeiboCrawler.TABLES:
domains = WeiboCrawler.get_table_domains(table)
sql = db_util.generate_create_sql(table, domains)
try:
db_util.execute(self.conf['db'], sql)
logger.log('[x] Table `%s` created!' % (table), 'green')
except:
logger.log('[x] Table `%s` already exsits' % (table))
def update(self):
logger.log('[x] Crawling...')
for task in self.conf['tasks']:
# logger.log('[x] Update user %s, name %s' % (task['uid'], task['name']))
sql = 'SELECT statuses_count FROM user_info WHERE id=? ORDER BY update_time desc LIMIT 1'
basic_info = db_util.execute(self.conf['db'], sql, [task['uid']])
self.update_timeline(task, len(basic_info) == 0)
def get_all_weibos(self, uid, num_weibos, max_id, since_id):
all_weibos = {}
all_weibos = weibo_util.user_timeline_all(uid, num_weibos, all_weibos, weibo_util.user_timeline_public)
times = 0
while len(all_weibos) != num_weibos:
logger.log('[x] Crawling... %d/%d' % (len(all_weibos), num_weibos))
all_weibos = weibo_util.user_timeline_all(uid, num_weibos, all_weibos, weibo_util.user_timeline_public, count=200)
times += 1
if times == 2:
break
times = 0
while len(all_weibos) != num_weibos:
logger.log('[x] Crawling... %d/%d' % (len(all_weibos), num_weibos))
all_weibos = weibo_util.user_timeline_all(uid, num_weibos, all_weibos, weibo_util.user_timeline_private)
times += 1
if times == 1:
break
times = 0
while len(all_weibos) != num_weibos:
logger.log('[x] Crawling... %d/%d' % (len(all_weibos), num_weibos))
all_weibos = weibo_util.user_timeline_all_since(uid, since_id, num_weibos, all_weibos, weibo_util.user_timeline_public, count=200)
times += 1
if times == 2:
break
# comment this part because the `since_id` argument does not work in private api
#times = 0
#while len(all_weibos) != num_weibos:
# logger.log('[x] Crawling... %d/%d' % (len(all_weibos), num_weibos))
# all_weibos = weibo_util.user_timeline_all_since(uid, since_id, num_weibos, all_weibos, weibo_util.user_timeline_private)
# times += 1
# if times == 1:
# break
logger.log('[x] Crawling... %d/%d' % (len(all_weibos), num_weibos))
return all_weibos
def update_all(self, task, user_info, all_weibos, new_weibos, deleted_weibos):
deleted_time = self.update_deletion(deleted_weibos.values())
if deleted_time != -1:
out = task['name']+'_del_' + time.strftime("%Y-%m-%d_%H.%M.%S") + '.csv'
weibo_writer.last_deleted_weibos_to_csv(self.conf['db'], task['uid'], \
deleted_time, out)
logger.log('[x] %s\'s deleted weibos are exported to %s' % (task['uid'], out), 'green')
self.show_user_info(user_info)
self.show_user_weibos(new_weibos.values(), deleted=False)
self.show_user_weibos(deleted_weibos.values(), deleted=True)
self.insert_table(table='user_info', entries=[user_info])
self.update_table(table='status', entries=all_weibos.values(), delete=False)
# helper functions
def show_user_weibos(self, weibos, deleted=False):
if deleted is False:
logger.log('[x] %d weibos are created!' % len(weibos), 'green')
else:
logger.log('[x] %d weibos are deleted!' % len(weibos), 'red')
id = 1
for weibo in weibos:
logger.log('[x] Weibo %d:' % id)
logger.log('[x] URL: http://www.weibo.com/%s/%s' % (weibo['uid'], weibo['mid']))
logger.log('[x] Content: %s' % (weibo['text']))
logger.log('[x] Created at: %s' % (weibo['created_at']))
logger.log('[x] Repost count: %s' % (weibo['reposts_count']))
logger.log('[x] Comments count: %s' % (weibo['comments_count']))
logger.log('[x] Like count: %s' % (weibo['attitudes_count']))
id += 1
def capture_screenshot(self, url, destination, cookie='', timeout=30000):
logger.log("[#] Capture screenshot %s, and put it in %s" % (url, destination))
#subprocess.call([self.phantomjs_folder + 'phantomjs.exe', self.phantomjs_folder + 'rasterize6_cookies.js', url, destination, str(timeout), cookie])
cmd = list2cmdline([path_to_unix(self.phantomjs), \
JS_SCRIPT, \
url, path_to_unix(destination), str(timeout), cookie])
r = envoy.run(str(cmd), timeout=60+timeout/1000)
def retrive(self):
for entry in self.conf['tasks']:
if self.validity_check(entry) is False:
continue
logger.log('[x] Retrive user %s\'s screenshots.' % (entry['uid']))
self.retrive_weibo(entry)
#self.retrive_hot_comments(entry)
self.retrive_weibo(entry, hot=True)
def export_all(config):
db = config['db']
for entry in config['tasks']:
uid = entry['uid']
uname = entry['name']
out = uname + '_all_' + time.strftime("%Y-%m-%d") + '.csv'
all_weibos_to_csv(db, uid, out)
logger.log('[x] Export %s\'s weibo to %s' % (uid, out), 'green')
def parse_and_bind(self, string):
'''Parse and execute single line of a readline init file.'''
try:
log('parse_and_bind("%s")' % string)
if string.startswith('#'):
return
if string.startswith('set'):
m = re.compile(r'set\s+([-a-zA-Z0-9]+)\s+(.+)\s*$').match(string)
if m:
var_name = m.group(1)
val = m.group(2)
try:
setattr(self, var_name.replace('-','_'), val)
except AttributeError:
log('unknown var="%s" val="%s"' % (var_name, val))
else:
log('bad set "%s"' % string)
return
m = re.compile(r'\s*(.+)\s*:\s*([-a-zA-Z]+)\s*$').match(string)
if m:
key = m.group(1)
func_name = m.group(2)
py_name = func_name.replace('-', '_')
try:
func = getattr(self.mode, py_name)
except AttributeError:
log('unknown func key="%s" func="%s"' % (key, func_name))
print 'unknown function to bind: "%s"' % func_name
self.mode._bind_key(key, func)
except:
log('error')
raise
def read_init_file(self, filename=None):
'''Parse a readline initialization file. The default filename is the last filename used.'''
log('read_init_file("%s")' % filename)
#History file book keeping methods (non-bindable)
def set_completer(self, function=None):
'''Set or remove the completer function.
If function is specified, it will be used as the new completer
function; if omitted or None, any completer function already
installed is removed. The completer function is called as
function(text, state), for state in 0, 1, 2, ..., until it returns a
non-string value. It should return the next possible completion
starting with text.
'''
log('set_completer')
self.completer = function
def get_completer(self):
'''Get the completer function.
'''
log('get_completer')
return self.completer
def download(self):
"""
Download images for specified tiles and time period.
:param filelist: lists all of the HDF files downloaded
:param observations: lists the total number of days worth of data downloaded
(e.g. 1 year of data = 23 observations).
(e.g. 1 year of data = 23 observations).
"""
if not os.path.exists(self.directory):
os.mkdir(self.directory)
if not os.path.exists(self.fullPath):
os.mkdir(self.fullPath)
dm = pymodis.downmodis.downModis(self.fullPath, self.password, self.username, self.url, self.tiles, self.path, self.dataset,
self.today, self.enddate, jpg = False, debug = True, timeout = 30)
dm.connect()
self.filelist = dm.getListDays()
self.observations = len(dm.getListDays())
if self.dataset != 'MOD13Q1.005':
if self.observations % 2 != 0:
raise IOError("The total number of observations through time must be an even number. Please add or remove an observation before or after %s" % str(self.filelist[0]))
dm.downloadsAllDay()
logger.log('SUCCESS', 'Downloading is complete! %d HDF files of %s data for tiles %s were downloaded for the following days: %s' % (self.observations*len(self.tiles), str(self.dataset), str(self.tiles), str(self.filelist)))
def mosaic(self):
"""
If more than two tiles are input by the user, this function mosaics the tiles
together.
"""
if len(self.tiles) > 1:
hdflist = sorted(glob.glob(self.fullPath + '/*.hdf'))
for i in range(0,len(hdflist),2):
ms = pymodis.convertmodis_gdal.createMosaicGDAL(hdfnames = [hdflist[i], hdflist[i+1]], subset = self.subset, outformat = 'GTiff')
ms.run(str(hdflist[i].split('.h')[0]) + 'mos.tif')
ms.write_vrt(output = str(hdflist[i].split('.h')[0]), separate = True)
mosaicCount = len(glob.glob(self.fullPath + '/*mos.tif'))
logger.log('SUCCESS', 'Mosaic complete! MODIS tiles %s were successfully mosaicked into %d mosaic images.' % (str(self.tiles), mosaicCount))
def qualityCheck(self):
d = self.fullPath
dataset = self.dataset
outfile = d + '/qualityCheck' + dataset + '.txt'
sys.stdout = open(outfile, 'w')
array = np.load(d + '.npy')
t = d + '.txt'
with open(t, 'r') as tfile:
text = tfile.read().split('\n')
text = text[0:-1]
nv = array.max()
#entire dataset
print 'Data for entire array of %s data:' % (dataset)
print '\t>>> included datasets: %s' % (str(text))
print '\t>>> shape:', array.shape
print '\t>>> max w nv:', array.max()
print '\t>>> max wo nv:', array[array<nv].max()
print '\t>>> min wo nv:', array[array<nv].min()
print '\t>>> mean wo nv:', array[array<nv].mean()
#each band
for b in range(array.shape[1]):
print 'Data for column %d, %s:'% (b, text[b])
print '\t>>> max w nv:', array[:,b].max()
print '\t>>> max wo nv:', array[:,b][array[:,b]<nv].max()
print '\t>>> min wo nv:', array[:,b][array[:,b]<nv].min()
print '\t>>> mean wo nv:', array[:,b][array[:,b]<nv].mean()
sys.stdout = sys.__stdout__
logger.log('SUCCESS', 'See file qualityCheck%s.txt for detailed information about the final matrix.' % (self.dataset))