def get_last_time(self):
try:
self.cu.execute('CREATE TABLE history (time TEXT,result TEXT,spider_name TEXT primary key)')
last_time="2015-1-1 00:00:00"
except:
try:
self.cu.execute('SELECT time FROM history where spider_name="'+self.spider_name+'"')
last_time = self.cu.fetchone()[0]
log.msg('************* '+last_time,level=log.WARNING)
except:
last_time="2015-5-1 00:00:00"
log.msg('************* '+last_time,level=log.WARNING)
last_time = time.strptime(last_time, '%Y-%m-%d %H:%M:%S')
last_time = time.mktime(last_time)
return last_time
python类WARNING的实例源码
def insert_new_time(self):
if time.mktime(time.strptime(self.item_max_time, '%Y-%m-%d %H:%M:%S')) < time.time():
if self.sqlite_flag:
try:
log.msg('delete from history where spider_name='+self.spider_name,level=log.WARNING)
self.cu.execute('delete from history where spider_name="'+self.spider_name+'"')
self.sx.commit()
except sqlite3.OperationalError,e:
log.msg('__________',level=log.WARNING)
pass
sql = "insert into history values(?,?,?)"
params = (self.item_max_time,self.item_max_id,self.spider_name)
self.cu.execute(sql,params)
self.sx.commit()
self.close_sqlite()
def parse_datetime(value):
try:
d = parse(value)
except ValueError:
log.msg('Unable to parse %s' % value, level=log.WARNING)
return value
else:
return d.isoformat()
def parse_date(value):
try:
d = parse(value)
except ValueError:
log.msg('Unable to parse %s' % value, level=log.WARNING)
return value
else:
return d.strftime("%Y-%m-%d")
def process_item(self, item, spider):
if not type(item) == Alert:
return item
uri = item['uri']
if not uri:
raise DropItem("Not a valid alert URI: ", uri)
if spider.custom_whitelist:
for (pattern) in spider.custom_whitelist:
if pattern[0] in uri:
raise DropItem("Whitelisted domain found in Alert: ", uri)
if spider.alexa_whitelist:
try:
parsed_uri = urlparse(uri)
parsed_domain = '{uri.netloc}'.format(uri=parsed_uri)
domain = get_tld(uri)
for alexa_domain in spider.alexa_whitelist:
if domain.endswith(alexa_domain):
raise DropItem("Alert domain found in Alexa Whitelist: ", domain)
except (TldIOError,TldDomainNotFound,TldBadUrl) as e:
log.msg("Error parsing TLD. Still allowing alert for " + uri, level=log.WARNING)
except:
raise
return item
def warn(msg):
log.msg(str(msg), level=log.WARNING)
def warn(msg):
log.msg(str(msg), level=log.WARNING)
def _conditional_insert(self,tx,item):
# ori_html_path = self.save_html(item)
# item['repost_post_id'] = ori_html_path
query=u"insert ignore into post (url, topic_id, topic_kws, site_id, site_name, title, content, pt_time, st_time) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
param=(item['topic_url'], item['topic_id'], item['topic_kw'], item['topic_site_id'], item['topic_site_name'], item['topic_title'], item['topic_content'], item['topic_pt_time'], item['topic_st_time'])
tx.execute(query,param)
log.msg('insert one',level=log.WARNING)
print '---- insert one ----'
# sql = 'insert into '+ item['table_name'] +' (id ,url,board, site_id, data_type , title , content, post_time, scratch_time , poster_name,language_type,repost_post_id) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE post_time=%s'
# param = (item['topic_url'],item['topic_url'],item['topic_board'], item['site_id'],item['data_type'],item['topic_title'], item['topic_content'], item['topic_post_time'],item['scratch_time'], item['topic_author'],0,item['repost_post_id'],item['topic_post_time'])
# tx.execute(sql,param)
def _conditional_insert(self,tx,item):
# ori_html_path = self.save_html(item)
# item['repost_post_id'] = ori_html_path
query=u"insert ignore into post (url, topic_id, topic_kws, site_id, site_name, title, content, pt_time, st_time) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
param=(item['topic_url'], item['topic_id'], item['topic_kw'], item['topic_site_id'], item['topic_site_name'], item['topic_title'], item['topic_content'], item['topic_pt_time'], item['topic_st_time'])
tx.execute(query,param)
log.msg('insert one',level=log.WARNING)
# sql = 'insert into '+ item['table_name'] +' (id ,url,board, site_id, data_type , title , content, post_time, scratch_time , poster_name,language_type,repost_post_id) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE post_time=%s'
# param = (item['topic_url'],item['topic_url'],item['topic_board'], item['site_id'],item['data_type'],item['topic_title'], item['topic_content'], item['topic_post_time'],item['scratch_time'], item['topic_author'],0,item['repost_post_id'],item['topic_post_time'])
# tx.execute(sql,param)
def open(self, spider):
super(RecorderScheduler, self).open(spider)
self.stats_manager = StatsManager(spider.crawler.stats)
settings = spider.crawler.settings
self.recorder_enabled = settings.get('RECORDER_ENABLED', DEFAULT_RECORDER_ENABLED)
if not self.recorder_enabled:
log.msg('Recorder disabled!', log.WARNING)
return
log.msg('Starting recorder', log.INFO)
recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None)
if not recorder_storage:
self.recorder_enabled = False
log.msg('Missing Recorder storage! Recorder disabled...', log.WARNING)
return
self.graph = graphs.Manager(
engine=recorder_storage,
drop_all_tables=settings.getbool('RECORDER_STORAGE_DROP_ALL_TABLES',
DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES),
clear_content=settings.getbool('RECORDER_STORAGE_CLEAR_CONTENT',
DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT))