def process_one(url, s3, expiration_rules, headers):
tld = tldextract.extract(url)
if tld.subdomain != '' and tld.subdomain != 'www':
tld = tld.subdomain + '.' + tld.domain + '.' + tld.suffix
else:
tld = tld.domain + '.' + tld.suffix
i = url.find(tld)
s3key = tld + url[i+len(tld):]
exp = get_expiration(url, expiration_rules)
try:
o = s3.ObjectSummary(bucket, s3key)
lm = o.last_modified
now = datetime.datetime.utcnow()
diff = exp - now
expires_on = now - diff
if lm.replace(tzinfo=None) < expires_on:
exists = False
else:
exists = True
except botocore.exceptions.ClientError as e:
exists = False
if not(exists):
logger.info('Processing: ' + url)
crawl = crawl_one(url, expiration_rules, headers)
contents = json.dumps(crawl, default=json_util.default)
fake_handle = StringIO(contents)
b = s3.create_bucket(Bucket=bucket)
res = b.put_object(Key=s3key, Body=fake_handle)
# TODO: check for errors
dt = datetime.datetime.today().strftime('%Y-%m-%d')
trackStats(tld, dt, True)
summaryKey = dt
trackStats(summaryKey, dt, True)
summaryKey = tld + "|" + dt
trackStats(summaryKey, dt, True)
return True
return False
评论列表
文章目录