crawl.py 文件源码

python
阅读 24 收藏 0 点赞 0 评论 0

项目:open-house-crawler 作者: data-skeptic 项目源码 文件源码
def process_one(url, s3, expiration_rules, headers):
    tld = tldextract.extract(url)
    if tld.subdomain != '' and tld.subdomain != 'www':
        tld = tld.subdomain + '.' + tld.domain + '.' + tld.suffix
    else:
        tld = tld.domain + '.' + tld.suffix
    i = url.find(tld)
    s3key = tld + url[i+len(tld):]
    exp = get_expiration(url, expiration_rules)
    try:
        o = s3.ObjectSummary(bucket, s3key)
        lm = o.last_modified
        now = datetime.datetime.utcnow()
        diff = exp - now
        expires_on = now - diff
        if lm.replace(tzinfo=None) < expires_on:
            exists = False
        else:
            exists = True
    except botocore.exceptions.ClientError as e:
        exists = False
    if not(exists):
        logger.info('Processing: ' + url)
        crawl = crawl_one(url, expiration_rules, headers)
        contents = json.dumps(crawl, default=json_util.default)
        fake_handle = StringIO(contents)
        b = s3.create_bucket(Bucket=bucket)
        res = b.put_object(Key=s3key, Body=fake_handle)
        # TODO: check for errors
        dt = datetime.datetime.today().strftime('%Y-%m-%d')
        trackStats(tld, dt, True)
        summaryKey = dt
        trackStats(summaryKey, dt, True)
        summaryKey = tld + "|" + dt
        trackStats(summaryKey, dt, True)
        return True
    return False
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号