webcrawler.py 文件源码-python代码片段

webcrawler.py 文件源码

python

阅读 30 收藏 0 点赞 0 评论 0

项目：simple-web-crawler 作者: fikander 项目源码文件源码

def crawl():
    try:
        depth_limit = int(request.values['depth'])
    except ValueError as e:
        return "Depth parameter must be a number", 400
    except:
        depth_limit = 1

    if 'url' in request.values:
        url = request.values['url']
        parsed_url = urlparse.urlsplit(url)
        if parsed_url.scheme not in ['http', 'https']:
            return "Only http and https protocols are supported", 400
        if parsed_url.netloc == '':
            return "Missing domain", 400
        allowed_domains = [ parsed_url.netloc ]
        crawler = Crawler(allowed_domains, depth_limit)
        crawler.crawl(url)
        return jsonify(**crawler.crawled)
    else:
        return "Missing url parameter", 400