def __init__(self, allowed_domains, depth_limit=1):
super().__init__()
self.allowed_domains = allowed_domains
# allow local links
self.allowed_domains.append('')
self.mime = MimeTypes()
# Queue implemented as dict as we need to store depth data alongside url
self.queue = dict() # { url: depth, .. }
# TODO: make crawled a class
self.crawled = dict() # { url: {type: type, outlinks: {link: count, ..}}, .. }
self.depth_limit = depth_limit
评论列表
文章目录