def __init__(self, domains, directory, allow=(), deny=(), unix=False):
self.directory = directory
self.unix = unix
self.rules = (
Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'),
)
# parse the allowed domains and start urls
self.allowed_domains = []
self.start_urls = []
for domain in domains:
url_parts = domain.split('://')
unqualified_url = url_parts[-1]
url_scheme = url_parts[0] if len(url_parts) > 1 else 'http'
full_url = '{0}://{1}'.format(url_scheme, unqualified_url)
bare_domain = unqualified_url.split('/')[0]
self.allowed_domains.append(bare_domain)
self.start_urls.append(full_url)
super().__init__()
mirror_spider.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录