def __init__(self, topic=None, newspaper=None, term='', *args, **kwargs):
self.term = term
if newspaper:
sources = [source for source in SOURCE_NEWSPAPERS if newspaper == source['name']]
else:
sources = TOPIC_TO_SOURCES.get(topic, SOURCE_NEWSPAPERS)
self.allowed_domains = [source['allowed_domains'] for source in sources]
self.start_urls = [source['url'] for source in sources]
self.rules = []
for source in sources:
if topic:
allowed_domain_regex=(source['allowed_subdomains_regex'][topic], )
else:
allowed_domain_regex = (regexsubdomain for topic, regexsubdomain in source['allowed_subdomains_regex'].items())
rule = Rule(link_extractor=LinkExtractor(allow=allowed_domain_regex),
callback='parse_with_term',
cb_kwargs={
'term': self.term,
'newspaper': newspaper
},
follow=True)
self.rules.append(rule)
return super(NewspaperCrawler, self).__init__(*args, **kwargs)
newspaper_crawler.py 文件源码
python
阅读 19
收藏 0
点赞 0
评论 0
评论列表
文章目录