newspaper_crawler.py 文件源码-python代码片段

newspaper_crawler.py 文件源码

python

阅读 19 收藏 0 点赞 0 评论 0

项目：newspaper-scraper-couchbase 作者: aleonsan 项目源码文件源码

def __init__(self, topic=None, newspaper=None, term='', *args, **kwargs):
        self.term = term
        if newspaper:
            sources = [source for source in SOURCE_NEWSPAPERS if newspaper == source['name']]
        else:
            sources = TOPIC_TO_SOURCES.get(topic, SOURCE_NEWSPAPERS)
        self.allowed_domains = [source['allowed_domains'] for source in sources]
        self.start_urls = [source['url'] for source in sources]
        self.rules = []
        for source in sources:
            if topic:
                allowed_domain_regex=(source['allowed_subdomains_regex'][topic], )
            else:
               allowed_domain_regex = (regexsubdomain for topic, regexsubdomain in source['allowed_subdomains_regex'].items())
            rule = Rule(link_extractor=LinkExtractor(allow=allowed_domain_regex), 
                                                     callback='parse_with_term',
                                                     cb_kwargs={
                                                         'term': self.term,
                                                         'newspaper': newspaper
                                                     },
                                                     follow=True)
            self.rules.append(rule)

        return super(NewspaperCrawler, self).__init__(*args, **kwargs)