crawl.py 文件源码-python代码片段

crawl.py 文件源码

python

阅读 33 收藏 0 点赞 0 评论 0

项目：hoaxy-backend 作者: IUNetSci 项目源码文件源码

def fetch_url(cls, session, msites, platform_id, purpose):
        """Actual method to do fetch url action.

        Parameters
        ----------
            msites : list
                a list of Site model class, contains info to build spiders.
            platform_id : int
                id of platform, bind fetched url with this id.
            purpose : {'update', 'archive'}
                indicate which url to fetch.
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.UrlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        for ms in msites:
            for sm in build_spiders_iter(ms, purpose):
                sm['kwargs']['session'] = session
                sm['kwargs']['platform_id'] = platform_id
                process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
        process.start()