def parse_article(cls, session, url_tuples):
"""Actual method to do parse to article action.
Parameters
----------
session : object
a SQLAlchemy session object.
url_tuples : list
a list of url tuple (id, created_at, date_published,
canonical, site_id)
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.ArticlePipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
logger.info('Number of url to parse is: %s', len(url_tuples))
process.crawl(
ArticleParserSpider,
session=session,
url_tuples=url_tuples,
api_key=cls.conf['crawl']['article_parser']['webparser_api_key'],)
process.start()
评论列表
文章目录