def index_job(link) :
"""
Index a single page.
"""
print("index page : %s"%link)
# get final url after possible redictions
try :
link = url.crawl(link).url
except :
return 0
process = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'REDIRECT_ENABLED':False,
'SPIDER_MIDDLEWARES' : {
'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True
}
})
process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn)
process.start() # block until finished
评论列表
文章目录