def explore_job(link) :
"""
Explore a website and index all urls (redis-rq process).
"""
print("explore website at : %s"%link)
# get final url after possible redictions
try :
link = url.crawl(link).url
except :
return 0
# create or update domain data
domain = url.domain(link)
res = client.index(index="web", doc_type='domain', id=domain, body={
"homepage":link,
"domain":domain,
"last_crawl":datetime.now()
})
# start crawler
process = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'DOWNLOAD_DELAY':0.25,
'ROBOTSTXT_OBEY':True,
'HTTPCACHE_ENABLED':False,
'REDIRECT_ENABLED':False,
'SPIDER_MIDDLEWARES' : {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware':True,
'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True,
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware':True,
'scrapy.extensions.closespider.CloseSpider':True
},
'CLOSESPIDER_PAGECOUNT':500 #only for debug
})
process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls = [link,], es_client=client, redis_conn=redis_conn)
process.start()
return 1
评论列表
文章目录