def start_scrapy(self):
self.process = CrawlerProcess(self.setting)
self.crawl()
reactor.run()
python类CrawlerProcess()的实例源码
def start_scrapy(self):
self.process = CrawlerProcess(self.setting)
self.crawl()
reactor.run()
def getReviewCount(url):
# Get the number of reviews
process = CrawlerProcess(get_project_settings())
process.crawl(review_count_spider, start_url=url)
process.start()
def crawl_naver_blog():
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(NaverBlogSpider)
process.start()
def main():
# configure the settings for the crawler and spider
args = parse_args()
config = {
'domains': args.domains,
'directory': args.output,
'allow': args.allow,
'deny': args.deny,
'unix': args.unix,
}
settings = Settings({
'USER_AGENT': (
'Wayback Machine Scraper/{0} '
'(+https://github.com/sangaline/scrapy-wayback-machine)'
).format(get_distribution('wayback-machine-scraper').version),
'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
'DOWNLOADER_MIDDLEWARES': {
'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
},
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_DEBUG': args.verbose,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
})
# start the crawler
process = CrawlerProcess(settings)
process.crawl(MirrorSpider, **config)
process.start()
def get_top_blogs():
"""
Get URLs of most popular blog posts for most popular programming languages
on GitHub.
"""
languages = get_top_languages(30, settings['LANGUAGES_DATA'])
process = CrawlerProcess(settings)
process.crawl(BlogsSpider, languages)
process.start() # the script will block here until the crawling is done
def estimate_traffic():
"""
Analyze traffic of the scraped blogs.
"""
process = CrawlerProcess(settings)
blogs_file = get_latest_file(settings['BLOGS_FEED_DIR'])
with open(blogs_file) as f:
blogs = json.load(f)
process.crawl(TrafficSpider, blogs)
process.start() # the script will block here until the crawling is done
def run_crawler(self):
process = CrawlerProcess(self.settings)
if self.args.anime:
if self.args.skip == None or 'nyaa' not in self.args.skip:
process.crawl(Nyaa, title=self.search, season=self.args.season, file=self.args.file)
if self.args.skip == None or 'shanaproject' not in self.args.skip:
process.crawl(Shanaproject, title=self.search, season=self.args.season, file=self.args.file)
else:
if self.args.skip == None or 'zooqle' not in self.args.skip:
process.crawl(Zooqle, title=self.search, season=self.args.season, file=self.args.file)
if self.args.skip == None or '1337x' not in self.args.skip:
process.crawl(_1337x, title=self.search, season=self.args.season, file=self.args.file)
if self.args.skip == None or 'eztv' not in self.args.skip:
process.crawl(Eztv, title=self.search, season=self.args.season, file=self.args.file)
if self.args.skip == None or 'rarbg' not in self.args.skip:
process.crawl(Rarbg, title=self.search, season=self.args.season, file=self.args.file)
if self.args.skip == None or 'torrentdownloads' not in self.args.skip:
process.crawl(Torrentdownloads, title=self.search, season=self.args.season, file=self.args.file)
if self.args.skip == None or 'limetorrents' not in self.args.skip:
process.crawl(Limetorrents, title=self.search, season=self.args.season, file=self.args.file)
if self.args.skip == None or 'thepiratebay' not in self.args.skip:
process.crawl(Thepiratebay, title=self.search, season=self.args.season, file=self.args.file)
process.start()
def explore_job(link) :
"""
Explore a website and index all urls (redis-rq process).
"""
print("explore website at : %s"%link)
# get final url after possible redictions
try :
link = url.crawl(link).url
except :
return 0
# create or update domain data
domain = url.domain(link)
res = client.index(index="web", doc_type='domain', id=domain, body={
"homepage":link,
"domain":domain,
"last_crawl":datetime.now()
})
# start crawler
process = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'DOWNLOAD_DELAY':0.25,
'ROBOTSTXT_OBEY':True,
'HTTPCACHE_ENABLED':False,
'REDIRECT_ENABLED':False,
'SPIDER_MIDDLEWARES' : {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware':True,
'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True,
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware':True,
'scrapy.extensions.closespider.CloseSpider':True
},
'CLOSESPIDER_PAGECOUNT':500 #only for debug
})
process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls = [link,], es_client=client, redis_conn=redis_conn)
process.start()
return 1
def _crawl(deck_ids):
logging.info('????????')
decks = list()
cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hearthstats.HearthStatsScrapyPipeline': 1}})
cp.crawl(HearthStatsScrapySpider, deck_ids=deck_ids, decks=decks)
cp.start()
return decks
def _crawl(deck_ids):
logging.info('??????????')
results = list()
cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hsbox.HSBoxScrapyPipeline': 1}})
cp.crawl(HSBoxScrapySpider, deck_ids=deck_ids, results=results)
cp.start()
logging.info('??? {} ??????????'.format(len(results)))
return results
def crawl(url):
"""Initialize crawling sequence."""
settings = get_project_settings()
settings.url = url
settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT
settings["DEPTH_LEVEL"] = DEPTH_LEVEL
process = CrawlerProcess(settings)
class ThisSpider(CrawlingSpider):
"""Create a spider to crawl with."""
start_urls = [url]
process.crawl(ThisSpider)
process.start()
def harvest(url):
"""Initialize harvest sequence."""
settings = get_project_settings()
settings.url = url
process = CrawlerProcess(settings)
process.crawl(HarvestSpider, url=url)
process.start()
def test_waizard_spider():
crawler = CrawlerProcess(get_project_settings())
crawler.crawl(WizardSpider)
crawler.start()
def run():
# ??settings.py?????
settings = get_project_settings()
process = CrawlerProcess(settings=settings)
# ??????spider
# process.crawl(Spider1)
# process.crawl(Spider2)
process.crawl(GuaziSaleSpider)
# ???????????????
process.start()
def collect(conf, conn):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn)
process.start()
def collect(conf, conn, page_from=None, page_to=None):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
process.start()
def collect(conf, conn):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn)
process.start()
def collect(conf, conn, date_from=None, date_to=None):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
process.start()
def collect(conf, conn, date_from=None, date_to=None):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
process.start()