def __init__(self, feed_file=None, feed_title=None, feed_link=None, feed_description=None,
crawler_settings=None):
settings = crawler_settings if crawler_settings else dict(self.default_settings)
if feed_file:
settings['FEED_FILE'] = feed_file
if feed_title:
settings['FEED_TITLE'] = feed_title
if feed_link:
settings['FEED_LINK'] = feed_link
if feed_description:
settings['FEED_DESCRIPTION'] = feed_description
self.crawler = get_crawler(settings_dict=settings)
self.spider = scrapy.Spider.from_crawler(self.crawler, 'example.com')
self.spider.parse = lambda response: ()
item_processor = settings.get('ITEM_PROCESSOR')
if not item_processor:
item_processor = RaisedItemPipelineManager
elif isinstance(item_processor, six.string_types):
item_processor = load_object(item_processor)
self.ipm = item_processor.from_crawler(self.crawler)
python类crawler()的实例源码
def test_autologin_request():
crawler = make_crawler(
base_settings(), SPLASH_URL='http://192.168.99.100:8050')
mw = AutologinMiddleware('http://127.0.0.1:8089', crawler)
al_request = mw._login_request(scrapy.Request('http://example.com'))
data = json.loads(al_request.body.decode('utf-8'))
assert al_request.dont_filter
assert al_request.meta['proxy'] is None
assert data['url'] == 'http://example.com'
assert data['settings']['USER_AGENT'] == crawler.settings.get('USER_AGENT')
assert data['settings'].get('SPLASH_URL') is None
al_request = mw._login_request(SplashRequest('http://example.com'))
data = json.loads(al_request.body.decode('utf-8'))
assert data['url'] == 'http://example.com'
assert data['settings']['SPLASH_URL'] == crawler.settings.get('SPLASH_URL')
def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(EuropythonSpyder())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
def main():
from scrapy.xlib.pydispatch import dispatcher
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define spyder for the crawler
crawler.crawl(PydataSpiderDetails())
print "STARTING ENGINE"
crawler.start() #start the crawler
print "ENGINE STOPPED"
def test_media_pipeline(tmpdir, max_cache):
crawler = make_crawler(FILES_STORE='file://{}'.format(tmpdir),
FILES_MAX_CACHE=max_cache)
with MockServer(WithFile) as s:
root_url = s.root_url
yield crawler.crawl(url=root_url)
spider = crawler.spider
assert len(spider.collected_items) == 3
root_item = find_item('/', spider.collected_items)
assert len(root_item['objects']) == 2
file_item = find_item(
'/file.pdf', root_item['objects'], 'obj_original_url')
assert file_item['obj_original_url'] == root_url + '/file.pdf'
assert not file_item['obj_stored_url'].endswith('.pdf')
with tmpdir.join(file_item['obj_stored_url']).open('rb') as f:
assert f.read() == FILE_CONTENTS
assert file_item['content_type'] == 'application/pdf'
headers = dict(file_item['response_headers'])
headers.pop('date')
headers.pop('server')
assert headers == {'content-type': 'application/pdf',
'content-hype': 'very/high'}
forbidden_item = find_item(
'/forbidden.pdf', root_item['objects'], 'obj_original_url')
with tmpdir.join(forbidden_item['obj_stored_url']).open('rb') as f:
assert f.read() == FILE_CONTENTS * 2
page_item = find_item('/page?b=2&a=1', spider.collected_items)
file_item_q = find_item(
'/file.pdf?allow=true', page_item['objects'], 'obj_original_url')
assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']
another_page_item = find_item('/another-page', spider.collected_items)
file_item_q = find_item(
'/file.pdf', another_page_item['objects'], 'obj_original_url')
assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']
assert file_item_q['obj_original_url'] == file_item['obj_original_url']
def startCrawler():
""" Initiates process of the web crawler above.
Arguments: None
Return: None
"""
# Starts a Twisted reactors to configure logs and set shutdown handlers
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(TwitterSpider)
process.start()
def main():
from scrapy.crawler import CrawlerProcess, Crawler
process = CrawlerProcess()
process.crawl(EducatieSpider)
process.start()
def main():
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
process.crawl(DialogSpider)
process.start()
def __enter__(self):
responses = self.crawler.signals.send_catch_log(signal=signals.spider_opened,
spider=self.spider)
for _, failure in responses:
if failure:
failure.raiseException()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
responses = self.crawler.signals.send_catch_log(signal=signals.spider_closed,
spider=self.spider, reason=None)
for _, failure in responses:
if failure:
failure.raiseException()
def test_skip(settings):
crawler = make_crawler(settings, _AUTOLOGIN_FORCE_SKIP=True)
with MockServer(Login) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
assert set(spider.visited_urls) == {'/', '/login'}
assert all(not r.meta['autologin_active'] for r in spider.responses)
def test_login(settings, extra_settings=None):
""" No logout links, just one page after login.
"""
crawler = make_crawler(settings, **AL_SETTINGS)
with MockServer(Login) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
assert len(spider.visited_urls) == 2
assert set(spider.visited_urls) == {'/', '/hidden'}
response = spider.responses[0]
assert urlsplit(response.url).path.rstrip('/') == ''
assert response.meta['autologin_active']
assert response.meta['autologin_response']['status'] == 'solved'
def test_login_error(settings, extra_settings=None):
""" Trying to login with wrong credentials
"""
al_settings = dict(AL_SETTINGS)
al_settings['AUTOLOGIN_PASSWORD'] = 'wrong'
crawler = make_crawler(settings, **al_settings)
with MockServer(Login) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
assert len(spider.visited_urls) == 2
assert set(spider.visited_urls) == {'/', '/login'}
response = spider.responses[0]
assert urlsplit(response.url).path.rstrip('/') == ''
assert not response.meta['autologin_active']
assert response.meta['autologin_response']['status'] == 'error'
def test_pass_via_meta(settings):
crawler = make_crawler(settings, spider_cls=PassMetaSpider,
AUTOLOGIN_DOWNLOAD_DELAY=0.01)
with MockServer(Login) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
assert len(spider.visited_urls) == 2
assert set(spider.visited_urls) == {'/', '/hidden'}
def test_login_with_logout(settings, spider_cls=TestSpider):
""" Login with logout.
"""
crawler = make_crawler(settings, spider_cls=spider_cls, **AL_SETTINGS)
with MockServer(LoginWithLogout) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
mandatory_urls = {'/', '/hidden', '/one', '/two', '/three', '/slow'}
spider_urls = set(spider.visited_urls)
assert mandatory_urls.difference(spider_urls) == set()
assert spider_urls.difference(
mandatory_urls | {'/l0gout1', '/l0gout2'}) == set()
def test_custom_headers(settings):
crawler = make_crawler(settings, USER_AGENT='MyCustomAgent', **AL_SETTINGS)
with MockServer(LoginIfUserAgentOk) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
assert len(spider.visited_urls) == 2
assert spider.visited_urls[1] == '/hidden'
def parse(self, response):
for item in super(StoppingSpider, self).parse(response):
yield item
if not self.state.get('was_stopped'):
self.state['was_stopped'] = True
self.crawler.stop()
def test_resume(settings):
crawler = make_crawler(
settings, spider_cls=StoppingSpider,
JOBDIR=tempfile.mkdtemp(),
SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleFifoDiskQueue',
SCHEDULER_MEMORY_QUEUE='scrapy.squeues.FifoMemoryQueue',
LOG_UNSERIALIZABLE_REQUESTS=True,
**AL_SETTINGS)
with MockServer(Login) as s:
yield crawler.crawl(url=s.root_url)
# resuming crawl
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
assert len(spider.visited_urls) == 1
assert set(spider.visited_urls) == {'/hidden'}
def test_disable_logout(settings):
crawler = make_crawler(settings, **AL_SETTINGS)
with MockServer(LoginWithContentAfterLogout) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
assert set(spider.visited_urls) == {'/', '/hidden'}
crawler = make_crawler(
settings, AUTOLOGIN_CHECK_LOGOUT=False, **AL_SETTINGS)
with MockServer(LoginWithContentAfterLogout) as s:
yield crawler.crawl(url=s.root_url)
spider = crawler.spider
spider_urls = set(spider.visited_urls)
assert set(spider.visited_urls) == {'/', '/hidden', '/target'}