python类crawler()的实例源码

test_exporter.py 文件源码 项目:scrapy_rss 作者: woxcab 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, feed_file=None, feed_title=None, feed_link=None, feed_description=None,
                 crawler_settings=None):
        settings = crawler_settings if crawler_settings else dict(self.default_settings)
        if feed_file:
            settings['FEED_FILE'] = feed_file
        if feed_title:
            settings['FEED_TITLE'] = feed_title
        if feed_link:
            settings['FEED_LINK'] = feed_link
        if feed_description:
            settings['FEED_DESCRIPTION'] = feed_description
        self.crawler = get_crawler(settings_dict=settings)
        self.spider = scrapy.Spider.from_crawler(self.crawler, 'example.com')
        self.spider.parse = lambda response: ()
        item_processor = settings.get('ITEM_PROCESSOR')
        if not item_processor:
            item_processor = RaisedItemPipelineManager
        elif isinstance(item_processor, six.string_types):
            item_processor = load_object(item_processor)

        self.ipm = item_processor.from_crawler(self.crawler)
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def test_autologin_request():
    crawler = make_crawler(
        base_settings(), SPLASH_URL='http://192.168.99.100:8050')
    mw = AutologinMiddleware('http://127.0.0.1:8089', crawler)
    al_request = mw._login_request(scrapy.Request('http://example.com'))
    data = json.loads(al_request.body.decode('utf-8'))
    assert al_request.dont_filter
    assert al_request.meta['proxy'] is None
    assert data['url'] == 'http://example.com'
    assert data['settings']['USER_AGENT'] == crawler.settings.get('USER_AGENT')
    assert data['settings'].get('SPLASH_URL') is None

    al_request = mw._login_request(SplashRequest('http://example.com'))
    data = json.loads(al_request.body.decode('utf-8'))
    assert data['url'] == 'http://example.com'
    assert data['settings']['SPLASH_URL'] == crawler.settings.get('SPLASH_URL')
EuropythonSpyder.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(EuropythonSpyder())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
PydataSpiderDetails.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def main():
    from scrapy.xlib.pydispatch import dispatcher

    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # define spyder for the crawler
    crawler.crawl(PydataSpiderDetails())

    print "STARTING ENGINE"
    crawler.start() #start  the crawler
    print "ENGINE STOPPED"
test_media_pipeline.py 文件源码 项目:scrapy-cdr 作者: TeamHG-Memex 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def test_media_pipeline(tmpdir, max_cache):
    crawler = make_crawler(FILES_STORE='file://{}'.format(tmpdir),
                           FILES_MAX_CACHE=max_cache)
    with MockServer(WithFile) as s:
        root_url = s.root_url
        yield crawler.crawl(url=root_url)
    spider = crawler.spider
    assert len(spider.collected_items) == 3

    root_item = find_item('/', spider.collected_items)
    assert len(root_item['objects']) == 2
    file_item = find_item(
        '/file.pdf', root_item['objects'], 'obj_original_url')
    assert file_item['obj_original_url'] == root_url + '/file.pdf'
    assert not file_item['obj_stored_url'].endswith('.pdf')
    with tmpdir.join(file_item['obj_stored_url']).open('rb') as f:
        assert f.read() == FILE_CONTENTS
    assert file_item['content_type'] == 'application/pdf'
    headers = dict(file_item['response_headers'])
    headers.pop('date')
    headers.pop('server')
    assert headers == {'content-type': 'application/pdf',
                       'content-hype': 'very/high'}

    forbidden_item = find_item(
        '/forbidden.pdf', root_item['objects'], 'obj_original_url')
    with tmpdir.join(forbidden_item['obj_stored_url']).open('rb') as f:
        assert f.read() == FILE_CONTENTS * 2

    page_item = find_item('/page?b=2&a=1', spider.collected_items)
    file_item_q = find_item(
        '/file.pdf?allow=true', page_item['objects'], 'obj_original_url')
    assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']

    another_page_item = find_item('/another-page', spider.collected_items)
    file_item_q = find_item(
        '/file.pdf', another_page_item['objects'], 'obj_original_url')
    assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']
    assert file_item_q['obj_original_url'] == file_item['obj_original_url']
searchMiner.py 文件源码 项目:twitter-sentiment 作者: words-sdsc 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def startCrawler():
    """ Initiates process of the web crawler above.

    Arguments: None

    Return: None
    """

    # Starts a Twisted reactors to configure logs and set shutdown handlers
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(TwitterSpider)
    process.start()
scraper.py 文件源码 项目:czl-scrape 作者: code4romania 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def main():
    from scrapy.crawler import CrawlerProcess, Crawler
    process = CrawlerProcess()
    process.crawl(EducatieSpider)
    process.start()
dialog.py 文件源码 项目:czl-scrape 作者: code4romania 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def main():
    from scrapy.crawler import CrawlerProcess
    process = CrawlerProcess()
    process.crawl(DialogSpider)
    process.start()
test_exporter.py 文件源码 项目:scrapy_rss 作者: woxcab 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __enter__(self):
        responses = self.crawler.signals.send_catch_log(signal=signals.spider_opened,
                                                        spider=self.spider)
        for _, failure in responses:
            if failure:
                failure.raiseException()

        return self
test_exporter.py 文件源码 项目:scrapy_rss 作者: woxcab 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __exit__(self, exc_type, exc_val, exc_tb):
        responses = self.crawler.signals.send_catch_log(signal=signals.spider_closed,
                                                        spider=self.spider, reason=None)
        for _, failure in responses:
            if failure:
                failure.raiseException()
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def test_skip(settings):
    crawler = make_crawler(settings, _AUTOLOGIN_FORCE_SKIP=True)
    with MockServer(Login) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    assert set(spider.visited_urls) == {'/', '/login'}
    assert all(not r.meta['autologin_active'] for r in spider.responses)
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def test_login(settings, extra_settings=None):
    """ No logout links, just one page after login.
    """
    crawler = make_crawler(settings, **AL_SETTINGS)
    with MockServer(Login) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    assert len(spider.visited_urls) == 2
    assert set(spider.visited_urls) == {'/', '/hidden'}
    response = spider.responses[0]
    assert urlsplit(response.url).path.rstrip('/') == ''
    assert response.meta['autologin_active']
    assert response.meta['autologin_response']['status'] == 'solved'
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_login_error(settings, extra_settings=None):
    """ Trying to login with wrong credentials
    """
    al_settings = dict(AL_SETTINGS)
    al_settings['AUTOLOGIN_PASSWORD'] = 'wrong'
    crawler = make_crawler(settings, **al_settings)
    with MockServer(Login) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    assert len(spider.visited_urls) == 2
    assert set(spider.visited_urls) == {'/', '/login'}
    response = spider.responses[0]
    assert urlsplit(response.url).path.rstrip('/') == ''
    assert not response.meta['autologin_active']
    assert response.meta['autologin_response']['status'] == 'error'
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_pass_via_meta(settings):
    crawler = make_crawler(settings, spider_cls=PassMetaSpider,
                           AUTOLOGIN_DOWNLOAD_DELAY=0.01)
    with MockServer(Login) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    assert len(spider.visited_urls) == 2
    assert set(spider.visited_urls) == {'/', '/hidden'}
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def test_login_with_logout(settings, spider_cls=TestSpider):
    """ Login with logout.
    """
    crawler = make_crawler(settings, spider_cls=spider_cls, **AL_SETTINGS)
    with MockServer(LoginWithLogout) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    mandatory_urls = {'/', '/hidden', '/one', '/two', '/three', '/slow'}
    spider_urls = set(spider.visited_urls)
    assert mandatory_urls.difference(spider_urls) == set()
    assert spider_urls.difference(
        mandatory_urls | {'/l0gout1', '/l0gout2'}) == set()
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_custom_headers(settings):
    crawler = make_crawler(settings, USER_AGENT='MyCustomAgent', **AL_SETTINGS)
    with MockServer(LoginIfUserAgentOk) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    assert len(spider.visited_urls) == 2
    assert spider.visited_urls[1] == '/hidden'
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        for item in super(StoppingSpider, self).parse(response):
            yield item
        if not self.state.get('was_stopped'):
            self.state['was_stopped'] = True
            self.crawler.stop()
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 66 收藏 0 点赞 0 评论 0
def test_resume(settings):
    crawler = make_crawler(
        settings, spider_cls=StoppingSpider,
        JOBDIR=tempfile.mkdtemp(),
        SCHEDULER_DISK_QUEUE='scrapy.squeues.PickleFifoDiskQueue',
        SCHEDULER_MEMORY_QUEUE='scrapy.squeues.FifoMemoryQueue',
        LOG_UNSERIALIZABLE_REQUESTS=True,
        **AL_SETTINGS)
    with MockServer(Login) as s:
        yield crawler.crawl(url=s.root_url)
        # resuming crawl
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    assert len(spider.visited_urls) == 1
    assert set(spider.visited_urls) == {'/hidden'}
test_integration.py 文件源码 项目:autologin-middleware 作者: TeamHG-Memex 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_disable_logout(settings):
    crawler = make_crawler(settings, **AL_SETTINGS)
    with MockServer(LoginWithContentAfterLogout) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    assert set(spider.visited_urls) == {'/', '/hidden'}
    crawler = make_crawler(
        settings, AUTOLOGIN_CHECK_LOGOUT=False, **AL_SETTINGS)
    with MockServer(LoginWithContentAfterLogout) as s:
        yield crawler.crawl(url=s.root_url)
    spider = crawler.spider
    spider_urls = set(spider.visited_urls)
    assert set(spider.visited_urls) == {'/', '/hidden', '/target'}


问题


面经


文章

微信
公众号

扫码关注公众号