test_media_pipeline.py 文件源码

python
阅读 24 收藏 0 点赞 0 评论 0

项目:scrapy-cdr 作者: TeamHG-Memex 项目源码 文件源码
def test_media_pipeline(tmpdir, max_cache):
    crawler = make_crawler(FILES_STORE='file://{}'.format(tmpdir),
                           FILES_MAX_CACHE=max_cache)
    with MockServer(WithFile) as s:
        root_url = s.root_url
        yield crawler.crawl(url=root_url)
    spider = crawler.spider
    assert len(spider.collected_items) == 3

    root_item = find_item('/', spider.collected_items)
    assert len(root_item['objects']) == 2
    file_item = find_item(
        '/file.pdf', root_item['objects'], 'obj_original_url')
    assert file_item['obj_original_url'] == root_url + '/file.pdf'
    assert not file_item['obj_stored_url'].endswith('.pdf')
    with tmpdir.join(file_item['obj_stored_url']).open('rb') as f:
        assert f.read() == FILE_CONTENTS
    assert file_item['content_type'] == 'application/pdf'
    headers = dict(file_item['response_headers'])
    headers.pop('date')
    headers.pop('server')
    assert headers == {'content-type': 'application/pdf',
                       'content-hype': 'very/high'}

    forbidden_item = find_item(
        '/forbidden.pdf', root_item['objects'], 'obj_original_url')
    with tmpdir.join(forbidden_item['obj_stored_url']).open('rb') as f:
        assert f.read() == FILE_CONTENTS * 2

    page_item = find_item('/page?b=2&a=1', spider.collected_items)
    file_item_q = find_item(
        '/file.pdf?allow=true', page_item['objects'], 'obj_original_url')
    assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']

    another_page_item = find_item('/another-page', spider.collected_items)
    file_item_q = find_item(
        '/file.pdf', another_page_item['objects'], 'obj_original_url')
    assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']
    assert file_item_q['obj_original_url'] == file_item['obj_original_url']
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号