def test_media_pipeline(tmpdir, max_cache):
crawler = make_crawler(FILES_STORE='file://{}'.format(tmpdir),
FILES_MAX_CACHE=max_cache)
with MockServer(WithFile) as s:
root_url = s.root_url
yield crawler.crawl(url=root_url)
spider = crawler.spider
assert len(spider.collected_items) == 3
root_item = find_item('/', spider.collected_items)
assert len(root_item['objects']) == 2
file_item = find_item(
'/file.pdf', root_item['objects'], 'obj_original_url')
assert file_item['obj_original_url'] == root_url + '/file.pdf'
assert not file_item['obj_stored_url'].endswith('.pdf')
with tmpdir.join(file_item['obj_stored_url']).open('rb') as f:
assert f.read() == FILE_CONTENTS
assert file_item['content_type'] == 'application/pdf'
headers = dict(file_item['response_headers'])
headers.pop('date')
headers.pop('server')
assert headers == {'content-type': 'application/pdf',
'content-hype': 'very/high'}
forbidden_item = find_item(
'/forbidden.pdf', root_item['objects'], 'obj_original_url')
with tmpdir.join(forbidden_item['obj_stored_url']).open('rb') as f:
assert f.read() == FILE_CONTENTS * 2
page_item = find_item('/page?b=2&a=1', spider.collected_items)
file_item_q = find_item(
'/file.pdf?allow=true', page_item['objects'], 'obj_original_url')
assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']
another_page_item = find_item('/another-page', spider.collected_items)
file_item_q = find_item(
'/file.pdf', another_page_item['objects'], 'obj_original_url')
assert file_item_q['obj_stored_url'] == file_item['obj_stored_url']
assert file_item_q['obj_original_url'] == file_item['obj_original_url']
评论列表
文章目录