def load_response(self, case_id):
"Create Scrapy Response from the html file"
url = self.raw_item['url']
request = Request(url=url)
page_id = case_id + '.html'
page_path = os.path.join(self._destdir, page_id)
page = open(page_path, 'rb')
response = TextResponse(url=url,
request=request,
body=page.read().decode('utf-8', 'ignore'),
encoding='utf-8')
page = page.close()
return response
python类TextResponse()的实例源码
def test_magic_response2():
# check 'body' handling and another 'headers' format
mw = _get_mw()
req = SplashRequest('http://example.com/', magic_response=True,
headers={'foo': 'bar'}, dont_send_headers=True)
req = mw.process_request(req, None)
assert 'headers' not in req.meta['splash']['args']
resp_data = {
'body': base64.b64encode(b"binary data").decode('ascii'),
'headers': {'Content-Type': 'text/plain'},
}
resp = TextResponse("http://mysplash.example.com/execute",
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp2 = mw.process_response(req, resp, None)
assert resp2.data == resp_data
assert resp2.body == b'binary data'
assert resp2.headers == {b'Content-Type': [b'text/plain']}
assert resp2.status == 200
assert resp2.url == "http://example.com/"
def test_magic_response_http_error():
mw = _get_mw()
req = SplashRequest('http://example.com/foo')
req = mw.process_request(req, None)
resp_data = {
"info": {
"error": "http404",
"message": "Lua error: [string \"function main(splash)\r...\"]:3: http404",
"line_number": 3,
"type": "LUA_ERROR",
"source": "[string \"function main(splash)\r...\"]"
},
"description": "Error happened while executing Lua script",
"error": 400,
"type": "ScriptError"
}
resp = TextResponse("http://mysplash.example.com/execute",
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp = mw.process_response(req, resp, None)
assert resp.data == resp_data
assert resp.status == 404
assert resp.url == "http://example.com/foo"
def test_magic_response2():
# check 'body' handling and another 'headers' format
mw = _get_mw()
req = SplashRequest('http://example.com/', magic_response=True,
headers={'foo': 'bar'}, dont_send_headers=True)
req = mw.process_request(req, None)
assert 'headers' not in req.meta['splash']['args']
resp_data = {
'body': base64.b64encode(b"binary data").decode('ascii'),
'headers': {'Content-Type': 'text/plain'},
}
resp = TextResponse("http://mysplash.example.com/execute",
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp2 = mw.process_response(req, resp, None)
assert resp2.data == resp_data
assert resp2.body == b'binary data'
assert resp2.headers == {b'Content-Type': [b'text/plain']}
assert resp2.status == 200
assert resp2.url == "http://example.com/"
def test_magic_response_http_error():
mw = _get_mw()
req = SplashRequest('http://example.com/foo')
req = mw.process_request(req, None)
resp_data = {
"info": {
"error": "http404",
"message": "Lua error: [string \"function main(splash)\r...\"]:3: http404",
"line_number": 3,
"type": "LUA_ERROR",
"source": "[string \"function main(splash)\r...\"]"
},
"description": "Error happened while executing Lua script",
"error": 400,
"type": "ScriptError"
}
resp = TextResponse("http://mysplash.example.com/execute",
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp = mw.process_response(req, resp, None)
assert resp.data == resp_data
assert resp.status == 404
assert resp.url == "http://example.com/foo"
def test_magic_response2():
# check 'body' handling and another 'headers' format
mw = _get_mw()
req = SplashRequest('http://example.com/', magic_response=True,
headers={'foo': 'bar'}, dont_send_headers=True)
req = mw.process_request(req, None)
assert 'headers' not in req.meta['splash']['args']
resp_data = {
'body': base64.b64encode(b"binary data").decode('ascii'),
'headers': {'Content-Type': 'text/plain'},
}
resp = TextResponse("http://mysplash.example.com/execute",
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp2 = mw.process_response(req, resp, None)
assert resp2.data == resp_data
assert resp2.body == b'binary data'
assert resp2.headers == {b'Content-Type': [b'text/plain']}
assert resp2.status == 200
assert resp2.url == "http://example.com/"
def test_magic_response_http_error():
mw = _get_mw()
req = SplashRequest('http://example.com/foo')
req = mw.process_request(req, None)
resp_data = {
"info": {
"error": "http404",
"message": "Lua error: [string \"function main(splash)\r...\"]:3: http404",
"line_number": 3,
"type": "LUA_ERROR",
"source": "[string \"function main(splash)\r...\"]"
},
"description": "Error happened while executing Lua script",
"error": 400,
"type": "ScriptError"
}
resp = TextResponse("http://mysplash.example.com/execute",
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp = mw.process_response(req, resp, None)
assert resp.data == resp_data
assert resp.status == 404
assert resp.url == "http://example.com/foo"
def run_crawler(base_url, ua, start_date, end_date,
google_username, google_password):
temp_df = pd.DataFrame()
dates = date_range(start_date, end_date)
for d in dates:
url = '{0}//transfers/transfertagedetail/statistik/top/land_id_zu/0/land_id_ab/0/leihe//datum/{1}/plus/1'.format(
base_url, d)
rqst = requests.get(url, headers={"User-Agent": ua})
resp = TextResponse(url, body=rqst.content)
players, nat, ages, positions, prev_clubs, next_clubs, mkt_values, trans_prices = get_data_lists(
resp)
df = get_df(players,
nat,
ages,
positions,
prev_clubs,
next_clubs,
mkt_values,
trans_prices,
d)
trends_df = get_trends_data(google_username, google_password,
players, d)
df = pd.merge(df, trends_df, how='left', on='player')
temp_df = pd.concat([temp_df, df])
return temp_df
def process_exception(self, request, exception, spider):
if isinstance(exception, self.DONT_RETRY_ERRORS):
return TextResponse(url=request.meta['proxy'])
def setUp(self):
self.spider = MinutesSpider()
self.index = TextResponse(
url=self.spider.start_urls[0],
body=open('./tests/samples/minute_index.html').read(),
encoding='utf-8'
)
self.response = TextResponse(
url="http://mail.camara.rj.gov.br/APL/Legislativos/atas.nsf/" +
"3f8037c08c436684032577040057cb8c/54ab5cc388ffcda5832580830059b178?OpenDocument",
body=open('./tests/samples/minute_item.html').read(),
encoding='utf-8'
) # 105ª Sessão Ordinária
def setUp(self):
self.spider = AldermanSpider()
self.index = TextResponse(
url=self.spider.start_urls[0],
body=open('./tests/samples/alderman_index.html', encoding='windows-1252').read(),
encoding='utf-8'
)
self.item = TextResponse(
url="http://www.camara.rj.gov.br/vereador_informacoes.php?m1=inform&cvd=24",
body=open('./tests/samples/alderman_item.html', encoding='windows-1252').read(),
encoding='utf-8'
) # Carlos Bolsonaro
def dummy_response():
"""Dummy response fixture."""
from scrapy.http import TextResponse, Request
url = 'http://www.example.com'
request = Request(url=url)
response = TextResponse(url=url, request=request, body=TEST_FILE_2, encoding='utf-8')
return response
def test_unicode_url():
mw = _get_mw()
req = SplashRequest(
# note unicode URL
u"http://example.com/", endpoint='execute')
req2 = mw.process_request(req, None)
res = {'html': '<html><body>Hello</body></html>'}
res_body = json.dumps(res)
response = TextResponse("http://mysplash.example.com/execute",
# Scrapy doesn't pass request to constructor
# request=req2,
headers={b'Content-Type': b'application/json'},
body=res_body.encode('utf8'))
response2 = mw.process_response(req2, response, None)
assert response2.url == "http://example.com/"
def test_unicode_url():
mw = _get_mw()
req = SplashRequest(
# note unicode URL
u"http://example.com/", endpoint='execute')
req2 = mw.process_request(req, None)
res = {'html': '<html><body>Hello</body></html>'}
res_body = json.dumps(res)
response = TextResponse("http://mysplash.example.com/execute",
# Scrapy doesn't pass request to constructor
# request=req2,
headers={b'Content-Type': b'application/json'},
body=res_body.encode('utf8'))
response2 = mw.process_response(req2, response, None)
assert response2.url == "http://example.com/"
def test_unicode_url():
mw = _get_mw()
req = SplashRequest(
# note unicode URL
u"http://example.com/", endpoint='execute')
req2 = mw.process_request(req, None)
res = {'html': '<html><body>Hello</body></html>'}
res_body = json.dumps(res)
response = TextResponse("http://mysplash.example.com/execute",
# Scrapy doesn't pass request to constructor
# request=req2,
headers={b'Content-Type': b'application/json'},
body=res_body.encode('utf8'))
response2 = mw.process_response(req2, response, None)
assert response2.url == "http://example.com/"
def collect(conf, conn):
"""Collect ICD-XX-CM conditions.
"""
# For more information see:
# https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html
URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip'
FILE = 'Tabular.xml'
VERSION = 'ICD-10-CM'
LAST_UPDATED = '2015-10-01'
# Prepare xml
zip = requests.get(URL).content
xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read()
res = TextResponse(url=URL, body=xml, encoding='utf-8')
count = 0
for diag in res.xpath('//diag'):
# We need only leafs
childs = diag.xpath('./diag')
if not childs:
continue
# Get data
data = {
'name': diag.xpath('./name/text()').extract_first(),
'desc': diag.xpath('./desc/text()').extract_first(),
'terms': diag.xpath('.//note/text()').extract(),
'version': VERSION,
'last_updated': LAST_UPDATED,
}
# Create record
record = Record.create(URL, data)
# Write record
record.write(conf, conn)
# Log info
count += 1
if not count % 100:
logger.info('Collected %s "%s" conditions', count, record.table)
def test_splash_request():
mw = _get_mw()
cookie_mw = _get_cookie_mw()
req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"
# check request preprocessing
req2 = cookie_mw.process_request(req, None) or req
req2 = mw.process_request(req2, None) or req2
assert req2 is not None
assert req2 is not req
assert req2.url == "http://127.0.0.1:8050/render.html"
assert req2.headers == {b'Content-Type': [b'application/json']}
assert req2.method == 'POST'
assert isinstance(req2, SplashRequest)
assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>"
expected_body = {'url': req.url}
assert json.loads(to_native_str(req2.body)) == expected_body
# check response post-processing
response = TextResponse("http://127.0.0.1:8050/render.html",
# Scrapy doesn't pass request to constructor
# request=req2,
headers={b'Content-Type': b'text/html'},
body=b"<html><body>Hello</body></html>")
response2 = mw.process_response(req2, response, None)
response2 = cookie_mw.process_response(req2, response2, None)
assert isinstance(response2, scrapy_splash.SplashTextResponse)
assert response2 is not response
assert response2.real_url == req2.url
assert response2.url == req.url
assert response2.body == b"<html><body>Hello</body></html>"
assert response2.css("body").extract_first() == "<body>Hello</body>"
assert response2.headers == {b'Content-Type': [b'text/html']}
# check .replace method
response3 = response2.replace(status=404)
assert response3.status == 404
assert isinstance(response3, scrapy_splash.SplashTextResponse)
for attr in ['url', 'real_url', 'headers', 'body']:
assert getattr(response3, attr) == getattr(response2, attr)
def test_cookies():
mw = _get_mw()
cookie_mw = _get_cookie_mw()
def request_with_cookies(cookies):
req = SplashRequest(
'http://example.com/foo',
endpoint='execute',
args={'lua_source': 'function main() end'},
magic_response=True,
cookies=cookies)
req = cookie_mw.process_request(req, None) or req
req = mw.process_request(req, None) or req
return req
def response_with_cookies(req, cookies):
resp_data = {
'html': '<html><body>Hello</body></html>',
'headers': [],
'cookies': cookies,
}
resp = TextResponse(
'http://mysplash.example.com/execute',
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp = mw.process_response(req, resp, None)
resp = cookie_mw.process_response(req, resp, None)
return resp
# Concurent requests
req1 = request_with_cookies({'spam': 'ham'})
req2 = request_with_cookies({'bom': 'bam'})
resp1 = response_with_cookies(req1, [
{'name': 'spam', 'value': 'ham'},
{'name': 'spam_x', 'value': 'ham_x'},
])
resp2 = response_with_cookies(req2, [
{'name': 'spam', 'value': 'ham'}, # because req2 was made after req1
{'name': 'bom_x', 'value': 'bam_x'},
])
assert resp1.cookiejar is resp2.cookiejar
cookies = {c.name: c.value for c in resp1.cookiejar}
assert cookies == {'spam': 'ham', 'spam_x': 'ham_x', 'bom_x': 'bam_x'}
# Removing already removed
req1 = request_with_cookies({'spam': 'ham'})
req2 = request_with_cookies({'spam': 'ham', 'pom': 'pam'})
resp2 = response_with_cookies(req2, [
{'name': 'pom', 'value': 'pam'},
])
resp1 = response_with_cookies(req1, [])
assert resp1.cookiejar is resp2.cookiejar
cookies = {c.name: c.value for c in resp1.cookiejar}
assert cookies == {'pom': 'pam'}
def test_cookies():
mw = _get_mw()
cookie_mw = _get_cookie_mw()
def request_with_cookies(cookies):
req = SplashRequest(
'http://example.com/foo',
endpoint='execute',
args={'lua_source': 'function main() end'},
magic_response=True,
cookies=cookies)
req = cookie_mw.process_request(req, None) or req
req = mw.process_request(req, None) or req
return req
def response_with_cookies(req, cookies):
resp_data = {
'html': '<html><body>Hello</body></html>',
'headers': [],
'cookies': cookies,
}
resp = TextResponse(
'http://mysplash.example.com/execute',
headers={b'Content-Type': b'application/json'},
body=json.dumps(resp_data).encode('utf8'))
resp = mw.process_response(req, resp, None)
resp = cookie_mw.process_response(req, resp, None)
return resp
# Concurent requests
req1 = request_with_cookies({'spam': 'ham'})
req2 = request_with_cookies({'bom': 'bam'})
resp1 = response_with_cookies(req1, [
{'name': 'spam', 'value': 'ham'},
{'name': 'spam_x', 'value': 'ham_x'},
])
resp2 = response_with_cookies(req2, [
{'name': 'spam', 'value': 'ham'}, # because req2 was made after req1
{'name': 'bom_x', 'value': 'bam_x'},
])
assert resp1.cookiejar is resp2.cookiejar
cookies = {c.name: c.value for c in resp1.cookiejar}
assert cookies == {'spam': 'ham', 'spam_x': 'ham_x', 'bom_x': 'bam_x'}
# Removing already removed
req1 = request_with_cookies({'spam': 'ham'})
req2 = request_with_cookies({'spam': 'ham', 'pom': 'pam'})
resp2 = response_with_cookies(req2, [
{'name': 'pom', 'value': 'pam'},
])
resp1 = response_with_cookies(req1, [])
assert resp1.cookiejar is resp2.cookiejar
cookies = {c.name: c.value for c in resp1.cookiejar}
assert cookies == {'pom': 'pam'}
def test_splash_request():
mw = _get_mw()
cookie_mw = _get_cookie_mw()
req = SplashRequest("http://example.com?foo=bar&url=1&wait=100")
assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"
# check request preprocessing
req2 = cookie_mw.process_request(req, None) or req
req2 = mw.process_request(req2, None) or req2
assert req2 is not None
assert req2 is not req
assert req2.url == "http://127.0.0.1:8050/render.html"
assert req2.headers == {b'Content-Type': [b'application/json']}
assert req2.method == 'POST'
assert isinstance(req2, SplashRequest)
assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>"
expected_body = {'url': req.url}
assert json.loads(to_native_str(req2.body)) == expected_body
# check response post-processing
response = TextResponse("http://127.0.0.1:8050/render.html",
# Scrapy doesn't pass request to constructor
# request=req2,
headers={b'Content-Type': b'text/html'},
body=b"<html><body>Hello</body></html>")
response2 = mw.process_response(req2, response, None)
response2 = cookie_mw.process_response(req2, response2, None)
assert isinstance(response2, scrapy_splash.SplashTextResponse)
assert response2 is not response
assert response2.real_url == req2.url
assert response2.url == req.url
assert response2.body == b"<html><body>Hello</body></html>"
assert response2.css("body").extract_first() == "<body>Hello</body>"
assert response2.headers == {b'Content-Type': [b'text/html']}
# check .replace method
response3 = response2.replace(status=404)
assert response3.status == 404
assert isinstance(response3, scrapy_splash.SplashTextResponse)
for attr in ['url', 'real_url', 'headers', 'body']:
assert getattr(response3, attr) == getattr(response2, attr)