def pycurl_get_resp(data_buf, headers, payload, resp):
charset = None
if 'content-type' in headers:
content_type = headers['content-type'].lower()
match = re.search('charset=(\S+)', content_type)
if match:
charset = match.group(1)
print('Decoding using %s' % charset)
body = data_buf.getvalue()
if len(body) == 0:
data = ''
charset = 'utf-8'
else:
if charset is None:
dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
data = dammit.unicode_markup
charset = dammit.original_encoding
else:
data = body.decode(charset, 'ignore')
# headers.remove({})
headers['content'] = [h for h in headers['content'] if len(h) > 0]
soup_lxml = BeautifulSoup(data, 'lxml')
soup_html = BeautifulSoup(data, 'html.parser')
resp.update({
'url': payload.get('url'),
# 'soup': soup,
'title': get_title(soup_lxml),
'links': get_links(soup_lxml),
'links2': get_links2(soup_lxml),
'metas': get_metas(soup_lxml),
'images': get_images(soup_lxml),
'scripts': get_scripts(soup_lxml),
'text': get_text(soup_html),
'data': data,
'headers': headers,
'charset': charset,
'spider': 'pycurl',
'payload': payload,
})
评论列表
文章目录