def parse_details_page(url, log, timeout, browser):
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode
import html5lib
from lxml.html import tostring
try:
raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip()
except Exception as e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
log.error('URL malformed: %r'%url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Amazon timed out. Try again later.'
log.error(msg)
else:
msg = 'Failed to make details query: %r'%url
log.exception(msg)
return
oraw = raw
raw = raw
raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
if '<title>404 - ' in raw:
log.error('URL malformed: %r'%url)
return
try:
root = html5lib.parse(raw, treebuilder='lxml',
namespaceHTMLElements=False)
except:
msg = 'Failed to parse amazon details page: %r'%url
log.exception(msg)
return
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = 'Failed to parse amazon details page: %r'%url
msg += tostring(errmsg, method='text', encoding=unicode).strip()
log.error(msg)
return
from css_selectors import Select
selector = Select(root)
return oraw, root, selector
评论列表
文章目录