__init__.py 文件源码

python
阅读 16 收藏 0 点赞 0 评论 0

项目:calibre_dangdang 作者: qunxyz 项目源码 文件源码
def parse_details_page(url, log, timeout, browser):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                        e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    raw = raw
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(raw, treebuilder='lxml',
                              namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号