__init__.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:calibre_dangdang 作者: qunxyz 项目源码 文件源码
def parse_results_page(self, root):  # {{{
        from lxml.html import tostring

        matches = []

        def title_ok(title):
            title = title.lower()
            bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
            for x in bad:
                if x in title:
                    return False
            # if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None:
            #     # Bad entries in the catalog
            #     return False
            return True

        for a in root.xpath(r'//li[starts-with(@class, "line")]//a[@href and contains(@name, "itemlist-picture")]'):
            # title = a.get('title')
            # if title_ok(title):
            url = a.get('href')
            if url.startswith('/'):
                url = 'http://product.dangdang.com/%s' % (url)
            matches.append(url)

        # Keep only the top 5 matches as the matches are sorted by relevance by
        # Amazon so lower matches are not likely to be very relevant
        return matches[:5]
    # }}}
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号