def parse_results_page(self, root): # {{{
from lxml.html import tostring
matches = []
def title_ok(title):
title = title.lower()
bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
for x in bad:
if x in title:
return False
# if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None:
# # Bad entries in the catalog
# return False
return True
for a in root.xpath(r'//li[starts-with(@class, "line")]//a[@href and contains(@name, "itemlist-picture")]'):
# title = a.get('title')
# if title_ok(title):
url = a.get('href')
if url.startswith('/'):
url = 'http://product.dangdang.com/%s' % (url)
matches.append(url)
# Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant
return matches[:5]
# }}}
评论列表
文章目录