def select_url(url, html, fruitline_spider_variable):
if html < 10:
return []
try:
html_element = document_fromstring(urllib2.unquote(html))
html_element.make_links_absolute(url)
links = [i[2] for i in html_element.iterlinks()]
except Exception, e:
spider_logger.error("Function: select_url, Info: %s" % str(e))
return []
links_unrepeat = set()
[links_unrepeat.add(i) for i in links]
final_links = []
for i in list(links_unrepeat):
full_url = repair_url(i, fruitline_spider_variable)
if fruitline_spider_variable.filter_rule != "":
pattern = re.compile(fruitline_spider_variable.filter_rule)
if re.match(pattern, full_url):
if full_url not in fruitline_spider_variable.crawled_url_queue:
d = dict()
d['method'] = "get"
d['url'] = full_url
final_links.append(d)
else:
if full_url not in fruitline_spider_variable.crawled_url_queue:
d = dict()
d['method'] = "get"
d['url'] = full_url
final_links.append(d)
return final_links
评论列表
文章目录