def extract_links(self, response):
"""Generate (url, source_anchor) tuples extracted from the page"""
for link in response.css('a'):
# extract the href & urljoin it to the current response
url = response.urljoin(link.xpath('@href').extract_first())
# Only follow http(s) URLs (i.e., no `javascript:` or `mailto:`).
if url.startswith('http'):
# merge text content of all child nodes of the link
anchor = " ".join(s.strip() for s in link.css('*::text').extract() if s.strip())
yield (url, anchor)
for frame in (response.css("frame") + response.css("iframe")):
relative_url = frame.css("::attr(src)").extract_first()
url = response.urljoin(relative_url)
if url.startswith("http"):
anchor = frame.css("::attr(name)").extract_first()
yield (url, anchor)
评论列表
文章目录