__init__.py 文件源码-python代码片段

__init__.py 文件源码

python

阅读 25 收藏 0 点赞 0 评论 0

项目：osp-scraper 作者: opensyllabus 项目源码文件源码

def extract_links(self, response):
        """Generate (url, source_anchor) tuples extracted from the page"""

        for link in response.css('a'):
            # extract the href & urljoin it to the current response
            url = response.urljoin(link.xpath('@href').extract_first())

            # Only follow http(s) URLs (i.e., no `javascript:` or `mailto:`).
            if url.startswith('http'):
                # merge text content of all child nodes of the link
                anchor = " ".join(s.strip() for s in link.css('*::text').extract() if s.strip())

                yield (url, anchor)

        for frame in (response.css("frame") + response.css("iframe")):
            relative_url = frame.css("::attr(src)").extract_first()
            url = response.urljoin(relative_url)

            if url.startswith("http"):
                anchor = frame.css("::attr(name)").extract_first()

                yield (url, anchor)