crawlpy_spider.py 文件源码-python代码片段

def parse(self, response):
        """
        Scrapy parse callback
        """

        # Get current nesting level
        curr_depth = response.meta.get('depth', 1)
        if self.config['login']['enabled']:
            curr_depth = curr_depth - 1 # Do not count the login page as nesting depth

        # Store to disk?
        if self.config['store']['enabled']:
            path = response.url.replace(os.sep, '--')   # Replace directory separator
            path = self.config['store']['path'] + os.sep + path
            with open(path, 'wb') as fpointer:
                fpointer.write(response.body)

        # Yield current url item
        item = CrawlpyItem()
        item['url'] = response.url
        item['status'] = response.status
        item['depth'] = curr_depth
        item['referer'] = response.meta.get('referer', '')
        yield item



        # Get all links from the current page
        links = LinkExtractor().extract_links(response)

        # Iterate all found links and crawl them
        for link in links:
            deny = False

            # Check requests to be ignored
            for ignore in self.config['ignores']:
                if  (ignore in link.url) or (ignore.lower() in link.url.lower()):
                    # Ignore pattern found, stop looking into other patterns
                    deny = True
                    break


            # [NO] Max depth exceeded
            if curr_depth >= self.max_depth:
                logging.info('[Not Crawling] Current depth (' + curr_depth + ') exceeds max depth (' + self.max_depth + ')')
                pass
            # [NO] Duplicate URL
            elif link.url in self.duplicates:
                logging.info('[Not Crawling] Url already crawled: ' + link.url)
                pass
            # [NO] URL denied
            elif deny:
                logging.info('[Not Crawling] Url denied (pattern: "' + ignore + '"): ' + link.url)
                pass
            # [OK] Crawl!
            else:
                self.duplicates.append(link.url)
                yield Request(link.url, meta={'depth': curr_depth+1, 'referer': response.url})