studios.py 文件源码-python代码片段

def _parse_contents(self, response):
        # Wix pages aren't really parseable, so anytime we see them,
        # let's re-run it (depth-1) with an escaped-fragment to get the real html source
        if 'https://static.wixstatic.com/' in response.body and '_escaped_fragment_' not in response.url:
            parsed_url = urlparse(response.url)
            qs = parse_qs(parsed_url.query)
            qs['_escaped_fragment_'] = ''
            wix_scrapeable_url = urlunparse(
                (parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, urlencode(qs), parsed_url.fragment)
            )
            response.meta['depth'] -= 1
            return [scrapy.Request(wix_scrapeable_url, self.parse)]

        return
        if not hasattr(response, 'selector'):
            logging.info('Skipping unknown file from: %s', response.url)
            return
        # Get all text contents of tags (unless they are script or style tags)
        text_contents = ' '.join(response.selector.xpath('//*[not(self::script|self::style)]/text()').extract()).lower()

        processed_text = event_classifier.StringProcessor(text_contents, regex_keywords.WORD_BOUNDARIES)
        wrong = processed_text.get_tokens(keywords.DANCE_WRONG_STYLE)
        good = processed_text.get_tokens(rules.STREET_STYLE)
        if (wrong or good):
            #print response.url, set(wrong), set(good)
            pass