def _parse_contents(self, response):
# Wix pages aren't really parseable, so anytime we see them,
# let's re-run it (depth-1) with an escaped-fragment to get the real html source
if 'https://static.wixstatic.com/' in response.body and '_escaped_fragment_' not in response.url:
parsed_url = urlparse(response.url)
qs = parse_qs(parsed_url.query)
qs['_escaped_fragment_'] = ''
wix_scrapeable_url = urlunparse(
(parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, urlencode(qs), parsed_url.fragment)
)
response.meta['depth'] -= 1
return [scrapy.Request(wix_scrapeable_url, self.parse)]
return
if not hasattr(response, 'selector'):
logging.info('Skipping unknown file from: %s', response.url)
return
# Get all text contents of tags (unless they are script or style tags)
text_contents = ' '.join(response.selector.xpath('//*[not(self::script|self::style)]/text()').extract()).lower()
processed_text = event_classifier.StringProcessor(text_contents, regex_keywords.WORD_BOUNDARIES)
wrong = processed_text.get_tokens(keywords.DANCE_WRONG_STYLE)
good = processed_text.get_tokens(rules.STREET_STYLE)
if (wrong or good):
#print response.url, set(wrong), set(good)
pass
评论列表
文章目录