def parse(self, response):
item = MultimediaCrawlerItem()
item['host'] = 'ergeng'
item['media_type'] = 'video'
item['stack'] = []
item['download'] = 0
item['extract'] = 0
item['file_dir'] = os.path.join(settings['FILES_STORE'], item['media_type'], self.name)
item['url'] = response.url
timestamp = re.search(r'"create_at"\s*:\s*(\d+),|$', response.body).group(1)
item['info'] = {
'link': item['url'],
'title': (response.xpath(r'//div[contains(@class, "new-video-info")]/h3/text()').
extract_first(default='').strip()),
'intro': response.xpath(r'//div[contains(@class, "tj")]/text()').extract_first(default='').strip(),
'date': time.strftime('%Y-%m-%d', time.localtime(int(timestamp))) if timestamp is not None else '',
'author': re.search(r'"user_nickname"\s*:\s*"(.*?)"|$', response.body).group(1),
}
player = self.__get_player(item['url'], response)
if player is None:
self.logger.error('url: {}, error: does not match any player'.format(item['url']))
return
yield scrapy.FormRequest(url=player.url, method=player.method, meta={'item': item},
formdata=player.params, callback=player.parse_video)
评论列表
文章目录