def parse_article(self,response):
hxs = Selector(response)
keyword = response.meta['keyword']
movie_name = hxs.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
movie_roles_paths = hxs.xpath('//*[@id="info"]/span[3]/span[2]')
movie_roles = []
for movie_roles_path in movie_roles_paths:
movie_roles = movie_roles_path.select('.//*[@rel="v:starring"]/text()').extract()
movie_classification= hxs.xpath('//span[@property="v:genre"]/text()').extract()
douban_item = DoubanItem()
douban_item['movie_keyword'] = keyword
douban_item['movie_name'] = ''.join(movie_name).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';').replace(' ','')
douban_item['movie_roles'] = ';'.join(movie_roles).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
douban_item['movie_classification'] = ';'.join(movie_classification).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
article_link = hxs.xpath('//*[@id="review_section"]/div/div/div/h3/a/@href').extract()
tmp = "https://movie.douban.com/review/"
for item in article_link:
if tmp in item:
yield Request(item,meta={'item': douban_item},callback=self.parse_item,cookies=[{'name': 'COOKIE_NAME','value': 'VALUE','domain': '.douban.com','path': '/'},])
评论列表
文章目录