def detail_appreciation(self, all_url, itemi):
detail_appreciation_container = []
for url in all_url:
url = self.site_domain + url
print('detail_appreciation url : %s' % url)
html_requests = requests.get(url).text.encode('utf-8')
html_response = HtmlResponse(url=url, body=html_requests, headers={'Connection': 'close'})
html_all = Selector(html_response)
temp = ''.join(html_all.xpath(
u'//div[@class="main3"]/div[@class="shileft"]/div[@class="shangxicont"]/p[not(@style or contains(text(),"?????"))]').extract())
temp = temp.encode('utf-8')
temp = re.sub(r'<p>', '', temp)
temp = re.sub(r'</p>', '', temp)
temp = re.sub(r'</a>', '', temp)
temp = re.sub(r'(<a\s+href=\s*\".*?\">)', '', temp)
alt = re.search(r'\s+alt=\s*\"(.*?)\"\s+', temp)
# print(alt.group(1))
if alt is not None:
temp = re.sub(r'<img.*\s*>', alt.group(1), temp)
else:
print('%s have a none img in appricate' % url)
temp = re.sub(r'\"', "“", temp)
# if self.site_domain + '/shangxi_4618.aspx' == url:
# print(temp)
detail_appreciation_container.append(temp)
itemi['detail_appreciation_text'] = detail_appreciation_container
pass
评论列表
文章目录