def parse_page(self, response):
item = BroadItem()
soup = BeautifulSoup(response.text, "lxml")
title = response.xpath('//title/text()').extract()
if len(title) > 0:
item['title'] = ''.join(title[0].replace('|', ',').\
replace('\"', '').replace('\'', '').\
replace('(', '[').replace(')', ']').\
replace('#', '').split())
else:
item['title'] = ''
print item['title']
print response.url
item['url'] = response.url
item['date'] = obtain_d(response)
print item['date']
divs = soup.findAll('div')
div_dic = {}
for div in divs:
ps = div.findAll('p')
div_dic[len(ps)] = div
if len(div_dic) == 0:
item['content'] = "none"
else:
div_dic = sorted(div_dic.iteritems(), key=lambda d:d[0], reverse=True)
ps = div_dic[0][1].findAll('p')
images = div_dic[0][1].findAll('img')
item['image_urls'] = ''
for img in images:
try:
if 'http' in img['src']:
item['image_urls'] += img['src'] + '\n'
except Exception as e:
pass
text = ""
for p in ps:
text += p.text
item['content'] = text.replace('"', '\'\'')
return item
评论列表
文章目录