def parse_news(self, response):
item = response.meta.get("item", NewsItem())
soup = BeautifulSoup(response.body.decode("utf-8").encode("utf-8"),"lxml")
pic = soup.find("p",class_ = "detailPic").find("img").get("src") if soup.find("p",class_ = "detailPic") else None
referer_web = soup.find("span",class_ = "ss03").text if soup.find("span",class_ = "ss03") else None
author = soup.find("span",itemprop="author").find("span").text if soup.find("span",itemprop="author") else None
temp = soup.find("div" ,id = "main_content")
if temp:
ps = temp.find_all("p") if temp.find_all("p") else None
content = "\n\n".join([ p.text.strip() for p in ps])
else:
content = None
item['pic'] = pic
item['referer_web'] = referer_web
item['author'] = author
item['content'] = content
item['crawl_date'] = NOW
yield item
评论列表
文章目录