def parse_news(self,response):
item = response.meta.get("item",NewsItem())
soup = BeautifulSoup(response.body.decode('gbk'))
pic = soup.find('p' , class_ = 'f_center').find('img').get('src') if soup.find('p' , class_ = 'f_center') and soup.find('p' , class_ = 'f_center').find('img') else None
referer_web = soup.find('a',id = 'ne_article_source').text if soup.find('a',id = 'ne_article_source') else None
referer_url = soup.find('a',id = 'ne_article_source').get('href') if soup.find('a',id = 'ne_article_source') else None
author = soup.find('span',class_ = 'ep-editor').text if soup.find('span',class_ = 'ep-editor') else None
if u"?" in author:
author = author.split(u"?")[-1]
crawl_date = NOW
read_num = soup.find('div',class_ = 'post_comment_joincount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
comment_num = soup.find('div',class_ = 'post_comment_tiecount').find('a').text if soup.find('div',class_ = 'post_comment_tiecount') else 0
content = soup.find('div',class_ = 'post_text').get_text(strip=True) if soup.find('div',class_ = 'post_text') else None
item['referer_web'] = referer_web
item['content'] = content
item['referer_url'] = referer_url
item['author'] = author
item['crawl_date'] = crawl_date
item['pic'] = pic
item['comment_num'] = int(comment_num)
item['read_num'] = int(read_num)
yield item
评论列表
文章目录