def parse_news(self,response):
item = response.meta.get("item",NewsItem())
soup = BeautifulSoup(response.body)
referer_web = soup.find("a",id="ne_article_source").text if soup.find("a",id="ne_article_source") else None
referer_url = soup.find("a",id="ne_article_source").get("href",None) if soup.find("a",id="ne_article_source") else None
comment_num = soup.find("a",class_="post_cnum_tie").text if soup.find("a",id="ne_article_source") else None
content = soup.find("div",class_="post_text").text.strip() if soup.find("div",class_="post_text") else None
#??: ?????????-????? ??????
author_source = soup.find("span",class_="left").text if soup.find("span",class_="left") else None
#TODO ??????
# import pdb;pdb.set_trace()
# author = re.search(u"??(.*)",author_source).group(1)[1:] if author_source else None
# item["author"]=author
item["referer_web"]=referer_web
item["referer_url"]=referer_url
item["comment_num"]=comment_num
item["content"]=content
item["crawl_date"]=NOW
yield item
评论列表
文章目录