def parse_news(self,response):
#content,news_date,news_no,crawl_date,referer_web
item = response.meta.get("item",NewsItem())
pageindex = response.meta.get("pageindex",1)
soup = BeautifulSoup(response.body)
# news_date = item.get("news_date",None)
#?????????
news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
#http://info.meadin.com/PictureNews/2938_1.shtml Exception
if news_date:
# struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
# delta = self.end_now-struct_date
# if delta.days == self.end_day:
# raise CloseSpider('today scrapy end')
referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
#????
art,content = None,None
art = soup.find("div",class_="article js-article")
if art:
#?????
art.find("div",class_="intro").replace_with("")
content =art.text.strip()
news_no =response.url.split("/")[-1].split("_")[0]
item["news_date"]=news_date
item["content"]=content
item["referer_web"]=referer_web
item["crawl_date"]=NOW
item["news_no"]=news_no
item = judge_news_crawl(item)
if item:
yield item
else:
self.flag = pageindex
else:
logger.warning("can't find news_date.the url is %s" % response.url)
评论列表
文章目录