def parse_news(self,response):
# print response.url,"response"
PageKey = response.meta.get("topic_id")
PageNumber =response.meta.get("PageNumber")
flag_id =str(int(PageKey)-40037910)
soup =BeautifulSoup(response.body,"lxml")
#2016-07-13
news_date = soup.find("time").text if soup.find("time") else None
# print self.flag[flag_id],int(PageNumber)
"""
?????????self.flag[flag_id]??0??????????????
??????????????????????????????
self.flag[flag_id]=????
"""
if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]:
#???????
struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
# print self.end_now,struct_date,"time"
delta = self.end_now-struct_date
# print delta.days,"delta day ~~~~~~~~~~~~~~~~"
if delta.days > self.end_day:
self.flag[str(flag_id)]=int(PageNumber)
# print flag_id,"stop ~~~~~~"
# raise CloseSpider('today scrapy end')
else:
head = soup.find("div",class_="post-head")
topic,title,abstract=None,None,None
if head:
topic = head.find("span",class_="category").text if head.find("span",class_="category") else None
title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None
abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None
content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None
news_no = response.url.split("/")[-1].split("?")[0]
#TODO ????js??????
item = NewsItem(title=title,topic=topic,
abstract=abstract,news_date=news_date,
content=content,news_no=news_no
,crawl_date=NOW,news_url=response.url,catalogue='????')
yield item
评论列表
文章目录