def parse(self, response):
#????
html = response.body
soup = BeautifulSoup(html,"lxml")
#????????
for i in self.fetch_newslist(soup):
# raise CloseSpider(str(i['time'] == u"???"))
# if i['time'] == "???": raise CloseSpider("today news end")
request = scrapy.Request(i['news_url'],callback=self.parse_news)
request.meta['item'] = i
request.meta['pageindex'] = 1
yield request
#????????
lasttime = "nothing"
for i in soup.select('div[class="news_li"]'):
if i.attrs.has_key("lasttime"):
lasttime = i["lasttime"]
break
#?????url????
# ???load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx=
load_chosen = re.search(r'data.:."(.*)".+.masonry',html)
page = 2
if load_chosen :
tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime)
yield scrapy.Request(tp_url, callback=self.next_page_parse)
评论列表
文章目录