def next_page_parse(self,response):
html = response.body
url = response.url
np_soup = BeautifulSoup(html,"lxml")
#???<div id="last2" lastTime="1467972702826" pageIndex="2" style="display:none;"></div>
res = np_soup.find(name="div",attrs={"lasttime":True})
lasttime = res.get("lasttime",None) if res else None
pageindex = res.get("pageindex",None)if res else None
for i in self.fetch_newslist(np_soup):
request = scrapy.Request(i['news_url'],callback=self.parse_news)
request.meta['item'] = i
request.meta["pageindex"] = i
yield request
#????
if not self.flag and lasttime:
pageindex = str(int(pageindex)+1)
new_url = re.sub(r'pageidx=.*?&lastTime=.*',"pageidx=%s&lastTime=%s" % (pageindex,lasttime),url,1)
yield scrapy.Request(new_url, callback=self.next_page_parse)
# else:
#log.msg("can't find lasttime or pageindex", level=log.INFO)
评论列表
文章目录