thepaper_spider.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:NewsScrapy 作者: yinzishao 项目源码 文件源码
def next_page_parse(self,response):
        html = response.body
        url = response.url
        np_soup = BeautifulSoup(html,"lxml")
        #???<div id="last2" lastTime="1467972702826" pageIndex="2" style="display:none;"></div>
        res = np_soup.find(name="div",attrs={"lasttime":True})

        lasttime = res.get("lasttime",None) if res else None
        pageindex = res.get("pageindex",None)if res else None
        for i in self.fetch_newslist(np_soup):
            request = scrapy.Request(i['news_url'],callback=self.parse_news)
            request.meta['item'] = i
            request.meta["pageindex"] = i
            yield request
        #????
        if not self.flag and lasttime:
            pageindex = str(int(pageindex)+1)
            new_url = re.sub(r'pageidx=.*?&lastTime=.*',"pageidx=%s&lastTime=%s" % (pageindex,lasttime),url,1)
            yield scrapy.Request(new_url, callback=self.next_page_parse)
        # else:
            #log.msg("can't find lasttime or pageindex", level=log.INFO)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号