dmoz_spider.py 文件源码-python代码片段

dmoz_spider.py 文件源码

python

阅读 20 收藏 0 点赞 0 评论 0

项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码

def parse3(self, response):
        """ ????????????ID """
        selector = Selector(response)
        text2 = selector.xpath('body//table/tr/td/a/@href').extract()
        next_urls = []
        for elem in text2:
            elem = re.findall('uid=(\d+)', elem)
            if elem:
                next_urls.append(int(elem[0]))

        self.next_ID.pop()
        self.next_ID.append(random.choice(next_urls))
        self.temp = next_urls[0]

        try:
            next_url = "http://weibo.cn/u/%s" % self.next_ID[-1]
            yield Request(url=next_url, dont_filter=True, callback=self.parse)
        except:
            self.next_ID.pop()
            self.next_ID.append(self.temp)
            next_url = "http://weibo.cn/u/%s" % self.temp
            yield Request(url=next_url, dont_filter=True, callback=self.parse)