imgspider.py 文件源码-python代码片段

def parse_page(self, response):
        #????
        # print u'~~~~', unicode(response.body, "gbk").encode("utf8")
        # print(self.config["xpathImagesPath"])
        # print(response.xpath(self.config["xpathImagesPath"]))
        l = ItemLoader(item=PageItem(), response=response)
        l.add_value('title', response.request.cookies['title'])
        l.add_value('name', self.config["id"])
        l.add_value('url', response.url)
        if self.config.has_key("imageUrlReplacement"):
            l.add_value('replace', self.config["imageUrlReplacement"])

        if self.config.has_key("xpathImagesPath"):
            l.add_xpath('image_urls', self.config["xpathImagesPath"])
        if self.config.has_key("xpathFilesPath"):
            l.add_xpath('file_urls', self.config["xpathFilesPath"])
        yield l.load_item()

        #TODO??????????????parse_page
        if self.config.has_key("xpathNextImageUrl"):
            nextUrls = response.xpath(self.config["xpathNextImageUrl"])
            if len(nextUrls) > 0:
                nextPage = nextUrls.extract()[0]
                if not nextPage.startswith("http"):
                    if nextPage.startswith("/"):
                        nextPage = response.url[0:response.url.index("/",10)+1]+nextPage 
                    else:
                        nextPage = response.url[0:response.url.rfind("/")+1]+nextPage 
                request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']})
                yield request