pictureSpider_demo.py 文件源码-python代码片段

def parse(self, response):
        se=Selector(response) #???????HtmlXPathSelector???
        if(re.match("http://desk.zol.com.cn/fengjing/\d+x\d+/\d+.html", response.url)):#??url??????????url????
            src=se.xpath("//ul[@class='pic-list2  clearfix']/li")#???ul?????li

            for i in range(len(src)):#??li??
                imgURLs=se.xpath("//ul[@class='pic-list2  clearfix']/li[%d]/a/img/@src"%i).extract() #??????????
                titles=se.xpath("//ul[@class='pic-list2  clearfix']/li[%d]/a/img/@title"%i).extract()

                if imgURLs:
                    realUrl=imgURLs[0].replace("t_s208x130c5","t_s2560x1600c5") #????????????????
                    file_name=u"%s.jpg"%titles[0] #????????

                    path=os.path.join("D:\pics",file_name)#??????????????F??pics????

                    type = sys.getfilesystemencoding()
                    print file_name.encode(type)  

                    item=WebcrawlerScrapyItem()  #??item??????item??,?????????????item???
                    item['name']=file_name 
                    item['url']=realUrl
                    print item["name"],item["url"]    

                    yield item  #??item,???????item

                    urllib.urlretrieve(realUrl,path)  #??????????????????????????????????????

            all_urls=se.xpath("//a/@href").extract()#???????url
            for url in all_urls:
                if url.startswith("/fengjing/1920x1080/"):#??????????????
                    yield Request("http://desk.zol.com.cn"+url,callback=self.parse)