def parse(self, response):
se=Selector(response) #???????HtmlXPathSelector???
if(re.match("http://desk.zol.com.cn/fengjing/\d+x\d+/\d+.html", response.url)):#??url??????????url????
src=se.xpath("//ul[@class='pic-list2 clearfix']/li")#???ul?????li
for i in range(len(src)):#??li??
imgURLs=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@src"%i).extract() #??????????
titles=se.xpath("//ul[@class='pic-list2 clearfix']/li[%d]/a/img/@title"%i).extract()
if imgURLs:
realUrl=imgURLs[0].replace("t_s208x130c5","t_s2560x1600c5") #????????????????
file_name=u"%s.jpg"%titles[0] #????????
path=os.path.join("D:\pics",file_name)#??????????????F??pics????
type = sys.getfilesystemencoding()
print file_name.encode(type)
item=WebcrawlerScrapyItem() #??item??????item??,?????????????item???
item['name']=file_name
item['url']=realUrl
print item["name"],item["url"]
yield item #??item,???????item
urllib.urlretrieve(realUrl,path) #??????????????????????????????????????
all_urls=se.xpath("//a/@href").extract()#???????url
for url in all_urls:
if url.startswith("/fengjing/1920x1080/"):#??????????????
yield Request("http://desk.zol.com.cn"+url,callback=self.parse)
pictureSpider_demo.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录