google_image_crawler.py 文件源码-python代码片段

def download_image(url, save_dir, loaded_urls=None):
    real_url = None
    response = None
    save_image_name = None
    try:
        req = Request(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
        response = urlopen(req)

        real_url = response.geturl()

        if loaded_urls and real_url in loaded_urls:
            print 'URL had been downloaded in previous searching'
            real_url = None
        else:
            img_name = hashlib.md5(real_url).hexdigest()
            save_image_name = save_dir + '/' + img_name + '.' + CONFIGS[u'search_file_type']
            print 'Try to save image ' + real_url + ' into file: ' +  save_image_name
            output_file = open(save_image_name,'wb')
            data = response.read()
            output_file.write(data)

        #response.close()
    except IOError as e:   #If there is any IOError
        print("IOError on url "+str(url))
        print e
    except HTTPError as e:  #If there is any HTTPError
        print("HTTPError on url "+str(url))
        print e
    except URLError as e:
        print("URLError on url "+str(url))
        print e

    if response:
        response.close()

    return real_url, save_image_name
############## End of Functions to get real urls and download images ############         

############## Main Program ############