def csvComment():
'Module for data to be fetched and parsed into csv'
print 'started'
with open(sortdata, 'r') as f:
for line in f:
line = line.strip('\n')
durl='http://fisheye.cuc.com/cru/'+line+'/reviewHistory.csv'
print durl
testfile = urllib.URLopener()
testfile.retrieve('http://fisheye.cuc.com/cru/'+line+'/reviewHistory.csv', line+'.csv')
with open(line+'.csv') as f:
columns = defaultdict(list) # each value in each column is appended to a list
reader = csv.DictReader(f) # read rows into a dictionary format
for row in reader: # read a row as {column1: value1, column2: value2,...}
for (k,v) in row.items(): # go over each column name and value
columns[k].append(v) # append the value into the appropriate list
d = dict(zip(zip(columns['Date'],columns['User'],columns['New value']),columns['Action']))
print d
##print rkdict
## for key, value in d.iteritems():
## if value == 'COMMENT_CHANGED' or value == 'COMMENT_ADDED':
## writer = csv.writer(open('final.csv', 'ab'))
## for (key, value)in zip(d.items()):
## writer.writerow([line, key, value ])
## else:
## print 'No Comments found for '+line
python类URLopener()的实例源码
def dictcsvFinalReview():
print 'started'
with open(sortdata, 'r') as f:
for line in f:
line = line.strip('\n')
durl='http://fisheye.cuc.com/cru/'+line+'/reviewHistory.csv'
print durl
testfile = urllib.URLopener()
os.chdir(r'C:\Users\radhakrishnanr\Desktop\filescsv')
testfile.retrieve('http://fisheye.cuc.com/cru/'+line+'/reviewHistory.csv', line+'.csv')
columns = defaultdict(list) # each value in each column is appended to a list
with open(line+'.csv') as f:
reader = csv.DictReader(f) # read rows into a dictionary format
for row in reader: # read a row as {column1: value1, column2: value2,...}
for (k,v) in row.items(): # go over each column name and value
columns[k].append(v) # append the value into the appropriate list
# based on column name k
d = dict(zip(zip(columns['Date'],columns['User'],columns['New value']),columns['Action']))
print d
## for key, value in d.iteritems():
## if value == 'COMMENT_CHANGED' or value == 'COMMENT_ADDED':
##
## writer = csv.writer(open('final.csv', 'ab'))
## for (key, value) in zip(d,line):
## writer.writerow([line, key])
## else:
## print 'No Comments found for '+line
def csvComment():
'Module for data to be fetched and parsed into csv'
print 'started'
with open('sorted.txt', 'r') as f:
for line in f:
line = line.strip('\n')
durl='http://fisheye.com/cru/'+line+'/reviewHistory.csv'
print durl
testfile = urllib.URLopener()
testfile.retrieve('http://fisheye.com/cru/'+line+'/reviewHistory.csv', line+'.csv')
def csvFinalReview():
print 'started'
with open('sorted.txt', 'r') as f:
for line in f:
line = line.strip('\n')
durl='http://fisheye.com/cru/'+line+'/reviewHistory.csv'
print durl
testfile = urllib.URLopener()
testfile.retrieve('http://fisheye.com/cru/'+line+'/reviewHistory.csv', line+'.csv')
columns = defaultdict(list) # each value in each column is appended to a list
with open(line+'.csv') as f:
reader = csv.DictReader(f) # read rows into a dictionary format
for row in reader: # read a row as {column1: value1, column2: value2,...}
for (k,v) in row.items(): # go over each column name and value
columns[k].append(v) # append the value into the appropriate list
# based on column name k
d = dict(zip(zip(columns['Date'],columns['User'],columns['New value']),columns['Action']))
for key, value in d.iteritems():
if value == 'COMMENT_CHANGED' or value == 'COMMENT_ADDED':
print file,key
try:
os.remove(line+'.csv')
except IOError:
pass
#csvComment()
#csvReview()
def get_gimdata(self):
import urllib, os
if self.file_exist():
print "No need to download GIM data..."
return
print "Start to download GIM data..."
weblink = "ftp://ftp.unibe.ch/aiub/CODE/{0}/".format(self.year)
if not os.path.isfile(self.sourcefn[:-2]):
if not os.path.isfile(self.sourcefn):
download = urllib.URLopener()
download.retrieve(weblink+self.sourcefn, self.sourcefn)
os.system("gzip -fd {0}".format(self.sourcefn))
def get_navidata(self):
import urllib, os
if self.file_exist():
print "No need to download Navigation data..."
return
print "Start to download Navigation data..."
if self.types in 'igslocal':
weblink = "ftp://igscb.jpl.nasa.gov/pub/product/"
if not (os.path.isfile(self.sourcefn_igs1) or os.path.isfile(self.sourcefn_igr1)):
try:
download = urllib.URLopener()
download.retrieve("{0}{1:04}/{2}".format(weblink, self.dweeks1, self.sourcefn_igs1), self.sourcefn_igs1)
self.sourcefn1 = self.sourcefn_igs1[:-2]
except IOError:
download = urllib.URLopener()
download.retrieve("{0}{1:04}/{2}".format(weblink, self.dweeks1, self.sourcefn_igr1), self.sourcefn_igr1)
self.sourcefn1 = self.sourcefn_igr1[:-2]
if not (os.path.isfile(self.sourcefn_igs2) or os.path.isfile(self.sourcefn_igr2)):
try:
download = urllib.URLopener()
download.retrieve("{0}{1:04}/{2}".format(weblink, self.dweeks2, self.sourcefn_igs2), self.sourcefn_igs2)
self.sourcefn2 = self.sourcefn_igs2[:-2]
except IOError:
download = urllib.URLopener()
download.retrieve("{0}{1:04}/{2}".format(weblink, self.dweeks2, self.sourcefn_igr2), self.sourcefn_igr2)
self.sourcefn2 = self.sourcefn_igr2[:-2]
elif self.types=='igsrt':
weblink = "ftp://cddis.gsfc.nasa.gov/pub/gps/products/{0}/".format(self.sourcefn_igu[3:7])
download = urllib.URLopener()
download.retrieve(weblink+self.sourcefn_igu, self.sourcefn_igu)
self.sourcefn = self.sourcefn_igu[:-2]
os.system("gzip -fd *sp3.Z")
def save_image(self, url, path):
"""
:param url:
:param path:
:return nothing:
"""
image = urllib.URLopener()
image.retrieve(url, path)
def download_file(url, local_path):
dir_path = path.dirname(local_path)
if not path.exists(dir_path):
print("Creating the directory '%s' ..." % dir_path)
os.makedirs(dir_path)
urllib.URLopener().retrieve(url, local_path)
def DownHTTP(url,fileName):
fileHTTP = urllib.URLopener()
if fileName == "":
fileHTTP.retrieve(url,url.split("/")[len(url.split("/"))-1])
else:
fileHTTP.retrieve(url,fileName)
###### setup EDIT
def DownHTTP(url,fileName):
fileHTTP = urllib.URLopener()
if fileName == "":
if os.path.isfile(url.split("/")[len(url.split("/"))-1]) == 1:
newName = url.split("/")[len(url.split("/"))-1].split(".")[0]+"_."+url.split("/")[len(url.split("/"))-1].split(".")[1]
fileHTTP.retrieve(url,newName)
return " saved the file with the original name + \"_\""
else:
fileHTTP.retrieve(url,url.split("/")[len(url.split("/"))-1])
return " saved the file with the original name"
else:
fileHTTP.retrieve(url,fileName)
return " saved the file with the given name"
def DownHTTP(url,fileName):
fileHTTP = urllib.URLopener()
if fileName == "":
fileHTTP.retrieve(url,url.split("/")[len(url.split("/"))-1])
else:
fileHTTP.retrieve(url,fileName)
def DownHTTP(url,fileName):
fileHTTP = urllib.URLopener()
if fileName == "":
fileHTTP.retrieve(url,url.split("/")[len(url.split("/"))-1])
else:
fileHTTP.retrieve(url,fileName)
###### setup EDIT
def DownHTTP(url,fileName):
fileHTTP = urllib.URLopener()
if fileName == "":
if os.path.isfile(url.split("/")[len(url.split("/"))-1]) == 1:
newName = url.split("/")[len(url.split("/"))-1].split(".")[0]+"_."+url.split("/")[len(url.split("/"))-1].split(".")[1]
fileHTTP.retrieve(url,newName)
return " saved the file with the original name + \"_\""
else:
fileHTTP.retrieve(url,url.split("/")[len(url.split("/"))-1])
return " saved the file with the original name"
else:
fileHTTP.retrieve(url,fileName)
return " saved the file with the given name"
def downloadSource(self):
download_file = URLopener()
download_file.retrieve(self.__url, self.__filename)
self.__sourceAvailable = True
def get_caltech101(save_dir=None, root_path=None):
assert((save_dir is not None and root_path is None) or (save_dir is None and root_path is not None))
if root_path is None:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
print 'Downloading Caltech101 dataset...'
tar_path = os.path.join(save_dir, "101_ObjectCategories.tar.gz")
url = urllib.URLopener(context=ctx)
url.retrieve("https://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz", tar_path)
print 'Download Done, Extracting...'
tar = tarfile.open(tar_path)
tar.extractall(save_dir)
tar.close()
root = os.path.join(save_dir, "101_ObjectCategories") if not root_path else root_path
train_x = []
train_y = []
val_x = []
val_y = []
label = 0
for cls_folder in os.listdir(root):
cls_root = os.path.join(root, cls_folder)
if not os.path.isdir(cls_root):
continue
cls_images = [misc.imread(os.path.join(cls_root, img_name)) for img_name in os.listdir(cls_root)]
cls_images = [np.repeat(np.expand_dims(img, 2), 3, axis=2) if len(img.shape) == 2 else img for img in cls_images]
cls_images = np.array([np.reshape(misc.imresize(img, (224,224,3)), (3,224,224)) for img in cls_images])
new_index = np.random.permutation(np.arange(cls_images.shape[0]))
cls_images = cls_images[new_index, :, :, :]
train_x.append(cls_images[:30])
train_y.append(np.array([label]*30))
if len(cls_images) <= 80:
val_x.append(cls_images[30:])
val_y.append(np.array([label]*(len(cls_images)-30)))
else:
val_x.append(cls_images[30:80])
val_y.append(np.array([label]*50))
label += 1
Xtr = np.concatenate(train_x)
Ytr = np.concatenate(train_y)
Xval= np.concatenate(val_x)
Yval= np.concatenate(val_y)
print 'Xtr shape ', Xtr.shape
print 'Ytr shape ', Ytr.shape
print 'Xval shape ', Xval.shape
print 'Yval shape ', Yval.shape
return Xtr, Ytr, Xval, Yval
def get_cifar10(save_dir=None, root_path=None):
''' If root_path is None, we download the data set from internet.
Either save path or root path must not be None and not both.
Returns Xtr, Ytr, Xte, Yte as numpy arrays
'''
assert((save_dir is not None and root_path is None) or (save_dir is None and root_path is not None))
if root_path is None:
print 'Downloading CIFAR10 dataset...'
tar_path = os.path.join(save_dir, "cifar-10-python.tar.gz")
url = urllib.URLopener()
url.retrieve("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz", tar_path)
print 'Download Done, Extracting...'
tar = tarfile.open(tar_path)
tar.extractall(save_dir)
tar.close()
root = os.path.join(save_dir, "cifar-10-batches-py") if not root_path else root_path
# Training Data
xs = []
ys = []
for b in range(1,6):
f = os.path.join(root, 'data_batch_%d' % (b, ))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
print 'Xtrain shape', Xtr.shape
print 'Ytrain shape', Ytr.shape
# Testing data
Xte, Yte = load_CIFAR_batch(os.path.join(root, 'test_batch'))
print 'Xtest shape', Xte.shape
print 'Ytest shape', Yte.shape
return Xtr, Ytr, Xte, Yte
def get_svhn(save_dir=None, root_path=None):
''' If root_path is None, we download the data set from internet.
Either save path or root path must not be None and not both.
Returns Xtr, Ytr, Xte, Yte as numpy arrays
'''
assert((save_dir is not None and root_path is None) or (save_dir is None and root_path is not None))
if root_path is None:
new_save_dir = os.path.join(save_dir, 'og_data')
if not os.path.isdir(new_save_dir):
os.mkdir(new_save_dir)
train_mat = os.path.join(new_save_dir, "train_32x32.mat")
test_mat = os.path.join(new_save_dir, "test_32x32.mat")
url = urllib.URLopener()
print 'Downloading Svhn Train...'
url.retrieve("http://ufldl.stanford.edu/housenumbers/train_32x32.mat", train_mat)
print 'Downloading Svhn Test...'
url.retrieve("http://ufldl.stanford.edu/housenumbers/test_32x32.mat", test_mat)
root = new_save_dir if not root_path else root_path
train = io.loadmat(os.path.join(root, 'train_32x32.mat'))
Xtr = train['X']
Ytr = train['y']
del train
test = io.loadmat(os.path.join(root, 'test_32x32.mat'))
Xte = test['X']
Yte = test['y']
del test
Xtr = np.transpose(Xtr, (3, 2, 0, 1))
Xte = np.transpose(Xte, (3, 2, 0, 1))
Ytr = Ytr.reshape(Ytr.shape[:1]) - 1
Yte = Yte.reshape(Yte.shape[:1]) - 1
print 'Xtrain shape', Xtr.shape
print 'Ytrain shape', Ytr.shape
print 'Xtest shape', Xte.shape
print 'Ytest shape', Yte.shape
return Xtr, Ytr, Xte, Yte
def get_svhn_full(save_dir=None, root_path=None):
''' If root_path is None, we download the data set from internet.
Either save path or root path must not be None and not both.
Returns Xtr, Ytr, Xte, Yte as numpy arrays
'''
assert((save_dir is not None and root_path is None) or (save_dir is None and root_path is not None))
Xtr_small, Ytr_small, Xte, Yte = get_svhn(save_dir, root_path)
if root_path is None:
new_save_dir = os.path.join(save_dir, 'og_data')
if not os.path.isdir(new_save_dir):
os.mkdir(new_save_dir)
extra_mat = os.path.join(new_save_dir, "extra_32x32.mat")
url = urllib.URLopener()
print 'Downloading Svhn Extra...'
url.retrieve("http://ufldl.stanford.edu/housenumbers/extra_32x32.mat", extra_mat)
root = new_save_dir if not root_path else root_path
extra = io.loadmat(os.path.join(root, 'extra_32x32.mat'))
Xtr_extra = extra['X']
Ytr_extra = extra['y']
Xtr_extra = np.transpose(Xtr_extra, (3, 2, 0, 1))
Ytr_extra = Ytr_extra.reshape(Ytr_extra.shape[:1]) - 1
print 'Xextra shape', Xtr_extra.shape
print 'Yextra shape', Ytr_extra.shape
val_x = []
val_y = []
train_x = []
train_y = []
for i in np.unique(Ytr_small):
# Get 400 images from X_small
X_small_label = Xtr_small[Ytr_small == i]
val_x.append(X_small_label[:400])
val_y.append([i]*400)
train_x.append(X_small_label[400:])
train_y.append([i]*(X_small_label.shape[0] - 400))
# Get 200 images from X_small
X_extra_label = Xtr_extra[Ytr_extra == i]
val_x.append(X_extra_label[:200])
val_y.append([i]*200)
train_x.append(X_extra_label[200:])
train_y.append([i]*(X_extra_label.shape[0] - 200))
Xtr = np.concatenate(train_x)
Ytr = np.concatenate(train_y)
Xval = np.concatenate(val_x)
Yval = np.concatenate(val_y)
return Xtr, Ytr, Xval, Yval, Xte, Yte
def download():
choice = (raw_input('Type "tag" or "album" for corresponding choice. \nDo you want to download images by tag or specific album: '))
#counter is created in order to label the images when they are downloaded
counter = 0
if(choice == 'album'):
albumID = int(raw_input('Enter the ID of the folder you wish to download: '))
name = raw_input('Enter the username of the desired users pictures: ')
# checking if the folder exists, creating a folder and moving into it
if not os.path.exists(name+'/'+albumID):
os.makedirs(name+'/'+albumID)
os.chdir(name+'/'+albumID)
print('Downloading...')
# walk_set function loops through the pictures of a specific album
for photo in flickr.walk_set(albumID):
# beautiful soup opens up the direct link to the picture using authors id(name) and photo id, specifying sizes/k will
# result in the highest quality picture available on flickr
url = 'https://www.flickr.com/photos/'+ name+ '/' + photo.get('id') + '/sizes/k/'
webpage = requests.get(url)
soup = BeautifulSoup(webpage.text, 'html.parser')
x = soup.findAll('img')
# we read the html using soup and look for img, after which we look for src link and extract it
for link in soup.find_all('img'):
new = (link.get('src'))
if(new.count(".jpg")) == 1:
#the link is downloaded using URLopener() and saved with 'photo + counter'
testfile = urllib.URLopener()
testfile.retrieve(new, 'photo' + str(counter) + '.jpg' )
counter = counter + 1
elif(choice == 'tag'):
tag = raw_input('Enter the tags(in format:tagName1,tagName2,tagName3 and etc): ')
# checking if the folder exists, creating a folder and moving into it
if not os.path.exists(tag):
os.makedirs(tag)
os.chdir(tag)
# checking the total number of available pictures with the specific tag
total = int(flickr.photos.search(tags=tag).find('photos').attrib['total'])
print('There are ' + str(total) + ' pictures found \nDownloading...')
# walk_set function loops through the pictures with the tag for more info go to flickrapi python documentation
for photo in flickr.walk(tag_mode='all', tags=tag):
author = photo.get('owner') # return the owner of the picture
# beautiful soup opens up the direct link to the picture using authors id and photos id, specifying sizes/k will
# result in the highest quality picture available on flickr
url = 'https://www.flickr.com/photos/'+ author+ '/' + photo.get('id') + '/sizes/k/'
webpage = requests.get(url)
soup = BeautifulSoup(webpage.text, 'html.parser')
x = soup.findAll('img')
# we read the html using soup and look for img, after which we look for src link and extract it
for link in soup.find_all('img'):
new = (link.get('src'))
if(new.count(".jpg")) == 1:
#the link is downloaded using URLopener() and saved with 'photo + counter'
testfile = urllib.URLopener()
testfile.retrieve(new, 'photo' + str(counter) + '.jpg' )
counter = counter + 1
else:
print('An Error appeared in your input. ')
download()