def process_word2vec(word2vec_dir, vocab, save_path, random_init=True):
# read pre-trained word embedddings from the binary file
print('Loading google word2vec...')
word2vec_path = word2vec_dir + '/GoogleNews-vectors-negative300.bin.gz'
word_vectors = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
print('Word2vec loaded!')
if random_init:
word2vec = np.random.uniform(-0.25, 0.25, (len(vocab), 300))
else:
word2vec = np.zeros((len(vocab), 300))
found = 0
for idx, token in enumerate(vocab):
try:
vec = word_vectors[token]
except:
pass
else:
word2vec[idx, :] = vec
found += 1
del word_vectors
print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab), word2vec_path))
np.savez_compressed(save_path, word2vec=word2vec)
print("saved trimmed word2vec matrix at: {}".format(save_path))
# construct embedding vectors according to the GloVe word vectors and vocabulary
python类savez_compressed()的实例源码
def process_glove(glove_dir, glove_dim, vocab_dir, save_path, random_init=True):
"""
:param vocab_list: [vocab]
:return:
"""
save_path = save_path + '.{}'.format(glove_dim)
if not os.path.isfile(save_path + ".npz"):
# read vocabulary
with open(vocab_dir + '/vocabulary.pickle', 'rb') as f:
vocab_map = cPickle.load(f)
f.close()
vocab_list = list(zip(*vocab_map)[0])
glove_path = os.path.join(glove_dir, "glove.6B.{}d.txt".format(glove_dim))
if random_init:
glove = np.random.uniform(-0.25, 0.25, (len(vocab_list), glove_dim))
else:
glove = np.zeros((len(vocab_list), glove_dim))
found = 0
with open(glove_path, 'r') as fh:
for line in fh.readlines():
array = line.lstrip().rstrip().split(" ")
word = array[0]
vector = list(map(float, array[1:]))
if word in vocab_list:
idx = vocab_list.index(word)
glove[idx, :] = vector
found += 1
if word.capitalize() in vocab_list:
idx = vocab_list.index(word.capitalize())
glove[idx, :] = vector
found += 1
if word.upper() in vocab_list:
idx = vocab_list.index(word.upper())
glove[idx, :] = vector
found += 1
print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path))
np.savez_compressed(save_path, glove=glove)
print("saved trimmed glove matrix at: {}".format(save_path))
def save_estimates(self, fname='', notes='', force=False):
"""
Saves the JIVE estimates
U, D, V, full, rank for block secific joint/individual spaces
U, D, V, rank for common joint space
some metadata (when saved, some nots)
Parameters
----------
fname: name of the file
notes: any notes you want to include
force: whether or note to overwrite a file with the same name
"""
if os.path.exists(fname) and (not force):
raise ValueError('%s already exists' % fname)
kwargs = {}
svd_dat = ['scores', 'sing_vals', 'loadings', 'rank']
kwargs['K'] = self.K
block_estimates = self.get_block_specific_estimates()
for k in range(self.K):
for mode in ['joint', 'individual']:
for dat in svd_dat + ['full']:
label = '%d_%s_%s' % (k, mode, dat)
kwargs[label] = block_estimates[k][mode][dat]
common_joint = self.get_common_joint_space_estimate()
for dat in svd_dat:
kwargs['common_%s' % dat] = common_joint[dat]
current_time = time.strftime("%m/%d/%Y %H:%M:%S")
kwargs['metadata'] = [current_time, notes]
np.savez_compressed(fname, **kwargs)
def save_init_svd(self, fname='', notes='', force=False):
"""
Saves the initial SVD so it can be loaded later without recomputing
Parameters
----------
fname: name of the file
notes: any notes you want to include
force: whether or note to overwrite a file with the same name
"""
if not hasattr(self.blocks[0], 'scores'):
raise ValueError('initial svd has not yet been computed')
if os.path.exists(fname) and (not force):
raise ValueError('%s already exists' % fname)
kwargs = {}
svd_dat = ['scores', 'sing_vals', 'loadings', 'rank']
kwargs['K'] = self.K
for k in range(self.K):
kwargs['%d_scores' % k] = self.blocks[k].scores
kwargs['%d_sv' % k] = self.blocks[k].sv
kwargs['%d_loadings' % k ] = self.blocks[k].loadings
kwargs['%d_init_svd_rank' % k] = self.blocks[k].init_svd_rank
np.savez_compressed(fname, **kwargs)
def save(self, out_file):
"""
Save the current memory into a file in Numpy format
:param out_file: File storage path
:return:
"""
np.savez_compressed(out_file, states=self._states, actions=self._actions,
rewards=self._rewards, terminals=self._terminals)
def _save_np_compressed_data(file_name, *args):
mkdirs_if_not_exist(dirname(file_name))
np.savez_compressed(file_name, *args)
def save(self, out_file):
"""
Save the current memory into a file in Numpy format
:param out_file: File storage path
:return:
"""
np.savez_compressed(out_file, states=self._states, actions=self._actions,
rewards=self._rewards, terminals=self._terminals)
def save_frame_data(archive, path, videos, object_point_set, verbose=True):
if verbose:
print("Saving corners to {0:s}".format(path))
for video in videos:
archive[IMAGE_POINTS + str(video.name)] = video.image_points
archive[FRAME_NUMBERS + str(video.name)] = list(video.usable_frames.keys())
if len(video.poses) > 0:
archive[POSES + str(video.name)] = np.array([pose.T for pose in video.poses])
archive[OBJECT_POINT_SET] = object_point_set
np.savez_compressed(path, **archive)
def save_calibration_intervals(archive, path, videos, verbose=True):
if verbose:
print("Saving calibration intervals to {0:s}".format(path))
ranges = []
for video in videos:
if video.calibration_interval is None:
raise ValueError("Expecting all cameras to have valid calibration frame ranges. Got: None")
ranges.append(video.calibration_interval)
ranges = np.array(ranges)
archive[CALIBRATION_INTERVALS] = ranges
np.savez_compressed(path, **archive)
def save_model(self):
logging.info("Saving model")
save_filename = os.path.join(self.model_folder,'{}_epoch{}.npz'.format(self.model_name, self.epoch))
np.savez_compressed(save_filename, *lasagne.layers.get_all_param_values(self.network))
def _preprocess(self, input_file, tensor_file):
if input_file.endswith(".bz2"): file_reference = BZ2File(input_file, "r")
elif input_file.endswith(".txt"): file_reference = io.open(input_file, "r")
raw_data = file_reference.read()
file_reference.close()
data = raw_data.encode(encoding=self.encoding)
# Convert the entirety of the data file from characters to indices via the vocab dictionary.
# How? map(function, iterable) returns a list of the output of the function
# executed on each member of the iterable. E.g.:
# [14, 2, 9, 2, 0, 6, 7, 0, ...]
# np.array converts the list into a numpy array.
self.tensor = np.array(list(map(self.vocab.get, data)))
# Compress and save the numpy tensor array to data.npz.
np.savez_compressed(tensor_file, tensor_data=self.tensor)
def save_vocab(self, path_count, path_vocab, word_limit=100000):
""" Saves the master vocabulary into a file.
"""
# reserve space for 10 special tokens
words = OrderedDict()
for token in SPECIAL_TOKENS:
# store -1 instead of np.inf, which can overflow
words[token] = -1
# sort words by frequency
desc_order = OrderedDict(sorted(self.master_vocab.items(),
key=lambda kv: kv[1], reverse=True))
words.update(desc_order)
# use encoding of up to 30 characters (no token conversions)
# use float to store large numbers (we don't care about precision loss)
np_vocab = np.array(words.items(),
dtype=([('word', '|S30'), ('count', 'float')]))
# output count for debugging
counts = np_vocab[:word_limit]
np.savez_compressed(path_count, counts=counts)
# output the index of each word for easy lookup
final_words = OrderedDict()
for i, w in enumerate(words.keys()[:word_limit]):
final_words.update({w: i})
with open(path_vocab, 'w') as f:
f.write(json.dumps(final_words, indent=4, separators=(',', ': ')))
def test_compressed_roundtrip():
arr = np.random.rand(200, 200)
npz_file = os.path.join(tempdir, 'compressed.npz')
np.savez_compressed(npz_file, arr=arr)
arr1 = np.load(npz_file)['arr']
assert_array_equal(arr, arr1)
def save_weights(self, weightspath=None):
weightspath = super(LasagneNetwork, self)._weightspath(weightspath)
weights = {name: p.get_value() for name, p in
LasagneNetwork._get_named_params(self.out_layer)}
np.savez_compressed(weightspath, **weights)
def _preprocess(self, input_file, tensor_file):
if input_file.endswith(".bz2"): file_reference = BZ2File(input_file, "r")
elif input_file.endswith(".txt"): file_reference = io.open(input_file, "r")
raw_data = file_reference.read()
file_reference.close()
data = raw_data.encode(encoding=self.encoding)
# Convert the entirety of the data file from characters to indices via the vocab dictionary.
# How? map(function, iterable) returns a list of the output of the function
# executed on each member of the iterable. E.g.:
# [14, 2, 9, 2, 0, 6, 7, 0, ...]
# np.array converts the list into a numpy array.
self.tensor = np.array(map(self.vocab.get, data))
# Compress and save the numpy tensor array to data.npz.
np.savez_compressed(tensor_file, tensor_data=self.tensor)
def main(em_file, em_result):
'''
embedding ->numpy
'''
em = word2vec.load(em_file)
vec = (em.vectors)
word2id = em.vocab_hash
# d = dict(vector = vec, word2id = word2id)
# t.save(d,em_result)
np.savez_compressed(em_result,vector=vec,word2id=word2id)
def savelogs(self, ts=None, saveres=True, filename=None):
# FIXME: consider HDF5
if ts == None:
ts = time.strftime("%Y%m%d-%H%M%S")
# np.save("%s/log-x-%s" % (self.cfgprefix, ts), self.iosm.x_)
# np.save("%s/log-x_raw-%s" % (self.cfgprefix, ts), self.iosm.x_raw_)
# np.save("%s/log-z-%s" % (self.cfgprefix, ts), self.iosm.z_)
# np.save("%s/log-zn-%s" % (self.cfgprefix, ts), self.iosm.zn_)
# np.save("%s/log-zn_lp-%s" % (self.cfgprefix, ts), self.iosm.zn_lp_)
# np.save("%s/log-r-%s" % (self.cfgprefix, ts), self.iosm.r_)
# np.save("%s/log-w-%s" % (self.cfgprefix, ts), self.iosm.w_)
# network data, pickling reservoir, input weights, output weights
# self.res.save("%s/log-%s-res-%s.bin" % (self.cfgprefix, self.cfgprefix, ts))
if filename == None:
logfile = "%s/log-learner-%s" % (self.cfgprefix, ts)
else:
logfile = filename
if saveres:
np.savez_compressed(logfile, x = self.iosm.x_,
x_raw = self.iosm.x_raw_, z = self.iosm.z_, zn = self.iosm.zn_,
zn_lp = self.iosm.zn_lp_, r = self.iosm.r_, w = self.iosm.w_, e = self.iosm.e_,
t = self.iosm.t_, mse = self.iosm.mse_)
else:
np.savez_compressed(logfile, x = self.iosm.x_,
x_raw = self.iosm.x_raw_, z = self.iosm.z_, zn = self.iosm.zn_,
zn_lp = self.iosm.zn_lp_, w = self.iosm.w_, e = self.iosm.e_,
t = self.iosm.t_,
mse = self.iosm.mse_)
print "logs saved to %s" % logfile
return logfile
def save_matrix(self, path):
with open(path, 'w') as f:
np.savez_compressed(f,
data=self.__data,
rowlabels=self.__rowlabels,
columnlabels=self.__columnlabels)
def save_scores(model_options,name_scores):
if not os.path.exists('scores/'):
os.system('mkdir scores/')
save_name = 'scores/scores_'+model_options['name'].split('/')[-1]
print 'Dumping scores to: '+save_name
if not os.path.isdir('scores/'):
os.mkdir('scores')
np.savez_compressed(save_name,name_scores)
def savez_compressed(file, *args, **kwds):
"""Saves one or more arrays into a file in compressed ``.npz`` format.
It is equivalent to :func:`cupy.savez` function except the output file is
compressed.
.. seealso::
:func:`cupy.savez` for more detail,
:func:`numpy.savez_compressed`
"""
args = map(cupy.asnumpy, args)
for key in kwds:
kwds[key] = cupy.asnumpy(kwds[key])
numpy.savez_compressed(file, *args, **kwds)