def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
python类array_split()的实例源码
def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
def test_latlon2pix_internals(pix_size_single, origin_point, is_flipped,
num_chunks, chunk_position):
img = make_image(pix_size_single, origin_point, is_flipped,
num_chunks, chunk_position)
chunk_idx = img.chunk_idx
res_x = img._full_res[0]
res_y = img._full_res[1]
pix_size = (img.pixsize_x, img.pixsize_y)
origin = (img._start_lon, img._start_lat)
# +0.5 for centre of pixels
lons = (np.arange(res_x) + 0.5) * pix_size[0] + origin[0]
all_lats = (np.arange(res_y) + 0.5) * pix_size[1] + origin[1]
lats = np.array_split(all_lats, num_chunks)[chunk_idx]
pix_x = np.arange(res_x)
pix_y = np.arange(lats.shape[0])
d = np.array([[a, b] for a in lons for b in lats])
xy = img.lonlat2pix(d)
true_xy = np.array([[a, b] for a in pix_x for b in pix_y])
assert np.all(xy == true_xy)
def test_pix2latlong(pix_size_single, origin_point, is_flipped,
num_chunks, chunk_position):
img = make_image(pix_size_single, origin_point, is_flipped,
num_chunks, chunk_position)
chunk_idx = img.chunk_idx
res_x = img._full_res[0]
res_y = img._full_res[1]
pix_size = (img.pixsize_x, img.pixsize_y)
origin = (img._start_lon, img._start_lat)
true_lons = np.arange(res_x) * pix_size[0] + origin[0]
all_lats = np.arange(res_y) * pix_size[1] + origin[1]
true_lats = np.array_split(all_lats, num_chunks)[chunk_idx]
true_d = np.array([[a, b] for a in true_lons for b in true_lats])
pix_x = np.arange(res_x)
pix_y = np.arange(img.resolution[1]) # chunk resolution
xy = np.array([[a, b] for a in pix_x for b in pix_y])
lonlats = img.pix2lonlat(xy)
assert np.all(lonlats == true_d)
def transform(self, X):
if self.tagger is None:
raise ValueError("Must find_motifs before you can tag anything")
logging.info("Tagging %s data with motifs using %d workers..." % (
str(X.shape), self.n_jobs))
if self.n_jobs > 1:
pool = mp.ProcessingPool(self.n_jobs)
splits = np.array_split(X, self.n_jobs)
tag_lists = pool.map(self._tag_motifs, splits)
tags = list(itertools.chain.from_iterable(tag_lists))
else:
tags = self._tag_motifs(X)
logging.info("All motifs have been tagged")
return self._sparsify_tags(tags)
def subset_iterator(X, m, repeats=1):
'''
Iterates over array X in chunks of m, repeat number of times.
Each time the order of the repeat is randomly generated.
'''
N, dim = X.shape
progress = tqdm(total=repeats * int(N / m))
for i in range(repeats):
indices = np.random.permutation(N)
for idx in np.array_split(indices, N // m):
yield X[idx][:]
progress.update()
progress.close()
def _split_into_groups(y, num_groups):
groups = [[] for _ in range(num_groups)]
group_index = 0
for cls in set(y):
this_cls_indices = np.where(y == cls)[0]
num_cls_samples = len(this_cls_indices)
num_cls_split_groups = ceil(num_cls_samples / 500)
split = np.array_split(this_cls_indices, num_cls_split_groups)
for cls_group in split:
groups[group_index] = np.hstack((groups[group_index], cls_group))
group_index = (group_index + 1) % num_groups
return groups
def get_embedding_X(img):
'''
Args : Numpy Images vector
Returns : Embedded Matrix of length Samples, 4096
'''
img = img.reshape((img.shape[0], img.shape[1], img.shape[2], 1))
sess = tf.Session()
imgs = tf.placeholder(tf.float32, [None, None, None, None])
vgg = vgg16(imgs, '/tmp/vgg16_weights.npz', sess)
embs = []
cnt = 0
for img_batch in np.array_split(img, img.shape[0] / 1000):
emb = sess.run(vgg.emb, feed_dict={vgg.imgs: img_batch})
embs.extend(emb)
cnt += 1
progress = round(100 * (cnt * 1000 / img.shape[0]),2)
if(progress%10 == 0):
print progress
embs = np.array(embs)
print embs.shape
embs = np.reshape(embs,(embs.shape[0],embs.shape[1] * embs.shape[2] * embs.shape[3]))
return embs
def __init__(self, pobj, just_list = False, attr='_grids',
round_robin=False):
ObjectIterator.__init__(self, pobj, just_list, attr=attr)
# pobj has to be a ParallelAnalysisInterface, so it must have a .comm
# object.
self._offset = pobj.comm.rank
self._skip = pobj.comm.size
# Note that we're doing this in advance, and with a simple means
# of choosing them; more advanced methods will be explored later.
if self._use_all:
self.my_obj_ids = np.arange(len(self._objs))
else:
if not round_robin:
self.my_obj_ids = np.array_split(
np.arange(len(self._objs)), self._skip)[self._offset]
else:
self.my_obj_ids = np.arange(len(self._objs))[self._offset::self._skip]
def iter_combinatorial_pairs(queue, num_examples, batch_size, interval,
num_classes, augment_positive=False):
num_examples_per_class = num_examples // num_classes
pairs = np.array(list(itertools.combinations(range(num_examples), 2)))
if augment_positive:
additional_positive_pairs = make_positive_pairs(
num_classes, num_examples_per_class, num_classes - 1)
pairs = np.concatenate((pairs, additional_positive_pairs))
num_pairs = len(pairs)
num_batches = num_pairs // batch_size
perm = np.random.permutation(num_pairs)
for i, batch_indexes in enumerate(np.array_split(perm, num_batches)):
if i % interval == 0:
x, c = queue.get()
x = x.astype(np.float32) / 255.0
c = c.ravel()
indexes0, indexes1 = pairs[batch_indexes].T
x0, x1, c0, c1 = x[indexes0], x[indexes1], c[indexes0], c[indexes1]
t = np.int32(c0 == c1) # 1 if x0 and x1 are same class, 0 otherwise
yield x0, x1, t
def get_epoch_indexes(self):
B = self.batch_size
K = self.num_classes
M = self.num_per_class
N = K * M # number of total examples
num_batches = M * int(K // B) # number of batches per epoch
indexes = np.arange(N, dtype=np.int32).reshape(K, M)
epoch_indexes = []
for m in range(M):
perm = np.random.permutation(K)
c_batches = np.array_split(perm, num_batches // M)
for c_batch in c_batches:
b = len(c_batch) # actual number of examples of this batch
indexes_anchor = M * c_batch + m
positive_candidates = np.delete(indexes[c_batch], m, axis=1)
indexes_positive = positive_candidates[
range(b), np.random.choice(M - 1, size=b)]
epoch_indexes.append((indexes_anchor, indexes_positive))
return epoch_indexes
def pre_processing(self):
"""Provide same API as Model, we split data to K folds here.
"""
if self.random:
mask = np.random.permutation(self.train_x.shape[0])
train_x = self.train_x[mask]
train_y = self.train_y[mask]
else:
train_x = self.train_x[:]
train_y = self.train_y[:]
if self.select_train_method == 'step':
self.x_folds = [train_x[i::self.k_folds] for i in range(0, self.k_folds)]
self.y_folds = [train_y[i::self.k_folds] for i in range(0, self.k_folds)]
else:
self.x_folds = np.array_split(train_x, self.k_folds)
self.y_folds = np.array_split(train_y, self.k_folds)
# for i in range(self.k_folds):
# self.x_folds[i] = self.train_x[0] + self.x_folds[i] + self.train_x[-1]
# self.y_folds[i] = self.train_y[0] + self.y_folds[i] + self.train_y[-1]
def Train(self, C, A, Y, SF):
'''
Train the classifier using the sample matrix A and target matrix Y
'''
C.fit(A, Y)
YH = np.zeros(Y.shape, dtype = np.object)
for i in np.array_split(np.arange(A.shape[0]), 32): #Split up verification into chunks to prevent out of memory
YH[i] = C.predict(A[i])
s1 = SF(Y, YH)
print('All:{:8.6f}'.format(s1))
'''
ss = ShuffleSplit(random_state = 1151) #Use fixed state for so training can be repeated later
trn, tst = next(ss.split(A, Y)) #Make train/test split
mi = [8] * 1 #Maximum number of iterations at each iter
YH = np.zeros((A.shape[0]), dtype = np.object)
for mic in mi: #Chunk size to split dataset for CV results
#C.SetMaxIter(mic) #Set the maximum number of iterations to run
#C.fit(A[trn], Y[trn]) #Perform training iterations
'''
def add_point(self, t, alt, az):
self.window.append((t, alt, az))
if self._current_window_size() < self.window_duration:
return
points = np.array(self.window)
steady, current = np.array_split(points, 2)
_, steady_cube = self.create_cube(steady)
timestamps, current_cube = self.create_cube(current)
t = self.denoise_and_compare_cubes(steady_cube, current_cube)
self.trigger_criterion.append(list(t))
self.trigger_criterion_timestamps.append(list(timestamps))
has_triggered = self.check_trigger(t)
new_duration = self.window_duration - self.step
self._reduce_to_duration(new_duration)
StandaloneSimilarity.py 文件源码
项目:job-salary-prediction
作者: soton-data-mining
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def predict(self):
if os.path.exists(DATA_QUERIES_VECTOR_NPZ) and not FORCE_LOAD:
print('{}: loading precomputed data'.format(self.__class__.__name__))
self.load_precomputed_data()
else:
self.precomputed_similarity()
batch_size = 100
batch_elements = math.ceil(self.queries_vector.shape[0] / batch_size)
batch_queue = np.array_split(self.queries_vector.A, batch_elements)
print("starting batch computation of Similarity and KNN calculation")
# # multiple versions of calculating the prediction, some faster, some use more mem
# prediction = self.multiprocessor_batch_calc(batch_queue)
prediction = self.batch_calculation(batch_queue)
# prediction = self.individual_calculation()
# prediction = self.cosine_knn_calc()
# prediction = self.custom_knn_calculation(prediction)
train_avg_salary = sum(self.y_train) / len(self.y_train)
cleaned_predictions = [x if str(x) != 'nan' else train_avg_salary for x in prediction]
return self.y_train, cleaned_predictions
def load_test_data(self):
# Remove non-mat files, and perform ascending sort
allfiles = os.listdir(self.data_dir)
npzfiles = []
for idx, f in enumerate(allfiles):
if ".npz" in f:
npzfiles.append(os.path.join(self.data_dir, f))
npzfiles.sort()
# Files for validation sets
val_files = np.array_split(npzfiles, self.n_folds)
val_files = val_files[self.fold_idx]
print "\n========== [Fold-{}] ==========\n".format(self.fold_idx)
print "Load validation set:"
data_val, label_val = self._load_npz_list_files(val_files)
return data_val, label_val
def __init__(self, X, kern, Xm):
super(PITC, self).__init__("PITC")
M = np.shape(Xm)[0]
self.M = M
start = time.time()
X_split = np.array_split(X, M)
self.kern = kern
kern_blocks = np.zeros((M),dtype=object)
for t in xrange(M):
nyst = Nystrom(X_split[t], kern, Xm, False)
size = np.shape(X_split[t])[0]
kern_blocks[t] = kern.K(X_split[t], X_split[t]) - nyst.precon + (kern.noise)*np.identity(size)
self.blocks = kern_blocks
blocked = block_diag(*kern_blocks)
self.nyst = Nystrom(X, kern, Xm, False)
self.precon = self.nyst.precon + blocked
self.duration = time.time() - start
def __init__(self, X, kern, Xm):
super(PITC, self).__init__("PITC")
M = np.shape(Xm)[0]
self.M = M
start = time.time()
X_split = np.array_split(X, M)
self.kern = kern
kern_blocks = np.zeros((M),dtype=object)
for t in xrange(M):
nyst = Nystrom(X_split[t], kern, Xm, False)
size = np.shape(X_split[t])[0]
kern_blocks[t] = kern.K(X_split[t], X_split[t]) - nyst.precon + (kern.noise)*np.identity(size)
self.blocks = kern_blocks
blocked = block_diag(*kern_blocks)
self.nyst = Nystrom(X, kern, Xm, False)
self.precon = self.nyst.precon + blocked
self.duration = time.time() - start
def _read_image_as_array(path, dtype, load_size, crop_size, flip):
f = Image.open(path)
A, B = numpy.array_split(numpy.asarray(f), 2, axis=1)
if hasattr(f, 'close'):
f.close()
A = _resize(A, load_size, Image.BILINEAR, dtype)
B = _resize(B, load_size, Image.NEAREST, dtype)
sx, sy = numpy.random.randint(0, load_size-crop_size, 2)
A = _crop(A, sx, sy, crop_size)
B = _crop(B, sx, sy, crop_size)
if flip and numpy.random.rand() > 0.5:
A = numpy.fliplr(A)
B = numpy.fliplr(B)
return A.transpose(2, 0, 1), B.transpose(2, 0, 1)
def setup_figure():
f = plt.figure(figsize=(7, 5))
mat_grid = plt.GridSpec(2, 6, .07, .52, .98, .95, .15, .20)
mat_axes = [f.add_subplot(spec) for spec in mat_grid]
sticks_axes, rest_axes = np.array_split(mat_axes, 2)
scatter_grid = plt.GridSpec(1, 6, .07, .30, .98, .49, .15, .05)
scatter_axes = [f.add_subplot(spec) for spec in scatter_grid]
kde_grid = plt.GridSpec(1, 6, .07, .07, .98, .21, .15, .05)
kde_axes = [f.add_subplot(spec) for spec in kde_grid]
cbar_ax = f.add_axes([.04, .62, .015, .26])
return f, sticks_axes, rest_axes, scatter_axes, kde_axes, cbar_ax
def partitions(min_val, max_val, n):
"""
Get start/stop boundaries for N partitions.
Args:
min_val (int): The starting value.
max_val (int): The last value.
n (int): The number of partitions.
"""
pts = np.array_split(np.arange(min_val, max_val+1), n)
bounds = []
for pt in pts:
bounds.append((int(pt[0]), int(pt[-1])))
return bounds
search_light.py 文件源码
项目:decoding_challenge_cortana_2016_3rd
作者: kingjr
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def fit(self, X, y):
"""Fit a series of independent estimators to the dataset.
Parameters
----------
X : array, shape (n_samples, n_features, n_estimators)
The training input samples. For each data slice, a clone estimator
is fitted independently.
y : array, shape (n_samples,)
The target values.
Returns
-------
self : object
Return self.
"""
self._check_Xy(X, y)
self.estimators_ = list()
# For fitting, the parallelization is across estimators.
parallel, p_func, n_jobs = parallel_func(_sl_fit, self.n_jobs)
estimators = parallel(
p_func(self.base_estimator, split, y)
for split in np.array_split(X, n_jobs, axis=-1))
self.estimators_ = np.concatenate(estimators, 0)
return self
search_light.py 文件源码
项目:decoding_challenge_cortana_2016_3rd
作者: kingjr
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def _transform(self, X, method):
"""Aux. function to make parallel predictions/transformation."""
self._check_Xy(X)
method = _check_method(self.base_estimator, method)
if X.shape[-1] != len(self.estimators_):
raise ValueError('The number of estimators does not match '
'X.shape[2]')
# For predictions/transforms the parallelization is across the data and
# not across the estimators to avoid memory load.
parallel, p_func, n_jobs = parallel_func(_sl_transform, self.n_jobs)
X_splits = np.array_split(X, n_jobs, axis=-1)
est_splits = np.array_split(self.estimators_, n_jobs)
y_pred = parallel(p_func(est, x, method)
for (est, x) in zip(est_splits, X_splits))
if n_jobs > 1:
y_pred = np.concatenate(y_pred, axis=1)
else:
y_pred = y_pred[0]
return y_pred
SoftmaxRegressionNNImpactDetection.py 文件源码
项目:FootballPredictors
作者: NickSadler2018
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def _yield_minibatches_idx(self, n_batches, data_ary, shuffle=True):
indices = np.arange(data_ary.shape[0])
if shuffle:
indices = np.random.permutation(indices)
if n_batches > 1:
remainder = data_ary.shape[0] % n_batches
if remainder:
minis = np.array_split(indices[:-remainder], n_batches)
minis[-1] = np.concatenate((minis[-1],
indices[-remainder:]),
axis=0)
else:
minis = np.array_split(indices, n_batches)
else:
minis = (indices,)
for idx_batch in minis:
yield idx_batch
def test_mini_batch_k_means_random_init_partial_fit():
km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)
# use the partial_fit API for online learning
for X_minibatch in np.array_split(X, 10):
km.partial_fit(X_minibatch)
# compute the labeling on the complete dataset
labels = km.predict(X)
assert_equal(v_measure_score(true_labels, labels), 1.0)
def binned_batch_stream(target_statistics, batch_size, n_batches, n_bins=64):
hist, bins = np.histogram(target_statistics, bins=n_bins)
indx = np.argsort(target_statistics)
indicies_categories = np.array_split(indx, np.cumsum(hist)[:-1])
per_category = batch_size / n_bins
weight_correction = (np.float64(hist) / per_category).astype('float32')
wc = np.repeat(weight_correction, per_category)
for i in xrange(n_batches):
sample = [
np.random.choice(ind, size=per_category, replace=True)
for ind in indicies_categories
]
yield np.hstack(sample), wc
def binned_batch_stream(target_statistics, batch_size, n_batches, n_bins=64):
hist, bins = np.histogram(target_statistics, bins=n_bins)
indx = np.argsort(target_statistics)
indicies_categories = np.array_split(indx, np.cumsum(hist)[:-1])
n_samples = target_statistics.shape[0]
per_category = batch_size / n_bins
weight_correction = (n_bins * np.float64(hist) / n_samples).astype('float32')
wc = np.repeat(weight_correction, per_category)
for i in xrange(n_batches):
sample = [
np.random.choice(ind, size=per_category, replace=True)
for ind in indicies_categories
]
yield np.hstack(sample), wc
def test_shape_factors(self):
"""
Tests for :func:`array_split.split.shape_factors`.
"""
f = shape_factors(4, 2)
self.assertTrue(_np.all(f == 2))
f = shape_factors(4, 1)
self.assertTrue(_np.all(f == 4))
f = shape_factors(5, 2)
self.assertTrue(_np.all(f == [1, 5]))
f = shape_factors(6, 2)
self.assertTrue(_np.all(f == [2, 3]))
f = shape_factors(6, 3)
self.assertTrue(_np.all(f == [1, 2, 3]))
def scale(boxlist, y_scale, x_scale):
"""Scale box coordinates in x and y dimensions.
Args:
boxlist: BoxList holding N boxes
y_scale: float
x_scale: float
Returns:
boxlist: BoxList holding N boxes
"""
y_min, x_min, y_max, x_max = np.array_split(boxlist.get(), 4, axis=1)
y_min = y_scale * y_min
y_max = y_scale * y_max
x_min = x_scale * x_min
x_max = x_scale * x_max
scaled_boxlist = np_box_list.BoxList(np.hstack([y_min, x_min, y_max, x_max]))
fields = boxlist.get_extra_fields()
for field in fields:
extra_field_data = boxlist.get_field(field)
scaled_boxlist.add_field(field, extra_field_data)
return scaled_boxlist
def iterbatches(arrays, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
arrays = tuple(map(np.asarray, arrays))
n = arrays[0].shape[0]
assert all(a.shape[0] == n for a in arrays[1:])
inds = np.arange(n)
if shuffle: np.random.shuffle(inds)
sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
for batch_inds in np.array_split(inds, sections):
if include_final_partial_batch or len(batch_inds) == batch_size:
yield tuple(a[batch_inds] for a in arrays)