def test_partition_cdtype(self):
d = np.array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
('Lancelot', 1.9, 38)],
dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
tgt = np.sort(d, order=['age', 'height'])
assert_array_equal(np.partition(d, range(d.size),
order=['age', 'height']),
tgt)
assert_array_equal(d[np.argpartition(d, range(d.size),
order=['age', 'height'])],
tgt)
for k in range(d.size):
assert_equal(np.partition(d, k, order=['age', 'height'])[k],
tgt[k])
assert_equal(d[np.argpartition(d, k, order=['age', 'height'])][k],
tgt[k])
d = np.array(['Galahad', 'Arthur', 'zebra', 'Lancelot'])
tgt = np.sort(d)
assert_array_equal(np.partition(d, range(d.size)), tgt)
for k in range(d.size):
assert_equal(np.partition(d, k)[k], tgt[k])
assert_equal(d[np.argpartition(d, k)][k], tgt[k])
python类sort()的实例源码
def plot_histogram_metric(chart, sample_properties, sample_data, **kwargs):
""" Plot a HistogramMetric from the summary json """
summary_data = sample_data.summary
items = summary_data.get(kwargs['metric_name'], {}).items()
if len(items) < 1:
return None
ordering = kwargs.get('order_by', shared_constants.HISTOGRAM_METRIC_DEFAULT_ORDERING)
if ordering == shared_constants.HISTOGRAM_METRIC_ORDER_INTEGER_BIN:
items.sort(key=lambda x: convert_to_int_gracefully(x[0]))
elif ordering == shared_constants.HISTOGRAM_METRIC_ORDER_DECREASING_FREQUENCY:
items.sort(key=lambda x: -convert_to_int_gracefully(x[1]))
elif ordering == shared_constants.HISTOGRAM_METRIC_ORDER_DECREASING_PROPORTION:
items.sort(key=lambda x: -convert_to_float_gracefully(x[1]))
x, y = zip(*items)
chart['data'][0].update({'x': x, 'y': y})
return chart
def preprocess_matrix(matrix, num_bcs=None, use_bcs=None, use_genes=None, force_cells=None):
if force_cells is not None:
bc_counts = matrix.get_reads_per_bc()
bc_indices, _, _ = cr_stats.filter_cellular_barcodes_fixed_cutoff(bc_counts, force_cells)
matrix = matrix.select_barcodes(bc_indices)
elif use_bcs is not None:
bc_seqs = cr_utils.load_csv_rownames(use_bcs)
bc_indices = matrix.bcs_to_ints(bc_seqs)
matrix = matrix.select_barcodes(bc_indices)
elif num_bcs is not None and num_bcs < matrix.bcs_dim:
bc_indices = np.sort(np.random.choice(np.arange(matrix.bcs_dim), size=num_bcs, replace=False))
matrix = matrix.select_barcodes(bc_indices)
if use_genes is not None:
gene_ids = cr_utils.load_csv_rownames(use_genes)
gene_indices = matrix.gene_ids_to_ints(gene_ids)
matrix = matrix.select_genes(gene_indices)
matrix, _, _ = matrix.select_nonzero_axes()
return matrix
def create_training_test_sets(self):
# training set
scale = self.data_interval_right - self.data_interval_left
train_x = sp.stats.truncnorm.rvs(-2, 2, scale=0.25 * scale, size=self.data_size).astype(np.float32)
train_x = np.sort(train_x)
train_y = self.true_f(train_x) + 0.2 * np.random.randn(self.data_size)
self.train_x = [train_x.reshape((train_x.shape[0], 1))]
self.train_y = [train_y.reshape((train_y.shape[0], 1))]
# test set
# scale = self.test_data_interval_right - self.test_data_interval_left
# test_x = sp.stats.truncnorm.rvs(-2, 2, scale=0.25 * scale, size=self.test_data_size).astype(np.float32)
# test_x = np.sort(test_x)
# test_y = self.true_f(test_x)
self.test_x = np.arange(self.view_xrange[0], self.view_xrange[1], 0.01, dtype=np.float32)
self.test_y = self.true_f(self.test_x)
self.test_x = [self.test_x.reshape((self.test_x.shape[0], 1))]
self.test_y = [self.test_y.reshape((self.test_y.shape[0], 1))]
def create_training_test_sets(self):
# training set
train_x = np.random.uniform(self.data_interval_left, self.data_interval_right, size=self.data_size)
train_x = np.sort(train_x)
train_y = self.true_f(train_x) + 3. * np.random.randn(self.data_size)
self.train_x = [train_x.reshape((train_x.shape[0], 1))]
self.train_y = [train_y.reshape((train_y.shape[0], 1))]
# test set for visualisation
self.test_x = np.arange(self.view_xrange[0], self.view_xrange[1], 0.01, dtype=np.float32)
self.test_x = np.reshape(self.test_x, (self.test_x.shape[0], 1))
self.test_y = self.true_f(self.test_x)
self.test_y = np.reshape(self.test_y, (self.test_y.shape[0], 1))
self.test_x = [self.test_x]
self.test_y = [self.test_y]
def iter_keys_values(self, keys, inds=None, verbose=False):
for key in keys:
if key not in self.keys_:
raise RuntimeError('Key %s not found in dataset. keys: %s' % (key, self.keys_))
idx, ii = 0, 0
total_chunks = len(self.meta_file_.chunks)
inds = np.sort(inds) if inds is not None else None
for chunk_idx, chunk in enumerate(progressbar(self.meta_file_.chunks, size=total_chunks, verbose=verbose)):
data = AttrDict.load(self.get_chunk_filename(chunk_idx))
# if inds is None:
items = (data[key] for key in keys)
for item in izip(*items):
yield item
# else:
# for i, item in enumerate(data[key]):
# if inds[ii] == idx + i:
# yield item
# ii += 1
# if ii >= len(inds): break
# idx += len(data[key])
3decision_tree_regression.py 文件源码
项目:Python-Machine-Learning-By-Example
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def get_best_split(X, y):
""" Obtain the best splitting point and resulting children for the data set X, y
Args:
X, y (numpy.ndarray, data set)
criterion (gini or entropy)
Returns:
dict {index: index of the feature, value: feature value, children: left and right children}
"""
best_index, best_value, best_score, children = None, None, 1e10, None
for index in range(len(X[0])):
for value in np.sort(np.unique(X[:, index])):
groups = split_node(X, y, index, value)
impurity = weighted_mse([groups[0][1], groups[1][1]])
if impurity < best_score:
best_index, best_value, best_score, children = index, value, impurity, groups
return {'index': best_index, 'value': best_value, 'children': children}
1decision_tree_submit.py 文件源码
项目:Python-Machine-Learning-By-Example
作者: PacktPublishing
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def get_best_split(X, y, criterion):
""" Obtain the best splitting point and resulting children for the data set X, y
Args:
X, y (numpy.ndarray, data set)
criterion (gini or entropy)
Returns:
dict {index: index of the feature, value: feature value, children: left and right children}
"""
best_index, best_value, best_score, children = None, None, 1, None
for index in range(len(X[0])):
for value in np.sort(np.unique(X[:, index])):
groups = split_node(X, y, index, value)
impurity = weighted_impurity([groups[0][1], groups[1][1]], criterion)
if impurity < best_score:
best_index, best_value, best_score, children = index, value, impurity, groups
return {'index': best_index, 'value': best_value, 'children': children}
def update_image_property(self, property_name, property_data, erase_property=False):
if isinstance(property_data,list) or isinstance(property_data,np.ndarray):
assert len(property_data) == len(self._labels)
property_keys = self._labels
elif isinstance(property_data,dict) or isinstance(property_data,array_dict):
property_keys = np.sort(property_data.keys())
property_data = [property_data[l] for l in property_keys]
if property_name in self._properties.keys():
if erase_property:
self._properties[property_name] = array_dict(property_data,keys=property_keys)
else:
for l,v in zip(property_keys,property_data):
self._properties[property_name][l] = v
else:
print "Creating property ",property_name," on image"
self._properties[property_name] = array_dict(property_data,keys=property_keys)
def testBsearch(self, dtype=dtype):
testarray = range(1,101)
random.shuffle(testarray)
a = numpy.array(testarray[:50], dtype)
b = numpy.array([0] + testarray[50:] + range(101,103), dtype)
a = numpy.sort(a)
self.assertEqual(mapped_struct.bsearch(a, 0), 0)
self.assertEqual(mapped_struct.bsearch(a, 101), len(a))
self.assertEqual(mapped_struct.bsearch(a, 102), len(a))
for x in a:
ix = mapped_struct.bsearch(a, x)
self.assertLess(ix, len(a))
self.assertEqual(a[ix], x)
self.assertTrue(mapped_struct.sorted_contains(a, x))
for x in b:
ix = mapped_struct.bsearch(a, x)
self.assertTrue(ix >= len(a) or a[ix] != x)
self.assertFalse(mapped_struct.sorted_contains(a, x))
def get_score_bounds_from_range(Z_min, Z_max, rho_lb, rho_ub, L0_max = None):
"global variables: L0_reg_ind"
edge_values = np.vstack([Z_min * rho_lb,
Z_max * rho_lb,
Z_min * rho_ub,
Z_max * rho_ub])
if L0_max is None or L0_max == Z_min.shape[0]:
s_min = np.sum(np.min(edge_values, axis = 0))
s_max = np.sum(np.max(edge_values, axis = 0))
else:
min_values = np.min(edge_values, axis = 0)
s_min_reg = np.sum(np.sort(min_values[L0_reg_ind])[0:L0_max])
s_min_no_reg = np.sum(min_values[~L0_reg_ind])
s_min = s_min_reg + s_min_no_reg
max_values = np.max(edge_values, axis = 0)
s_max_reg = np.sum(-np.sort(-max_values[L0_reg_ind])[0:L0_max])
s_max_no_reg = np.sum(max_values[~L0_reg_ind])
s_max = s_max_reg + s_max_no_reg
return s_min, s_max
#setup weights
def get_score_bounds(Z_min, Z_max, rho_lb, rho_ub, L0_reg_ind = None, L0_max = None):
edge_values = np.vstack([Z_min * rho_lb,
Z_max * rho_lb,
Z_min * rho_ub,
Z_max * rho_ub])
if (L0_max is None) or (L0_reg_ind is None) or (L0_max == Z_min.shape[0]):
s_min = np.sum(np.min(edge_values, axis=0))
s_max = np.sum(np.max(edge_values, axis=0))
else:
min_values = np.min(edge_values, axis=0)
s_min_reg = np.sum(np.sort(min_values[L0_reg_ind])[0:L0_max])
s_min_no_reg = np.sum(min_values[~L0_reg_ind])
s_min = s_min_reg + s_min_no_reg
max_values = np.max(edge_values, axis=0)
s_max_reg = np.sum(-np.sort(-max_values[L0_reg_ind])[0:L0_max])
s_max_no_reg = np.sum(max_values[~L0_reg_ind])
s_max = s_max_reg + s_max_no_reg
return s_min, s_max
def round_solution_pool(pool, constraints):
pool.distinct().sort()
P = pool.P
L0_reg_ind = np.isnan(constraints['coef_set'].C_0j)
L0_max = constraints['L0_max']
rounded_pool = SolutionPool(P)
for solution in pool.solutions:
# sort from largest to smallest coefficients
feature_order = np.argsort([-abs(x) for x in solution])
rounded_solution = np.zeros(shape=(1, P))
l0_norm_count = 0
for k in range(0, P):
j = feature_order[k]
if not L0_reg_ind[j]:
rounded_solution[0, j] = np.round(solution[j], 0)
elif l0_norm_count < L0_max:
rounded_solution[0, j] = np.round(solution[j], 0)
l0_norm_count += L0_reg_ind[j]
rounded_pool.add(objvals=np.nan, solutions=rounded_solution)
rounded_pool.distinct().sort()
return rounded_pool
def top_uncer_items(adata, pp, n, flag = None):
"""
Return top a flag list of top n uncertain item that not flag
"""
uncertain = np.abs(pp[:,0] - 0.5)
if flag != None:
addition = np.asarray(flag, dtype = int)*10# flagged items are not consider, increase their value
uncertain = uncertain + addition
if len(uncertain) <= n:
return np.nonzero(uncertain <= 10000000)[0]
sorted_uncertain = np.sort(uncertain)
thresh = sorted_uncertain[n]
return np.nonzero(uncertain <= thresh)[0]
def items_for_expert(adata, pp, n, flag):
"""
take n items for expert to consider
"""
combined_prob = 0.8*np.asarray(adata.taken_crowd_prob) + 0.2*pp[:,1]
uncertain = np.abs(combined_prob - 0.5)
if flag != None:
addition = np.asarray(flag, dtype = int)*10# flagged items are not consider, increase their value
uncertain = uncertain + addition
if len(uncertain) <= n:
return np.nonzero(uncertain <= 10000000)[0]
sorted_uncertain = np.sort(uncertain)
thresh = sorted_uncertain[n]
return np.nonzero(uncertain <= thresh)[0]
def flush():
prints = []
for name, vals in _since_last_flush.items():
prints.append("{}\t{}".format(name, np.mean(list(vals.values()))))
_since_beginning[name].update(vals)
x_vals = np.sort(list(_since_beginning[name].keys()))
y_vals = [_since_beginning[name][x] for x in x_vals]
plt.clf()
plt.plot(x_vals, y_vals)
plt.xlabel('iteration')
plt.ylabel(name)
plt.savefig('generated/'+name.replace(' ', '_')+'.jpg')
print("iter {}\t{}".format(_iter[0], "\t".join(prints)))
_since_last_flush.clear()
with open('log.pkl', 'wb') as f:
pickle.dump(dict(_since_beginning), f, 4)
def plot_feature_importances(feature_names, feature_importances, N=30):
importances = list(zip(feature_names, list(feature_importances)))
importances = pd.DataFrame(importances, columns=["Feature", "Importance"])
importances = importances.set_index("Feature")
# Sort by the absolute value of the importance of the feature
importances["sort"] = abs(importances["Importance"])
importances = importances.sort(columns="sort", ascending=False).drop("sort", axis=1)
importances = importances[0:N]
# Show the most important positive feature at the top of the graph
importances = importances.sort(columns="Importance", ascending=True)
with plt.style.context(('ggplot')):
fig, ax = plt.subplots(figsize=(16,12))
ax.tick_params(labelsize=16)
importances.plot(kind="barh", legend=False, ax=ax)
ax.set_frame_on(False)
ax.set_xlabel("Relative importance", fontsize=20)
ax.set_ylabel("Feature name", fontsize=20)
plt.tight_layout()
plt.title("Most important features for attack", fontsize=20).set_position([.5, 0.99])
return fig
def test_swap_random(data, seed):
a, b = data
np.random.seed(seed)
a_orig, b_orig = original.swap_random(a, b)
dcst_private._seed_numba(seed)
a_out, b_out = dcst.swap_random(a, b)
assert len(a_out) == len(b_out) == len(a) == len(b)
# Each entry should be present same number of times
ab = np.sort(np.concatenate((a, b)))
ab_out = np.sort(np.concatenate((a_out, b_out)))
assert np.allclose(ab, ab_out, atol=atol, equal_nan=True)
# Check for swaps matching
for i in range(len(a)):
ab = np.array([a[i], b[i]])
ab_out = np.array([a_out[i], b_out[i]])
assert ab[0] in ab_out
assert ab[1] in ab_out
def _hpd_interval(self, x, width):
"""
Code adapted from pymc3.stats.calc_min_interval:
https://github.com/pymc-devs/pymc3/blob/master/pymc3/stats.py
"""
x = np.sort(x)
n = len(x)
interval_idx_inc = int(np.floor(width * n))
n_intervals = n - interval_idx_inc
interval_width = x[interval_idx_inc:] - x[:n_intervals]
if len(interval_width) == 0:
raise ValueError('Too few elements for interval calculation')
min_idx = np.argmin(interval_width)
hdi_min = x[min_idx]
hdi_max = x[min_idx + interval_idx_inc]
index = ['hpd{}_{}'.format(width, x) for x in ['lower', 'upper']]
return pd.Series([hdi_min, hdi_max], index=index)
def _random_curve(self, nr_curves):
curves = []
for i in range(nr_curves-1):
curve = [(0,0)]
# exlcude the 0 and 255
_x = numpy.sort(random.sample(range(1, 255), 32))
_y = numpy.sort(random.sample(range(1, 255), 32))
#_x = numpy.sort(numpy.random.randint(1, 255, 2))
#_y = numpy.sort(numpy.random.randint(1, 255, 2))
# _x[0] and _x[1] can't be the same
curve.append((_x[0], _y[0]))
curve.append((_x[1], _y[1]))
curve.append((255,255))
curves.append(curve)
curves.append([(255,255)])
return curves
def _random_curve(self, nr_curves):
curves = []
for i in range(nr_curves-1):
curve = [(0,0)]
# exlcude the 0 and 255
_x = numpy.sort(random.sample(range(1, 255), 32))
_y = numpy.sort(random.sample(range(1, 255), 32))
#_x = numpy.sort(numpy.random.randint(1, 255, 2))
#_y = numpy.sort(numpy.random.randint(1, 255, 2))
# _x[0] and _x[1] can't be the same
curve.append((_x[0], _y[0]))
curve.append((_x[1], _y[1]))
curve.append((255,255))
curves.append(curve)
curves.append([(255,255)])
return curves
def test_randomized_svd(rows, cols, rank, dtype, transpose, n_iter, target_gen,
rgen):
rank = min(rows, cols) - 2 if rank is 'fullrank' else rank
A = target_gen(rows, cols, rank=rank, randstate=rgen, dtype=dtype)
U_ref, s_ref, V_ref = utils.truncated_svd(A, k=rank)
U, s, V = em.randomized_svd(A, rank, transpose=transpose, randstate=rgen,
n_iter=n_iter)
error_U = np.abs(U.conj().T.dot(U_ref)) - np.eye(rank)
assert_allclose(np.linalg.norm(error_U), 0, atol=1e-3)
error_V = np.abs(V.dot(V_ref.conj().T)) - np.eye(rank)
assert_allclose(np.linalg.norm(error_V), 0, atol=1e-3)
assert_allclose(s.ravel() - s_ref, 0, atol=1e-3)
# Check that singular values are returned in descending order
assert_array_equal(s, np.sort(s)[::-1])
def ecdf(x):
''' Computes the empirical cumulative distribution function of a dataset
Args:
x (`iterable`): Data.
Returns:
tuple containing:
`numpy.ndarray`: sorted data.
`numpy.ndarray`: cumulative distribution function of the data.
'''
xs = np.sort(x)
ys = np.arange(1, len(xs) + 1) / float(len(xs))
return xs, ys
def sort_xy(x, y):
''' Sorts a pair of x and y iterables, returning arrays in order of
ascending x.
Args:
x (`iterable`): a list, numpy ndarray, or other iterable to sort by.
y (`iterable`): a list, numpy ndarray, or other iterable that is y=f(x).
Returns:
tuple containing:
`iterable`: an iterable containing the sorted x elements.
`iterable`: an iterable containing the sorted y elements.
'''
# zip x and y, sort by the 0th element (x) of each tuple in zip()
_ = sorted(zip(x, y), key=itemgetter(0))
sorted_x, sorted_y = zip(*_)
return sorted_x, sorted_y
def compute_group(cls, data, scales, **params):
data = data.sort_values('x')
n = params['n']
x_unique = data['x'].unique()
if len(x_unique) < 2:
# Not enough data to fit
return pd.DataFrame()
if data['x'].dtype.kind == 'i':
if params['fullrange']:
xseq = scales.x.dimension()
else:
xseq = np.sort(x_unique)
else:
if params['fullrange']:
rangee = scales.x.dimension()
else:
rangee = [data['x'].min(), data['x'].max()]
xseq = np.linspace(rangee[0], rangee[1], n)
return predictdf(data, xseq, **params)
def bootstrap_statistics(series, statistic, n_samples=1000,
confidence_interval=0.95, random_state=None):
"""
Default parameters taken from
R's Hmisc smean.cl.boot
"""
if random_state is None:
random_state = np.random
alpha = 1 - confidence_interval
size = (n_samples, len(series))
inds = random_state.randint(0, len(series), size=size)
samples = series.values[inds]
means = np.sort(statistic(samples, axis=1))
return pd.DataFrame({'ymin': means[int((alpha/2)*n_samples)],
'ymax': means[int((1-alpha/2)*n_samples)],
'y': [statistic(series)]})
def sort_base_rules(self):
""" Sort the population lexicographically by truth vector.
This should help speed up likelihood calculations.
Note, resets the filter.
"""
# np.lexsort will sort columns by rows, with the last
# row as the primary sort key, etc; so we rotate the
# truth array by 90 degrees to get it to do what we want.
new_order = np.lexsort(np.rot90(self.base_flat_truth))
self._reordering_cache = new_order
self.base_flat_durations = self.base_flat_durations[new_order]
self.base_flat_variable_weights = self.base_flat_variable_weights[new_order]
new_flat_rules = [self.base_flat_rules[i] for i in new_order]
self.base_flat_rules = new_flat_rules
self.base_flat_truth = self.base_flat_truth[new_order]
self.base_primitive_index = {
t:i for i,t in enumerate(new_flat_rules)
}
self.reset_filter()
def number_classes(Yin, omitLabels=[]):
"""Remaps class labels to contiguous natural numbers starting at 0.
In many frameworks (e.g. caffe) class labels are mapped to indices at
the output of the CNN; hence this remapping.
Any pixels that should be ignored will have class label of -1.
"""
if Yin is None: return None
yAll = np.sort(np.unique(Yin))
yAll = [y for y in yAll if y not in omitLabels]
Yout = -1*np.ones(Yin.shape, dtype=Yin.dtype)
for yIdx, y in enumerate(yAll):
Yout[Yin==y] = yIdx
return Yout
def test_sort_flexible(self):
# Test sort on flexible dtype.
a = array(
data=[(3, 3), (3, 2), (2, 2), (2, 1), (1, 0), (1, 1), (1, 2)],
mask=[(0, 0), (0, 1), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0)],
dtype=[('A', int), ('B', int)])
test = sort(a)
b = array(
data=[(1, 1), (1, 2), (2, 1), (2, 2), (3, 3), (3, 2), (1, 0)],
mask=[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (1, 0)],
dtype=[('A', int), ('B', int)])
assert_equal(test, b)
assert_equal(test.mask, b.mask)
test = sort(a, endwith=False)
b = array(
data=[(1, 0), (1, 1), (1, 2), (2, 1), (2, 2), (3, 2), (3, 3), ],
mask=[(1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), ],
dtype=[('A', int), ('B', int)])
assert_equal(test, b)
assert_equal(test.mask, b.mask)
def compute_precision_mapping(pt):
thresh_all = []
prec_all = []
for jj in xrange(1000):
thresh = pt['details']['score'][:, jj]
prec = pt['details']['precision'][:, jj]
ind = np.argsort(thresh); # thresh, ind = torch.sort(thresh)
thresh = thresh[ind];
indexes = np.unique(thresh, return_index=True)[1]
indexes = np.sort(indexes);
thresh = thresh[indexes]
thresh = np.vstack((min(-1000, min(thresh) - 1), thresh[:, np.newaxis], max(1000, max(thresh) + 1)));
prec = prec[ind];
for i in xrange(1, len(prec)):
prec[i] = max(prec[i], prec[i - 1]);
prec = prec[indexes]
prec = np.vstack((prec[0], prec[:, np.newaxis], prec[-1]));
thresh_all.append(thresh)
prec_all.append(prec)
precision_score = {'thresh': thresh_all, "prec": prec_all}
return precision_score