def encode_edges(edges, nodes):
"""Encode data with dictionary
Args:
edges (np.ndarray): np array of the form [node1, node2].
nodes (np.array): list of unique nodes
Returns:
np.ndarray: relabeled edges
Examples:
>>> import numpy as np
>>> edges = np.array([['A', 'B'], ['A', 'C']])
>>> nodes = np.array(['C', 'B', 'A'])
>>> print(encode_edges(edges, nodes))
[[2 1]
[2 0]]
"""
sidx = nodes.argsort()
relabeled_edges = sidx[np.searchsorted(nodes, edges, sorter=sidx)]
return relabeled_edges
python类searchsorted()的实例源码
def get_spectrum(self, kT):
"""
Get the thermal emission spectrum given a temperature *kT* in keV.
"""
cspec_l = np.zeros(self.nchan)
mspec_l = np.zeros(self.nchan)
cspec_r = np.zeros(self.nchan)
mspec_r = np.zeros(self.nchan)
tindex = np.searchsorted(self.Tvals, kT)-1
if tindex >= self.Tvals.shape[0]-1 or tindex < 0:
return YTArray(cspec_l, "cm**3/s"), YTArray(mspec_l, "cm**3/s")
dT = (kT-self.Tvals[tindex])/self.dTvals[tindex]
# First do H,He, and trace elements
for elem in self.cosmic_elem:
cspec_l += self._make_spectrum(kT, elem, tindex+2)
cspec_r += self._make_spectrum(kT, elem, tindex+3)
# Next do the metals
for elem in self.metal_elem:
mspec_l += self._make_spectrum(kT, elem, tindex+2)
mspec_r += self._make_spectrum(kT, elem, tindex+3)
cosmic_spec = YTArray(cspec_l*(1.-dT)+cspec_r*dT, "cm**3/s")
metal_spec = YTArray(mspec_l*(1.-dT)+mspec_r*dT, "cm**3/s")
return cosmic_spec, metal_spec
def stateindex(state, hashes, sorter):
"""
Converts state to hash and searches for the hash among hashes,
which are sorted by the sorter list.
Parameters
----------
state : ndarray
An array of one or more states
hashes : ndarray
List of hashes so search among
sorter : ndarray
Sorting indicies which sorts hashes
(generated from Basis.argsort).
"""
key = Basis.hash(state)
return sorter[np.searchsorted(hashes, key, sorter=sorter)]
def getNextIndex(self):
"""
Returns how many batches/sequences to load from each .data file
"""
target_value = (self.scratch_index+1)*(self.batch_memory*self.batch_size)
idx_target = np.searchsorted(self.num_points,target_value, side='right')
if target_value>self.num_points[-1] or idx_target>=len(self.num_points):
idx_target = idx_target - 2
target_value = self.num_points[idx_target]
self.idxend = self.num_points[idx_target] - self.num_points[idx_target-1]
self.nindex = idx_target
else:
while target_value<=self.num_points[idx_target]:
idx_target = idx_target - 1
self.idxend = target_value - self.num_points[idx_target]
self.nindex = idx_target
def getNextIndex(self):
"""
Returns how many batches/sequences to load from each .data file
"""
target_value = (self.scratch_index+1)*(self.batch_memory*self.batch_size)
idx_target = np.searchsorted(self.num_points,target_value, side='right')
if target_value>self.num_points[-1] or idx_target>=len(self.num_points):
idx_target = idx_target - 2
target_value = self.num_points[idx_target]
self.idxend = self.num_points[idx_target] - self.num_points[idx_target-1]
self.nindex = idx_target
else:
while target_value<=self.num_points[idx_target]:
idx_target = idx_target - 1
self.idxend = target_value - self.num_points[idx_target]
self.nindex = idx_target
def random_pick(p,word,sampling_type):
def weighted_pick(weights):
t = np.cumsum(weights)
s = np.sum(weights)
return(int(np.searchsorted(t, np.random.rand(1)*s)))
if sampling_type == 'argmax':
sample = np.argmax(p)
elif sampling_type == 'weighted':
sample = weighted_pick(p)
elif sampling_type == 'combined':
if word == ' ':
sample = weighted_pick(p)
else:
sample = np.argmax(p)
return sample
# test code
def reldist_linpol(tx_soa, beacon_soa):
# Interpolate between two nearest beacon samples
beacon_rx0, beacon_rx1 = beacon_soa[:, 0], beacon_soa[:, 1]
tx_rx0, tx_rx1 = tx_soa[:, 0], tx_soa[:, 1]
high_idx = np.searchsorted(beacon_rx0, tx_rx0)
low_idx = high_idx - 1
length = len(beacon_soa[:, 0])
if high_idx[-1] >= length:
high_idx[-1] = length - 1
if low_idx[0] < 0:
high_idx[0] = 0
weight = ((tx_rx0 - beacon_rx0[low_idx]) /
(beacon_rx0[high_idx] - beacon_rx0[low_idx]))
weight[np.isinf(weight)] = 1 # remove nan
# Reldist in samples
reldist = (tx_rx1 - (beacon_rx1[low_idx] * (1-weight) +
beacon_rx1[high_idx] * weight)) # / 2.0
return reldist
def weighted_quantile(x, weights, quantile):
I = np.argsort(x)
sort_x = x[I]
sort_w = weights[I]
acum_w = np.add.accumulate(sort_w)
norm_w = (acum_w - 0.5*sort_w)/acum_w[-1]
interpq = np.searchsorted(norm_w, [quantile])[0]
if interpq == 0:
return sort_x[0]
elif interpq == len(x):
return sort_x[-1]
else:
tmp1 = (norm_w[interpq] - quantile)/(norm_w[interpq] - norm_w[interpq-1])
tmp2 = (quantile - norm_w[interpq-1])/(norm_w[interpq] - norm_w[interpq-1])
assert tmp1>=0 and tmp2>=0 and tmp1<=1 and tmp2<=1
return sort_x[interpq-1]*tmp1 + sort_x[interpq]*tmp2
def transform(self, y):
"""Transform labels to normalized encoding.
Parameters
----------
y : array-like of shape [n_samples]
Target values.
Returns
-------
y : array-like of shape [n_samples]
"""
y = column_or_1d(y, warn=True)
classes = np.unique(y)
if len(np.intersect1d(classes, self.classes_)) < len(classes):
diff = np.setdiff1d(classes, self.classes_)
self.classes_ = np.hstack((self.classes_, diff))
return np.searchsorted(self.classes_, y)[0]
def get_spectrum(self, kT):
"""
Get the thermal emission spectrum given a temperature *kT* in keV.
"""
tindex = np.searchsorted(self.Tvals, kT)-1
if tindex >= self.Tvals.shape[0]-1 or tindex < 0:
return (YTArray(np.zeros(self.nchan), "cm**3/s"),)*2
dT = (kT-self.Tvals[tindex])/self.dTvals[tindex]
cspec_l = self.cosmic_spec[tindex, :]
mspec_l = self.metal_spec[tindex, :]
cspec_r = self.cosmic_spec[tindex+1, :]
mspec_r = self.metal_spec[tindex+1, :]
cosmic_spec = cspec_l*(1.-dT)+cspec_r*dT
metal_spec = mspec_l*(1.-dT)+mspec_r*dT
var_spec = None
if self.var_spec is not None:
vspec_l = self.var_spec[:, tindex, :]
vspec_r = self.var_spec[:, tindex+1, :]
var_spec = vspec_l*(1.-dT) + vspec_r*dT
return cosmic_spec, metal_spec, var_spec
test_resample.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def test_resample_group_info(self): # GH10914
for n, k in product((10000, 100000), (10, 100, 1000)):
dr = date_range(start='2015-08-27', periods=n // 10, freq='T')
ts = Series(np.random.randint(0, n // k, n).astype('int64'),
index=np.random.choice(dr, n))
left = ts.resample('30T').nunique()
ix = date_range(start=ts.index.min(), end=ts.index.max(),
freq='30T')
vals = ts.values
bins = np.searchsorted(ix.values, ts.index, side='right')
sorter = np.lexsort((vals, bins))
vals, bins = vals[sorter], bins[sorter]
mask = np.r_[True, vals[1:] != vals[:-1]]
mask |= np.r_[True, bins[1:] != bins[:-1]]
arr = np.bincount(bins[mask] - 1,
minlength=len(ix)).astype('int64', copy=False)
right = Series(arr, index=ix)
assert_series_equal(left, right)
def find_cutoff_rule(self, J):
"""
This function takes a value function and returns the corresponding
cutoffs of where you transition between continue and choosing a
specific model
"""
payoff_choose_f0 = self.payoff_choose_f0
payoff_choose_f1 = self.payoff_choose_f1
m, pgrid = self.m, self.pgrid
# Evaluate cost at all points on grid for choosing a model
p_c_0 = payoff_choose_f0(pgrid)
p_c_1 = payoff_choose_f1(pgrid)
# The cutoff points can be found by differencing these costs with
# the Bellman equation (J is always less than or equal to p_c_i)
lb = pgrid[np.searchsorted(p_c_1 - J, 1e-10) - 1]
ub = pgrid[np.searchsorted(J - p_c_0, -1e-10)]
return (lb, ub)
def _get_streams_index_by_time(self, local_time):
if self.is_stream:
cidx = numpy.searchsorted(self._times, local_time, 'right') - 1
return cidx
def is_first_chunk(self, idx, nb_chunks):
if self.is_stream:
cidx = numpy.searchsorted(self._chunks_in_sources, idx, 'right') - 1
idx -= self._chunks_in_sources[cidx]
if idx == 0:
return True
else:
if idx == 0:
return True
return False
def get_data(self, idx, chunk_size, padding=(0, 0), nodes=None):
if self.is_stream:
cidx = numpy.searchsorted(self._chunks_in_sources, idx, 'right') - 1
idx -= self._chunks_in_sources[cidx]
return self._sources[cidx].read_chunk(idx, chunk_size, padding, nodes), self._sources[cidx].t_start + idx*chunk_size
else:
return self.read_chunk(idx, chunk_size, padding, nodes), self.t_start + idx*chunk_size
def find_within_ordmag(x, baseline_idx):
x_ascending = np.sort(x)
baseline = x_ascending[-baseline_idx]
cutoff = max(1, round(0.1*baseline))
# Return the index corresponding to the cutoff in descending order
return len(x) - np.searchsorted(x_ascending, cutoff)
def _common_vocabulary_batch(self, words1, weights1, i2):
words2, weights2 = self._get_vocabulary(i2)
joint, index = numpy.unique(numpy.concatenate((words1, words2)),
return_index=True)
nw1 = numpy.zeros(len(joint), dtype=numpy.float32)
cmp = index < len(words1)
nw1[numpy.nonzero(cmp)] = weights1[index[cmp]]
nw2 = numpy.zeros(len(joint), dtype=numpy.float32)
nw2[numpy.searchsorted(joint, words2)] = weights2
return joint, nw1, nw2
def _get_selected_ids(self, gid, id_column, time_column, t_start, t_stop,
time_unit, data):
"""
Calculates the data range to load depending on the selected gid
and the provided time range (t_start, t_stop)
gid: int, gid to be loaded.
id_column: int, id of the column containing gids.
time_column: int, id of the column containing times.
t_start: pq.quantity.Quantity, start of the time range to load.
t_stop: pq.quantity.Quantity, stop of the time range to load.
time_unit: pq.quantity.Quantity, time unit of the data to load.
data: numpy array, data to load.
Returns
list of selected gids
"""
gid_ids = np.array([0, data.shape[0]])
if id_column is not None:
gid_ids = np.array([np.searchsorted(data[:, 0], gid, side='left'),
np.searchsorted(data[:, 0], gid, side='right')])
gid_data = data[gid_ids[0]:gid_ids[1], :]
# select only requested time range
id_shifts = np.array([0, 0])
if time_column is not None:
id_shifts[0] = np.searchsorted(gid_data[:, 1],
t_start.rescale(
time_unit).magnitude,
side='left')
id_shifts[1] = (np.searchsorted(gid_data[:, 1],
t_stop.rescale(
time_unit).magnitude,
side='left') - gid_data.shape[0])
selected_ids = gid_ids + id_shifts
return selected_ids
def _get_selected_ids(self, gid, id_column, time_column, t_start, t_stop,
time_unit, data):
"""
Calculates the data range to load depending on the selected gid
and the provided time range (t_start, t_stop)
gid: int, gid to be loaded.
id_column: int, id of the column containing gids.
time_column: int, id of the column containing times.
t_start: pq.quantity.Quantity, start of the time range to load.
t_stop: pq.quantity.Quantity, stop of the time range to load.
time_unit: pq.quantity.Quantity, time unit of the data to load.
data: numpy array, data to load.
Returns
list of selected gids
"""
gid_ids = np.array([0, data.shape[0]])
if id_column is not None:
gid_ids = np.array([np.searchsorted(data[:, 0], gid, side='left'),
np.searchsorted(data[:, 0], gid, side='right')])
gid_data = data[gid_ids[0]:gid_ids[1], :]
# select only requested time range
id_shifts = np.array([0, 0])
if time_column is not None:
id_shifts[0] = np.searchsorted(gid_data[:, 1],
t_start.rescale(
time_unit).magnitude,
side='left')
id_shifts[1] = (np.searchsorted(gid_data[:, 1],
t_stop.rescale(
time_unit).magnitude,
side='left') - gid_data.shape[0])
selected_ids = gid_ids + id_shifts
return selected_ids
def test_ecdf_formal(x, data):
correct = np.searchsorted(np.sort(data), x, side='right') / len(data)
assert np.allclose(dcst.ecdf_formal(x, data), correct, atol=atol,
equal_nan=True)