def concatenate_sort(out_filename, in_filenames, sort_cols, metrics=None):
in_mcs = [MoleculeCounter.open(f, 'r') for f in in_filenames]
out_mc = MoleculeCounter.open(out_filename, mode='w')
if metrics is None:
metrics = in_mcs[0].get_all_metrics()
out_mc.set_all_metrics(metrics)
for col, array in in_mcs[0].ref_columns.iteritems():
out_mc.set_ref_column(col, array[:])
sort_array = []
# reverse sort columns so they get sorted in the right order
for col in reversed(sort_cols):
sort_array.append(np.concatenate([mc.get_column(col) for mc in in_mcs]))
sort_index = np.lexsort(sort_array)
for col in MOLECULE_INFO_COLUMNS:
col_sorted = np.concatenate([mc.get_column(col) for mc in in_mcs])[sort_index]
out_mc.add_many(col, col_sorted)
for mc in in_mcs:
mc.close()
out_mc.save()
python类lexsort()的实例源码
def sort_base_rules(self):
""" Sort the population lexicographically by truth vector.
This should help speed up likelihood calculations.
Note, resets the filter.
"""
# np.lexsort will sort columns by rows, with the last
# row as the primary sort key, etc; so we rotate the
# truth array by 90 degrees to get it to do what we want.
new_order = np.lexsort(np.rot90(self.base_flat_truth))
self._reordering_cache = new_order
self.base_flat_durations = self.base_flat_durations[new_order]
self.base_flat_variable_weights = self.base_flat_variable_weights[new_order]
new_flat_rules = [self.base_flat_rules[i] for i in new_order]
self.base_flat_rules = new_flat_rules
self.base_flat_truth = self.base_flat_truth[new_order]
self.base_primitive_index = {
t:i for i,t in enumerate(new_flat_rules)
}
self.reset_filter()
def symmetry_normalised_reflections(self, hkl):
"""Returns an array of same size as *hkl*, containing the
corresponding symmetry-equivalent reflections of lowest
indices.
Example:
>>> from ase.lattice.spacegroup import Spacegroup
>>> sg = Spacegroup(225) # fcc
>>> sg.symmetry_normalised_reflections([[2, 0, 0], [0, 2, 0]])
array([[ 0, 0, -2],
[ 0, 0, -2]])
"""
hkl = np.array(hkl, dtype=int, ndmin=2)
normalised = np.empty(hkl.shape, int)
R = self.get_rotations().transpose(0, 2, 1)
for i, g in enumerate(hkl):
gsym = np.dot(R, g)
j = np.lexsort(gsym.T)[0]
normalised[i,:] = gsym[j]
return normalised
def unique_reflections(self, hkl):
"""Returns a subset *hkl* containing only the symmetry-unique
reflections.
Example:
>>> from ase.lattice.spacegroup import Spacegroup
>>> sg = Spacegroup(225) # fcc
>>> sg.unique_reflections([[ 2, 0, 0],
... [ 0, -2, 0],
... [ 2, 2, 0],
... [ 0, -2, -2]])
array([[2, 0, 0],
[2, 2, 0]])
"""
hkl = np.array(hkl, dtype=int, ndmin=2)
hklnorm = self.symmetry_normalised_reflections(hkl)
perm = np.lexsort(hklnorm.T)
iperm = perm.argsort()
xmask = np.abs(np.diff(hklnorm[perm], axis=0)).any(axis=1)
mask = np.concatenate(([True], xmask))
imask = mask[iperm]
return hkl[imask]
def _get_new_id_seq(pos, numbers):
"""
A helper function to produce the new sequence of the transformed
structure. Algs is sort the position back to init and use the index
to sort numbers.
"""
# transfer the atom position into >=0 and <=1
pos = np.around(pos, decimals=3)
func_tofrac = np.vectorize(lambda x: round((x % 1), 3))
o_pos = func_tofrac(pos)
# round_o_pos = np.around(o_pos, decimals=3)
# z, y, x = round_o_pos[:, 2], round_o_pos[:, 1], round_o_pos[:, 0]
z, y, x = o_pos[:, 2], o_pos[:, 1], o_pos[:, 0]
inds = np.lexsort((z, y, x))
return inds
def _get_new_id_seq(pos, numbers):
"""
A helper function to produce the new sequence of the transformed
structure. Algs is sort the position back to init and use the index
to sort numbers.
"""
# transfer the atom position into >=0 and <=1
pos = np.around(pos, decimals=5)
func_tofrac = np.vectorize(lambda x: round((x % 1), 3))
o_pos = func_tofrac(pos)
# round_o_pos = np.around(o_pos, decimals=3)
# z, y, x = round_o_pos[:, 2], round_o_pos[:, 1], round_o_pos[:, 0]
z, y, x = o_pos[:, 2], o_pos[:, 1], o_pos[:, 0]
inds = np.lexsort((z, y, x))
return inds
def get_new_id_seq(pos, numbers):
"""
A helper function to produce the new sequence of the transformed
structure. Algs is sort the position back to init and use the index
to sort numbers.
"""
# transfer the atom position into >=0 and <=1
pos = np.around(pos, decimals=5)
func_tofrac = np.vectorize(lambda x: round((x % 1), 3))
o_pos = func_tofrac(pos)
# round_o_pos = np.around(o_pos, decimals=3)
# z, y, x = round_o_pos[:, 2], round_o_pos[:, 1], round_o_pos[:, 0]
z, y, x = o_pos[:, 2], o_pos[:, 1], o_pos[:, 0]
inds = np.lexsort((z, y, x))
return inds
def paretoSorting(x0, x1):
fronts=list()
idx=np.lexsort((x1, x0))
fronts.append(list())
fronts[-1].append(idx[0])
for i0 in idx[1:]:
if x1[i0]>=x1[fronts[-1][-1]]:
fronts.append(list())
fronts[-1].append(i0)
else:
for i1 in range(0,len(fronts)):
if x1[i0]<x1[fronts[i1][-1]]:
fronts[i1].append(i0)
break
return (fronts, idx)
def _set_sparse_diagonal(rows, cols, data, preferences):
idx = np.where(rows == cols)
data[idx] = preferences[rows[idx]]
mask = np.ones(preferences.shape, dtype=bool)
mask[rows[idx]] = False
diag_other = np.argwhere(mask).T[0]
rows = np.concatenate((rows, diag_other))
cols = np.concatenate((cols, diag_other))
data = np.concatenate((data, preferences[mask]))
# return data sorted by row
idx_sorted_left_ori = np.lexsort((cols, rows))
rows = rows[idx_sorted_left_ori]
cols = cols[idx_sorted_left_ori]
data = data[idx_sorted_left_ori]
return rows, cols, data
test_resample.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_resample_group_info(self): # GH10914
for n, k in product((10000, 100000), (10, 100, 1000)):
dr = date_range(start='2015-08-27', periods=n // 10, freq='T')
ts = Series(np.random.randint(0, n // k, n).astype('int64'),
index=np.random.choice(dr, n))
left = ts.resample('30T').nunique()
ix = date_range(start=ts.index.min(), end=ts.index.max(),
freq='30T')
vals = ts.values
bins = np.searchsorted(ix.values, ts.index, side='right')
sorter = np.lexsort((vals, bins))
vals, bins = vals[sorter], bins[sorter]
mask = np.r_[True, vals[1:] != vals[:-1]]
mask |= np.r_[True, bins[1:] != bins[:-1]]
arr = np.bincount(bins[mask] - 1,
minlength=len(ix)).astype('int64', copy=False)
right = Series(arr, index=ix)
assert_series_equal(left, right)
def unique(a):
""" Returns unique 2D array entries of a given array """
order = np.lexsort(a.T)
a = a[order]
diff = np.diff(a, axis=0)
ui = np.ones(len(a), dtype=np.bool)
ui[1:] = (diff != 0).any(axis=1)
# Return value(s)
return a[ui]
###############################################################################
# FUNCTIONS FOR MOLECULAR PROPERTIES
###############################################################################
def _sort(group_idx, a, size, fill_value, dtype=None, reversed_=False):
if np.iscomplexobj(a):
raise NotImplementedError("a must be real, could use np.lexsort or "
"sort with recarray for complex.")
if not (np.isscalar(fill_value) or len(fill_value) == 0):
raise ValueError("fill_value must be scalar or an empty sequence")
if reversed_:
order_group_idx = np.argsort(group_idx + -1j * a, kind='mergesort')
else:
order_group_idx = np.argsort(group_idx + 1j * a, kind='mergesort')
counts = np.bincount(group_idx, minlength=size)
if np.ndim(a) == 0:
a = np.full(size, a, dtype=type(a))
ret = np.split(a[order_group_idx], np.cumsum(counts)[:-1])
ret = np.asarray(ret, dtype=object)
if np.isscalar(fill_value):
fill_untouched(group_idx, ret, fill_value)
return ret
def prune(self, question, paragraphs: List[ExtractedParagraph]):
if not self.filter_dist_one and len(paragraphs) == 1:
return paragraphs
tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
text = []
for para in paragraphs:
text.append(" ".join(" ".join(s) for s in para.text))
try:
para_features = tfidf.fit_transform(text)
q_features = tfidf.transform([" ".join(question)])
except ValueError:
return []
dists = pairwise_distances(q_features, para_features, "cosine").ravel()
sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph
if self.filter_dist_one:
return [paragraphs[i] for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
else:
return [paragraphs[i] for i in sorted_ix[:self.n_to_select]]
def dists(self, question, paragraphs: List[ExtractedParagraph]):
tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
text = []
for para in paragraphs:
text.append(" ".join(" ".join(s) for s in para.text))
try:
para_features = tfidf.fit_transform(text)
q_features = tfidf.transform([" ".join(question)])
except ValueError:
return []
dists = pairwise_distances(q_features, para_features, "cosine").ravel()
sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph
if self.filter_dist_one:
return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
else:
return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select]]
def unique_rows(a):
"""
????????????rows
????sklearn GP ?????????
???
a: ????????array
??:
mask of unique rows
"""
order = np.lexsort(a.T)
reorder = np.argsort(order)
a = a[order]
diff = np.diff(a, axis=0)
ui = np.ones(len(a), 'bool')
ui[1:] = (diff != 0).any(axis=1)
return ui[reorder]
def pack_distribution(self, p_sparse, p_dense=None):
"""
convenience routine to translate a distribution from a dictionary to
a dense array, using this state enumeration
"""
if p_dense is None:
p_dense = numpy.zeros((self.size, ), dtype=numpy.float)
# guard against case where p_sparse is empty
if len(p_sparse) == 0:
return p_dense
p_states, p_values = domain.from_mapping(p_sparse)
# now sort the states, keeping them synchronised with the
# ordering of the values
order = numpy.lexsort(p_states)
p_states = p_states[:, order]
p_values = p_values[order]
p_indices = self.indices(p_states)
p_dense[p_indices] = p_values
return p_dense
def pack_distribution(self, p_sparse, p_dense=None):
"""
convenience routine to translate a distribution from a dictionary to
a dense array, using this state enumeration
"""
if p_dense is None:
p_dense = numpy.zeros((self.size, ), dtype=numpy.float)
# guard against case where p_sparse is empty
if len(p_sparse) == 0:
return p_dense
p_states, p_values = domain.from_mapping(p_sparse)
# now sort the states, keeping them synchronised with the
# ordering of the values
order = numpy.lexsort(p_states)
p_states = p_states[:, order]
p_values = p_values[order]
p_indices = self.indices(p_states)
p_dense[p_indices] = p_values
return p_dense
def _make_feed_dict(self, X, y):
# Make the dictionary mapping tensor placeholders to input data.
if self.is_sparse_:
x_inds = np.vstack(X.nonzero())
x_srt = np.lexsort(x_inds[::-1, :])
x_inds = x_inds[:, x_srt].T.astype(np.int64)
x_vals = np.squeeze(np.array(
X[x_inds[:, 0], x_inds[:, 1]])).astype(np.float32)
x_shape = np.array(X.shape).astype(np.int64)
feed_dict = {self._x_inds: x_inds,
self._x_vals: x_vals,
self._x_shape: x_shape}
else:
feed_dict = {self._x: X.astype(np.float32)}
if self._output_size == 1:
feed_dict[self._y] = y.astype(np.float32)
else:
feed_dict[self._y] = y.astype(np.int32)
return feed_dict
def multiarray_sort(arr, srt=[0]):
'''
Sort rows of a two-dimensional array for a given
hierarchy of rows.
Parameters
----------
arr : array
A two-dimensional numpy array.
srt : list
List specifying in which order of rows to sort.
Returns
-------
array
A sorted array.
'''
ind = np.lexsort([arr[i] for i in reversed(srt)])
return (arr.T[ind]).T
def main(args, outs):
with cr_mol_counter.MoleculeCounter.open(args.molecule_h5, 'r') as in_mc:
with cr_mol_counter.MoleculeCounter.open(outs.merged_molecules, 'w') as out_mc:
remapped_gem_groups = remap_gems(in_mc.get_column('gem_group'), args.gem_group_index, args.library_id)
sort_index = np.lexsort([remapped_gem_groups])
for col in cr_mol_counter.MOLECULE_INFO_COLUMNS:
if col == 'gem_group':
arr = remapped_gem_groups
else:
arr = in_mc.get_column(col)
out_mc.add_many(col, arr[sort_index])
for col in cr_mol_counter.MOLECULE_REF_COLUMNS:
array = in_mc.get_ref_column(col)
out_mc.set_ref_column(col, array)
out_metrics = in_mc.get_all_metrics()
gg_metrics = {}
for (gg, metrics) in in_mc.get_metric(cr_mol_counter.GEM_GROUPS_METRIC).iteritems():
for ng, (sid, og) in args.gem_group_index.iteritems():
if sid == args.library_id and og == gg:
gg_metrics[int(ng)] = metrics
out_metrics[cr_mol_counter.GEM_GROUPS_METRIC] = gg_metrics
out_mc.set_all_metrics(out_metrics)
def gini(actual, pred, cmpcol = 0, sortcol = 1):
assert( len(actual) == len(pred) )
all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
totalLosses = all[:,0].sum()
giniSum = all[:,0].cumsum().sum() / totalLosses
giniSum -= (len(actual) + 1) / 2.
return giniSum / len(actual)
def test_lexsort(self,level=rlevel):
# Lexsort memory error
v = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
assert_equal(np.lexsort(v), 0)
def test_lexsort_invalid_sequence(self):
# Issue gh-4123
class BuggySequence(object):
def __len__(self):
return 4
def __getitem__(self, key):
raise KeyError
assert_raises(KeyError, np.lexsort, BuggySequence())
def test_mem_lexsort_strings(self, level=rlevel):
# Ticket #298
lst = ['abc', 'cde', 'fgh']
np.lexsort((lst,))
def test_lexsort_buffer_length(self):
# Ticket #1217, don't segfault.
a = np.ones(100, dtype=np.int8)
b = np.ones(100, dtype=np.int32)
i = np.lexsort((a[::-1], b))
assert_equal(i, np.arange(100, dtype=np.int))
def test_basic(self):
a = [1, 2, 1, 3, 1, 5]
b = [0, 4, 5, 6, 2, 3]
idx = np.lexsort((b, a))
expected_idx = np.array([0, 4, 2, 1, 3, 5])
assert_array_equal(idx, expected_idx)
x = np.vstack((b, a))
idx = np.lexsort(x)
assert_array_equal(idx, expected_idx)
assert_array_equal(x[1][idx], np.sort(x[1]))
def test_object(self): # gh-6312
a = np.random.choice(10, 1000)
b = np.random.choice(['abc', 'xy', 'wz', 'efghi', 'qwst', 'x'], 1000)
for u in a, b:
left = np.lexsort((u.astype('O'),))
right = np.argsort(u, kind='mergesort')
assert_array_equal(left, right)
for u, v in (a, b), (b, a):
idx = np.lexsort((u, v))
assert_array_equal(idx, np.lexsort((u.astype('O'), v)))
assert_array_equal(idx, np.lexsort((u, v.astype('O'))))
u, v = np.array(u, dtype='object'), np.array(v, dtype='object')
assert_array_equal(idx, np.lexsort((u, v)))
def preCompute(rowBased_row_array,rowBased_col_array,S_rowBased_data_array):
"""
format affinity/similarity matrix
"""
# Get parameters
data_len=len(S_rowBased_data_array)
row_indptr=sparseAP_cy.getIndptr(rowBased_row_array)
if row_indptr[-1]!=data_len: row_indptr=np.concatenate((row_indptr,np.array([data_len])))
row_to_col_ind_arr=np.lexsort((rowBased_row_array,rowBased_col_array))
colBased_row_array=sparseAP_cy.npArrRearrange_int_para(rowBased_row_array,row_to_col_ind_arr)
colBased_col_array=sparseAP_cy.npArrRearrange_int_para(rowBased_col_array,row_to_col_ind_arr)
col_to_row_ind_arr=np.lexsort((colBased_col_array,colBased_row_array))
col_indptr=sparseAP_cy.getIndptr(colBased_col_array)
if col_indptr[-1]!=data_len: col_indptr=np.concatenate((col_indptr,np.array([data_len])))
kk_col_index=sparseAP_cy.getKKIndex(colBased_row_array,colBased_col_array)
#Initialize matrix A, R
A_rowbased_data_array=np.array([0.0]*data_len)
R_rowbased_data_array=np.array([0.0]*data_len)
#Add random samll value to remove degeneracies
random_state=np.random.RandomState(0)
S_rowBased_data_array+=1e-12*random_state.randn(data_len)*(np.amax(S_rowBased_data_array)-np.amin(S_rowBased_data_array))
#Convert row_to_col_ind_arr/col_to_row_ind_arr data type to np.int datatype so it is compatible with cython code
row_to_col_ind_arr=row_to_col_ind_arr.astype(np.int)
col_to_row_ind_arr=col_to_row_ind_arr.astype(np.int)
return S_rowBased_data_array, A_rowbased_data_array, R_rowbased_data_array,col_indptr,row_indptr,row_to_col_ind_arr,col_to_row_ind_arr,kk_col_index
def sort_by_tfidf(question, paragraphs):
tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=spacy.en.STOP_WORDS, decode_error='replace')
try:
para_features = tfidf.fit_transform(paragraphs)
q_features = tfidf.transform([question])
except ValueError:
return [(i, 0.0) for i in range(len(paragraphs))]
dists = pairwise_distances(q_features, para_features, "cosine").ravel()
sorted_ix = np.lexsort((paragraphs, dists)) # in case of ties, use the earlier paragraph
return [(i, 1.0 - dists[i]) for i in sorted_ix]
def equivalent_reflections(self, hkl):
"""Return all equivalent reflections to the list of Miller indices
in hkl.
Example:
>>> from ase.lattice.spacegroup import Spacegroup
>>> sg = Spacegroup(225) # fcc
>>> sg.equivalent_reflections([[0, 0, 2]])
array([[ 0, 0, -2],
[ 0, -2, 0],
[-2, 0, 0],
[ 2, 0, 0],
[ 0, 2, 0],
[ 0, 0, 2]])
"""
hkl = np.array(hkl, dtype='int', ndmin=2)
rot = self.get_rotations()
n, nrot = len(hkl), len(rot)
R = rot.transpose(0, 2, 1).reshape((3*nrot, 3)).T
refl = np.dot(hkl, R).reshape((n*nrot, 3))
ind = np.lexsort(refl.T)
refl = refl[ind]
diff = np.diff(refl, axis=0)
mask = np.any(diff, axis=1)
return np.vstack((refl[mask], refl[-1,:]))