def PCA(data, num_components=None):
# mean center the data
data -= data.mean(axis=0)
# calculate the covariance matrix
R = np.cov(data, rowvar=False)
# calculate eigenvectors & eigenvalues of the covariance matrix
# use 'eigh' rather than 'eig' since R is symmetric,
# the performance gain is substantial
V, E = np.linalg.eigh(R)
# sort eigenvalue in decreasing order
idx = np.argsort(V)[::-1]
E = E[:,idx]
# sort eigenvectors according to same index
V = V[idx]
# select the first n eigenvectors (n is desired dimension
# of rescaled data array, or dims_rescaled_data)
E = E[:, :num_components]
# carry out the transformation on the data using eigenvectors
# and return the re-scaled data, eigenvalues, and eigenvectors
return np.dot(E.T, data.T).T, V, E
python类argsort()的实例源码
def __SubDoWavelets(self,waveforms):
scales = 4
dimensions = 10
nspk,ls = waveforms.shape
cc = pywt.wavedec(waveforms,"haar",mode="symmetric",level=scales,axis=-1)
cc = np.hstack(cc)
sd = list()
for i in range(ls):
test_data = cc[:,i]
thr_dist = np.std(test_data,ddof=1)*3
thr_dist_min = np.mean(test_data)-thr_dist
thr_dist_max = np.mean(test_data)+thr_dist
aux = test_data[(test_data>thr_dist_min)&(test_data<thr_dist_max)]
if aux.size > 10:
sd.append(self.__test_ks(aux))
else:
sd.append(0)
ind = np.argsort(sd)
ind = ind[::-1]
coeff = ind[:dimensions]
waveletspk = cc[:,coeff]
return waveletspk
def get_feature_importance(list_of_features):
n_estimators=10000
random_state=0
n_jobs=4
x_train=data_frame[list_of_features]
y_train=data_frame.iloc[:,-1]
feat_labels= data_frame.columns[1:]
forest = BaggingRegressor(n_estimators=n_estimators,random_state=random_state,n_jobs=n_jobs)
forest.fit(x_train,y_train)
importances=forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],
importances[indices[f]]))
plt.title("Feature Importance")
plt.bar(range(x_train.shape[1]),importances[indices],color='lightblue',align='center')
plt.xticks(range(x_train.shape[1]),feat_labels[indices],rotation=90)
plt.xlim([-1,x_train.shape[1]])
plt.tight_layout()
plt.show()
def update_sort_idcs(self):
# The selected points are sorted before all the other points -- an easy
# way to achieve this is to add the maximum score to their score
if self.current_order == 0:
score = self.score_x
elif self.current_order == 1:
score = self.score_y
elif self.current_order == 2:
score = self.score_z
else:
raise AssertionError(self.current_order)
score = score.copy()
if len(self.selected_points):
score[np.array(sorted(self.selected_points))] += score.max()
self.sort_idcs = np.argsort(score)
def removeTopPCs(X, numRemovePCs):
t0 = time.time()
X_mean = X.mean(axis=0)
X -= X_mean
XXT = symmetrize(blas.dsyrk(1.0, X, lower=0))
s,U = la.eigh(XXT)
if (np.min(s) < -1e-4): raise Exception('Negative eigenvalues found')
s[s<0]=0
ind = np.argsort(s)[::-1]
U = U[:, ind]
s = s[ind]
s = np.sqrt(s)
#remove null PCs
ind = (s>1e-6)
U = U[:, ind]
s = s[ind]
V = X.T.dot(U/s)
#print 'max diff:', np.max(((U*s).dot(V.T) - X)**2)
X = (U[:, numRemovePCs:]*s[numRemovePCs:]).dot((V.T)[numRemovePCs:, :])
X += X_mean
return X
def _translate(seq, f_init, f_next, trg_eos_idx, src_sel, trg_sel,
k, cond_init_trg, normalize, n_best, **kwargs):
sample, score = gen_sample(
f_init, f_next, x=numpy.array(seq).reshape([len(seq), 1]),
eos_idx=trg_eos_idx, src_selector=src_sel, trg_selector=trg_sel,
k=k, maxlen=3*len(seq), stochastic=False, argmax=False,
cond_init_trg=cond_init_trg, **kwargs)
if normalize:
lengths = numpy.array([len(s) for s in sample])
score = score / lengths
if n_best == 1:
sidx = numpy.argmin(score)
elif n_best > 1:
sidx = numpy.argsort(score)[:n_best]
else:
raise ValueError('n_best cannot be negative!')
return sample[sidx], score[sidx]
def closestNeighbor(query, embedding_array, normed=False, top_k=1):
'''Gets the index of the closest neighbor of embedding_array
to the query point. Distance metric is cosine.
SLOW. DO NOT USE THIS FOR RAPID COMPUTATION.
'''
embedding_array = numpy.array(embedding_array)
if not normed:
embedding_array = numpy.array([
(embedding_array[i] / numpy.linalg.norm(embedding_array[i]))
for i in range(embedding_array.shape[0])
])
## assuming embeddings are unit-normed by this point;
## norm(query) is a constant factor, so we can ignore it
dists = numpy.array([
numpy.dot(query, embedding_array[i])
for i in range(embedding_array.shape[0])
])
sorted_ixes = numpy.argsort(-1 * dists)
return sorted_ixes[:top_k]
def testMerge(self, dtype=dtype):
testarray1 = range(1,101)
testarray2 = range(5,106)
a = numpy.empty((100,2), dtype=dtype)
b = numpy.empty((100,2), dtype=dtype)
merged = numpy.empty((200,2), dtype=dtype)
incompatible1 = numpy.empty((200,3), dtype=dtype)
incompatible2 = numpy.empty(200, dtype=dtype)
a[:,0] = numpy.arange(1,101)
a[:,1] = numpy.arange(2,102)
b[:,0] = numpy.arange(5,105)
b[:,1] = numpy.arange(6,106)
ref = numpy.concatenate([a,b])
ref = ref[numpy.argsort(ref[:,0])]
self.assertEqual(mapped_struct.index_merge(a, b, merged), 200)
self.assertTrue((merged == ref).all())
self.assertRaises(ValueError, mapped_struct.index_merge, a, b, incompatible1)
self.assertRaises(ValueError, mapped_struct.index_merge, a, incompatible1, merged)
self.assertRaises(ValueError, mapped_struct.index_merge, a, b, incompatible2)
self.assertRaises(ValueError, mapped_struct.index_merge, a, incompatible2, merged)
def __init__(self, pos, color, mode=None):
"""
=============== ==============================================================
**Arguments:**
pos Array of positions where each color is defined
color Array of RGBA colors.
Integer data types are interpreted as 0-255; float data types
are interpreted as 0.0-1.0
mode Array of color modes (ColorMap.RGB, HSV_POS, or HSV_NEG)
indicating the color space that should be used when
interpolating between stops. Note that the last mode value is
ignored. By default, the mode is entirely RGB.
=============== ==============================================================
"""
self.pos = np.array(pos)
order = np.argsort(self.pos)
self.pos = self.pos[order]
self.color = np.array(color)[order]
if mode is None:
mode = np.ones(len(pos))
self.mode = mode
self.stopsCache = {}
def __loadChnTimeWave(self,f,selectChan):
times = list()
waveforms = list()
spk_startswith = "spike_{0}".format(selectChan)
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
times.append(time)
waveforms.append(waveform)
if times:
times = np.hstack(times)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
waveforms = waveforms[sort_index]
times = times[sort_index]
return times,waveforms
else:
return None,None
def __init__(self, pos, color, mode=None):
"""
=============== ==============================================================
**Arguments:**
pos Array of positions where each color is defined
color Array of RGBA colors.
Integer data types are interpreted as 0-255; float data types
are interpreted as 0.0-1.0
mode Array of color modes (ColorMap.RGB, HSV_POS, or HSV_NEG)
indicating the color space that should be used when
interpolating between stops. Note that the last mode value is
ignored. By default, the mode is entirely RGB.
=============== ==============================================================
"""
self.pos = np.array(pos)
order = np.argsort(self.pos)
self.pos = self.pos[order]
self.color = np.array(color)[order]
if mode is None:
mode = np.ones(len(pos))
self.mode = mode
self.stopsCache = {}
def __loadChnTimeWave(self,f,selectChan):
times = list()
waveforms = list()
spk_startswith = "spike_{0}".format(selectChan)
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
times.append(time)
waveforms.append(waveform)
if times:
times = np.hstack(times)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
waveforms = waveforms[sort_index]
times = times[sort_index]
return times,waveforms
else:
return None,None
def __SubDoWavelets(self,waveforms):
scales = 4
dimensions = 10
nspk,ls = waveforms.shape
cc = pywt.wavedec(waveforms,"haar",mode="symmetric",level=scales,axis=-1)
cc = np.hstack(cc)
sd = list()
for i in range(ls):
test_data = cc[:,i]
thr_dist = np.std(test_data,ddof=1)*3
thr_dist_min = np.mean(test_data)-thr_dist
thr_dist_max = np.mean(test_data)+thr_dist
aux = test_data[(test_data>thr_dist_min)&(test_data<thr_dist_max)]
if aux.size > 10:
sd.append(self.__test_ks(aux))
else:
sd.append(0)
ind = np.argsort(sd)
ind = ind[::-1]
coeff = ind[:dimensions]
waveletspk = cc[:,coeff]
return waveletspk
def __load_waveforms(self,selectChan,file_name):
spk_startswith = "spike_{0}".format(selectChan)
with hp.File(file_name,"r") as f:
times = list()
waveforms = list()
for chn_unit in f["spikes"].keys():
if chn_unit.startswith(spk_startswith):
tep_time = f["spikes"][chn_unit]["times"].value
waveform = f["spikes"][chn_unit]["waveforms"].value
times.append(tep_time)
waveforms.append(waveform)
if times:
times = np.hstack(times)
waveforms = np.vstack(waveforms)
sort_index = np.argsort(times)
waveforms = waveforms[sort_index]
return waveforms
else:
return None
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list):
"""
:param row: it is a list.length is number of labels. e.g. 2002
:param vocabulary_index2word_label
:param result_list
:return: a lable
"""
label_list=list(np.argsort(row))
label_list.reverse()
#print("label_list:",label_list) # a list,length is number of labels.
for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want.
#print(i,"index:",index)
flag1=vocabulary_index2word_label[index] not in result_list
flag2=index!=vocabulary_word2index_label[_PAD]
flag3=index!=vocabulary_word2index_label[_END]
if flag1 and flag2 and flag3:
#print("going to return ")
return vocabulary_index2word_label[index]
# write question id and labels to file system.
a2_predict_classification.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def get_label_using_logits_batch(question_id_sublist, logits_batch, vocabulary_index2word_label, f, top_number=5):
print("get_label_using_logits.shape:", np.array(logits_batch).shape) # (1, 128, 2002))
for i, logits in enumerate(logits_batch):
index_list = np.argsort(logits)[-top_number:]
#print("index_list:",index_list)
index_list = index_list[::-1]
label_list = []
for index in index_list:
#print("index:",index)
label = vocabulary_index2word_label[index]
label_list.append(
label) # ('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
# print("get_label_using_logits.label_list",label_list)
write_question_id_with_labels(question_id_sublist[i], label_list, f)
f.flush()
# get label using logits
a2_predict_classification.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list):
"""
:param row: it is a list.length is number of labels. e.g. 2002
:param vocabulary_index2word_label
:param result_list
:return: a lable
"""
label_list=list(np.argsort(row))
label_list.reverse()
#print("label_list:",label_list) # a list,length is number of labels.
for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want.
#print(i,"index:",index)
flag1=vocabulary_index2word_label[index] not in result_list
flag2=index!=vocabulary_word2index_label[_PAD]
flag3=index!=vocabulary_word2index_label[_END]
if flag1 and flag2 and flag3:
#print("going to return ")
return vocabulary_index2word_label[index]
# write question id and labels to file system.
a1_seq2seq_attention_predict.py 文件源码
项目:text_classification
作者: brightmart
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list):
"""
:param row: it is a list.length is number of labels. e.g. 2002
:param vocabulary_index2word_label
:param result_list
:return: a lable
"""
label_list=list(np.argsort(row))
label_list.reverse()
#print("label_list:",label_list) # a list,length is number of labels.
for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want.
#print(i,"index:",index)
flag1=vocabulary_index2word_label[index] not in result_list
flag2=index!=vocabulary_word2index_label[_PAD]
flag3=index!=vocabulary_word2index_label[_END]
if flag1 and flag2 and flag3:
#print("going to return ")
return vocabulary_index2word_label[index]
def filter_sort_unique(self, max_objval=float('Inf')):
# filter
if max_objval < float('inf'):
good_idx = self.objvals <= max_objval
self.objvals = self.objvals[good_idx]
self.solutions = self.solutions[good_idx]
if len(self.objvals) > 0:
sort_idx = np.argsort(self.objvals)
self.objvals = self.objvals[sort_idx]
self.solutions = self.solutions[sort_idx]
# unique
b = np.ascontiguousarray(self.solutions).view(
np.dtype((np.void, self.solutions.dtype.itemsize * self.P)))
_, unique_idx = np.unique(b, return_index=True)
self.objvals = self.objvals[unique_idx]
self.solutions = self.solutions[unique_idx]
def round_solution_pool(pool, constraints):
pool.distinct().sort()
P = pool.P
L0_reg_ind = np.isnan(constraints['coef_set'].C_0j)
L0_max = constraints['L0_max']
rounded_pool = SolutionPool(P)
for solution in pool.solutions:
# sort from largest to smallest coefficients
feature_order = np.argsort([-abs(x) for x in solution])
rounded_solution = np.zeros(shape=(1, P))
l0_norm_count = 0
for k in range(0, P):
j = feature_order[k]
if not L0_reg_ind[j]:
rounded_solution[0, j] = np.round(solution[j], 0)
elif l0_norm_count < L0_max:
rounded_solution[0, j] = np.round(solution[j], 0)
l0_norm_count += L0_reg_ind[j]
rounded_pool.add(objvals=np.nan, solutions=rounded_solution)
rounded_pool.distinct().sort()
return rounded_pool
def listen(self, results):
score_out = results['score_out']
y_gt = results['y_gt']
sort_idx = np.argsort(score_out, axis=-1)
idx_gt = np.argmax(y_gt, axis=-1)
correct = 0
count = 0
for kk, ii in enumerate(idx_gt):
sort_idx_ = sort_idx[kk][::-1]
for jj in sort_idx_[:self.top_k]:
if ii == jj:
correct += 1
break
count += 1
# self.log.info('Correct {}/{}'.format(correct, count))
self.correct += correct
self.count += count
self.step = int(results['step'])
# self.log.info('Step {}'.format(self.step))
pass
def nn(model, text, vectors, query, k=5):
"""
Return the nearest neighbour sentences to query
text: list of sentences
vectors: the corresponding representations for text
query: a string to search
"""
qf = encode(model, [query])
qf /= norm(qf)
scores = numpy.dot(qf, vectors.T).flatten()
sorted_args = numpy.argsort(scores)[::-1]
sentences = [text[a] for a in sorted_args[:k]]
print('QUERY: ' + query)
print('NEAREST: ')
for i, s in enumerate(sentences):
print(s, sorted_args[i])
def nn(model, text, vectors, query, k=5):
"""
Return the nearest neighbour sentences to query
text: list of sentences
vectors: the corresponding representations for text
query: a string to search
"""
qf = encode(model, [query])
qf /= norm(qf)
scores = numpy.dot(qf, vectors.T).flatten()
sorted_args = numpy.argsort(scores)[::-1]
sentences = [text[a] for a in sorted_args[:k]]
print 'QUERY: ' + query
print 'NEAREST: '
for i, s in enumerate(sentences):
print s, sorted_args[i]
def _spatial_sort(glyph):
from scipy.spatial.distance import cdist
from numpy import argsort
from numpy import argmin
curr = argmin(glyph[:,0])
visited = set([curr])
order = [curr]
dd = cdist(glyph, glyph)
while len(visited)<len(glyph):
row = dd[curr,:]
for i in argsort(row):
if row[i]<=0.0 or i==curr or i in visited:
continue
order.append(i)
visited.add(i)
break
glyph[:,:] = glyph[order,:]
def label_ranking_reciprocal_rank(label, # [sent_num]
preds): # [sent_num]
""" Calcualting the reciprocal rank according to definition,
"""
rank = np.argsort(preds)[::-1]
#pos_rank = np.take(rank, np.where(label == 1)[0])
#return np.mean(1.0 / pos_rank)
if_find = False
pos = 0
for r in rank:
pos += 1
if label[r] == 1:
first_pos_r = pos
if_find = True
break
assert(if_find)
return 1.0 / first_pos_r
def _matrix_inverse(self, matrix):
"""
Computes inverse of a matrix.
"""
matrix = np.array(matrix)
n_features = matrix.shape[0]
rank = np.linalg.matrix_rank(matrix)
if rank == n_features:
return np.linalg.inv(matrix)
else:
# Matrix is not full rank, so use Hadi's technique to compute inverse
# Reference: Ali S. Hadi (1992) "Identifying Multiple Outliers in Multivariate Data" eg. 2.3, 2.4
eigenValues, eigenVectors = np.linalg.eig(matrix)
eigenValues = np.abs(eigenValues) # to deal with -0 values
idx = eigenValues.argsort()[::-1]
eigenValues = eigenValues[idx]
eigenVectors = eigenVectors[:, idx]
s = eigenValues[eigenValues != 0].min()
w = [1 / max(e, s) for e in eigenValues]
W = w * np.eye(n_features)
return eigenVectors.dot(W).dot(eigenVectors.T)
def get_1000G_snps(sumstats, out_file):
sf = np.loadtxt(sumstats,dtype=str,skiprows=1)
h5f = h5py.File('ref/Misc/1000G_SNP_info.h5','r')
rf = h5f['snp_chr'][:]
h5f.close()
ind1 = np.in1d(sf[:,1],rf[:,2])
ind2 = np.in1d(rf[:,2],sf[:,1])
sf1 = sf[ind1]
rf1 = rf[ind2]
### check order ###
if sum(sf1[:,1]==rf1[:,2])==len(rf1[:,2]):
print 'Good!'
else:
print 'Shit happens, sorting sf1 to have the same order as rf1'
O1 = np.argsort(sf1[:,1])
O2 = np.argsort(rf1[:,2])
O3 = np.argsort(O2)
sf1 = sf1[O1][O3]
out = ['hg19chrc snpid a1 a2 bp or p'+'\n']
for i in range(len(sf1[:,1])):
out.append(sf1[:,0][i]+' '+sf1[:,1][i]+' '+sf1[:,2][i]+' '+sf1[:,3][i]+' '+rf1[:,1][i]+' '+sf1[:,5][i]+' '+sf1[:,6][i]+'\n')
ff = open(out_file,"w")
ff.writelines(out)
ff.close()
def plot_heatmaps(data, mis, column_label, cont, topk=30, prefix=''):
cmap = sns.cubehelix_palette(as_cmap=True, light=.9)
m, nv = mis.shape
for j in range(m):
inds = np.argsort(- mis[j, :])[:topk]
if len(inds) >= 2:
plt.clf()
order = np.argsort(cont[:,j])
subdata = data[:, inds][order].T
subdata -= np.nanmean(subdata, axis=1, keepdims=True)
subdata /= np.nanstd(subdata, axis=1, keepdims=True)
columns = [column_label[i] for i in inds]
sns.heatmap(subdata, vmin=-3, vmax=3, cmap=cmap, yticklabels=columns, xticklabels=False, mask=np.isnan(subdata))
filename = '{}/heatmaps/group_num={}.png'.format(prefix, j)
if not os.path.exists(os.path.dirname(filename)):
os.makedirs(os.path.dirname(filename))
plt.title("Latent factor {}".format(j))
plt.yticks(rotation=0)
plt.savefig(filename, bbox_inches='tight')
plt.close('all')
#plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=cont[:, j],
# outfile=prefix + '/relationships/group_num=' + str(j), latent=labels[:, j], alpha=0.1)
def plot_top_relationships(data, corex, labels, column_label, topk=5, prefix=''):
dual = (corex.moments['X_i Y_j'] * corex.moments['X_i Z_j']).T
alpha = dual > 0.04
cy = corex.moments['ry']
m, nv = alpha.shape
for j in range(m):
inds = np.where(alpha[j] > 0)[0]
inds = inds[np.argsort(- dual[j][inds])][:topk]
if len(inds) >= 2:
if dual[j, inds[0]] > 0.1:
factor = labels[:, j]
title = '$Y_{%d}$' % j
else:
k = np.argmax(np.abs(cy[j]))
if k == j:
k = np.argsort(-np.abs(cy[j]))[1]
factor = corex.moments['X_i Z_j'][inds[0], j] * labels[:, j] + corex.moments['X_i Z_j'][inds[0], k] * labels[:, k]
title = '$Y_{%d} + Y_{%d}$' % (j, k)
plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=factor,
outfile=prefix + '/relationships/group_num=' + str(j), title=title)
def trim(g, max_parents=False, max_children=False):
for node in g:
if max_parents:
parents = list(g.successors(node))
weights = [g.edge[node][parent]['weight'] for parent in parents]
for weak_parent in np.argsort(weights)[:-max_parents]:
g.remove_edge(node, parents[weak_parent])
if max_children:
children = g.predecessors(node)
weights = [g.edge[child][node]['weight'] for child in children]
for weak_child in np.argsort(weights)[:-max_children]:
g.remove_edge(children[weak_child], node)
return g
# Misc. utilities