def test_partition_cdtype(self):
d = np.array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
('Lancelot', 1.9, 38)],
dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
tgt = np.sort(d, order=['age', 'height'])
assert_array_equal(np.partition(d, range(d.size),
order=['age', 'height']),
tgt)
assert_array_equal(d[np.argpartition(d, range(d.size),
order=['age', 'height'])],
tgt)
for k in range(d.size):
assert_equal(np.partition(d, k, order=['age', 'height'])[k],
tgt[k])
assert_equal(d[np.argpartition(d, k, order=['age', 'height'])][k],
tgt[k])
d = np.array(['Galahad', 'Arthur', 'zebra', 'Lancelot'])
tgt = np.sort(d)
assert_array_equal(np.partition(d, range(d.size)), tgt)
for k in range(d.size):
assert_equal(np.partition(d, k)[k], tgt[k])
assert_equal(d[np.argpartition(d, k)][k], tgt[k])
python类argpartition()的实例源码
def format_lines(video_ids, predictions, labels, top_k):
batch_size = len(video_ids)
for video_index in range(batch_size):
n_recall = max(int(numpy.sum(labels[video_index])), 1)
# labels
label_indices = numpy.argpartition(labels[video_index], -n_recall)[-n_recall:]
label_predictions = [(class_index, predictions[video_index][class_index])
for class_index in label_indices]
label_predictions = sorted(label_predictions, key=lambda p: -p[1])
label_str = "\t".join(["%d\t%f"%(x,y) for x,y in label_predictions])
# predictions
top_k_indices = numpy.argpartition(predictions[video_index], -top_k)[-top_k:]
top_k_predictions = [(class_index, predictions[video_index][class_index])
for class_index in top_k_indices]
top_k_predictions = sorted(top_k_predictions, key=lambda p: -p[1])
top_k_str = "\t".join(["%d\t%f"%(x,y) for x,y in top_k_predictions])
# compute PERR
top_n_indices = numpy.argpartition(predictions[video_index], -n_recall)[-n_recall:]
positives = [labels[video_index][class_index]
for class_index in top_n_indices]
perr = sum(positives) / float(n_recall)
# URL
url = "https://www.youtube.com/watch?v=" + video_ids[video_index].decode('utf-8')
yield url + "\t" + str(1-perr) + "\t" + top_k_str + "\t" + label_str + "\n"
def argpartition(a, kth, axis=-1):
"""Returns the indices that would partially sort an array.
Args:
a (cupy.ndarray): Array to be sorted.
kth (int or sequence of ints): Element index to partition by. If
supplied with a sequence of k-th it will partition all elements
indexed by k-th of them into their sorted position at once.
axis (int or None): Axis along which to sort. Default is -1, which
means sort along the last axis. If None is supplied, the array is
flattened before sorting.
Returns:
cupy.ndarray: Array of the same type and shape as ``a``.
.. note::
For its implementation reason, `cupy.argpartition` fully sorts the
given array as `cupy.argsort` does. It also does not support ``kind``
and ``order`` parameters that ``numpy.argpartition`` supports.
.. seealso:: :func:`numpy.argpartition`
"""
return a.argpartition(kth, axis=axis)
def CSMToBinary(D, Kappa):
"""
Turn a cross-similarity matrix into a binary cross-simlarity matrix
If Kappa = 0, take all neighbors
If Kappa < 1 it is the fraction of mutual neighbors to consider
Otherwise Kappa is the number of mutual neighbors to consider
"""
N = D.shape[0]
M = D.shape[1]
if Kappa == 0:
return np.ones((N, M))
elif Kappa < 1:
NNeighbs = int(np.round(Kappa*M))
else:
NNeighbs = Kappa
J = np.argpartition(D, NNeighbs, 1)[:, 0:NNeighbs]
I = np.tile(np.arange(N)[:, None], (1, NNeighbs))
V = np.ones(I.size)
[I, J] = [I.flatten(), J.flatten()]
ret = sparse.coo_matrix((V, (I, J)), shape=(N, M))
return ret.toarray()
def closest_docs(self, query, k=1):
"""Closest docs by dot product between query and documents
in tfidf weighted word vector space.
"""
spvec = self.text2spvec(query)
res = spvec * self.doc_mat
if len(res.data) <= k:
o_sort = np.argsort(-res.data)
else:
o = np.argpartition(-res.data, k)[0:k]
o_sort = o[np.argsort(-res.data[o])]
doc_scores = res.data[o_sort]
doc_ids = [self.get_doc_id(i) for i in res.indices[o_sort]]
return doc_ids, doc_scores
def bottom_top_k_along_row(arr, k, ordered=True):
""" bottom and top k of a 2d np.array, along the rows
http://stackoverflow.com/questions/6910641/how-to-get-indices-of-n-maximum-values-in-a-numpy-array/18691983
"""
assert k>0, "bottom_top_k_along_row/column() requires k>0."
rows = arr.shape[0]
if ordered:
tmp = np.argsort(arr, axis=1)
idx_bot = tmp[:, :k]
idx_top = tmp[:,-k:]
else:
idx_bot = np.argpartition(arr, k, axis=1)[:,:k]
idx_top = np.argpartition(arr, -k, axis=1)[:,-k:]
indices = np.concatenate((idx_bot, idx_top), axis=1)
vals = arr[np.repeat(np.arange(rows), 2*k), indices.ravel()].reshape(rows,2*k)
return vals, indices
def top_k_recommendations(self, sequence, k=10, exclude=None, **kwargs):
if exclude is None:
exclude = []
last_item = int(sequence[-1][0])
if last_item not in self.previous_recommendations:
self.get_all_recommendations(last_item)
all_recommendations = deepcopy(self.previous_recommendations[last_item])
for s in sequence:
all_recommendations[int(s[0])] = 0
for i in exclude:
all_recommendations[i] = 0
ranking = np.zeros(self.n_items)
for i, x in enumerate(all_recommendations.most_common(k)):
ranking[x[0]] = k-i
return np.argpartition(-ranking, range(k))[:k]
def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
'''
if exclude is None:
exclude = []
last_item = sequence[-1][0]
output = np.dot(self.V_user_item[user_id, :], self.V_item_user.T) + np.dot(self.V_prev_next[last_item, :], self.V_next_prev.T)
# Put low similarity to viewed items to exclude them from recommendations
output[[i[0] for i in sequence]] = -np.inf
output[exclude] = -np.inf
# find top k according to output
return list(np.argpartition(-output, range(k))[:k])
def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
'''
if exclude is None:
exclude = []
user_items = [i[0] for i in sequence]
output = self.item_score(user_id, user_items)
# Put low similarity to viewed items to exclude them from recommendations
output[[i[0] for i in sequence]] = -np.inf
output[exclude] = -np.inf
# find top k according to output
return list(np.argpartition(-output, range(k))[:k])
def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None):
''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
'''
if exclude is None:
exclude = []
last_item = sequence[-1][0]
output = self.bias + np.dot(self.V[user_id, :], self.H.T)
# Put low similarity to viewed items to exclude them from recommendations
output[[i[0] for i in sequence]] = -np.inf
output[exclude] = -np.inf
# find top k according to output
return list(np.argpartition(-output, range(k))[:k])
stacked_denoising_autoencoder.py 文件源码
项目:sequence-based-recommendations
作者: rdevooght
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def top_k_recommendations(self, sequence, user_id=None, k=10, exclude=None, **kwargs):
''' Recieves a sequence of (id, rating), and produces k recommendations (as a list of ids)
'''
# Compile network if needed
if not hasattr(self, 'predict_function'):
self._compile_predict_function()
# Prepare RNN input
X = np.zeros((1, self._input_size())) # input of the RNN
X[0, :] = self._one_hot_encoding([i[0] for i in sequence])
# Run RNN
output = self.predict_function(X.astype(theano.config.floatX))[0]
# Put low similarity to viewed items to exclude them from recommendations
output[[i[0] for i in sequence]] = -np.inf
output[exclude] = -np.inf
# find top k according to output
return list(np.argpartition(-output, range(k))[:k])
def _compile_test_function(self):
''' Differs from base test function because of the added softmax operation
'''
print("Compiling test...")
deterministic_output = T.nnet.softmax(lasagne.layers.get_output(self.l_out, deterministic=True))
if self.interactions_are_unique:
deterministic_output *= (1 - self.exclude)
theano_test_function = theano.function(self.theano_inputs, deterministic_output, allow_input_downcast=True, name="Test_function", on_unused_input='ignore')
def precision_test_function(theano_inputs, k=10):
output = theano_test_function(*theano_inputs)
ids = np.argpartition(-output, range(k), axis=-1)[0, :k]
return ids
self.test_function = precision_test_function
print("Compilation done.")
def smallest_k(matrix: np.ndarray, k: int,
only_first_row: bool = False) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
"""
Find the smallest elements in a numpy matrix.
:param matrix: Any matrix.
:param k: The number of smallest elements to return.
:param only_first_row: If true the search is constrained to the first row of the matrix.
:return: The row indices, column indices and values of the k smallest items in matrix.
"""
if only_first_row:
flatten = matrix[:1, :].flatten()
else:
flatten = matrix.flatten()
# args are the indices in flatten of the k smallest elements
args = np.argpartition(flatten, k)[:k]
# args are the indices in flatten of the sorted k smallest elements
args = args[np.argsort(flatten[args])]
# flatten[args] are the values for args
return np.unravel_index(args, matrix.shape), flatten[args]
def test_partition_cdtype(self):
d = np.array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
('Lancelot', 1.9, 38)],
dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
tgt = np.sort(d, order=['age', 'height'])
assert_array_equal(np.partition(d, range(d.size),
order=['age', 'height']),
tgt)
assert_array_equal(d[np.argpartition(d, range(d.size),
order=['age', 'height'])],
tgt)
for k in range(d.size):
assert_equal(np.partition(d, k, order=['age', 'height'])[k],
tgt[k])
assert_equal(d[np.argpartition(d, k, order=['age', 'height'])][k],
tgt[k])
d = np.array(['Galahad', 'Arthur', 'zebra', 'Lancelot'])
tgt = np.sort(d)
assert_array_equal(np.partition(d, range(d.size)), tgt)
for k in range(d.size):
assert_equal(np.partition(d, k)[k], tgt[k])
assert_equal(d[np.argpartition(d, k)][k], tgt[k])
def probs(self, x):
dists = np.hstack([self.distFunc(x, cls) for cls in self.trainData])
indices = np.argpartition(dists, self.k, axis=1)[:,:self.k]
#start = 0
#votes = list()
#for cls in self.trainData:
# end = start + cls.shape[0]
# votes.append(np.sum(np.logical_and(start <= indices, indices < end), axis=1))
# start = end
ends = np.cumsum([len(cls) for cls in self.trainData])
starts = ends - np.array([len(cls) for cls in self.trainData])
votes = [np.sum(np.logical_and(start <= indices, indices < end), axis=1)
for start, end in zip(starts, ends)]
votes = np.vstack(votes).T
#probs = np.zeros((x.shape[0], self.nCls))
#probs[np.arange(probs.shape[0]), np.argmax(votes, axis=1)] = 1.0
##probs = util.softmax(votes / float(self.k))
probs = votes / float(self.k)
return probs
def argmaxk_rows_opt1(arr, k=10, sort=False):
"""
Optimized implementation. When sort=False it is equal to argmaxk_rows_basic. When sort=True and k << arr.shape[1],
it is should be faster, because we argsort only subarray of k max elements from each row of arr (arr.shape[0] x k) instead of
the whole array arr (arr.shape[0] x arr.shape[1]).
"""
best_inds = np.argpartition(arr, kth=-k, axis=1)[:, -k:] # column indices of k max elements in each row (m x k)
if not sort:
return best_inds
# generate row indices corresponding to best_ids (just current row id in each row) (m x k)
rows = np.arange(best_inds.shape[0], dtype=np.intp)[:, np.newaxis].repeat(best_inds.shape[1], axis=1)
best_elems = arr[rows, best_inds] # select k max elements from each row using advanced indexing (m x k)
# indices which sort each row of best_elems in descending order (m x k)
best_elems_inds = np.argsort(best_elems, axis=1)[:, ::-1]
# reorder best_indices so that arr[i, sorted_best_inds[i,:]] will be sorted in descending order
sorted_best_inds = best_inds[rows, best_elems_inds]
return sorted_best_inds
def generateCosineNeighborGraph(hin,kNeighbors=10,tf_param={'word':True, 'entity':False, 'we_weight':1}):
X, newIds, entIds = GraphGenerator.getTFVectorX(hin,param=tf_param)
cosX = cosine_similarity(X)
#return sparse.csc_matrix(X.dot(X.transpose())),newIds
n = cosX.shape[0]
graph = np.zeros((n,n))
tic = time.time()
for i in range(n):
for j in np.argpartition(-cosX[i],kNeighbors)[:kNeighbors]:
if j == i:
continue
#graph[i, j] += cosX[i, j]
#graph[j, i] += cosX[i, j]
graph[i, j] += 1
graph[j, i] += 1
toc = time.time() - tic
return sparse.csc_matrix(graph), newIds
def generateCosineNeighborGraphfromX(X, kNeighbors=10):
cosX = cosine_similarity(X)
# return sparse.csc_matrix(X.dot(X.transpose())),newIds
#print cosX.shape
n = cosX.shape[0]
graph = np.zeros((n, n))
tic = time.time()
for i in range(n):
for j in np.argpartition(-cosX[i], kNeighbors)[:kNeighbors]:
if j == i:
continue
# graph[i, j] += cosX[i, j]
# graph[j, i] += cosX[i, j]
graph[i, j] += 1
graph[j, i] += 1
toc = time.time() - tic
#print 'graph generation done in %f seconds.' % toc
return sparse.csc_matrix(graph)
def generate_laplacian_score_scalar(X_ent, X_word, kNeighbors):
# Generate cosine similarity graph
n = X_ent.shape[0]
cosX = cosine_similarity(X_word)
graph = np.zeros((n, n))
for i in range(n):
for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
if j == i:
continue
graph[i, j] = cosX[i, j]
graph[j, i] = cosX[i, j]
D = sparse.diags([graph.sum(axis=0)], [0])
L = D - graph
f_tilde = X_ent - (float(X_ent.transpose() * D * np.ones((n, 1))) / D.sum().sum()) * np.ones((n, 1))
score = float(f_tilde.transpose() * L * f_tilde) / float(f_tilde.transpose() * D * f_tilde + 1e-10)
laplacian_score = score
return laplacian_score
test_multiarray.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def test_partition_cdtype(self):
d = np.array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
('Lancelot', 1.9, 38)],
dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
tgt = np.sort(d, order=['age', 'height'])
assert_array_equal(np.partition(d, range(d.size),
order=['age', 'height']),
tgt)
assert_array_equal(d[np.argpartition(d, range(d.size),
order=['age', 'height'])],
tgt)
for k in range(d.size):
assert_equal(np.partition(d, k, order=['age', 'height'])[k],
tgt[k])
assert_equal(d[np.argpartition(d, k, order=['age', 'height'])][k],
tgt[k])
d = np.array(['Galahad', 'Arthur', 'zebra', 'Lancelot'])
tgt = np.sort(d)
assert_array_equal(np.partition(d, range(d.size)), tgt)
for k in range(d.size):
assert_equal(np.partition(d, k)[k], tgt[k])
assert_equal(d[np.argpartition(d, k)][k], tgt[k])
def compute_nearest_neighbors(self, num_neighbors):
result_list = []
for key, value in self.im2index.iteritems():
neighbor_list = [key]
similarity_scores = self.similarity_mat[value]
# removes best match as same as key
ind = np.argpartition(similarity_scores, -(num_neighbors + 1))[-(num_neighbors + 1):-1]
ind = ind[np.argsort(similarity_scores[ind])]
neighbors = [self.index2im[x] for x in ind]
neighbor_list.extend(neighbors)
result_list.append(neighbor_list)
# compute neighbor statistics
NearestNeighbour.compute_neighbor_stats(result_list, num_neighbors)
# plot the TSNE plot
self.plot_tsne()
return result_list
def _calculate_topk_ndces(self, k):
"""
Calculate the indices of the k specialists with highest b-value,
including the base classifier regardless of its b-value.
Args:
k: int >= 0, approximately specifying the number of derived specialists to select.
Precisely, the best k (by Wilson error bound) are taken, along with the
base classifier if it is not already one of the best k.
Returns:
A list containing the indices of the top k classifiers.
The list always at least contains the base classifier's index (i.e. 0).
Therefore, the list is of length k if the base classifier is one of the top k,
and length k+1 otherwise. If k is greater than the total number of derived
specialists, returns all of them.
"""
assert self.label_corrs is not None , "Label correlations must be calculated before top k indices."
if k < len(self.label_corrs):
topk_ndces = set(np.argpartition(-self.label_corrs, k)[:k]) #Only does a partial sort of b!
else:
topk_ndces = set(range(len(self.label_corrs)))
topk_ndces.add(0)
return list(topk_ndces & set(self._relevant_ndces))
def argsort(x, topn=None, reverse=False):
"""
Return indices of the `topn` smallest elements in array `x`, in ascending order.
If reverse is True, return the greatest elements instead, in descending order.
"""
x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...)
if topn is None:
topn = x.size
if topn <= 0:
return []
if reverse:
x = -x
if topn >= x.size or not hasattr(np, 'argpartition'):
return np.argsort(x)[:topn]
# np >= 1.8 has a fast partial argsort, use that!
most_extreme = np.argpartition(x, topn)[:topn]
return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order
def test_partition_cdtype(self):
d = np.array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41),
('Lancelot', 1.9, 38)],
dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')])
tgt = np.sort(d, order=['age', 'height'])
assert_array_equal(np.partition(d, range(d.size),
order=['age', 'height']),
tgt)
assert_array_equal(d[np.argpartition(d, range(d.size),
order=['age', 'height'])],
tgt)
for k in range(d.size):
assert_equal(np.partition(d, k, order=['age', 'height'])[k],
tgt[k])
assert_equal(d[np.argpartition(d, k, order=['age', 'height'])][k],
tgt[k])
d = np.array(['Galahad', 'Arthur', 'zebra', 'Lancelot'])
tgt = np.sort(d)
assert_array_equal(np.partition(d, range(d.size)), tgt)
for k in range(d.size):
assert_equal(np.partition(d, k)[k], tgt[k])
assert_equal(d[np.argpartition(d, k)][k], tgt[k])
def closest_docs(self, query, k=1):
"""Closest docs by dot product between query and documents
in tfidf weighted word vector space.
"""
spvec = self.text2spvec(query)
res = spvec * self.doc_mat
if len(res.data) <= k:
o_sort = np.argsort(-res.data)
else:
o = np.argpartition(-res.data, k)[0:k]
o_sort = o[np.argsort(-res.data[o])]
doc_scores = res.data[o_sort]
doc_ids = [self.get_doc_id(i) for i in res.indices[o_sort]]
return doc_ids, doc_scores
def _select_target_neighbors(self):
"""Find the target neighbors of each sample, that stay fixed during training.
Returns
-------
array_like
An array of neighbors indices for each sample with shape (n_samples, n_neighbors).
"""
self.logger.info('Finding target neighbors...')
target_neighbors = np.empty((self.X_.shape[0], self.n_neighbors_), dtype=int)
for class_ in self.classes_:
class_ind, = np.where(np.equal(self.y_, class_))
dist = euclidean_distances(self.X_[class_ind], squared=True)
np.fill_diagonal(dist, np.inf)
neigh_ind = np.argpartition(dist, self.n_neighbors_ - 1, axis=1)
neigh_ind = neigh_ind[:, :self.n_neighbors_]
# argpartition doesn't guarantee sorted order, so we sort again but only the k neighbors
row_ind = np.arange(len(class_ind))[:, None]
neigh_ind = neigh_ind[row_ind, np.argsort(dist[row_ind, neigh_ind])]
target_neighbors[class_ind] = class_ind[neigh_ind]
return target_neighbors
def select_next_words(self, next_costs, next_probs, step_num, how_many):
# Pick only on the first line (for the beginning of sampling)
# This will avoid duplicate <q> token.
if step_num == 0:
flat_next_costs = next_costs[:1, :].flatten()
else:
# Set the next cost to infinite for finished utterances (they will be replaced)
# by other utterances in the beam
flat_next_costs = next_costs.flatten()
voc_size = next_costs.shape[1]
args = numpy.argpartition(flat_next_costs, how_many)[:how_many]
args = args[numpy.argsort(flat_next_costs[args])]
return numpy.unravel_index(args, next_costs.shape), flat_next_costs[args]
def find_nbest(score, n, threshold=None):
num_vars = score.shape[1]
score = score.flatten()
nbest = np.argpartition(score, n)[:n]
beam_indices = nbest / num_vars
var_indices = nbest % num_vars
nbest_score = score[nbest]
if threshold:
best = np.max(nbest_score)
cond = nbest_score > best + threshold
nbest_score = nbest_score[cond]
beam_indices = beam_indices[cond]
var_indices = var_indices[cond]
return nbest_score, beam_indices, var_indices
def tfidf_retrieval(tfidf_vec, train_contexts_txt, train_responses_txt, output_file):
print type(tfidf_vec)
tfidf_vec = tfidf_vec.toarray()
print tfidf_vec.shape
prod_mat = np.dot(tfidf_vec, tfidf_vec.T)
print prod_mat.shape
prod_mat = prod_mat / mat_vector_2norm_squared(tfidf_vec)
print prod_mat.shape
response_list = []
for i in xrange(len(prod_mat)):
row = prod_mat[i]
# No idea what's going on here. See the following page:
# stackoverflow.com/questions/6910641/how-to-get-indices-of-n-maximum-values-in-a-numpy-array
ind = np.argpartition(row, -2)[-2:]
ind = ind[np.argsort(row[ind])][0]
response_list.append(train_responses_txt[ind])
print train_contexts_txt[i]
print response_list[i]
with open(output_file, 'w') as f1:
for response in response_list:
f1.write(response)
def visualize_frequent_words(vectors_2d: np.ndarray, dataset: DataSet, k: int, ax: plt.Axes = None) -> None:
word_ids, counts = np.unique(dataset.data, return_counts=True)
indices = np.argpartition(-counts, k)[:k]
frequent_word_ids = word_ids[indices]
if ax is None:
fig, ax = plt.subplots(figsize=(13, 13))
else:
fig = None
vectors_2d = vectors_2d[frequent_word_ids]
ax.scatter(vectors_2d[:, 0], vectors_2d[:, 1], s=2, alpha=0.25)
for i, id in enumerate(frequent_word_ids):
ax.annotate(dataset.vocabulary.to_word(id), (vectors_2d[i, 0], vectors_2d[i, 1]))
if fig is not None:
fig.tight_layout()
fig.show()