def decision_function(self, X):
"""Compute the distances to the nearest centroid for
an array of test vectors X.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
C : array, shape = [n_samples]
"""
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils.validation import check_array, check_is_fitted
check_is_fitted(self, 'centroids_')
X = check_array(X, accept_sparse='csr')
return pairwise_distances(X, self.centroids_,
metric=self.metric).min(axis=1)
python类pairwise_distances()的实例源码
def test_similarity_calculations():
"""
Tests the implementation of fast similarity calculations with the PyTorch
:return:
"""
np.random.seed(1)
# Create random data vectors
for sigma in [0.01, 0.1, 0.5, 1]:
A = np.random.randn(10, 23)
sef_sim = fast_heat_similarity_matrix(A, sigma)
assert sef_sim.shape[0] == 10
assert sef_sim.shape[1] == 10
sim = np.exp(-pairwise_distances(A, A)**2/sigma**2)
assert np.sum((sef_sim-sim)*2) < 1e-3
def test_cosine2jaccard():
from sklearn.metrics.pairwise import pairwise_distances
from freediscovery.metrics import (cosine2jaccard_similarity,
jaccard2cosine_similarity)
x = np.array([[0, 0, 1., 1.]])
y = np.array([[0, 1., 1., 0]])
S_cos = 1 - pairwise_distances(x, y, metric='cosine')
S_jac = cosine2jaccard_similarity(S_cos)
S_jac_ref = 1 - pairwise_distances(x.astype('bool'), y.astype('bool'), metric='jaccard')
assert_allclose(S_jac, S_jac_ref)
S_cos2 = jaccard2cosine_similarity(S_jac)
assert_allclose(S_cos2, S_cos)
def centroid_similarity(X, internal_ids, nn_metric='cosine'):
""" Given a list of documents in a cluster, compute the cluster centroid,
intertia and individual distances
Parameters
----------
internal_ids : list
a list of internal ids
nn_metric : str
a rescaling of the metric if needed
"""
from ..metrics import _scale_cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
X_sl = X[internal_ids, :]
centroid = X_sl.mean(axis=0)
if centroid.ndim == 1:
centroid = centroid[None, :]
S_cos = 1 - pairwise_distances(X_sl, centroid, metric='cosine')
S_sim = _scale_cosine_similarity(S_cos, metric=nn_metric)
S_sim_mean = np.mean(S_sim)
return float(S_sim_mean), S_sim[:, 0]
def query(vec, model, k, max_search_radius):
data = model['data']
table = model['table']
random_vectors = model['random_vectors']
num_vector = random_vectors.shape[1]
# Compute bin index for the query vector, in bit representation.
bin_index_bits = (vec.dot(random_vectors) >= 0).flatten()
# Search nearby bins and collect candidates
candidate_set = set()
for search_radius in range(max_search_radius+1):
candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, initial_candidates=candidate_set)
# Sort candidates by their true distances from the query
nearest_neighbors = pd.DataFrame({'id':list(candidate_set)})
candidates = data[np.array(list(candidate_set)),:]
nearest_neighbors['distance'] = pairwise_distances(candidates, vec, metric='cosine').flatten()
return nearest_neighbors.sort_values(by='distance').head(k), len(candidate_set)
def pre_train(train_df, test_df, train_add, test_add):
train = train_df.values[:,1:-1]
t = train_add.values[:,1:-1]
train = np.hstack((train, t))
dtest = test_df.values[:,1:]
tA = test_add.values[:,1:]
dtest = np.hstack((dtest, tA))
cor_distance = pairwise.pairwise_distances(dtest, train)
resultset = set()
for tmp in cor_distance:
index = np.argsort(tmp)
for i in range(10):
resultset.add(index[i])
index = []
for i in resultset:
index.append(i)
return index
def predict(self, X):
"""
Classify the input data assigning the label of the nearest prototype
Keyword arguments:
X -- The feature vectors
"""
classification=np.zeros(len(X))
if self.distance_metric=="euclidean":
distances=pairwise_distances(X, self.M_,self.distance_metric) #compute distances to the prototypes (template matching)
if self.distance_metric=="minkowski":
distances=pairwise_distances(X, self.M_,self.distance_metric)
elif self.distance_metric=="manhattan":
distances=pairwise_distances(X, self.M_,self.distance_metric)
elif self.distance_metric=="mahalanobis":
distances=pairwise_distances(X, self.M_,self.distance_metric)
else:
distances=pairwise_distances(X, self.M_,"euclidean")
for i in xrange(len(X)):
classification[i]=self.outcomes[distances[i].tolist().index(min(distances[i]))] #choose the class belonging to nearest prototype distance
return classification
def test_distance_calculations():
"""
Tests the implementation of fast distance calculations with the PyTorch
:return:
"""
np.random.seed(1)
# Create random data vectors
A = np.random.randn(10, 23)
B = np.random.randn(5, 23)
sef_dists = fast_distance_matrix(A, B)
assert sef_dists.shape[0] == 10
assert sef_dists.shape[1] == 5
dists = pairwise_distances(A, B)
assert np.sum((sef_dists-dists)*2) < 1e-3
def mean_data_distance(data):
"""
Calculates the mean distance between a set of data points
:param data:
:return:
"""
mean_distance = np.mean(pairwise_distances(data))
return mean_distance
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def compare_pic(self,feature1,feature2):
predicts=pw.pairwise_distances(feature2, feature1,'cosine')
#predicts=pw.cosine_similarity(feature1, feature2)
return predicts
def initialize_layer(self, data, n_samples=10000):
"""
Initializes the layer using k-means (sigma is set to the mean pairwise distance)
:param data: data
:param n_samples: n_samples to keep for initializing the model
:return:
"""
if self.features_fn is None:
assert False
idx = np.arange(data.shape[0])
np.random.shuffle(idx)
features = []
for i in range(idx.shape[0]):
feats = self.features_fn([data[idx[i]]])
feats = feats.transpose((0, 2, 3, 1))
feats = feats.reshape((-1, feats.shape[-1]))
features.extend(feats)
if len(features) > n_samples:
break
features = np.asarray(features)
kmeans = KMeans(n_clusters=self.n_codewords, n_jobs=4, n_init=5)
kmeans.fit(features)
V = kmeans.cluster_centers_.copy()
# Initialize gamma
mean_distance = np.sum(pairwise_distances(V)) / (self.n_codewords * (self.n_codewords - 1))
self.gamma.set_value(self.gamma.get_value() * np.float32(mean_distance))
# Initialize codebook
V = V.reshape((V.shape[0], V.shape[1], 1, 1))
self.V.set_value(np.float32(V))
def delta(X, Y, n_jobs=-1, a=1, c=0):
"""Pairwise delta function: cosine and sigmoid
:X: TODO
:returns: TODO
"""
D = pairwise_distances(X, Y, metric="cosine", n_jobs=n_jobs)
if c != 0:
D -= c
if a != 1:
D *= a
D = expit(D)
return D
def test_euclidean2cosine():
from sklearn.metrics.pairwise import pairwise_distances
x = normalize([[0, 2, 3, 5]])
y = normalize([[1, 3, 6, 7]])
D_cos = pairwise_distances(x, y, metric='cosine')[0, 0]
S_cos = 1 - D_cos
D_seuc = pairwise_distances(x, y, metric='euclidean', squared=True)[0, 0]
assert_allclose(S_cos, seuclidean_dist2cosine_sim(D_seuc))
def get_distances(self):
distances = pairwise_distances(self.query_feats,self.db_feats,self.dist_type, n_jobs=-1)
return distances
def fit(self, data):
"""
:param data:
:return:
"""
[n_samples, n_obs] = data.shape
self.protos = data[self.rng.choice(n_samples, self.n_protos),] # w
self.context = np.zeros(self.protos.shape) # c
ct = np.zeros((1, n_obs))
wr = ct
cr = wr
for iteration in range(self.iterations):
sample = data[self.rng.choice(n_samples, 1),]
ct = (1 - self.a) * wr + self.b * cr
t = iteration / float(self.iterations)
lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
epsilon = self.epsilon_i * (self.lrate_f / float(self.lrate_i)) ** t
d = (1 - self.a) * pairwise_distances(sample, self.protos) + self.a * pairwise_distances(ct, self.context)
I = np.argsort(np.argsort(d))
min_id = np.where(I == 0)[0]
H = np.exp(-I / epsilon).ravel()
diff_w = sample - self.protos
diff_c = ct - self.context
for i in range(self.n_protos):
self.protos[i, :] += lrate * H[i] * diff_w[i, :]
self.context[i, :] += lrate * H[i] * diff_c[i, :]
wr = self.protos[min_id]
cr = self.context[min_id]
return self
def encode(self, data, metric = 'euclidean'):
""" Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
metric : string
One of the following valid options as defined for function `http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html`.
Valid options include:
- euclidean
- cityblock
- l1
- cosine
Returns
-------
encoded_data : real array-like, shape(n_samples, n_features)
``data``, as represented by the prototypes in codebook.
ts_symbols : list, shape(n_samples, 1)
A discrete symbolic time series
"""
nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos)
_, self.__symbols = nbrs.kneighbors(data)
self.__encoding = self.protos[self.__symbols]
return (self.__encoding, self.__symbols)
def fit(self, data):
""" Learn data, and construct a vector codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
Returns
-------
self : object
The instance itself
"""
[n_samples, _] = data.shape
self.protos = data[self.rng.choice(n_samples, self.n_protos), ]
# avg_p = np.mean(data, 0)
#dist_from_avg_p = np.sum(pairwise_distances(avg_p, data))
#ndistortion = []
for iteration in range(self.iterations):
sample = data[self.rng.choice(n_samples, 1), ]
t = iteration / float(self.iterations)
lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
epsilon = self.epsilon_i * (self.epsilon_f / float(self.epsilon_i)) ** t
D = pairwise_distances(sample, self.protos, metric='euclidean', n_jobs=self.n_jobs)
I = np.argsort(np.argsort(D))
H = np.exp(-I / epsilon).ravel()
diff = sample - self.protos
for proto_id in range(self.n_protos):
self.protos[proto_id, :] += lrate * H[proto_id] * diff[proto_id, :]
#nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(protos)
#distances, _ = nbrs.kneighbors(data)
#ndistortion.append( np.sum(distances) / dist_from_avg_p )
return self
def encode(self, data, metric='euclidean'):
""" Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
metric : string
One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.
Valid options include:
- euclidean
- cityblock
- l1
- cosine
Returns
-------
encoded_data : real array-like, shape(n_samples, n_features)
``data``, as represented by the prototypes in codebook.
ts_symbols : list, shape(n_samples, 1)
A discrete symbolic time series
"""
# Perform a proposed data mining procedure as described in [Laskaris2004].
mds = MDS(1, random_state=self.rng)
protos_1d = mds.fit_transform(self.protos).ravel()
sorted_protos_1d = np.argsort(protos_1d)
sprotos = self.protos[sorted_protos_1d]
nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos)
_, self.__symbols = nbrs.kneighbors(data)
self.__encoding = sprotos[self.__symbols]
return (self.__encoding, self.__symbols)
def grab_articles(self, ids):
task_id = self.request.id
ids = ids[0]
print("Entering Grab Articles Task: ", len(ids))
print("Task id from self: ", task_id)
s = select([articles_db.c.id, articles_db.c.tfidf]).where(articles_db.c.id.in_(ids))
all_articles = pd.read_sql(s, con=connection, chunksize=350)
all_articles = pd.concat(all_articles, ignore_index=True)
stored_data = json.loads(r.get(task_id))
stored_data['status'] = "creating article matrix"
r.set(task_id, json.dumps(stored_data))
tfidf_dict = stored_data['tfidf_dict']
all_articles = all_articles.append({'id': 1, 'tfidf': tfidf_dict}, ignore_index=True)
corpus = helpers.generate_sparse_matrix(all_articles)
query_article_vector = corpus.getrow(-1)
all_articles['distance'] = pairwise_distances(corpus, query_article_vector, metric='cosine').flatten()
stored_data['status'] = "computing best matches"
r.set(task_id, json.dumps(stored_data))
max_distance_from_query = 0.75 # on a scale of 0 (exact match) to 1.0 (not even close)
all_articles = all_articles[all_articles['distance'] < max_distance_from_query]
print("Done computing matrix and distances")
s = select([articles_db.c.id, articles_db.c.headline, articles_db.c.url, articles_db.c.date]).where(
articles_db.c.id.in_(all_articles['id'].tolist()))
all_articles = pd.read_sql(s, connection).set_index('id').join(all_articles.set_index('id')).sort_values(by='date')
query_article = {'headline': stored_data['headline'], 'date': datetime.strptime(stored_data['date'], "%d-%b-%Y"),
'distance': 0, 'url': stored_data['url']}
articles = helpers.make_article_array(all_articles, query_article)
return articles, query_article['headline']
def pairwise_distances(self, X, Y=None, metric='cosine',
n_jobs=1, **kwds):
if self.prenorm:
if metric == 'cosine':
return self._cosine_distances_prenorm(X, Y)
else:
raise Exception(
'Vectors are normalized and will work only with cosine.')
return smp.pairwise_distances(X, Y, metric=metric,
n_jobs=n_jobs, **kwds)