def prune(self, question, paragraphs: List[ExtractedParagraph]):
if not self.filter_dist_one and len(paragraphs) == 1:
return paragraphs
tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
text = []
for para in paragraphs:
text.append(" ".join(" ".join(s) for s in para.text))
try:
para_features = tfidf.fit_transform(text)
q_features = tfidf.transform([" ".join(question)])
except ValueError:
return []
dists = pairwise_distances(q_features, para_features, "cosine").ravel()
sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph
if self.filter_dist_one:
return [paragraphs[i] for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
else:
return [paragraphs[i] for i in sorted_ix[:self.n_to_select]]
python类pairwise_distances()的实例源码
def dists(self, question, paragraphs: List[ExtractedParagraph]):
tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self.stop.words)
text = []
for para in paragraphs:
text.append(" ".join(" ".join(s) for s in para.text))
try:
para_features = tfidf.fit_transform(text)
q_features = tfidf.transform([" ".join(question)])
except ValueError:
return []
dists = pairwise_distances(q_features, para_features, "cosine").ravel()
sorted_ix = np.lexsort(([x.start for x in paragraphs], dists)) # in case of ties, use the earlier paragraph
if self.filter_dist_one:
return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select] if dists[i] < 1.0]
else:
return [(paragraphs[i], dists[i]) for i in sorted_ix[:self.n_to_select]]
def find_distance_matrix(self, vector, metric='cosine'):
'''
compute distance matrix between topis using cosine or euclidean
distance (default=cosine distance)
'''
if metric == 'cosine':
distance_matrix = pairwise_distances(vector,
metric='cosine')
# diagonals should be exactly zero, so remove rounding errors
numpy.fill_diagonal(distance_matrix, 0)
if metric == 'euclidean':
distance_matrix = pairwise_distances(vector,
metric='euclidean')
return distance_matrix
def find_similar_words(wordvecs):
""" Use loaded word embeddings to find out the most similar words in the
embedded vector space.
"""
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
pairwise_sim_mat = 1 - pairwise_distances(wordvecs.W[1:],
metric='cosine',
# metric='euclidean',
)
id2word = {}
for key, value in wordvecs.word_idx_map.iteritems():
assert(value not in id2word)
id2word[value] = key
while True:
word = raw_input("Enter a word ('STOP' to quit): ")
if word == 'STOP': break
try:
w_id = wordvecs.word_idx_map[word]
except KeyError:
print '%s not in the vocabulary.' % word
sim_w_id = pairwise_sim_mat[w_id-1].argsort()[-10:][::-1]
for i in sim_w_id:
print id2word[i+1],
print ''
def sort_by_tfidf(question, paragraphs):
tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=spacy.en.STOP_WORDS, decode_error='replace')
try:
para_features = tfidf.fit_transform(paragraphs)
q_features = tfidf.transform([question])
except ValueError:
return [(i, 0.0) for i in range(len(paragraphs))]
dists = pairwise_distances(q_features, para_features, "cosine").ravel()
sorted_ix = np.lexsort((paragraphs, dists)) # in case of ties, use the earlier paragraph
return [(i, 1.0 - dists[i]) for i in sorted_ix]
def test_pairwise_distances(X_blobs):
centers = X_blobs[::100].compute()
result = dm.pairwise_distances(X_blobs, centers)
expected = sm.pairwise_distances(X_blobs.compute(), centers)
assert_eq(result, expected, atol=1e-4)
def pairwise_distances(X, Y, metric='euclidean', n_jobs=None, **kwargs):
if isinstance(Y, da.Array):
raise TypeError("`Y` must be a numpy array")
chunks = (X.chunks[0], (len(Y),))
return X.map_blocks(metrics.pairwise_distances, Y,
dtype=float, chunks=chunks,
metric=metric, **kwargs)
def transform(self, X):
"""Compute the LLC representation of the provided data.
Parameters
----------
X : array_like or list
The local features to aggregate. They must be either nd arrays or
a list of nd arrays. In case of a list each item is aggregated
separately.
"""
# Get the local features and the number of local features per document
X, lengths = self._reshape_local_features(X)
# Preprocess the lengths list into indexes in the local feature array
starts = np.cumsum([0] + lengths).astype(int)
ends = np.cumsum(lengths).astype(int)
# Calculate the nearest neighbors
centroids = self._clusterer.cluster_centers_
distances = pairwise_distances(X, centroids)
K = self.neighbors
neighbors = np.argpartition(distances, K)[:, :K]
# Compute the llc representation
llc = np.zeros((len(lengths), self.n_codewords))
L2 = self.beta * np.eye(X.shape[1])
for i, (s, e) in enumerate(zip(starts, ends)):
for j in range(s, e):
# a = argmin_{1^T a = 1} ||x - Ca||_2^2 + \beta ||a||_2^2
C = centroids[neighbors[j]]
a = C.dot(np.linalg.inv(C.T.dot(C) + L2)).dot(X[j])
llc[i, neighbors[j]] = np.maximum(
llc[i, neighbors[j]],
a / a.sum()
)
return llc
def computePerformance(self, instances):
X = instances.features
labels = instances.true_labels
# For unsupervised projection methods, the performance is always computed with the labels (not the families).
if hasattr(self.projection.conf, 'families_supervision'):
if self.projection.conf.families_supervision:
labels = instances.true_families
unique_labels, label_inds = np.unique(labels, return_inverse = True)
ratio = 0
for li in xrange(len(unique_labels)):
Xc = X[label_inds == li]
Xnc = X[label_inds != li]
ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc, Xnc).mean()
self.class_separation = ratio / len(unique_labels)
def _compute_score(q, X, metric):
""" Internal method to compute the scores """
from .metrics import _scale_cosine_similarity
dist = pairwise_distances(q, X, 'cosine')
dist = dist[0]
scores = 1 - dist
scores = _scale_cosine_similarity(scores, metric=metric)
return scores
def draw_features_and_similarity(mm, words_of_interest):
rows, cols, xlabels = mm.filter_submatrix(words_of_interest, 25)
ax = plt.subplot(1, 2, 1)
plot_heat(ax, cols, xlabels, words_of_interest)
# plot_heat(ax,abs(m),numbered)
ax = plt.subplot(1, 2, 2)
t = 1 - pairwise_distances(rows, metric="cosine")
np.fill_diagonal(t, 0)
plot_heat(ax, t, words_of_interest, words_of_interest)
# plt.savefig("m1.pdf")
def score_paragraphs(self, question, paragraphs: List[ExtractedParagraphWithAnswers]):
tfidf = self._tfidf
text = []
for para in paragraphs:
text.append(" ".join(" ".join(s) for s in para.text))
try:
para_features = tfidf.fit_transform(text)
q_features = tfidf.transform([" ".join(question)])
except ValueError:
return []
q_words = {x for x in question if x.lower() not in self._stop}
q_words_lower = {x.lower() for x in q_words}
word_matches_features = np.zeros((len(paragraphs), 2))
for para_ix, para in enumerate(paragraphs):
found = set()
found_lower = set()
for sent in para.text:
for word in sent:
if word in q_words:
found.add(word)
elif word.lower() in q_words_lower:
found_lower.add(word.lower())
word_matches_features[para_ix, 0] = len(found)
word_matches_features[para_ix, 1] = len(found_lower)
tfidf = pairwise_distances(q_features, para_features, "cosine").ravel()
starts = np.array([p.start for p in paragraphs])
log_word_start = np.log(starts/400.0 + 1)
first = starts == 0
scores = tfidf * self.TFIDF_W + self.LOG_WORD_START_W * log_word_start + self.FIRST_W * first +\
self.LOWER_WORD_W * word_matches_features[:, 1] + self.WORD_W * word_matches_features[:, 0]
return scores
def rank(self, questions: List[List[str]], paragraphs: List[List[List[str]]]):
tfidf = self._tfidf
para_features = tfidf.fit_transform([" ".join(" ".join(s) for s in x) for x in paragraphs])
q_features = tfidf.transform([" ".join(q) for q in questions])
scores = pairwise_distances(q_features, para_features, "cosine")
return scores
def centroid_pairwise_dist(X,centroids):
return pairwise_distances(X,centroids,metric='euclidean')
def compute_heterogeneity(data, k, centroids, cluster_assignment):
heterogeneity = 0.0
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment==i, :]
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
# Compute distances from centroid to data points (RHS only)
distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
squared_distances = distances**2
heterogeneity += np.sum(squared_distances)
return heterogeneity
def _compute_radii(self):
"""Generate RBF radii"""
# use supplied radii if present
radii = self._get_user_components('radii')
# compute radii
if (radii is None):
centers = self.components_['centers']
n_centers = centers.shape[0]
max_dist = np.max(pairwise_distances(centers))
radii = np.ones(n_centers) * max_dist/sqrt(2.0 * n_centers)
self.components_['radii'] = radii
def parameter_distance(params, dist_metric='canberra', scale='minmax', return_scaled=False):
"""
Computes distances between subjects' respective parameter estimates
Parameters
----------
params : ndarray(shape=(nsubjects, nsubjects))
Array of parameter estimates
dist_metric : str (default='canberra')
Distance metric to be used. Can take any value acceptable by ``sklearn.metrics.pairwise_distances``.
scale : {'minmax', 'standard', 'none'}
How to scale the parameters for distance computation
return_scaled : bool
Whether to return scaled parameters
"""
if scale != 'none':
if scale == 'minmax':
scaler = MinMaxScaler()
if scale == 'standard':
scaler = StandardScaler()
nparams = np.shape(params)[1]
for j in range(nparams):
scaledparam = scaler.fit_transform(params[:, j].reshape(-1, 1))
params[:, j] = scaledparam.flatten()
if return_scaled is True:
D = (pairwise_distances(params, metric=dist_metric), params)
else:
D = pairwise_distances(params, metric=dist_metric)
return D
def construct_k_nearest_matrix(self, dt_matrix, k):
tmp = np.array(1 - pairwise_distances(dt_matrix[dt_matrix.columns[1:]], metric = "cosine"))
similarity_matrix = pd.DataFrame(tmp, index = dt_matrix.index.tolist(), columns = dt_matrix.index.tolist())
for i in similarity_matrix.index:
tmp = [int(i),[]]
j = 0
while j < k:
max_col = similarity_matrix.loc[i].idxmax(axis = 1)
similarity_matrix.loc[i][max_col] = -1
if max_col != i:
tmp[1].append(int(max_col)) #max column name
j += 1
self.k_nearest.append(tmp)
def test_precomputed_cross_validation():
# Ensure array is split correctly
rng = np.random.RandomState(0)
X = rng.rand(20, 2)
D = pairwise_distances(X, metric='euclidean')
y = rng.randint(3, size=20)
for Est in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
metric_score = cross_val_score(Est(), X, y)
precomp_score = cross_val_score(Est(metric='precomputed'), D, y)
assert_array_equal(metric_score, precomp_score)
def test_non_euclidean_kneighbors():
rng = np.random.RandomState(0)
X = rng.rand(5, 5)
# Find a reasonable radius.
dist_array = pairwise_distances(X).flatten()
np.sort(dist_array)
radius = dist_array[15]
# Test kneighbors_graph
for metric in ['manhattan', 'chebyshev']:
nbrs_graph = neighbors.kneighbors_graph(
X, 3, metric=metric, mode='connectivity',
include_self=True).toarray()
nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())
# Test radiusneighbors_graph
for metric in ['manhattan', 'chebyshev']:
nbrs_graph = neighbors.radius_neighbors_graph(
X, radius, metric=metric, mode='connectivity',
include_self=True).toarray()
nbrs1 = neighbors.NearestNeighbors(metric=metric, radius=radius).fit(X)
assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A)
# Raise error when wrong parameters are supplied,
X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
X_nbrs.fit(X)
assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
metric='euclidean')
X_nbrs = neighbors.NearestNeighbors(radius=radius, metric='manhattan')
X_nbrs.fit(X)
assert_raises(ValueError, neighbors.radius_neighbors_graph, X_nbrs,
radius, metric='euclidean')
def test_silhouette():
# Tests the Silhouette Coefficient.
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
D = pairwise_distances(X, metric='euclidean')
# Given that the actual labels are used, we can assume that S would be
# positive.
silhouette = silhouette_score(D, y, metric='precomputed')
assert(silhouette > 0)
# Test without calculating D
silhouette_metric = silhouette_score(X, y, metric='euclidean')
assert_almost_equal(silhouette, silhouette_metric)
# Test with sampling
silhouette = silhouette_score(D, y, metric='precomputed',
sample_size=int(X.shape[0] / 2),
random_state=0)
silhouette_metric = silhouette_score(X, y, metric='euclidean',
sample_size=int(X.shape[0] / 2),
random_state=0)
assert(silhouette > 0)
assert(silhouette_metric > 0)
assert_almost_equal(silhouette_metric, silhouette)
# Test with sparse X
X_sparse = csr_matrix(X)
D = pairwise_distances(X_sparse, metric='euclidean')
silhouette = silhouette_score(D, y, metric='precomputed')
assert(silhouette > 0)
def test_spectral_amg_mode():
# Test the amg mode of SpectralClustering
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
try:
from pyamg import smoothed_aggregation_solver
amg_loaded = True
except ImportError:
amg_loaded = False
if amg_loaded:
labels = spectral_clustering(S, n_clusters=len(centers),
random_state=0, eigen_solver="amg")
# We don't care too much that it's good, just that it *worked*.
# There does have to be some lower limit on the performance though.
assert_greater(np.mean(labels == true_labels), .3)
else:
assert_raises(ValueError, spectral_embedding, S,
n_components=len(centers),
random_state=0, eigen_solver="amg")
def test_spectral_unknown_assign_labels():
# Test that SpectralClustering fails with an unknown assign_labels set.
centers = np.array([
[0., 0., 0.],
[10., 10., 10.],
[20., 20., 20.],
])
X, true_labels = make_blobs(n_samples=100, centers=centers,
cluster_std=1., random_state=42)
D = pairwise_distances(X) # Distance matrix
S = np.max(D) - D # Similarity matrix
S = sparse.coo_matrix(S)
assert_raises(ValueError, spectral_clustering, S, n_clusters=2,
random_state=0, assign_labels="<unknown>")
def train_wordfilter_coefficient(self, seed_words, wordfilters):
mined_words = defaultdict(lambda: defaultdict(lambda: 0))
filter_set = {wordfilter for (rng, wordfilter) in wordfilters}
ranges = {rng for (rng, wordfilter) in wordfilters}
for num_doc, doc in enumerate(Word2vecCorpus(self.corpus_file)):
len_doc = len(doc)
for rng in ranges:
(fb, fe) = rng
if len_doc < (fe - fb + 1):
continue
words = doc[-fb:-fe]
contexts = []
for i, word in enumerate(doc):
if (i + fb < 0) or (i + fe >= len_doc):
continue
contexts.append(tuple([doc[i+r] for r in range(fb, fe+1) if r != 0]))
for i, context in enumerate(contexts):
if context in filter_set:
mined_words[(rng, context)][words[i]] += 1
result = []
seeds_idx = sorted([self.word2index[seed] for seed in seed_words])
seeds_vec = [self.word2vec_model.syn0[idx] for idx in seeds_idx]
for ((rng, context), word2freq) in sorted(mined_words.items(), key=lambda x:sum(x[1].values()), reverse=True):
word_freq = [(self.word2index[word], freq) for (word, freq) in word2freq.items()]
word_freq = [v for v in word_freq if v[0] != -1]
word_freq = sorted(word_freq)
idx = [pair[0] for pair in word_freq]
word_vec = self.word2vec_model.syn0[idx]
sum_freq = sum([v[1] for v in word_freq])
score = 0
for seed_vec in seeds_vec:
sim = 1 + -1 * pairwise_distances(word_vec, seed_vec, metric='cosine')
score += sum([wf[1] * s for wf, s in zip(word_freq, sim)]) / sum_freq
score /= len(seed_words)
result.append((context, rng, score, sum_freq))
return result
def likelihood_distance(loglik_func, data, params, diff_metric='sq', dist_metric='cosine', verbose=False):
"""
Estimates the likelihood of the data from the i'th subject using the parameter estimates of the j'th subject, for all i and j, then computes the distance between subjects' likelihood difference vectors
Parameters
----------
loglik_func : function
The log-likelihood function to be used
data : dict
Data formatted for input into the log-likelihood function
params : ndarray(shape=(nsubjects, nparams))
Array of parameter estimates
diff_metric : {'sq', 'diff', 'abs'}
Which type of difference measure to compute, 'diff' is simple subtractive difference, whereas 'sq' and 'abs' are the squared and absolute differences, respectively
dist_metric : str (default='cosine')
The pairwise distance metric to use. Any option that can be passed into ``sklearn.metrics.pairwise_distances`` can work.
verbose : bool
Whether to print out progress
Returns
-------
ndarray(shape=(nsubjects, nsubjects))
"""
nsubjects = np.shape(params)[0]
D = np.zeros([nsubjects, nsubjects])
for i in range(nsubjects):
S = data[i]['S']
A = data[i]['A']
R = data[i]['R']
if verbose is True:
print('Likelihood Differences: Subject ' + str(i))
# Compute loglikelihood for subject i with own data
LL0 = loglik_func(params=params[i, :],
states=S,
actions=A,
rewards=R)
for j in range(nsubjects):
if i !=j:
LL1 = loglik_func(params=params[j, :],
states=S,
actions=A,
rewards=R)
if diff_metric == 'diff':
D[i, j] = LL1 - LL0
elif diff_metric == 'sq':
D[i, j] = (LL1 - LL0)**2
elif diff_metric == 'abs':
D[i, j] = np.abs(LL1 - LL0)
return pairwise_distances(D, metric=dist_metric)
def test_precomputed(random_state=42):
"""Tests unsupervised NearestNeighbors with a distance matrix."""
# Note: smaller samples may result in spurious test success
rng = np.random.RandomState(random_state)
X = rng.random_sample((10, 4))
Y = rng.random_sample((3, 4))
DXX = metrics.pairwise_distances(X, metric='euclidean')
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
for method in ['kneighbors']:
# TODO: also test radius_neighbors, but requires different assertion
# As a feature matrix (n_samples by n_features)
nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
nbrs_X.fit(X)
dist_X, ind_X = getattr(nbrs_X, method)(Y)
# As a dense distance matrix (n_samples by n_samples)
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
metric='precomputed')
nbrs_D.fit(DXX)
dist_D, ind_D = getattr(nbrs_D, method)(DYX)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Check auto works too
nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
metric='precomputed')
nbrs_D.fit(DXX)
dist_D, ind_D = getattr(nbrs_D, method)(DYX)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Check X=None in prediction
dist_X, ind_X = getattr(nbrs_X, method)(None)
dist_D, ind_D = getattr(nbrs_D, method)(None)
assert_array_almost_equal(dist_X, dist_D)
assert_array_almost_equal(ind_X, ind_D)
# Must raise a ValueError if the matrix is not of correct shape
assert_raises(ValueError, getattr(nbrs_D, method), X)
target = np.arange(X.shape[0])
for Est in (neighbors.KNeighborsClassifier,
neighbors.RadiusNeighborsClassifier,
neighbors.KNeighborsRegressor,
neighbors.RadiusNeighborsRegressor):
print(Est)
est = Est(metric='euclidean')
est.radius = est.n_neighbors = 1
pred_X = est.fit(X, target).predict(Y)
est.metric = 'precomputed'
pred_D = est.fit(DXX, target).predict(DYX)
assert_array_almost_equal(pred_X, pred_D)