def get_nearby_words(main_words):
main_inds = {}
all_words = []
all_vecs = []
with open(OPTS.wordvec_file) as f:
for i, line in tqdm(enumerate(f)):
toks = line.rstrip().split(' ')
word = unicode(toks[0], encoding='ISO-8859-1')
vec = np.array([float(x) for x in toks[1:]])
all_words.append(word)
all_vecs.append(vec)
if word in main_words:
main_inds[word] = i
print >> sys.stderr, 'Found vectors for %d/%d words = %.2f%%' % (
len(main_inds), len(main_words), 100.0 * len(main_inds) / len(main_words))
tree = KDTree(all_vecs)
nearby_words = {}
for word in tqdm(main_inds):
dists, inds = tree.query([all_vecs[main_inds[word]]],
k=OPTS.num_neighbors + 1)
nearby_words[word] = [
{'word': all_words[i], 'dist': d} for d, i in zip(dists[0], inds[0])]
return nearby_words
python类KDTree()的实例源码
def _nearest_neighbor(df1, df2):
"""
For a DataFrame of xy coordinates find the nearest xy
coordinates in a subsequent DataFrame
Parameters
----------
df1 : pandas.DataFrame
DataFrame of records to return as the nearest record to records in df2
df2 : pandas.DataFrame
DataFrame of records with xy coordinates for which to find the
nearest record in df1 for
Returns
-------
df1.index.values[indexes] : pandas.Series
index of records in df1 that are nearest to the coordinates in df2
"""
kdt = KDTree(df1.as_matrix())
indexes = kdt.query(df2.as_matrix(), k=1, return_distance=False)
return df1.index.values[indexes]
def _add(self, state, r):
if self._current_capacity >= self._capacity:
# find the LRU entry
old_index = np.argmin(self._lru) # ??????????
self._states[old_index] = state
self._q_values[old_index] = r
self._lru[old_index] = self._time
else:
self._states[self._current_capacity] = state
self._q_values[self._current_capacity] = r
self._lru[self._current_capacity] = self._time
self._current_capacity += 1
self._time += 0.01
# ????????
self._tree = KDTree(self._states[:self._current_capacity])
# TDOO: _time?????????????????
def test_unsupervised_inputs():
# test the types of valid input into NearestNeighbors
X = rng.random_sample((10, 3))
nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1)
nbrs_fid.fit(X)
dist1, ind1 = nbrs_fid.kneighbors(X)
nbrs = neighbors.NearestNeighbors(n_neighbors=1)
for input in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
nbrs.fit(input)
dist2, ind2 = nbrs.kneighbors(X)
assert_array_almost_equal(dist1, dist2)
assert_array_almost_equal(ind1, ind2)
def retrieve_image(target_image, model_file, deploy_file, imagemean_file,
threshold=1):
model_dir = os.path.dirname(model_file)
image_files = np.load(os.path.join(model_dir, 'image_files.npy'))
fc7_feature_mat = np.load(os.path.join(model_dir, 'fc7_features.npy'))
latent_feature_file = os.path.join(model_dir, 'latent_features.npy')
latent_feature_mat = np.load(latent_feature_file)
candidates = []
dist = 0
for layer, mat in layer_features(['latent', 'fc7'], model_file,
deploy_file, imagemean_file,
[target_image], show_pred=True):
if layer == 'latent':
# coarse-level search
mat = binary_hash_codes(mat)
mat = mat * np.ones((latent_feature_mat.shape[0], 1))
dis_mat = np.abs(mat - latent_feature_mat)
hamming_dis = np.sum(dis_mat, axis=1)
distance_file = os.path.join(model_dir, 'hamming_dis.npy')
np.save(distance_file, hamming_dis)
candidates = np.where(hamming_dis < threshold)[0]
if layer == 'fc7':
# fine-level search
kdt = KDTree(fc7_feature_mat[candidates], metric='euclidean')
k = 6
if not candidates.shape[0] > 6:
k = candidates.shape[0]
dist, idxs = kdt.query(mat, k=k)
candidates = candidates[idxs]
print(dist)
return image_files[candidates][0], dist[0]
def _find_neighborhoods(self, targets, constraints):
if not constraints:
raise ValueError('No constraints in neighbor search.')
if any(np.isnan(v) for v in constraints.values()):
raise ValueError('Nan constraints in neighbor search.')
# Extract the targets, constraints from the dataset.
lookup = list(targets) + list(constraints)
D = self._dataset(lookup)
# Not enough neighbors: crash for now. Workarounds include:
# (i) reduce K, (ii) randomly drop constraints, (iii) impute dataset.
if len(D) < self.K:
raise ValueError('Not enough neighbors: %s'
% ((targets, constraints),))
# Code the dataset with Euclidean embedding.
N = len(targets)
D_qr_code = self._dummy_code(D[:,:N], lookup[:N])
D_ev_code = self._dummy_code(D[:,N:], lookup[N:])
D_code = np.column_stack((D_qr_code, D_ev_code))
# Run nearest neighbor search on the constraints only.
constraints_code = self._dummy_code(
[constraints.values()], constraints.keys())
dist, neighbors = KDTree(D_ev_code).query(constraints_code, k=len(D))
# Check for equidistant neighbors and possibly extend the search.
valid = [i for i, d in enumerate(dist[0]) if d <= dist[0][self.K-1]]
if self.K < len(valid):
neighbors = self.rng.choice(neighbors[0][valid],
replace=False, size=self.K)
else:
neighbors = neighbors[0][:self.K]
# For each neighbor, find its nearest M on the full lookup set.
_, ex = KDTree(D_code).query(D_code[neighbors], k=min(self.M, self.K))
# Return the dataset and the list of neighborhoods.
return D[:,:len(targets)], ex
def add(self, keys, values):
skip_indices = []
if self.curr_capacity >= 1:
dist, ind = self.tree.query(keys, k=1)
for i, d in enumerate(dist):
if d[0] < 0.001:
new_value = values[i]
index = ind[i][0]
self.q_values[index] = self.q_values[index]*0.9 + new_value*0.1
skip_indices.append(i)
for i, _ in enumerate(keys):
if i in skip_indices: continue
if self.curr_capacity >= self.capacity:
# find the LRU entry
old_index = np.argmin(self.lru)
self.states[old_index] = keys[i]
self.q_values[old_index] = values[i]
self.lru[old_index] = self.tm
else:
self.states[self.curr_capacity] = keys[i]
self.q_values[self.curr_capacity] = values[i]
self.lru[self.curr_capacity] = self.tm
self.curr_capacity+=1
self.tm += 0.01
self.tree = KDTree(self.states[:self.curr_capacity])
def add(self, keys, values):
if self.curr_capacity >= 5:
dist, ind = self.tree.query(np.pad(keys, ((0,0),(0,1)), 'constant', constant_values=1.0), k=5)
#dist, ind = self.tree.query(keys, k=50)
for i, ind_ in enumerate(ind):
stren = 1 - self.alpha
self.weights[ind_] = self.weights[ind_] * stren
for i, _ in enumerate(keys):
low_w = 1.0
if self.curr_capacity >= self.capacity:
# find the LRU entry
old_index = np.argmin(self.weights)
low_w = min(low_w, self.weights[old_index])
index = old_index
else:
index = self.curr_capacity
self.curr_capacity+=1
self.states[index] = keys[i]
self.q_values[index] = values[i]
self.weights[index] = 1.0
self.tree = KDTree(np.concatenate((self.states[:self.curr_capacity], np.expand_dims(self.weights[:self.curr_capacity], axis=1)),axis=1))
#self.tree = KDTree(self.states[:self.curr_capacity])
def plot_decision_boundry(data, pipe, reducer=PCA):
fig, ax = plt.subplots(figsize=(16, 12))
if callable(reducer):
reducer = reducer(n_components=2)
# else assume it's already been instantiated...
if isinstance(pipe, Pipeline) and len(pipe.steps) > 1:
prepipe = Pipeline(pipe.steps[:-1])
km = pipe.steps[-1][1]
data_ = prepipe.transform(data)
elif isinstance(pipe, Pipeline):
prepipe = None
km = pipe.steps[0][1]
data_ = data
else:
prepipe = None
km = pipe
data_ = data
X_reduced = reducer.fit_transform(data_)
cluster_centers = getattr(km, 'cluster_centers_',
compute_centers(km, data_))
mu_reduced = reducer.transform(cluster_centers)
n_clusters = len(np.unique(km.labels_))
tree = KDTree(mu_reduced)
cmap = rediscretize_cmap(n_clusters, 'Set1')
ax.scatter(mu_reduced[:, 0], mu_reduced[:, 1],
c=np.arange(n_clusters), cmap=cmap,
s=300)
colorbar_index(ncolors=n_clusters, cmap=cmap)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=km.labels_,
cmap=cmap, alpha=.95)
xmin, xmax = ax.get_xlim()
ymin, ymax = ax.get_ylim()
xx, yy = np.meshgrid(np.linspace(xmin, xmax, 100),
np.linspace(ymin, ymax, 100))
T = np.c_[xx.ravel(), yy.ravel()]
_, group = tree.query(T)
Z = group.ravel().reshape(xx.shape)
ax.pcolormesh(xx, yy, Z, alpha=.25, cmap=cmap)
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
for label, xy in enumerate(mu_reduced[:, :2]):
ax.annotate(label, xy, fontsize=28, fontweight="bold")
return ax
def estimatenormals(points, npoints = 40, method = 'pca'):
"""
estimate the normals of points
:param points: an array of [x, y, z]
:param method: 'pca' or 'ransac', theoretically ransac is more precise when there are more points
:return: a list of normal vectors
author: weiwei
date: 20170714
"""
pointsnormals = []
camerapos = np.array([0.0,0.0,0.0])
kdt = KDTree(points)
if method == 'pca':
regionpntidlist = kdt.query(points, k=npoints, return_distance=False)
for i, pntidlist in enumerate(regionpntidlist):
regionpnts = points[pntidlist]
covmat = np.cov(regionpnts.T)
eigvalues, eigmat = np.linalg.eig(covmat)
idx = np.argmin(eigvalues)
eigvec = eigmat[:, idx]
if np.dot(eigvec, camerapos-points[i]) < 0:
eigvec = -eigvec
pointsnormals.append(eigvec)
elif method == 'ransac':
# NOTE: this part is not usable due to small npoints
ransacer = linear_model.RANSACRegressor(linear_model.LinearRegression())
regionpntidlist = kdt.query(points, k=npoints, return_distance=False)
for i, pntidlist in enumerate(regionpntidlist):
XYZ = points[pntidlist]
ransacer.fit(XYZ[:, 0:2], XYZ[:, 2])
inlier_mask = ransacer.inlier_mask_
regionpnts = XYZ[inlier_mask]
covmat = np.cov(regionpnts.T)
eigvalues, eigmat = np.linalg.eig(covmat)
idx = np.argmin(eigvalues)
eigvec = eigmat[:, idx]
if np.dot(eigvec, camerapos-points[i]) < 0:
eigvec = -eigvec
pointsnormals.append(eigvec)
return pointsnormals
def test_neighbors_metrics(n_samples=20, n_features=3,
n_query_pts=2, n_neighbors=5):
# Test computing the neighbors for various metrics
# create a symmetric matrix
V = rng.rand(n_features, n_features)
VI = np.dot(V, V.T)
metrics = [('euclidean', {}),
('manhattan', {}),
('minkowski', dict(p=1)),
('minkowski', dict(p=2)),
('minkowski', dict(p=3)),
('minkowski', dict(p=np.inf)),
('chebyshev', {}),
('seuclidean', dict(V=rng.rand(n_features))),
('wminkowski', dict(p=3, w=rng.rand(n_features))),
('mahalanobis', dict(VI=VI))]
algorithms = ['brute', 'ball_tree', 'kd_tree']
X = rng.rand(n_samples, n_features)
test = rng.rand(n_query_pts, n_features)
for metric, metric_params in metrics:
results = []
p = metric_params.pop('p', 2)
for algorithm in algorithms:
# KD tree doesn't support all metrics
if (algorithm == 'kd_tree' and
metric not in neighbors.KDTree.valid_metrics):
assert_raises(ValueError,
neighbors.NearestNeighbors,
algorithm=algorithm,
metric=metric, metric_params=metric_params)
continue
neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors,
algorithm=algorithm,
metric=metric, p=p,
metric_params=metric_params)
neigh.fit(X)
results.append(neigh.kneighbors(test, return_distance=True))
assert_array_almost_equal(results[0][0], results[1][0])
assert_array_almost_equal(results[0][1], results[1][1])