def __knn_sklearn(X, k, n_jobs=-1, verbose=False, **kwargs):
nn = NearestNeighbors(n_neighbors=k+1, n_jobs=n_jobs,
algorithm='ball_tree', **kwargs)
nn.fit(X)
if verbose:
print('Indexing done.')
dist, ind = nn.kneighbors(X, k+1, return_distance=True)
if verbose:
print('Query done.')
return dist[:,1:].astype(X.dtype), ind[:,1:]
python类NearestNeighbors()的实例源码
def encode(self, data, metric = 'euclidean'):
""" Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
metric : string
One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.
Valid options include:
- euclidean
- cityblock
- l1
- cosine
Returns
-------
encoded_data : real array-like, shape(n_samples, n_features)
``data``, as represented by the prototypes in codebook.
ts_symbols : list, shape(n_samples, 1)
A discrete symbolic time series
"""
nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos)
_, self.__symbols = nbrs.kneighbors(data)
self.__encoding = self.protos[self.__symbols]
return (self.__encoding, self.__symbols)
def encode(self, data, metric = 'euclidean'):
""" Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
metric : string
One of the following valid options as defined for function `http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html`.
Valid options include:
- euclidean
- cityblock
- l1
- cosine
Returns
-------
encoded_data : real array-like, shape(n_samples, n_features)
``data``, as represented by the prototypes in codebook.
ts_symbols : list, shape(n_samples, 1)
A discrete symbolic time series
"""
nbrs = NearestNeighbors(n_neighbors = 1, algorithm = 'auto', metric = metric).fit(self.protos)
_, self.__symbols = nbrs.kneighbors(data)
self.__encoding = self.protos[self.__symbols]
return (self.__encoding, self.__symbols)
def fit(self, data):
""" Learn data, and construct a vector codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
Returns
-------
self : object
The instance itself
"""
[n_samples, _] = data.shape
self.protos = data[self.rng.choice(n_samples, self.n_protos), ]
# avg_p = np.mean(data, 0)
#dist_from_avg_p = np.sum(pairwise_distances(avg_p, data))
#ndistortion = []
for iteration in range(self.iterations):
sample = data[self.rng.choice(n_samples, 1), ]
t = iteration / float(self.iterations)
lrate = self.lrate_i * (self.lrate_f / float(self.lrate_i)) ** t
epsilon = self.epsilon_i * (self.epsilon_f / float(self.epsilon_i)) ** t
D = pairwise_distances(sample, self.protos, metric='euclidean', n_jobs=self.n_jobs)
I = np.argsort(np.argsort(D))
H = np.exp(-I / epsilon).ravel()
diff = sample - self.protos
for proto_id in range(self.n_protos):
self.protos[proto_id, :] += lrate * H[proto_id] * diff[proto_id, :]
#nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(protos)
#distances, _ = nbrs.kneighbors(data)
#ndistortion.append( np.sum(distances) / dist_from_avg_p )
return self
def encode(self, data, metric='euclidean'):
""" Employ a nearest-neighbor rule to encode the given ``data`` using the codebook.
Parameters
----------
data : real array-like, shape(n_samples, n_features)
Data matrix, each row represents a sample.
metric : string
One of the following valid options as defined for function http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html.
Valid options include:
- euclidean
- cityblock
- l1
- cosine
Returns
-------
encoded_data : real array-like, shape(n_samples, n_features)
``data``, as represented by the prototypes in codebook.
ts_symbols : list, shape(n_samples, 1)
A discrete symbolic time series
"""
# Perform a proposed data mining procedure as described in [Laskaris2004].
mds = MDS(1, random_state=self.rng)
protos_1d = mds.fit_transform(self.protos).ravel()
sorted_protos_1d = np.argsort(protos_1d)
sprotos = self.protos[sorted_protos_1d]
nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric=metric).fit(sprotos)
_, self.__symbols = nbrs.kneighbors(data)
self.__encoding = sprotos[self.__symbols]
return (self.__encoding, self.__symbols)
def __init__(self,n_neighbors=5,loss='L2'):
if loss in ['L1','L2','SMAPE']:
loss = {'L1':L1,'L2':L2,'SMAPE':SMAPE}[loss]
self.loss = loss
self.n_neighbors = n_neighbors
self.model = NearestNeighbors(n_neighbors,algorithm='auto',n_jobs=-1)
self.solver = lambda x:solver(x,loss)
def compute_distances(cls, inst_id):
global feat_nn
global feat_ids
it = cls.objects.annotate(height=F('face__bbox_y2') - F('face__bbox_y1')).filter(
height__gte=0.1).order_by('id')
if feat_nn is None:
_print('Loading features...')
feats = list(it[::5])
feat_ids = np.array([f.id for f in feats])
feat_vectors = [f.load_features() for f in feats]
X = np.vstack(feat_vectors)
_print('Constructing KNN tree...')
feat_nn = NearestNeighbors().fit(X)
_print('Done!')
# Erase distances from previous computation
prev = list(cls.objects.filter(distto__isnull=False))
for feat in prev:
feat.distto = None
cls.objects.bulk_update(prev)
dists, indices = feat_nn.kneighbors([cls.objects.get(face=inst_id).load_features()], 1000)
for dist, feat_id in zip(dists[0], feat_ids[indices[0]]):
feat = cls.objects.get(id=feat_id)
feat.distto = dist
feat.save()
def identity_detect(videos, exemplar, features):
log.debug('Loading features')
ids, vectors = zip(*[((i, j), f.load_features())
for i, vid_features in enumerate(features)
for j, f in enumerate(vid_features)])
log.debug('Building k-nn tree')
feat_nn = NearestNeighbors().fit(np.vstack(vectors))
log.debug('Doing look-up')
exemplar_vector = FaceFeatures.objects.get(
face=exemplar, labeler__name='facenet').load_features()
dists, id_indices = feat_nn.kneighbors([exemplar_vector], min(10000, len(vectors)))
face_map = defaultdict(list)
for (dist, k) in zip(dists[0], id_indices[0]):
(i, j) = ids[k]
if dist > FEATURE_DISTANCE_THRESHOLD:
break
face_map[videos[i].id].append(features[i][j])
return [face_map[video.id] for video in videos]
# Remove faces with negative coords and small height
def __init__(self,
MMDLayer,
MMDTargetTrain,
MMDTargetValidation_split=0.1,
MMDTargetSampleSize=1000,
n_neighbors = 25,
scales = None,
weights = None):
if scales == None:
print("setting scales using KNN")
med = np.zeros(20)
for ii in range(1,20):
sample = MMDTargetTrain[np.random.randint(MMDTargetTrain.shape[0], size=MMDTargetSampleSize),:]
nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample)
distances,dummy = nbrs.kneighbors(sample)
#nearest neighbor is the point so we need to exclude it
med[ii]=np.median(distances[:,1:n_neighbors])
med = np.median(med)
scales = [med/2, med, med*2] # CyTOF
print(scales)
scales = K.variable(value=np.asarray(scales))
if weights == None:
print("setting all scale weights to 1")
weights = K.eval(K.shape(scales)[0])
weights = K.variable(value=np.asarray(weights))
self.MMDLayer = MMDLayer
MMDTargetTrain, MMDTargetValidation = train_test_split(MMDTargetTrain, test_size=MMDTargetValidation_split, random_state=42)
self.MMDTargetTrain = K.variable(value=MMDTargetTrain)
self.MMDTargetTrainSize = K.eval(K.shape(self.MMDTargetTrain)[0])
self.MMDTargetValidation = K.variable(value=MMDTargetValidation)
self.MMDTargetValidationSize = K.eval(K.shape(self.MMDTargetValidation)[0])
self.MMDTargetSampleSize = MMDTargetSampleSize
self.kernel = self.RaphyKernel
self.scales = scales
self.weights = weights
#calculate the raphy kernel applied to all entries in a pairwise distance matrix
def __init__(self):
SingleClassifier.SingleClassifier.__init__(self)
# weak classifier
algorithms = ['brute', 'ball_tree', 'kd_tree']
self.clf = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples.
Returns
-------
self : detector
Return self.
"""
X = check_array(X)
self._knn = NearestNeighbors(
metric = self.metric,
metric_params = self.metric_params,
n_jobs = self.n_jobs,
n_neighbors = self.n_neighbors,
p = self.p
).fit(X)
self.y_score_ = self.anomaly_score()
self.threshold_ = np.percentile(
self.y_score_, 100.0 * (1.0 - self.fpr)
)
return self
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples.
Returns
-------
self : detector
Return self.
"""
X = check_array(X)
self._knn = NearestNeighbors(
metric = self.metric,
metric_params = self.metric_params,
n_jobs = self.n_jobs,
n_neighbors = self.n_neighbors,
p = self.p
).fit(X)
self.y_score_ = self.anomaly_score()
self.threshold_ = np.percentile(
self.y_score_, 100.0 * (1.0 - self.fpr)
)
return self
def calc_mahalanobis(x, y, n_neighbors):
from sklearn.neighbors import DistanceMetric, NearestNeighbors
DistanceMetric.get_metric('mahalanobis', V=np.cov(x))
nn = NearestNeighbors(n_neighbors=n_neighbors,
algorithm='brute',
metric='mahalanobis',
metric_params={'V': np.cov(x)})
return nn.fit(x).kneighbors(y)
def train_and_score(metric, training, testing, ks):
print "Training and scoring"
scores = []
knn = NearestNeighbors(metric=metric, algorithm="brute")
knn.fit(training)
for k in ks:
print "Evaluating for", k, "neighbors"
neighbor_indices = knn.kneighbors(testing,
n_neighbors=k,
return_distance=False)
all_predicted_scores = []
all_labels = []
for user_id in xrange(testing.shape[0]):
user_row = testing[user_id, :]
_, interaction_indices = user_row.nonzero()
interacted = set(interaction_indices)
non_interacted = set(xrange(testing.shape[1])) - interacted
n_samples = min(len(non_interacted), len(interacted))
sampled_interacted = random.sample(interacted, n_samples)
sampled_non_interacted = random.sample(non_interacted, n_samples)
indices = list(sampled_interacted)
indices.extend(sampled_non_interacted)
labels = [1] * n_samples
labels.extend([0] * n_samples)
neighbors = training[neighbor_indices[user_id, :], :]
predicted_scores = neighbors.mean(axis=0)
for idx in indices:
all_predicted_scores.append(predicted_scores[0, idx])
all_labels.extend(labels)
print len(all_labels), len(all_predicted_scores)
auc = roc_auc_score(all_labels, all_predicted_scores)
print "k", k, "AUC", auc
def __init__(self):
self.knnModel = NearestNeighbors(n_neighbors=15)
self.log = logging.getLogger(__name__)
def train(self, userFeatureTable, ratingsMat):
userFeatureTable.loc[:, "age"] = userFeatureTable.loc[:, "age"] / 10.
# ad hoc fix, make sure feature's range is similar
self.knnModel = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(userFeatureTable)
# ratingMat is the rating matrix
self.ratingsMat = ratingsMat
self.userFeatureTable = userFeatureTable
self.userIds = self.userFeatureTable.index # the actual order seen by the knnmodel
def find_knn(self, target_matrix, target_features):
neighbors = NearestNeighbors(n_neighbors=self.__args.n_neighbors, algorithm=self.__args.alg).fit(
target_matrix.values)
distances, indexes = neighbors.kneighbors(target_features)
return distances, indexes
def fit(atributos):
neighbor = NearestNeighbors(metric='euclidean')
neighbor.fit(atributos)
return neighbor
def index(self, metric='cosine'):
alg = 'brute' if (metric == 'cosine') else 'auto'
if not SKLEARN:
raise WordVectorBoxException("Needs sklearn to work")
self._nn = NearestNeighbors(metric=metric, algorithm=alg)
self._nn.fit(self.W)
return self