def computeNeighboursScores(self):
all_instances = self.iteration.datasets.instances
# Connectivity matrix
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))])
pipeline.fit(all_instances.getFeatures())
# Labels
labels = np.array([generateLabel(x) for x in all_instances.getLabels()])
# Compute neighbour scores
scores = []
all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False)
for i, label in enumerate(labels):
if label != 0:
continue
else:
neighbours = all_neighbours[i]
score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours)
scores.append(score)
return np.array(scores)
python类NearestNeighbors()的实例源码
def getpossibleedges(datapointwts,seeds):
# datapointwts = densify(datapointwts);
X = [(xx[0], xx[1]) for xx in datapointwts]; S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {};
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(S)
distances, indices = nbrs.kneighbors(X)
for cd in range(len(seeds)):
cluster[cd] = []
for ii, ll in enumerate(indices):
dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
cd = ll[dd.index(min(dd))];
cluster[cd].append(datapointwts[ii])
p2cluster.append(cd)
for ii, xx in enumerate(datapointwts):
if ii>1:
if datapointwts[ii-1][-1]<datapointwts[ii][-1] and datapointwts[ii-1][-1]>datapointwts[ii][-1]-11:
cd1 = p2cluster[ii-1]; cd2 = p2cluster[ii];
if not cd1== cd2:
gedges1[(cd1,cd2)] = gedges1.get((cd1,cd2),0)+1;
return(gedges1)
def point2cluster(datapointwts,seeds,theta):
cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = [];
X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in datapointwts]; S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * xx[2]) for xx in seeds];
Xrot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in datapointwts]; Srot = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]%360)) for xx in seeds];
for cd in range(len(seeds)):
cluster[cd] = []
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S)
distances, indices = nbrs.kneighbors(X)
nbrsrot = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(Srot)
distancesrot, indicesrot = nbrsrot.kneighbors(Xrot)
for ii, ll in enumerate(indices):
# print(distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii])
cd = indicesrot[ii][0]
if distances[ii][0] < distancesrot[ii][0]:
cd = indices[ii][0];
# print(cd,distances[ii],distancesrot[ii],ll,indices[ii],indicesrot[ii])
cluster[cd].append(datapointwts[ii])
p2cluster.append(cd)
return(cluster,p2cluster)
def splitclustersparallel(datapointwts,seeds):
X = [(xx[0], xx[1]) for xx in datapointwts]; S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; roadwidth = [];
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(S)
distances, indices = nbrs.kneighbors(X)
for cd in range(len(seeds)):
cluster[cd] = []; roadwidth.append(0);
for ii, ll in enumerate(indices):
dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll]
cd = ll[dd.index(min(dd))];
cluster[cd].append(datapointwts[ii])
p2cluster.append(cd)
for cl in cluster:
mang = seeds[cl][-1];
scl = seeds[cl]
if len(cluster[cl]) > 10:
std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90)
roadwidth[cl] = 1+5*np.std([geodist(scl,xx)*np.sin(anglebetweentwopoints(scl,xx)-scl[-1]) for xx in cluster[cl]])
print(cl,scl,[(anglebetweentwopoints(scl,xx),scl[-1]) for xx in cluster[cl]])
def median_kneighbour_distance(X, k=5):
"""
Calculate the median kneighbor distance.
Find the distance between a set of random datapoints and
their kth nearest neighbours. This is a heuristic for setting the
kernel length scale.
"""
N_all = X.shape[0]
k = min(k, N_all)
N_subset = min(N_all, 2000)
sample_idx_train = np.random.permutation(N_all)[:N_subset]
nn = neighbors.NearestNeighbors(k)
nn.fit(X[sample_idx_train, :])
d, idx = nn.kneighbors(X[sample_idx_train, :])
return np.median(d[:, -1])
job_description_feature_extraction.py 文件源码
项目:job-salary-prediction
作者: soton-data-mining
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def cosine_knn(corpus_vector, queries_vector, k=10):
"""
:param corpus_vector: vectorized document text
:param queries_vector: vectorized query text
:param k: number of neighbours
:return: (distances, indices) of knn
"""
# based on
# http://scikit-learn.org/stable/modules/neighbors.html
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
# since we want to use cosine similarity to account for document length
# we have to use bruteforce search
# parallelize to number of cores with n_jobs -1
nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric='cosine')
nbrs.fit(corpus_vector)
distances, indices = nbrs.kneighbors(queries_vector)
return distances, indices
def index(self, metric='cosine'):
""" Build a nearest neighbor retrieval index to perform similarity
lookups and analogies
Arguments:
metric: string, or sklearn compatible callable
Returns:
self
Raises:
TokenContainerException if no pretrained vectors have been loaded
"""
if self.W is not None:
alg = 'brute' if (metric == 'cosine') else 'auto'
from sklearn.neighbors import NearestNeighbors
self._nn = NearestNeighbors(metric=metric, algorithm=alg)
self._nn.fit(self.W)
else:
raise TokenContainerException(
'cannot build similarity on vectorless structure'
)
return self
pixel_sampling.py 文件源码
项目:kaggle-yelp-restaurant-photo-classification
作者: u1234x1234
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def extract_lab_histogram(mode, clusters):
nn = neighbors.NearestNeighbors(n_neighbors=1)
nn.fit(clusters)
out_filename = mode + '_color'
try:
os.remove(out_filename)
except:
pass
out = open(out_filename, 'ab')
cnt = 0
with open(mode + '_list') as f:
for line in f:
line = line[:-1]
image = cv2.imread(line)
image = cv2.resize(image, (100, 100))
image = cv2.cvtColor(image, cv2.COLOR_BGR2Lab)
points = image.reshape((-1, 3))
cn = nn.kneighbors(points)
hist = np.histogram(cn[1], bins=50, range=(1, 50))[0]
hist.tofile(out)
cnt = cnt + 1
if cnt % 1000 == 0:
print(cnt)
def buildNNDataStructure(self):
"""Builds a nearest neighbor data structure. User doesn't need to
call this unless the self.problems attribute was changed manually."""
if len(self.problemFeatures)==0 or len(self.featureNames)==0:
return
try:
from sklearn.neighbors import NearestNeighbors,BallTree
from scipy.spatial import KDTree
with self.lock:
try:
farray = self.problemFeatures.array
except AttributeError:
farray = np.array(self.problemFeatures.items)
if self.metricTransform is not None:
farray = np.dot(farray,self.metricTransform)
#self.nn = NearestNeighbors(n_neighbors=1,algorithm="auto").fit(farray)
self.nn = BallTree(farray)
#self.nn = KDTree(farray)
self.nnBuildSize = len(self.problemFeatures)
except ImportError:
print "IKDatabase: Warning, scikit-learn is not installed, queries will be much slower"
with self.lock:
self.nn = None
self.nnBuildSize = 0
return
def __init__(self, x, ys):
import numpy as np
from sklearn.neighbors import NearestNeighbors
#print x, ys
CI = np.array( [x.checksum.get_signature_entropy(), x.checksum.get_entropy()] )
#print CI, x.get_info()
#print
for i in ys:
CI = np.vstack( (CI, [i.checksum.get_signature_entropy(), i.checksum.get_entropy()]) )
#idx = 0
#for i in np.array(CI)[1:]:
# print idx+1, i, ys[idx].get_info()
# idx += 1
self.neigh = NearestNeighbors(2, 0.4)
self.neigh.fit(np.array(CI))
#print self.neigh.kneighbors( CI[0], len(CI) )
self.CI = CI
self.ys = ys
def __init__(self, x, ys):
import numpy as np
from sklearn.neighbors import NearestNeighbors
#print x, ys
CI = np.array( [x.checksum.get_signature_entropy(), x.checksum.get_entropy()] )
#print CI, x.get_info()
#print
for i in ys:
CI = np.vstack( (CI, [i.checksum.get_signature_entropy(), i.checksum.get_entropy()]) )
#idx = 0
#for i in np.array(CI)[1:]:
# print idx+1, i, ys[idx].get_info()
# idx += 1
self.neigh = NearestNeighbors(2, 0.4)
self.neigh.fit(np.array(CI))
#print self.neigh.kneighbors( CI[0], len(CI) )
self.CI = CI
self.ys = ys
def build_search_tree(datadir, featurename='vgg16_block5_conv3-vlad-64.h5'):
ndim = 64
features_file = os.path.join(datadir, featurename)
print(features_file)
global keys, features
keys, features = load_features(features_file)
print('reducing features')
pca = PCA(n_components=ndim)
features = pca.fit_transform(features)
print('ready')
print('building search tree')
nn = NearestNeighbors()
global nneighs
nneighs = nn.fit(features)
print('ready')
def test_unsupervised_kneighbors(n_samples=20, n_features=5,
n_query_pts=2, n_neighbors=5):
# Test unsupervised neighbors methods
X = rng.rand(n_samples, n_features)
test = rng.rand(n_query_pts, n_features)
for p in P:
results_nodist = []
results = []
for algorithm in ALGORITHMS:
neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors,
algorithm=algorithm,
p=p)
neigh.fit(X)
results_nodist.append(neigh.kneighbors(test,
return_distance=False))
results.append(neigh.kneighbors(test, return_distance=True))
for i in range(len(results) - 1):
assert_array_almost_equal(results_nodist[i], results[i][1])
assert_array_almost_equal(results[i][0], results[i + 1][0])
assert_array_almost_equal(results[i][1], results[i + 1][1])
def test_unsupervised_inputs():
# test the types of valid input into NearestNeighbors
X = rng.random_sample((10, 3))
nbrs_fid = neighbors.NearestNeighbors(n_neighbors=1)
nbrs_fid.fit(X)
dist1, ind1 = nbrs_fid.kneighbors(X)
nbrs = neighbors.NearestNeighbors(n_neighbors=1)
for input in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)):
nbrs.fit(input)
dist2, ind2 = nbrs.kneighbors(X)
assert_array_almost_equal(dist1, dist2)
assert_array_almost_equal(ind1, ind2)
def test_radius_neighbors_boundary_handling():
"""Test whether points lying on boundary are handled consistently
Also ensures that even with only one query point, an object array
is returned rather than a 2d array.
"""
X = np.array([[1.5], [3.0], [3.01]])
radius = 3.0
for algorithm in ALGORITHMS:
nbrs = neighbors.NearestNeighbors(radius=radius,
algorithm=algorithm).fit(X)
results = nbrs.radius_neighbors([[0.0]], return_distance=False)
assert_equal(results.shape, (1,))
assert_equal(results.dtype, object)
assert_array_equal(results[0], [0, 1])
def test_callable_metric():
def custom_metric(x1, x2):
return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))
X = np.random.RandomState(42).rand(20, 2)
nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto',
metric=custom_metric)
nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute',
metric=custom_metric)
nbrs1.fit(X)
nbrs2.fit(X)
dist1, ind1 = nbrs1.kneighbors(X)
dist2, ind2 = nbrs2.kneighbors(X)
assert_array_almost_equal(dist1, dist2)
def __init__(self, is_multiclass=True, K_CLOSEST_NEIGHBORS=2):
# Constants
self.K_RECO = 5.0 # Num of neighbors for weight learning
self.K_CLOSEST_NEIGHBORS = K_CLOSEST_NEIGHBORS
self.weights = None
self.kNN_finder = NearestNeighbors(
n_neighbors=K_CLOSEST_NEIGHBORS,
metric=self._calculate_dist,
metric_params=None, # Dict otherwise
n_jobs=-1
)
def assignClasses(self):
clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size)
train_locs = self.df_train[['lat', 'lon']].values
clusterer.fit(train_locs)
clusters = clusterer.get_clusters()
cluster_points = dd(list)
for i, cluster in enumerate(clusters):
cluster_points[cluster].append(train_locs[i])
logging.info('#labels: %d' %len(cluster_points))
self.cluster_median = OrderedDict()
for cluster in sorted(cluster_points):
points = cluster_points[cluster]
median_lat = np.median([p[0] for p in points])
median_lon = np.median([p[1] for p in points])
self.cluster_median[cluster] = (median_lat, median_lon)
dev_locs = self.df_dev[['lat', 'lon']].values
test_locs = self.df_test[['lat', 'lon']].values
nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4)
nnbr.fit(np.array(self.cluster_median.values()))
self.dev_classes = nnbr.kneighbors(dev_locs, n_neighbors=1, return_distance=False)[:, 0]
self.test_classes = nnbr.kneighbors(test_locs, n_neighbors=1, return_distance=False)[:, 0]
self.train_classes = clusters
if self.one_hot_labels:
num_labels = np.max(self.train_classes) + 1
y_train = np.zeros((len(self.train_classes), num_labels), dtype=np.float32)
y_train[np.arange(len(self.train_classes)), self.train_classes] = 1
y_dev = np.zeros((len(self.dev_classes), num_labels), dtype=np.float32)
y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1
y_test = np.zeros((len(self.test_classes), num_labels), dtype=np.float32)
y_test[np.arange(len(self.test_classes)), self.test_classes] = 1
self.train_classes = y_train
self.dev_classes = y_dev
self.test_classes = y_test
def network_layout(matrix, k=30):
nbrs = NearestNeighbors(k, algorithm='brute', metric='cosine').fit(matrix)
G = networkx.from_scipy_sparse_matrix(nbrs.kneighbors_graph(matrix))
node_labels = label_propagation(G, verbose=True)
communities_labelprop = np.array([node_labels[i] for i in range(matrix.shape[0])])
pos = graphviz_layout(G, prog="sfdp")
coords = np.array([pos[i] for i in range(len(pos))])
print(coords.shape)
return coords, communities_labelprop
def network_layout(matrix, k=30):
nbrs = NearestNeighbors(k, algorithm='brute',
metric='cosine').fit(matrix)
G = networkx.from_scipy_sparse_matrix(nbrs.kneighbors_graph(matrix))
node_labels = label_propagation(G, verbose=True)
communities_labelprop = np.array([node_labels[i] for i in range(matrix.shape[0])])
pos = graphviz_layout(G, prog="sfdp")
coords = np.array([pos[i] for i in range(len(pos))])
print(coords.shape)
return coords, communities_labelprop
def __init__(self,
MMDLayer,
MMDTargetTrain,
MMDTargetValidation_split=0.1,
MMDTargetSampleSize=1000,
n_neighbors = 25,
scales = None,
weights = None):
if scales == None:
print("setting scales using KNN")
med = np.zeros(20)
for ii in range(1,20):
sample = MMDTargetTrain[np.random.randint(MMDTargetTrain.shape[0], size=MMDTargetSampleSize),:]
nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(sample)
distances,dummy = nbrs.kneighbors(sample)
#nearest neighbor is the point so we need to exclude it
med[ii]=np.median(distances[:,1:n_neighbors])
med = np.median(med)
scales = [med/2, med, med*2] # CyTOF
print(scales)
scales = K.variable(value=np.asarray(scales))
if weights == None:
print("setting all scale weights to 1")
weights = K.eval(K.shape(scales)[0])
weights = K.variable(value=np.asarray(weights))
self.MMDLayer = MMDLayer
MMDTargetTrain, MMDTargetValidation = train_test_split(MMDTargetTrain, test_size=MMDTargetValidation_split, random_state=42)
self.MMDTargetTrain = K.variable(value=MMDTargetTrain)
self.MMDTargetTrainSize = K.eval(K.shape(self.MMDTargetTrain)[0])
self.MMDTargetValidation = K.variable(value=MMDTargetValidation)
self.MMDTargetValidationSize = K.eval(K.shape(self.MMDTargetValidation)[0])
self.MMDTargetSampleSize = MMDTargetSampleSize
self.kernel = self.RaphyKernel
self.scales = scales
self.weights = weights
#calculate the raphy kernel applied to all entries in a pairwise distance matrix
def get_chunk_nns(self, X, q_centroids, question_details, chunk):
nbrs = NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=1000).fit(X)
dist, nns = nbrs.kneighbors(q_centroids, return_distance=True)
q_array = []
for q_point in range(nns.shape[0]):
doc_nns = []
for n_point in range(nns.shape[1]):
doc_nns.append(self.idmap[chunk[0] + nns[q_point, n_point]])
q = Question(question_details[q_point][0], question_details[q_point][1], doc_nns, list(dist[q_point, :]))
q_array.append(q)
return q_array
# Dataset indeces are splitted in N chucks. Nearest top-(N*k) neighbors are extracted from each chunk, and then
# the final top-k neighbors are extracted from those.
def getseeds(datapoint,radius,theta):
chosen = []; seeds = [];
# random.shuffle(datapoint)
periodsampl = 500000
for p in datapoint:
chosen.append(p);
for j,p in enumerate(chosen):
ok = -1;
if j<periodsampl:
for q in seeds:
if taxidist(p,q,theta)<radius:
ok = 1
break;
if ok <1:
seeds.append(p)
else:
if j%periodsampl == 0:# and (is_power2(int(j/1000))):
# print(j,time.time()-start)
S = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in seeds];
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(S)
X = [(lonconst * xx[0], latconst * xx[1], theta / 180 * (xx[2]+45)) for xx in chosen[j:min(len(chosen),j+periodsampl)]];
distances, indices = nbrs.kneighbors(X)
if distances[j%periodsampl][0] >radius:
seeds.append(p)
print('seeds: ', len(seeds))
return (seeds)
def __init__(self,
analyzer=None, matching=None,
name=None,
verbose=0,
n_epochs=10,
alpha=0.25,
min_alpha=0.05,
n_jobs=4,
**kwargs):
# self.model = model
self.alpha = alpha
self.min_alpha = min_alpha
self.verbose = verbose
self.name = "paragraph-vectors" if name is None else name
if matching is True:
self._matching = Matching()
elif matching is False or matching is None:
self._matching = None
else:
self._matching = Matching(**dict(matching))
self.analyzer = analyzer
self.model = Doc2Vec(alpha=alpha,
min_alpha=alpha,
size=500,
window=8,
min_count=1,
sample=1e-5,
workers=n_jobs,
negative=20,
dm=0, dbow_words=1, # words only with dm!=0?
dm_mean=0, # unused when in concat mode
dm_concat=1,
dm_tag_count=1
)
self.n_epochs = n_epochs
self._neighbors = NearestNeighbors(**kwargs)
def query(self, query, k=None):
model, matching = self.model, self._matching
nn, analyze = self._neighbors, self.analyzer
verbose = self.verbose
if k is None:
k = len(self._centroids)
if matching:
matched = matching.predict(query)
print("Matched:", matched)
tags = self._y[matched]
dvs = np.asarray([model.docvecs[tag] for tag in tags])
n_ret = min(k, len(matched))
if n_ret == 0:
return []
nn.fit(dvs)
else:
tags = self._y
n_ret = k
# NearestNeighbors are already fit
if verbose > 0:
print(len(tags), "documents matched.")
q = analyze(query)
qv = model.infer_vector(q).reshape(1, -1)
ind = nn.kneighbors(qv, n_neighbors=n_ret, return_distance=False)[0]
y = tags[ind]
return y
def query(self, query, k=None, matched_indices=None):
# matching step
matching_ind = self._matching(query)
# print(matching_ind, file=sys.stderr)
Xm, matched_doc_ids = self._X[matching_ind], self._y[matching_ind]
# matching_docs, matching_doc_ids = self._matching(query)
# calculate elements to retrieve
n_ret = len(matching_ind)
if n_ret == 0:
return []
if self.verbose > 0:
print("Found {} matches:".format(n_ret))
# n_ret = min(n_ret, k) if k > 0 else n_ret
# model dependent transformation
xq = self._cv.transform([query])
q = self.tfidf.transform(xq)
# Xm = self.vectorizer.transform(matching_docs)
# model dependent nearest neighbor search or scoring or whatever
nn = NearestNeighbors(metric='cosine', algorithm='brute').fit(Xm)
# abuse kneighbors in this case
# AS q only contains one element, we only need its results.
if k is not None and k < n_ret:
n_ret = k
ind = nn.kneighbors(q, # q contains a single element
n_neighbors=n_ret, # limit to k neighbors
return_distance=False)[0] # so we only need 1 res
# dont forget to convert the indices to document ids of matching
labels = matched_doc_ids[ind]
return labels
def __init__(self,
embedding,
analyzer,
name="WCD",
n_jobs=1,
normalize=True,
verbose=0,
oov=None,
matching=True,
**kwargs):
self.name = name
self._embedding = embedding
self._normalize = normalize
self._oov = oov
self.verbose = verbose
self.n_jobs = n_jobs
self._neighbors = NearestNeighbors(**kwargs)
self._analyzer = analyzer
if matching is True:
self._matching = Matching()
elif matching is False or matching is None:
self._matching = None
else:
self._matching = Matching(**dict(matching))
def __init__(self, embedding, analyzer='word', matching=None, name="FWCD",
n_jobs=1, use_idf=True):
"""TODO: to be defined1. """
self.name = name
self.matching = Matching(**dict(matching)) if matching else None
self.vect = EmbeddedVectorizer(embedding, analyzer=analyzer, norm='l2',
use_idf=use_idf)
self.nn = NearestNeighbors(n_jobs=n_jobs, metric='cosine',
algorithm='brute')
def test_nearest_centroid_ranker():
# in the case where there is a single point by centroid,
# nearest centroid should reduce to nearest neighbor
from sklearn.neighbors import NearestNeighbors
np.random.seed(0)
n_samples = 100
n_features = 120
X = np.random.rand(n_samples, n_features)
normalize(X, copy=False)
index = np.arange(n_samples, dtype='int')
y = np.arange(n_samples, dtype='int')
index_train, index_test, y_train, y_test = train_test_split(index, y)
X_train = X[index_train]
X_test = X[index_test]
nn = NearestNeighbors(n_neighbors=1, algorithm='brute')
nn.fit(X_train)
dist_ref, idx_ref = nn.kneighbors(X_test)
nc = NearestCentroidRanker()
nc.fit(X_train, y_train)
dist_pred = nc.decision_function(X_test)
y_pred = nc.predict(X_test)
# ensures that we have the same number of unique ouput points
# (even if absolute labels are not preserved)
assert np.unique(idx_ref[:,0]).shape == np.unique(y_pred).shape
assert_allclose(dist_pred, dist_ref[:,0])
def fit(self, X, y):
"""Fit the model using X as training data
Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree}
Training data, shape [n_samples, n_features],
"""
X = check_array(X, accept_sparse='csr')
y = np.asarray(y, dtype='int')
y_unique = np.unique(y)
index = np.arange(len(y), dtype='int')
if len(y_unique) == 0:
raise ValueError('The training set must have at least '
'one document category!')
# define nearest neighbors search objects for each category
self._mod = [NearestNeighbors(n_neighbors=1,
leaf_size=self.leaf_size,
algorithm=self.algorithm,
n_jobs=self.n_jobs,
# euclidean metric by default
metric='cosine',
) for el in range(len(y_unique))]
index_mapping = []
for imod, y_val in enumerate(y_unique):
mask = (y == y_val)
index_mapping.append(index[mask])
self._mod[imod].fit(X[mask])
self.index_mapping = index_mapping