def k_nearest_approx(self, vec, k):
"""Get the k nearest neighbors of a vector (in terms of cosine similarity).
:param (np.array) vec: query vector
:param (int) k: number of top neighbors to return
:return (list[tuple[str, float]]): a list of (word, cosine similarity) pairs, in descending order
"""
if not hasattr(self, 'lshf'):
self.lshf = self._init_lsh_forest()
# TODO(kelvin): make this inner product score, to be consistent with k_nearest
distances, neighbors = self.lshf.kneighbors(vec, n_neighbors=k, return_distance=True)
scores = np.subtract(1, distances)
nbr_score_pairs = self.score_map(np.squeeze(neighbors), np.squeeze(scores))
return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)
python类neighbors()的实例源码
def k_nearest(self, vec, k):
"""Get the k nearest neighbors of a vector (in terms of highest inner products).
:param (np.array) vec: query vector
:param (int) k: number of top neighbors to return
:return (list[tuple[str, float]]): a list of (word, score) pairs, in descending order
"""
nbr_score_pairs = self.inner_products(vec)
return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)[:k]
def _init_lsh_forest(self):
"""Construct an LSH forest for nearest neighbor search."""
import sklearn.neighbors
lshf = sklearn.neighbors.LSHForest()
lshf.fit(self.array)
return lshf
def k_nearest(self, vec, k):
"""Get the k nearest neighbors of a vector (in terms of highest inner products).
:param (np.array) vec: query vector
:param (int) k: number of top neighbors to return
:return (list[tuple[str, float]]): a list of (word, score) pairs, in descending order
"""
nbr_score_pairs = self.inner_products(vec)
return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)[:k]
def _init_lsh_forest(self):
"""Construct an LSH forest for nearest neighbor search."""
import sklearn.neighbors
lshf = sklearn.neighbors.LSHForest()
lshf.fit(self.array)
return lshf
def choose_classifier(classifier, # which classifier to use
# parameters for the tree based classifiers
trees_n_estimators=None, trees_criterion=None,
trees_max_features=None, trees_max_depth=None,
# the ones for k-nearest-neighbors
knn_n_neighbors=None, knn_weights=None):
# note that possibly inactive variables have to be optional
# as ac_pysmac does not assign a value for inactive variables
# during the minimization phase
if classifier == 'random_forest':
predictor = sklearn.ensemble.RandomForestClassifier(
trees_n_estimators, trees_criterion,
trees_max_features, trees_max_depth)
elif classifier == 'extra_trees':
predictor = sklearn.ensemble.ExtraTreesClassifier(
trees_n_estimators, trees_criterion,
trees_max_features, trees_max_depth)
elif classifier == 'k_nearest_neighbors':
predictor = sklearn.neighbors.KNeighborsClassifier(
knn_n_neighbors, knn_weights)
predictor.fit(X_train, Y_train)
return -predictor.score(X_test, Y_test)
# defining all the parameters with respective defaults.
advanced_supvervised_model_trainer.py 文件源码
项目:healthcareai-py
作者: HealthCatalyst
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def knn(self,
scoring_metric='roc_auc',
hyperparameter_grid=None,
randomized_search=True,
number_iteration_samples=10):
"""
A light wrapper for Sklearn's knn classifier that performs randomized search over an overridable default
hyperparameter grid.
Args:
scoring_metric (str): Any sklearn scoring metric appropriate for classification
hyperparameter_grid (dict): hyperparameters by name
randomized_search (bool): True for randomized search (default)
number_iteration_samples (int): Number of models to train during the randomized search for exploring the
hyperparameter space. More may lead to a better model, but will take longer.
Returns:
TrainedSupervisedModel:
"""
self.validate_classification('KNN')
if hyperparameter_grid is None:
neighbors = list(range(5, 26))
hyperparameter_grid = {'n_neighbors': neighbors, 'weights': ['uniform', 'distance']}
number_iteration_samples = 10
print('KNN Grid: {}'.format(hyperparameter_grid))
algorithm = get_algorithm(KNeighborsClassifier,
scoring_metric,
hyperparameter_grid,
randomized_search,
number_iteration_samples=number_iteration_samples)
trained_supervised_model = self._create_trained_supervised_model(algorithm)
return trained_supervised_model
def findEps(ssearch):
"""
Find a good epsilon value to use.
"""
###########################################################################
# Calculate nearest neighbors
###########################################################################
# Create a nearest neighbors model--we need 2 nearest neighbors since the
# nearest neighbor to a point is going to be itself.
nbrs_model = NearestNeighbors(n_neighbors=2, algorithm='brute', metric='cosine').fit(ssearch.index.index)
t0 = time.time()
# Find nearest neighbors.
distances, indices = nbrs_model.kneighbors(ssearch.index.index)
elapsed = time.time() - t0
print 'Took %.2f seconds' % elapsed
distances = [d[1] for d in distances]
indeces = [ind[1] for ind in indices]
###########################################################################
# Histogram the nearest neighbor distances.
###########################################################################
import matplotlib.pyplot as plt
counts, bins, patches = plt.hist(distances, bins=16)
plt.title("Nearest neighbor distances")
plt.xlabel("Distance")
plt.ylabel("Frequency")
print '\n%d bins:' % len(counts)
countAcc = 0
num_points = len(ssearch.index.index)
for i in range(0, len(counts)):
countAcc += counts[i]
# Calculate the percentage of values which fall below the upper limit
# of this bin.
prcnt = float(countAcc) / float(num_points) * 100.0
print ' %.2f%% < %.2f' % (prcnt, bins[i + 1])
def findMinPts(ssearch, eps):
"""
Find a good value for MinPts.
"""
###########################################################################
# Count neighbors within threshold
###########################################################################
print 'Calculating pair-wise distances...'
# Calculate pair-wise cosine distance for all documents.
t0 = time.time()
DD = sklearn.metrics.pairwise.cosine_distances(ssearch.index.index)
elapsed = time.time() - t0
print ' Took %.2f seconds' % elapsed
print 'Counting number of neighbors...'
t0 = time.time()
# Create a list to hold the number of neighbors for each point.
numNeighbors = [0]*len(DD)
for i in range(0, len(DD)):
dists = DD[i]
count = 0
for j in range(0, len(DD)):
if (dists[j] < eps):
count += 1
numNeighbors[i] = count
elapsed = time.time() - t0
print ' Took %.2f seconds' % elapsed
###############################################################################
# Histogram the nearest neighbor distances.
###############################################################################
import matplotlib.pyplot as plt
counts, bins, patches = plt.hist(numNeighbors, bins=60)
plt.title("Number of neighbors")
plt.xlabel("Number of neighbors")
plt.ylabel("Frequency")
print '\n%d bins:' % (len(bins) - 1)
binsStr = ''
for b in bins:
binsStr += ' %0.2f' % b
print binsStr