def cluster_points(coordinates, eps, min_samples, n_jobs=1):
"""Given coordinates, function returns the number of clusters in the
set of coordinates and a list of integer labels corresponding to
the input coordinate list
Arguments:
coordinates: a sequence of (lat, lon) tuples
eps: the cluster size in radial degrees
min_samples: the size of the smallest cluster
n_jobs: number of CPUs to use to compute the clusters
Returns:
n_clusters: number of clusters
labels: the labels of the clusters
"""
db = DBSCAN(eps=eps,
min_samples=min_samples,
n_jobs=n_jobs).fit(coordinates)
return db
python类DBSCAN的实例源码
def dbFun(_x,_original_vals, f):
db = DBSCAN(eps=0.3, min_samples=20).fit(_x)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
#print(labels)
n_clusters_ = len(set(labels)) - (1 if -1 else 0)
#gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_,
#_original_vals)
print("Wait plotting clusters.....")
plotCluster(_x, labels, core_samples_mask, n_clusters_, f)
return
##############################################################################################
# Plotting the cluster after the result of DBSCAN
def dbscan(fig):
global X_iris, geo
ax = fig.add_subplot(geo + 5, projection='3d', title='dbscan')
dbscan = cluster.DBSCAN()
dbscan.fit(X_iris)
res = dbscan.labels_
core = dbscan.core_sample_indices_
print repr(core)
size = [5 if i not in core else 40 for i in range(len(X_iris))]
print repr(size)
for n, i in enumerate(X_iris):
ax.scatter(*i[: 3], s=size[n], c='bgrcmyk'[res[n] % 7],
alpha=0.8, marker='o')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
return res
def cluster_dbscan(self, image_cols):
print 'DBSCAN'
# TODO handle outliers/noise
# Look at different metrics?
db = DBSCAN(eps=self.params.epsilon, min_samples=10, metric='euclidean')
db.fit(image_cols)
# from IPython import embed; embed(); import ipdb; ipdb.set_trace()
self.number_of_clusters = np.max(db.labels_) + 1
# Ignore -1 cluster, it's noise
print 'number of clusters', self.number_of_clusters
# Clusters
centers = np.zeros((self.number_of_clusters, 3))
for i in range(0, self.number_of_clusters):
cluster_points = image_cols[db.labels_ == i]
cluster_mean = np.mean(cluster_points, axis=0)
centers[i, :] = cluster_mean
return centers
def train(self, data, sample_weight=None):
"""
:type data: pyspark.RDD
:param data: (key, k-dim vector like)
Train the model using a (key, vector) RDD
"""
parts = KDPartitioner(data, self.max_partitions)
self.data = data
self.bounding_boxes = parts.bounding_boxes
self.expanded_boxes = {}
self._create_neighborhoods()
# repartition data set on the partition label
self.data = self.data.map(lambda ((k, p), v): (p, (k, v))) \
.partitionBy(len(parts.partitions)) \
.map(lambda (p, (k, v)): ((k, p), v))
# create parameters for sklearn DBSCAN
params = self.dbscan_params or {
'eps': self.eps,
'min_samples': self.min_samples,
'metric': self.metric}
# perform dbscan on each part
self.data = self.data.mapPartitions(
lambda iterable: dbscan_partition(iterable, params, sample_weight))
self.data.cache()
self._remap_cluster_ids()
def __init__(self, ompath, density = 4.0):
"""
:param ompath: path of the mesh template
author: weiwei
date: 20170711
"""
cadtemp = CADTemp.CADTemp(ompath = ompath, density = density)
self.objnp = pg.packpandanp(cadtemp.objtrimesh.vertices,
cadtemp.objtrimesh.face_normals,
cadtemp.objtrimesh.faces,
name='')
self.temppnt = cadtemp.pcdtemp
self.kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Depth)
self.dbscan = DBSCAN(eps=50, min_samples=100, n_jobs=-1)
self.randsac = linear_model.RANSACRegressor(linear_model.LinearRegression(), residual_threshold = 15)
self.tablepnt = []
self.objectpnt = []
def process(self, obj_data):
'''
Run DBScan on data. Stores result in data wrapper
@param obj_data: Data wrapper to be processed
'''
epsilon = self.ap_paramList[0]()
min_points = self.ap_paramList[1]()
results = dict()
for label, data in obj_data.getIterator():
results[label] = DBSCAN(eps=epsilon, min_samples = min_points).fit_predict(data.loc[:,self.column_names])
obj_data.addResult(self.str_description, results)
def test_clusterer_enforcement(self):
"""
Assert that only clustering estimators can be passed to cluster viz
"""
nomodels = [
SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
]
for nomodel in nomodels:
with self.assertRaises(YellowbrickTypeError):
visualizer = ClusteringScoreVisualizer(nomodel())
models = [
KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
]
for model in models:
try:
visualizer = ClusteringScoreVisualizer(model())
except YellowbrickTypeError:
self.fail("could not pass clustering estimator to visualizer")
def fit(self, model, n_clusters=5):
"""
Fits clusters to the feature set using a Kmeans model.
Input: n_clusters (int) number of clusters to use during clustering
Output: None
"""
self.n_clusters = n_clusters
scaler = StandardScaler()
self.features = scaler.fit_transform(self.features)
if model == 'kmeans':
self.model = KMeans(self.n_clusters)
elif model == 'DBSCAN':
self.model = DBSCAN(eps=0.3, min_samples = 3)
self.cluster_fit = self.model.fit(self.features)
print ('-- Running clustering on {} piece collection --'
.format(self.n_artworks))
def newDBSCANModel(vectorFile, outputFile):
model = Doc2Vec.load("Models\\" + vectorFile)
vecs = []
for doc in range(0, len(model.docvecs)):
doc_vec = model.docvecs[doc]
# print doc_vec
vecs.append(doc_vec.reshape((1, 300)))
doc_vecs = np.array(vecs, dtype='float') # TSNE expects float type values
# print doc_vecs
docs = []
for i in doc_vecs:
docs.append(i[0])
db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
joblib.dump(db, outputFile)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
clusters = db.labels_.tolist()
cluster_info = {'labels': model.docvecs.offset2doctag,
"index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
model.docvecs.offset2doctag],
'clusters': clusters}
sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
columns=['labels', "index, wordcount and repeated words", 'clusters'])
print(sentenceDF)
sentenceDF.to_csv("DBSCAN.csv")
print('Estimated number of clusters: %d' % n_clusters_)
def dbscan_partition(iterable, params):
"""
:type iterable: iter
:param iterable: iterator yielding ((key, partition), vector)
:type params: dict
:param params: dictionary containing sklearn DBSCAN parameters
:rtype: iter
:return: ((key, cluster_id), v)
Performs a DBSCAN on a given partition of the data
"""
# read iterable into local memory
data = list(iterable)
(key, part), vector = data[0]
x = np.array([v for (_, __), v in data])
y = np.array([k for (k, _), __ in data])
# perform DBSCAN
model = skc.DBSCAN(**params)
c = model.fit_predict(x)
cores = set(model.core_sample_indices_)
# yield (key, cluster_id), non-core samples labeled with *
for i in xrange(len(c)):
flag = '' if i in cores else '*'
yield (y[i], '%i:%i%s' % (part, c[i], flag))
def DBSCAN_cluster(psi_matrix, eventid_lst, dist, minpts, metric):
# Setting logging preferences
logger = logging.getLogger(__name__)
# The metric is "cosine" works only with the algorithm "brute"
if metric == "cosine":
alg = 'brute'
else:
alg = 'auto'
try:
db = DBSCAN(eps=dist, min_samples=minpts, metric=metric, algorithm=alg).fit(psi_matrix)
labels = db.labels_
except:
logger.error("Unknown error: {}".format(sys.exc_info()))
sys.exit(1)
eventid_labels_dict = {k: v for k, v in zip(eventid_lst, labels)}
return eventid_labels_dict, labels
def cluster_analysis(dpsi, psivec, sig_threshold, dpsi_threshold, eps, minpts, metric, indexes, clustering,
separation, output):
path = os.path.dirname(os.path.realpath(dpsi))
os.chdir(path)
psi_matrix, eventid_lst = process_cluster_input(dpsi, psivec, sig_threshold, dpsi_threshold, indexes)
if(clustering=="DBSCAN"):
eventid_labels_dict, labels = DBSCAN_cluster(psi_matrix, eventid_lst, eps, minpts, metric)
#eventid_labels_dict are the labels of the clustering for eacg event
write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
calculate_cluster_scores(psi_matrix, labels, output)
else:
#OPTICS
points_list = create_points_list(psi_matrix, eventid_lst) #Transform the points on psi_matrix to Points from optics.py
optics = Optics(points_list, eps, minpts) # Maximum radius to be considered, cluster size >= 2 points
optics.run() # run the algorithm
clusters = optics.cluster(separation) # minimum threshold for clustering (upper limit to separate the clusters)
eventid_labels_dict, labels = generate_labels(clusters, eventid_lst)
write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
calculate_cluster_scores(psi_matrix, labels, output)
def makeDBScan(X=None, k=-1):
return cluster.DBSCAN(eps=.2)
def sts_matrix_generator(ind, slope_matrix):
"""Work-horse function. Computes the short time-series (STS) distance for
an index, ind of the slope matrix.
Parameters
----------
ind: int
The index of the slope matrix that is being computed.
slope_matrix: np.matrix
The slope matrix.
Returns
-------
(ind, dists): ind is the index and dists is a np.matrix containing the
STS distances
"""
mx = slope_matrix[ind, :]
mv = slope_matrix[ind:, :]
mx_rep = np.vstack((mx,)*mv.shape[0])
diff = mx_rep - mv
diff = np.square(diff)
sts_squared = diff.sum(axis=1)
dists = np.sqrt(sts_squared)
return (ind, dists)
# DBSCAN from scikit learn
def cluster_dbscan(matrix, distance_measure="sts", eps=1):
"""Clusters the distance matrix for a given epsilon value, if distance
measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’,
‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’,
‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’,
‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’,
‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]
Parameters
----------
matrix: np.matrix
The input matrix. If distance measure is sts, this should be the sts
distance matrix. If other distance, this should be the time-series
matrix of size ngenes x nsamples.
distance_measure: str
The distance measure, default is sts, short time-series distance.
Any distance measure available in scikit-learn is available here.
Note: multiple time-series is NOT supported for distances other than
"sts".
Returns
-------
cluster_labels: list of int
A list of size ngenes that defines cluster membership.
"""
if (distance_measure == "sts"):
dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
else:
dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2)
cluster_labels = dbs.fit_predict(matrix)
return cluster_labels
def dbFun( _x,_original_vals, f):
db = DBSCAN(eps=0.3, min_samples=20).fit(_x)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
#print(labels)
n_clusters_ = len(set(labels)) - (1 if -1 else 0)
#gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_,
#_original_vals)
print("Wait plotting clusters.....")
plotCluster(_x, labels, core_samples_mask, n_clusters_, f)
return
def dbFun( _x,_original_vals, f):
db = DBSCAN(eps=0.3, min_samples=20).fit(_x)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
#print(labels)
n_clusters_ = len(set(labels)) - (1 if -1 else 0)
#gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_,
#_original_vals)
print("Wait plotting clusters.....")
plotCluster(_x, labels, core_samples_mask, n_clusters_, f)
return
def demo_printing_picture(anomaly_file, prefix, rgb_directory, pre_prefix, dir, file_name):
#clusters = webDemo.main(anomaly_file,
#"D:\\ifruitly_junk\\results\\result.jpg")
clusters = v_demo(anomaly_file, prefix, pre_prefix, file_name, dir)
return
##############################################################################################
# Running the DBSCAN for output
def db_scan(data, eps, min_samples, metric):
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(data)
print 'DBSCAN'
print metrics.silhouette_score(data, dbscan.labels_)
print collections.Counter(dbscan.labels_)
reduced_data = reduce_with_pca(data)
plot_2d_data(reduced_data, dbscan.labels_)
def sdbscanTrain(self, settings, mname, data):
'''
:param data: -> dataframe with data
:param settings: -> settings dictionary
:param mname: -> name of serialized clusterer
:return: -> clusterer
:example settings: -> {eps:0.9, min_samples:10, metric:'euclidean' ,
algorithm:'auto, leaf_size:30, p:0.2, n_jobs:1}
'''
for k, v in settings.iteritems():
logger.info('[%s] : [INFO] SDBSCAN %s set to %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
print "SDBSCAN %s set to %s" % (k, v)
sdata = StandardScaler().fit_transform(data)
try:
db = DBSCAN(eps=float(settings['eps']), min_samples=int(settings['min_samples']), metric=settings['metric'],
algorithm=settings['algorithm'], leaf_size=int(settings['leaf_size']), p=float(settings['p']),
n_jobs=int(settings['n_jobs'])).fit(sdata)
except Exception as inst:
logger.error('[%s] : [ERROR] Cannot instanciate sDBSCAN with %s and %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
print "Error while instanciating sDBSCAN with %s and %s" % (type(inst), inst.args)
sys.exit(1)
labels = db.labels_
print labels
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print'Estimated number of clusters: %d' % n_clusters_
self.__serializemodel(db, 'sdbscan', mname)
return db
def __init__(self, image, colour_space='hsv', cluster_method='ward', scale=None, num_clusters=None, quantile=None):
self.image = image
self.colour_space = colour_space
self.cluster_method = cluster_method
self.params = Parameters()
# Scaling colour space
if scale is None:
self.params.scale = (1, 1, 1)
else:
# TODO validate 3 float tuple
self.params.scale = scale
# K-means param
if num_clusters is None:
self.params.num_clusters = 8
else:
# TODO validate
self.params.num_clusters = int(num_clusters)
# Mean-shift param
if quantile is None:
self.params.quantile = 0.1
else:
self.params.quantile = float(quantile)
# DBSCAN param
# if epsilon is None:
self.params.epsilon = 255*0.1
# Log
h, w = self.image.shape[:2]
msg = 'Clustering a {}x{} image: cluster_method={} colour_space={} num_clusters={} quantile={}'.format(
w, h, cluster_method, colour_space, num_clusters, quantile
)
print msg
def dbscan(self, n_clusters=None, eps=0.5, min_samples=10,
algorithm='auto', leaf_size=30):
"""
Perform DBSCAN clustering
This can also be used for Duplicate Detection (when ep
Parameters
----------
n_clusters : int
number of clusters # not used just present for compatibility
lsi_components : int
apply LSA before the clustering algorithm
eps : float
The maximum distance between two samples for them to be considered
as in the same neighborhood.
min_samples : int
The number of samples (or total weight) in a neighborhood
for a point to be considered as a core point.
This includes the point itself.
"""
from sklearn.cluster import DBSCAN
pars = {'is_hierarchical': False, "metric": self.metric}
km = DBSCAN(eps=eps, min_samples=min_samples, algorithm=algorithm,
leaf_size=leaf_size)
return self._cluster_func(n_clusters, km, pars)
def main():
centers = get_list('out_center.txt')
labels = get_list('142-label.txt')
judge(centers, labels)
n_class = int(len(centers) * 0.18)
est = KMeans(n_clusters=n_class, max_iter=1000)
est.fit(centers)
new_list = []
for x, y in est.cluster_centers_:
min_num = 10000
min_x = -1
min_y = -1
for x_, y_ in centers:
dist = distance(x, y, x_, y_)
if (dist < min_num) or (min_x == -1):
min_num = dist
min_x = x_
min_y = y_
new_list.append([min_x, min_y])
judge(new_list, labels)
judge(est.cluster_centers_, labels)
# db = DBSCAN(eps=0.3, min_samples=180).fit(centers)
# print(db.core_sample_indices_)
# judge(new_list, labels)
# print(est.cluster_centers_)
# save_list('result.txt', est.cluster_centers_)
# af = AffinityPropagation(preference=180).fit(centers)
# judge(af.cluster_centers_, labels)
def dbscan_partition(iterable, params, sample_weight=None):
"""
:type iterable: iter
:param iterable: iterator yielding ((key, partition), vector)
:type params: dict
:param params: dictionary containing sklearn DBSCAN parameters
:rtype: iter
:return: ((key, cluster_id), v)
Performs a DBSCAN on a given partition of the data
"""
# read iterable into local memory
data = list(iterable)
(key, part), vector = data[0]
x = np.array([v for (_, __), v in data])
y = np.array([k for (k, _), __ in data])
# perform DBSCAN
model = skc.DBSCAN(**params)
# import sys
# print(model, file=sys.stderr)
weights = [sample_weight[k[0]] for k in x]
c = model.fit_predict(x, sample_weight=weights)
cores = set(model.core_sample_indices_)
# yield (key, cluster_id), non-core samples labeled with *
for i in xrange(len(c)):
flag = '' if i in cores else '*'
yield (y[i], '%i:%i%s' % (part, c[i], flag))
car_recognizer.py 文件源码
项目:Vision-based-parking-lot-availability-OpenCV
作者: Saar1312
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def dbscan(points,eps,min_samples):
db = DBSCAN(eps=eps, min_samples=min_samples).fit(points) # eps=5 min_samples = 80
# Labeling pixels by cluster
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# Creating list of clusters
return [points[labels == i] for i in xrange(n_clusters_)]
def cluster_texts(textdict, eps=0.45, min_samples=3):
"""
cluster the given texts
Input:
textdict: dictionary with {docid: text}
Returns:
doccats: dictionary with {docid: cluster_id}
"""
doc_ids = list(textdict.keys())
# transform texts into length normalized kpca features
ft = FeatureTransform(norm='max', weight=True, renorm='length', norm_num=False)
docfeats = ft.texts2features(textdict)
X, featurenames = features2mat(docfeats, doc_ids)
e_lkpca = KernelPCA(n_components=250, kernel='linear')
X = e_lkpca.fit_transform(X)
xnorm = np.linalg.norm(X, axis=1)
X = X/xnorm.reshape(X.shape[0], 1)
# compute cosine similarity
D = 1. - linear_kernel(X)
# and cluster with dbscan
clst = DBSCAN(eps=eps, metric='precomputed', min_samples=min_samples)
y_pred = clst.fit_predict(D)
return {did: y_pred[i] for i, did in enumerate(doc_ids)}
def getRotMat(verts):
"""
find the table and do calibration
:param verts: see depthToXYZ
:return:
author: weiwei
date: 20170711
"""
cutverts = []
for vert in verts:
if vert[0] < 700.0 and vert[0] > -700.0:
if vert[1] < 200.0 and vert[1] > -600.0:
if vert[2] < -1000.0 and vert[2] > -1500.0:
cutverts.append([vert[0], vert[1], vert[2]])
# clustering using DBSCAN
X = np.array(cutverts)
db = DBSCAN(eps=20, min_samples = 100, n_jobs = -1).fit(X)
print db.labels_
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
returnvertslist = []
unique_labels = set(labels)
for k in unique_labels:
class_member_mask = (labels == k)
print class_member_mask, core_samples_mask
xyzlist = X[class_member_mask & core_samples_mask]
print xyzlist
returnvertslist.append(xyzlist.tolist())
return returnvertslist
# return verts
def __init__(self):
"""
Kinect interface
author: weiwei
date: 20170715
"""
self.kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Depth)
self.dbscan = DBSCAN(eps=50, min_samples=100, n_jobs=-1)
self.randsac = linear_model.RANSACRegressor(linear_model.LinearRegression(), residual_threshold = 15)
def _get_dbscan(parameters):
if parameters is None:
parameters = {
}
return DBSCAN(**parameters)