def kmeans(encoder_val_clean, y, nClusters, y_pred_prev=None, weight_initilization='k-means++', seed=42, n_init=40,
max_iter=300):
# weight_initilization = { 'kmeans-pca', 'kmean++', 'random', None }
if weight_initilization == 'kmeans-pca':
start_time = timeit.default_timer()
pca = PCA(n_components=nClusters).fit(encoder_val_clean)
kmeans_model = KMeans(init=pca.components_, n_clusters=nClusters, n_init=1, max_iter=300, random_state=seed)
y_pred = kmeans_model.fit_predict(encoder_val_clean)
centroids = kmeans_model.cluster_centers_.T
centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))
end_time = timeit.default_timer()
elif weight_initilization == 'k-means++':
start_time = timeit.default_timer()
kmeans_model = KMeans(init='k-means++', n_clusters=nClusters, n_init=n_init, max_iter=max_iter, n_jobs=15,
random_state=seed)
y_pred = kmeans_model.fit_predict(encoder_val_clean)
D = 1.0 / euclidean_distances(encoder_val_clean, kmeans_model.cluster_centers_, squared=True)
D **= 2.0 / (2 - 1)
D /= np.sum(D, axis=1)[:, np.newaxis]
centroids = kmeans_model.cluster_centers_.T
centroids = centroids / np.sqrt(np.diag(np.matmul(centroids.T, centroids)))
end_time = timeit.default_timer()
print('k-means: \t nmi =', normalized_mutual_info_score(y, y_pred), '\t arc =', adjusted_rand_score(y, y_pred),
'\t acc = {:.4f} '.format(bestMap(y, y_pred)),
'K-means objective = {:.1f} '.format(kmeans_model.inertia_), '\t runtime =', end_time - start_time)
if y_pred_prev is not None:
print('Different Assignments: ', sum(y_pred == y_pred_prev), '\tbestMap: ', bestMap(y_pred, y_pred_prev),
'\tdatapoints-bestMap*datapoints: ',
encoder_val_clean.shape[0] - bestMap(y_pred, y_pred_prev) * encoder_val_clean.shape[0])
return centroids, kmeans_model.inertia_, y_pred
评论列表
文章目录