def bic(X, k):
"""
Compute the BIC score.
Implementarion from here:
http://www.aladdin.cs.cmu.edu/papers/pdfs/y2000/xmeans.pdf
with corrections from here:
https://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
Params:
--------
X: numpy array with observations and features to be clustered
k: number of clusters
Returns:
--------
BIC: bic score
"""
model = MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=50,
n_init=3, max_no_improvement=10, verbose=0)
model.fit(X)
centers = model.cluster_centers_
centers = np.expand_dims(centers, axis=1)
labels = model.labels_
N_C = np.bincount(labels)
R, M = X.shape
wcss = sum([sum(cdist(X[np.where(labels == c)], centers[c], 'euclidean')**2) for c in range(k)])
var = (1.0/(R-k)/M) * wcss
const_term = 0.5 * k * np.log(R) * (M+1)
BIC = np.sum([ ( Rn * np.log(Rn) ) -
( Rn * np.log(R) ) -
( ((Rn * M) / 2) * np.log(2*np.pi*var) ) -
( (Rn - 1) * M/ 2 )
for Rn in N_C]) - const_term
return BIC
评论列表
文章目录