def minibatch_kmedians(X, M=None, n_components=10, n_iter=100,
minibatch_size=100, random_state=None):
n_clusters = n_components
if M is not None:
assert M.shape[0] == n_components
assert M.shape[1] == X.shape[1]
if random_state is None:
random_state = np.random.RandomState(random_state)
elif not hasattr(random_state, 'shuffle'):
# Assume integer passed
random_state = np.random.RandomState(int(random_state))
if M is None:
ind = np.arange(len(X)).astype('int32')
random_state.shuffle(ind)
M = X[ind[:n_clusters]]
center_counts = np.zeros(n_clusters)
pts = list(np.arange(len(X), minibatch_size)) + [len(X)]
if len(pts) == 1:
# minibatch size > dataset size case
pts = [0, None]
minibatch_indices = zip(pts[:-1], pts[1:])
for i in range(n_iter):
for mb_s, mb_e in minibatch_indices:
Xi = X[mb_s:mb_e]
# Broadcasted Manhattan distance
# Could be made faster with einsum perhaps
centers = np.abs(Xi[:, None, :] - M[None]).sum(
axis=-1).argmin(axis=1)
def count_update(c):
center_counts[c] += 1
[count_update(c) for c in centers]
scaled_lr = 1. / center_counts[centers]
Mi = M[centers]
scaled_lr = scaled_lr[:, None]
# Gradient of abs
Mi = Mi - scaled_lr * ((Xi - Mi) / np.sqrt((Xi - Mi) ** 2 + 1E-9))
M[centers] = Mi
# Reassign centers to nearest datapoint
mem, _ = vq(M, X)
M = X[mem]
return M
评论列表
文章目录