kmedians.py 文件源码-python代码片段

def minibatch_kmedians(X, M=None, n_components=10, n_iter=100,
                       minibatch_size=100, random_state=None):
    n_clusters = n_components
    if M is not None:
        assert M.shape[0] == n_components
        assert M.shape[1] == X.shape[1]
    if random_state is None:
        random_state = np.random.RandomState(random_state)
    elif not hasattr(random_state, 'shuffle'):
        # Assume integer passed
        random_state = np.random.RandomState(int(random_state))
    if M is None:
        ind = np.arange(len(X)).astype('int32')
        random_state.shuffle(ind)
        M = X[ind[:n_clusters]]

    center_counts = np.zeros(n_clusters)
    pts = list(np.arange(len(X), minibatch_size)) + [len(X)]
    if len(pts) == 1:
        # minibatch size > dataset size case
        pts = [0, None]
    minibatch_indices = zip(pts[:-1], pts[1:])
    for i in range(n_iter):
        for mb_s, mb_e in minibatch_indices:
            Xi = X[mb_s:mb_e]
            # Broadcasted Manhattan distance
            # Could be made faster with einsum perhaps
            centers = np.abs(Xi[:, None, :] - M[None]).sum(
                axis=-1).argmin(axis=1)

            def count_update(c):
                center_counts[c] += 1
            [count_update(c) for c in centers]
            scaled_lr = 1. / center_counts[centers]
            Mi = M[centers]
            scaled_lr = scaled_lr[:, None]
            # Gradient of abs
            Mi = Mi - scaled_lr * ((Xi - Mi) / np.sqrt((Xi - Mi) ** 2 + 1E-9))
            M[centers] = Mi
    # Reassign centers to nearest datapoint
    mem, _ = vq(M, X)
    M = X[mem]
    return M