def set_preference(data, chunk_size):
"""Return the median of the distribution of pairwise L2 Euclidean distances
between samples (the rows of 'data') as the default preference parameter
for Affinity Propagation clustering.
Parameters
----------
data : array of shape (N_samples, N_features)
The data-set submitted for Affinity Propagation clustering.
chunk_size : int
The size of random subsamples from the data-set whose similarity
matrix is computed. The resulting median of the distribution of
pairwise distances between the data-points selected as part of a
given subsample is stored into a list of medians.
Returns
-------
preference : float
The preference parameter for Affinity Propagation clustering is computed
as the median of the list of median pairwise distances between the data-points
selected as part of each of 15 rounds of random subsampling.
"""
N_samples, N_features = data.shape
rng = np.arange(0, N_samples, dtype = int)
medians = []
for i in xrange(15):
selected_samples = np.random.choice(N_samples, size = chunk_size, replace = False)
samples = data[selected_samples, :]
S = - euclidean_distances(samples, data, squared = True)
n = chunk_size * N_samples - (chunk_size * (chunk_size + 1) / 2)
rows = np.zeros(0, dtype = int)
for i in xrange(chunk_size):
rows = np.append(rows, np.full(N_samples - i, i, dtype = int))
cols = np.zeros(0, dtype = int)
for i in xrange(chunk_size):
cols = np.append(cols, np.delete(rng, selected_samples[:i+1]))
triu_indices = tuple((rows, cols))
preference = np.median(S, overwrite_input = True)
medians.append(preference)
del S
if i % 4 == 3:
gc.collect()
preference = np.median(medians)
return preference
评论列表
文章目录