def mean_shift(df, l1, l2, c1name, qt, cluster_all, bin_seeding):
df1 = df.loc[df[c1name].isin([l1,l2])]
pccols = [ i for i in range(0,50) ]
xp = df1[pccols].as_matrix()
bandwidth = 0
if l1==l2:
bandwidth = estimate_bandwidth(xp, quantile=qt)
else:
xp1 = df1.loc[df1[c1name]==l1, pccols].as_matrix()
xp2 = df1.loc[df1[c1name]==l2, pccols].as_matrix()
bandwidth1 = estimate_bandwidth(xp1, quantile=qt)
bandwidth2 = estimate_bandwidth(xp2, quantile=qt)
bandwidth = max(bandwidth1, bandwidth2)
logging.info("compare (%d, %d) with width=%f", l1, l2, bandwidth)
ms = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all,
bin_seeding=bin_seeding)
ms.fit(xp)
mslabels_unique = np.unique(ms.labels_)
nc = len(mslabels_unique)
nl = ms.labels_
df.loc[df[c1name].isin([l1,l2]), c1name] = df.loc[df[c1name].isin([l1,l2]), c1name]*1000 +nl
return nc, nl, bandwidth
评论列表
文章目录