def do_kshape(name_prefix, df, cluster_size, initial_clustering=None):
columns = df.columns
matrix = []
for c in columns:
matrix.append(zscore(df[c]))
res = kshape(matrix, cluster_size, initial_clustering)
labels, score = silhouette_score(np.array(matrix), res)
# keep a reference of which metrics are in each cluster
cluster_metrics = defaultdict(list)
# we keep it in a dict: cluster_metrics[<cluster_nr>]{<metric_a>, <metric_b>}
for i, col in enumerate(columns):
cluster_metrics[int(labels[i])].append(col)
filenames = []
for i, (centroid, assigned_series) in enumerate(res):
d = {}
for serie in assigned_series:
d[columns[serie]] = pd.Series(matrix[serie], index=df.index)
d["centroid"] = pd.Series(centroid, index=df.index)
df2 = pd.DataFrame(d)
figure = df2.plot()
figure.legend(loc='center left', bbox_to_anchor=(1, 0.5))
name = "%s_%d" % (name_prefix, (i+1))
filename = name + ".tsv.gz"
print(filename)
df2.to_csv(filename, sep="\t", compression='gzip')
filenames.append(os.path.basename(filename))
graphs.write(df2, name + ".png")
return cluster_metrics, score, filenames
评论列表
文章目录