def plot_feature_overlap(df, cmap='binary', method='cluster'):
"""Plot feature-feature presence overlap of a pandas dataframe.
Args:
df: A pandas dataframe.
cmap: A matplotlib colormap.
method: Method of clustering, one of 'cluster' or 'tree'.
"""
V = len(df.columns)
present = (df == df).as_matrix().astype(np.float32)
overlap = np.dot(present.T, present)
assert overlap.shape == (V, V)
# Sort features to make blocks contiguous.
if method == 'tree':
# TODO(fritzo) Fix this to not look awful.
grid = make_complete_graph(V)
weights = np.empty(grid.shape[1], dtype=np.float32)
for k, v1, v2 in grid.T:
weights[k] = overlap[v1, v2]
edges = estimate_tree(grid, weights)
order, order_inv = order_vertices(edges)
elif method == 'cluster':
distance = scipy.spatial.distance.pdist(overlap)
clustering = scipy.cluster.hierarchy.complete(distance)
order_inv = scipy.cluster.hierarchy.leaves_list(clustering)
else:
raise ValueError(method)
overlap = overlap[order_inv, :]
overlap = overlap[:, order_inv]
assert overlap.shape == (V, V)
pyplot.imshow(overlap**0.5, cmap=cmap)
pyplot.axis('off')
评论列表
文章目录