def plot_clusters(num_clusters, feature_matrix,
cluster_data, movie_data,
plot_size=(16,8)):
# generate random color for clusters
def generate_random_color():
color = '#%06x' % random.randint(0, 0xFFFFFF)
return color
# define markers for clusters
markers = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd']
# build cosine distance matrix
cosine_distance = 1 - cosine_similarity(feature_matrix)
# dimensionality reduction using MDS
mds = MDS(n_components=2, dissimilarity="precomputed",
random_state=1)
# get coordinates of clusters in new low-dimensional space
plot_positions = mds.fit_transform(cosine_distance)
x_pos, y_pos = plot_positions[:, 0], plot_positions[:, 1]
# build cluster plotting data
cluster_color_map = {}
cluster_name_map = {}
for cluster_num, cluster_details in cluster_data.items():
# assign cluster features to unique label
cluster_color_map[cluster_num] = generate_random_color()
cluster_name_map[cluster_num] = ', '.join(cluster_details['key_features'][:5]).strip()
# map each unique cluster label with its coordinates and movies
cluster_plot_frame = pd.DataFrame({'x': x_pos,
'y': y_pos,
'label': movie_data['Cluster'].values.tolist(),
'title': movie_data['Title'].values.tolist()
})
grouped_plot_frame = cluster_plot_frame.groupby('label')
# set plot figure size and axes
fig, ax = plt.subplots(figsize=plot_size)
ax.margins(0.05)
# plot each cluster using co-ordinates and movie titles
for cluster_num, cluster_frame in grouped_plot_frame:
marker = markers[cluster_num] if cluster_num < len(markers) \
else np.random.choice(markers, size=1)[0]
ax.plot(cluster_frame['x'], cluster_frame['y'],
marker=marker, linestyle='', ms=12,
label=cluster_name_map[cluster_num],
color=cluster_color_map[cluster_num], mec='none')
ax.set_aspect('auto')
ax.tick_params(axis= 'x', which='both', bottom='off', top='off',
labelbottom='off')
ax.tick_params(axis= 'y', which='both', left='off', top='off',
labelleft='off')
fontP = FontProperties()
fontP.set_size('small')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.01), fancybox=True,
shadow=True, ncol=5, numpoints=1, prop=fontP)
#add labels as the film titles
for index in range(len(cluster_plot_frame)):
ax.text(cluster_plot_frame.ix[index]['x'],
cluster_plot_frame.ix[index]['y'],
cluster_plot_frame.ix[index]['title'], size=8)
# show the plot
plt.show()
document_clustering.py 文件源码
python
阅读 18
收藏 0
点赞 0
评论 0
评论列表
文章目录