def get_cluster_assignments(sim_matrix, parameters):
"""
(np.array, list of int) -> list of int
sim_matrix: list of list of float -- similarity matrix between exemplars
parameters: list of parameters in the format ["method:method_name",
"algo:algo_name", "k:num_clusters", "damping:damping"]
where order doesn't matter
(k and damping only relevant for certain clustering methods)
the possible values for each parameter are listed in the
function below.
Returns a list of integers. The integer at each index of the list corresponds
to the cluster number of the exemplar at the same index in sim_matrix.
"""
algorithm = next((re.split(':',f)[1] for f in parameters if f[:4] == 'algo'), 'ap')
# from { 'hierarchical', 'kmeans', 'ap', 'ward' }
method = next((re.split(':',f)[1] for f in parameters if f[:6] == 'method'), 'single')
# from {'single', 'complete', 'average'} (only relevant for hierarchical clustering)
kMk = next((int(re.split(':',f)[1]) for f in parameters if f[:1] == 'k'), 8)
# any integer <= the data length
damping = next((re.split(':',f)[1] for f in parameters if f[:4] == 'damping'), 0.5)
# only relevant for AP -- in [0.5,1]
#
if algorithm == 'hierarchical':
clustering = hierarchy.linkage(sim_matrix, method)
k = get_k(clustering, 20)
cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
elif algorithm == 'kmeans':
cluster_assignments = KMeans(n_clusters = kMk).fit_predict(sim_matrix)
elif algorithm == 'ap':
cluster_assignments = AffinityPropagation().fit_predict(sim_matrix)
elif algorithm == 'ward':
clustering = hierarchy.ward(sim_matrix)
k = get_k(clustering, 20)
cluster_assignments = hierarchy.fcluster(clustering, k, criterion = 'maxclust')-1
return cluster_assignments
python类ward()的实例源码
def comparative_exp():
"""
Runs a series of clustering experiments for different parameter settings.
"""
data_path = sys.argv[1] # path to data set
stem_dict_path = sys.argv[2] # path to stemming dictionary
parameters = ['SPLIT', 'noUF']
d = data(data_path, stem_dict_path, parameters)
#
#
clustering_algos = [(a,m,k) for a in ['hierarchical', 'ward', 'ap', 'kmeans']
for m in [None,'complete','average','single']
for k in [None,2,3,4,5,6,7,8,9,10]
if (m != None and k == None and a == 'hierarchical') or
(m == None and k != None and a == 'kmeans') or
(m == None and k == None and a in ['ward', 'ap'])]
for onto_cat in ['thing', 'body']:
parameters_i = parameters + ['onto:%s' % onto_cat]
oix = sorted(set(np.where(d.ontological == onto_cat)[0]))
similarity_matrix = get_similarity_matrix(d, parameters_i, oix, association = 'associated')
for a,m,k in clustering_algos:
parameters_j = parameters_i + ['algo:%s' % a]
if m != None: parameters_j.append('method:%s' % m)
if k != None: parameters_j.append('k:%r' % k)
print(parameters_j)
cluster_assignments = get_cluster_assignments(similarity_matrix, parameters_j)
print(set(cluster_assignments))
print(evaluate_clustering(cluster_assignments, d.annotation[oix]))
print_confusion_matrix(cluster_assignments, d.annotation[oix])
def main():
country = dictdata(getCountrydict())
result = ward(country.getData())
dendrogram(result, labels=country.getName(), orientation='left', leaf_font_size=10)
show()
document_clustering.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def ward_hierarchical_clustering(feature_matrix):
cosine_distance = 1 - cosine_similarity(feature_matrix)
linkage_matrix = ward(cosine_distance)
return linkage_matrix
document_clustering.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def plot_hierarchical_clusters(linkage_matrix, movie_data, figure_size=(8,12)):
# set size
fig, ax = plt.subplots(figsize=figure_size)
movie_titles = movie_data['Title'].values.tolist()
# plot dendrogram
ax = dendrogram(linkage_matrix, orientation="left", labels=movie_titles)
plt.tick_params(axis= 'x',
which='both',
bottom='off',
top='off',
labelbottom='off')
plt.tight_layout()
plt.savefig('ward_hierachical_clusters.png', dpi=200)
# build ward's linkage matrix