def train(X, y, true_k=50, minibatch=False, showLable=True):
# ??????????????k-means?
fout = open('pro1_cluster.txt', 'w+')
if minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
verbose=False)
km.fit(X)
print y.dtype
if showLable:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = y
# print y
result = list(km.predict(X))
print('Cluster distribution:')
print(dict([(i, result.count(i)) for i in result]))
cluster_list = {}
for i in range(true_k):
cluster_list[i] = []
for j in range(len(result)):
# print terms[j]
# print result[j]
cluster_list[result[j]].append([terms[j], X[j]])
for i in cluster_list.keys():
cluster = cluster_list[i]
if len(cluster) > 0:
for bet in cluster:
vec = bet[1].tolist()
# fout.write(bet[0] + str(vec) + '\n')
# print bet
fout.write(bet[0] + '\n')
fout.write('-------------------\n')
return -km.score(X)
fout.close()
评论列表
文章目录