def cluster(data):
"""
Use k-means clustering on training data to find profitable patterns
we can exploit
"""
num_clusters = 100
num_selected_clusters = 20
# Split into 30, 60, and 120 min time intervals, cluster each
split = lambda n: split_into_intervals(data, n)
kmeans30 = sklearn.cluster.k_means(split(30), num_clusters)
kmeans60 = sklearn.cluster.k_means(split(60), num_clusters)
kmeans120 = sklearn.cluster.k_means(split(120), num_clusters)
# Sort the clusters by performance
hp30, hp60, hp120 = [], [], []
for i in range(0, num_clusters):
hp30.append((i,kmeans30[0][i,-1]))
hp60.append((i,kmeans60[0][i,-1]))
hp120.append((i,kmeans120[0][i,-1]))
hp30 = sorted(hp30, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
hp60 = sorted(hp60, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
hp60 = sorted(hp120, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
# Select the highest performing clusters
top30 = np.zeros((num_selected_clusters,181))
top60 = np.zeros((num_selected_clusters,361))
top120 = np.zeros((num_selected_clusters,721))
for i in range(0, num_selected_clusters):
top30[i,0:181] = kmeans30[0][hp30[i][0],0:181]
top60[i,0:361] = kmeans60[0][hp60[i][0],0:361]
top120[i,0:721] = kmeans120[0][hp120[i][0],0:721]
# Then normalize the clusters so we can use the faster similarity function
# from S&Z to compare instead of L2 norm
scaler = sklearn.preprocessing.StandardScaler()
for i in range(0, num_selected_clusters):
top30[i,0:180] = scaler.fit_transform(top30[i,0:180])
top60[i,0:360] = scaler.fit_transform(top60[i,0:360])
top120[i,0:720] = scaler.fit_transform(top120[i,0:720])
return [top30, top60, top120]
评论列表
文章目录