def _step5(arr):
kmeans = pickle.loads(open("kmeans.model", "rb").read())
key, lines, tipe = arr
print(key)
open("./tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe,key=key), "w").write("\n".join(lines))
res = os.popen("./fasttext print-sentence-vectors ./models/model.bin < tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe, key=key)).read()
w = open("tmp/tmp.{tipe}.{key}.json".format(tipe=tipe,key=key), "w")
for line in res.split("\n"):
try:
vec = list(map(float, line.split()[-100:]))
except:
print(line)
print(res)
continue
x = np.array(vec)
if np.isnan(x).any():
continue
cluster = kmeans.predict([vec])
txt = line.split()[:-100]
obj = {"txt": txt, "cluster": cluster.tolist()}
data = json.dumps(obj, ensure_ascii=False)
w.write( data + "\n" )
python类cluster()的实例源码
def step6():
for tipe in ["news", "nocturne"]:
names = [name for name in reversed(sorted(glob.glob("./tmp/tmp.{tipe}.*.json".format(tipe=tipe))))]
size = len(names)
for en, name in enumerate(names):
term_clus = {}
oss = []
with open(name) as f:
for line in f:
line = line.strip()
oss.append(json.loads(line))
for i in range(3, len(oss) - 3):
terms = set( oss[i]["txt"] )
for term in terms:
if term_clus.get(term) is None:
term_clus[term] = [0.0]*128
cd = [oss[i+d]["cluster"][0] for d in [-3, -2, -1, 1, 2, 3]]
for c in cd:
term_clus[term][c] += 1.0
print("{}/{} finished {}".format(en, size, name))
open("{tipe}.term_clus.pkl".format(tipe=tipe), "wb").write( pickle.dumps(term_clus) )
def do_kmeans(data, k):
km = sklearn.cluster.KMeans(n_clusters=k)
km.fit(data)
means = km.cluster_centers_.reshape((-1,))
#initialize standard deviations with distances between random cluster centers
sds = []
for i in range(means.shape[0]):
# choose any 2 means and take half the distance between them
x, y = np.random.choice(means, 2, replace=False)
sds.append((x-y)/2)
sds = np.abs(np.array(sds))
return (means, sds)
# expectation maximization for gmm
# use_kmeans: whether to initialize using kmeans or randomly
# use_priors: whether to model the prior distribution;
# this attaches a weight to each distribution that tells us
# the percentage of points generated from that distribution
def computeF1_macro(confusion_matrix,matching, num_clusters):
"""
computes the macro F1 score
confusion matrix : requres permutation
matching according to which matrix must be permuted
"""
##Permute the matrix columns
permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
for cluster in xrange(num_clusters):
matched_cluster = matching[cluster]
permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
##Compute the F1 score for every cluster
F1_score = 0
for cluster in xrange(num_clusters):
TP = permuted_confusion_matrix[cluster,cluster]
FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
precision = TP/(TP + FP)
recall = TP/(TP + FN)
f1 = stats.hmean([precision,recall])
F1_score += f1
F1_score /= num_clusters
return F1_score
def computeF1_macro(confusion_matrix,matching, num_clusters):
"""
computes the macro F1 score
confusion matrix : requres permutation
matching according to which matrix must be permuted
"""
##Permute the matrix columns
permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
for cluster in xrange(num_clusters):
matched_cluster = matching[cluster]
permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
##Compute the F1 score for every cluster
F1_score = 0
for cluster in xrange(num_clusters):
TP = permuted_confusion_matrix[cluster,cluster]
FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
precision = TP/(TP + FP)
recall = TP/(TP + FN)
f1 = stats.hmean([precision,recall])
F1_score += f1
F1_score /= num_clusters
return F1_score
def process_options(args):
options = argparser().parse_args(args)
if options.max_rank is not None and options.max_rank < 1:
raise ValueError('max-rank must be >= 1')
if options.eps <= 0.0:
raise ValueError('eps must be > 0')
wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)
if options.normalize:
logging.info('normalize vectors to unit length')
wv.normalize()
words, vectors = wv.words(), wv.vectors()
if options.whiten:
logging.info('normalize features to unit variance')
vectors = scipy.cluster.vq.whiten(vectors)
return words, vectors, options
def split_into_intervals(data, n):
"""
Split time series into n minute intervals
"""
# Throw away time, bid/ask numbers
prices = [x[1] for x in data]
# create a len n-1 array of price differences (10 second increments)
price_diffs = np.diff(prices)
# m = interval length in terms of data points (6*~10sec = 1 minute)
m = n * 6
# each datapoint we're trying to cluster will be of the form:
# (xi,yi) = (time series of prices, price change after series)
intervals = np.zeros((len(prices)-1,m+1))
for i in range(0, len(prices)-m-1):
intervals[i,0:m] = prices[i:i+m]
intervals[i,m] = price_diffs[i+m]
return intervals
def plot_data(*data):
'''
graph the dataset
:param data: data, target
:return: None
'''
X,labels_true=data
labels=np.unique(labels_true)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
colors='rgbyckm'
for i,label in enumerate(labels):
position=labels_true==label
ax.scatter(X[position,0],X[position,1],label="cluster {0}".format(label),
color=colors[i%len(colors)])
ax.legend(loc="best",framealpha=0.5)
ax.set_xlabel("X[0]")
ax.set_ylabel("Y[1]")
ax.set_title("data")
plt.show()
def spectral_clustering(messages, dist_func=combined, num_clusters=3):
'''
takes a list of converstation messages and return `num_cluster` threads.
'''
m = len(messages)
affinity = np.zeros((m, m))
# extract message features.
for (mi, message) in enumerate(messages):
if type(message) != dict:
message = {
'text': message
}
if 'feat' not in message: # extract on the fly.
message['feat'] = extract_all(parse_body(message['text']))
messages[mi] = message # write back.
# build affinity matrix.
for mi in range(m):
for mj in range(m):
affinity[mi, mj] = np.exp(-1.0 * keywords_l0(
messages[mi]['feat'],
messages[mj]['feat']
))
# run clustering.
print affinity
labels = sklearn.cluster.spectral_clustering(affinity, n_clusters=num_clusters, eigen_solver='arpack')
return labels
def adhoc_clustering(messages, dist_func=combined):
''' an adhoc method for clustering messages '''
m = len(messages)
# extract message features.
for (mi, message) in enumerate(messages):
if type(message) != dict:
message = {
'text': message
}
message.update(extract_all(parse_body(message['text'])))
# run clustering (ad hoc).
max_label = 0
bias = 600
labels = []
for (mi, message) in enumerate(messages):
min_mj = -1
min_dist = float('inf')
for mj in range(mi-1, -1, -1):
dist = dist_func(messages[mi], messages[mj])
if dist < min_dist:
min_dist = dist
min_mj = mj
if (bias- 100 * worth(messages[mi])) < min_dist: # create new cluster.
labels.append(max_label)
max_label += 1
else: # assign to an old cluster.
labels.append(labels[min_mj])
return labels
def updateClusters(LLE_node_vals,switch_penalty = 1):
"""
Takes in LLE_node_vals matrix and computes the path that minimizes
the total cost over the path
Note the LLE's are negative of the true LLE's actually!!!!!
Note: switch penalty > 0
"""
(T,num_clusters) = LLE_node_vals.shape
future_cost_vals = np.zeros(LLE_node_vals.shape)
##compute future costs
for i in xrange(T-2,-1,-1):
j = i+1
indicator = np.zeros(num_clusters)
future_costs = future_cost_vals[j,:]
lle_vals = LLE_node_vals[j,:]
for cluster in xrange(num_clusters):
total_vals = future_costs + lle_vals + switch_penalty
total_vals[cluster] -= switch_penalty
future_cost_vals[i,cluster] = np.min(total_vals)
##compute the best path
path = np.zeros(T)
##the first location
curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:])
path[0] = curr_location
DP_start2 = time.time()
##compute the path
for i in xrange(T-1):
j = i+1
future_costs = future_cost_vals[j,:]
lle_vals = LLE_node_vals[j,:]
total_vals = future_costs + lle_vals + switch_penalty
total_vals[int(path[i])] -= switch_penalty
path[i+1] = np.argmin(total_vals)
##return the computed path
return path
def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False):
"""
computes the F1 scores and returns a list of values
"""
F1_score = np.zeros(num_cluster)
for cluster in xrange(num_cluster):
matched_cluster = matching_algo[cluster]
true_matrix = actual_clusters[cluster]
estimated_matrix = threshold_algo[matched_cluster]
TP = 0
TN = 0
FP = 0
FN = 0
for i in xrange(num_stacked*n):
for j in xrange(num_stacked*n):
if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0:
TP += 1.0
elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0:
TN += 1.0
elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0:
FP += 1.0
else:
FN += 1.0
precision = (TP)/(TP + FP)
print "cluster #", cluster
print "TP,TN,FP,FN---------->", (TP,TN,FP,FN)
recall = TP/(TP + FN)
f1 = (2*precision*recall)/(precision + recall)
F1_score[cluster] = f1
return F1_score
def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo):
"""
computes a confusion matrix and returns it
"""
seg_len = 50
true_confusion_matrix = np.zeros([num_clusters,num_clusters])
for point in xrange(len(clustered_points_algo)):
cluster = clustered_points_algo[point]
#CASE E : ABCABC
num = (int(sorted_indices_algo[point]/seg_len) %num_clusters)
true_confusion_matrix[num,cluster] += 1
return true_confusion_matrix
def computeF1_macro(confusion_matrix,matching, num_clusters):
"""
computes the macro F1 score
confusion matrix : requres permutation
matching according to which matrix must be permuted
"""
##Permute the matrix columns
permuted_confusion_matrix = np.zeros([num_clusters,num_clusters])
for cluster in xrange(num_clusters):
matched_cluster = matching[cluster]
permuted_confusion_matrix[:,cluster] = confusion_matrix[:,matched_cluster]
##Compute the F1 score for every cluster
F1_score = 0
for cluster in xrange(num_clusters):
TP = permuted_confusion_matrix[cluster,cluster]
FP = np.sum(permuted_confusion_matrix[:,cluster]) - TP
FN = np.sum(permuted_confusion_matrix[cluster,:]) - TP
precision = TP/(TP + FP)
recall = TP/(TP + FN)
f1 = stats.hmean([precision,recall])
F1_score += f1
F1_score /= num_clusters
return F1_score
############
##The basic folder to be created
def computeNetworkAccuracy(matching,train_cluster_inverse, num_clusters):
"""
Takes in the matching for the clusters
takes the computed clusters
computes the average F1 score over the network
"""
threshold = 1e-2
f1 = 0
for cluster in xrange(num_clusters):
true_cluster_cov = np.loadtxt("Inverse Covariance cluster ="+ str(cluster) +".csv", delimiter = ",")
matched_cluster = matching[cluster]
matched_cluster_cov = train_cluster_inverse[matched_cluster]
(nrow,ncol) = true_cluster_cov.shape
out_true = np.zeros([nrow,ncol])
for i in xrange(nrow):
for j in xrange(ncol):
if np.abs(true_cluster_cov[i,j]) > threshold:
out_true[i,j] = 1
out_matched = np.zeros([nrow,ncol])
for i in xrange(nrow):
for j in xrange(ncol):
if np.abs(matched_cluster_cov[i,j]) > threshold:
out_matched[i,j] = 1
np.savetxt("Network_true_cluster=" +str(cluster) + ".csv",true_cluster_cov, delimiter = ",")
np.savetxt("Network_matched_cluster=" + str(matched_cluster)+".csv",matched_cluster_cov, delimiter = ",")
##compute the confusion matrix
confusion_matrix = np.zeros([2,2])
for i in xrange(nrow):
for j in xrange(ncol):
confusion_matrix[out_true[i,j],out_matched[i,j]] += 1
f1 += computeF1_macro(confusion_matrix, [0,1],2)
return f1/num_clusters
############
def computeF1Score_delete(num_cluster,matching_algo,actual_clusters,threshold_algo,save_matrix = False):
"""
computes the F1 scores and returns a list of values
"""
F1_score = np.zeros(num_cluster)
for cluster in xrange(num_cluster):
matched_cluster = matching_algo[cluster]
true_matrix = actual_clusters[cluster]
estimated_matrix = threshold_algo[matched_cluster]
if save_matrix: np.savetxt("estimated_matrix_cluster=" + str(cluster)+".csv",estimated_matrix,delimiter = ",", fmt = "%1.4f")
TP = 0
TN = 0
FP = 0
FN = 0
for i in xrange(num_stacked*n):
for j in xrange(num_stacked*n):
if estimated_matrix[i,j] == 1 and true_matrix[i,j] != 0:
TP += 1.0
elif estimated_matrix[i,j] == 0 and true_matrix[i,j] == 0:
TN += 1.0
elif estimated_matrix[i,j] == 1 and true_matrix[i,j] == 0:
FP += 1.0
else:
FN += 1.0
precision = (TP)/(TP + FP)
recall = TP/(TP + FN)
f1 = (2*precision*recall)/(precision + recall)
F1_score[cluster] = f1
return F1_score
def computeNetworkAccuracy(matching,train_cluster_inverse, num_clusters):
"""
Takes in the matching for the clusters
takes the computed clusters
computes the average F1 score over the network
"""
threshold = 1e-2
f1 = 0
for cluster in xrange(num_clusters):
true_cluster_cov = np.loadtxt("Inverse Covariance cluster ="+ str(cluster) +".csv", delimiter = ",")
matched_cluster = matching[cluster]
matched_cluster_cov = train_cluster_inverse[matched_cluster]
(nrow,ncol) = true_cluster_cov.shape
out_true = np.zeros([nrow,ncol])
for i in xrange(nrow):
for j in xrange(ncol):
if np.abs(true_cluster_cov[i,j]) > threshold:
out_true[i,j] = 1
out_matched = np.zeros([nrow,ncol])
for i in xrange(nrow):
for j in xrange(ncol):
if np.abs(matched_cluster_cov[i,j]) > threshold:
out_matched[i,j] = 1
np.savetxt("Network_true_cluster=" +str(cluster) + ".csv",true_cluster_cov, delimiter = ",")
np.savetxt("Network_matched_cluster=" + str(matched_cluster)+".csv",matched_cluster_cov, delimiter = ",")
##compute the confusion matrix
confusion_matrix = np.zeros([2,2])
for i in xrange(nrow):
for j in xrange(ncol):
confusion_matrix[out_true[i,j],out_matched[i,j]] += 1
f1 += computeF1_macro(confusion_matrix, [0,1],2)
return f1/num_clusters
############
def write_cluster_ids(words, cluster_ids, out=None):
"""Write given list of words and their corresponding cluster ids to out."""
assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'
if out is None:
out = sys.stdout
for word, cid in izip(words, cluster_ids):
print >> out, '%s\t%d' % (word, cid)
def main(argv=None):
if argv is None:
argv = sys.argv
try:
words, vectors, options = process_options(argv[1:])
except Exception, e:
if str(e):
print >> sys.stderr, 'Error: %s' % str(e)
return 1
else:
raise
dbscan = sklearn.cluster.DBSCAN(eps=options.eps, metric=options.metric)
dbscan.fit(numpy.array(vectors))
noisy = sum(1 for l in dbscan.labels_ if l == -1)
unique = len(set(dbscan.labels_))
logging.info('%d clusters, %d noisy, %d vectors' % (unique, noisy,
len(vectors)))
if noisy >= len(vectors) / 4:
logging.warning('%d/%d noisy (-1) labels (try higher eps?)' % \
(noisy, len(vectors)))
elif unique < (len(vectors)/2)**0.5:
logging.warning('only %d clusters (try lower eps?)' % unique)
write_cluster_ids(words, dbscan.labels_)
return 0
def process_options(args):
options = argparser().parse_args(args)
if options.max_rank is not None and options.max_rank < 1:
raise ValueError('max-rank must be >= 1')
if options.k is not None and options.k < 2:
raise ValueError('cluster number must be >= 2')
if options.method == MINIBATCH_KMEANS and not with_sklearn:
logging.warning('minibatch kmeans not available, using kmeans (slow)')
options.method = KMEANS
if options.jobs != 1 and (options.method != KMEANS or not with_sklearn):
logging.warning('jobs > 1 only supported scikit-learn %s' % KMEANS)
options.jobs = 1
wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)
if options.k is None:
options.k = int(math.ceil((len(wv.words())/2)**0.5))
logging.info('set k=%d (%d words)' % (options.k, len(wv.words())))
if options.normalize:
logging.info('normalize vectors to unit length')
wv.normalize()
words, vectors = wv.words(), wv.vectors()
if options.whiten:
logging.info('normalize features to unit variance')
vectors = scipy.cluster.vq.whiten(vectors)
return words, vectors, options
def minibatch_kmeans(vectors, k):
if not with_sklearn:
raise NotImplementedError
# Sculley (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf)
# uses batch size 1000. sklearn KMeans defaults to n_init 10
kmeans = sklearn.cluster.MiniBatchKMeans(k, batch_size=1000, n_init=10)
kmeans.fit(vectors)
return kmeans.labels_
def write_cluster_ids(words, cluster_ids, out=None):
"""Write given list of words and their corresponding cluster ids to out."""
assert len(words) == len(cluster_ids), 'word/cluster ids number mismatch'
if out is None:
out = sys.stdout
for word, cid in izip(words, cluster_ids):
print >> out, '%s\t%d' % (word, cid)
def create_data(centers,num=100,std=0.7):
'''
generate data
:param centers: dimension of centre
:param num: number of sample
:param std: std of each cluster
:return: data, target
'''
X, labels_true = make_blobs(n_samples=num, centers=centers, cluster_std=std)
return X,labels_true
def runClustering(ssearch, eps, min_samples):
"""
Run DBSCAN with the determined eps and MinPts values.
"""
print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples))
# Initialize DBSCAN with parameters.
# I forgot to use cosine at first!
db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute')
# Time this step.
t0 = time.time()
# Cluster the LSI vectors.
db.fit(ssearch.index.index)
# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print(" done in %.3fsec" % elapsed)
# Get the set of unique IDs.
cluster_ids = set(db.labels_)
# Show the number of clusters (don't include noise label)
print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1))
# For each of the clusters...
for cluster_id in cluster_ids:
# Get the list of all doc IDs belonging to this cluster.
cluster_doc_ids = []
for doc_id in range(0, len(db.labels_)):
if db.labels_[doc_id] == cluster_id:
cluster_doc_ids.append(doc_id)
# Get the top words in this cluster
top_words = ssearch.getTopWordsInCluster(cluster_doc_ids)
print(' Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))
def updateClusters(LLE_node_vals,switch_penalty = 1):
"""
Uses the Viterbi path dynamic programming algorithm
to compute the optimal cluster assigments
Takes in LLE_node_vals matrix and computes the path that minimizes
the total cost over the path
Note the LLE's are negative of the true LLE's actually!!!!!
Note: switch penalty > 0
"""
(T,num_clusters) = LLE_node_vals.shape
future_cost_vals = np.zeros(LLE_node_vals.shape)
##compute future costs
for i in xrange(T-2,-1,-1):
j = i+1
indicator = np.zeros(num_clusters)
future_costs = future_cost_vals[j,:]
lle_vals = LLE_node_vals[j,:]
for cluster in xrange(num_clusters):
total_vals = future_costs + lle_vals + switch_penalty
total_vals[cluster] -= switch_penalty
future_cost_vals[i,cluster] = np.min(total_vals)
##compute the best path
path = np.zeros(T)
##the first location
curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:])
path[0] = curr_location
##compute the path
for i in xrange(T-1):
j = i+1
future_costs = future_cost_vals[j,:]
lle_vals = LLE_node_vals[j,:]
total_vals = future_costs + lle_vals + switch_penalty
total_vals[int(path[i])] -= switch_penalty
path[i+1] = np.argmin(total_vals)
##return the computed path
return path
def compute_confusion_matrix(num_clusters,clustered_points_algo, sorted_indices_algo):
"""
computes a confusion matrix and returns it
"""
seg_len = 200
true_confusion_matrix = np.zeros([num_clusters,num_clusters])
for point in xrange(len(clustered_points_algo)):
cluster = int(clustered_points_algo[point])
##CASE G: ABBACCCA
# num = (int(sorted_indices_algo[point]/seg_len) )
# if num in [0,3,7]:
# true_confusion_matrix[0,cluster] += 1
# elif num in[1,2]:
# true_confusion_matrix[1,cluster] += 1
# else:
# true_confusion_matrix[2,cluster] += 1
##CASE F: ABCBA
# num = (int(sorted_indices_algo[point]/seg_len))
# num = min(num, 4-num)
# true_confusion_matrix[num,cluster] += 1
#CASE E : ABCABC
num = (int(sorted_indices_algo[point]/seg_len) %num_clusters)
true_confusion_matrix[num,cluster] += 1
##CASE D : ABABABAB
# num = (int(sorted_indices_algo[point]/seg_len) %2)
# true_confusion_matrix[num,cluster] += 1
##CASE C:
# num = (sorted_indices_algo[point]/seg_len)
# if num < 15:
# true_confusion_matrix[0,cluster] += 1
# elif num < 20:
# true_confusion_matrix[1,cluster] += 1
# else:
# true_confusion_matrix[0,cluster] += 1
##CASE B :
# if num > 4:
# num = 9 - num
# true_confusion_matrix[num,cluster] += 1
##CASE A : ABA
# if sorted_indices_algo[point] < seg_len:
# true_confusion_matrix[0,cluster] += 1
# elif sorted_indices_algo[point] <3*seg_len:
# true_confusion_matrix[1,cluster] += 1
# else:
# true_confusion_matrix[0,cluster] += 1
return true_confusion_matrix
def updateClusters(LLE_node_vals,switch_penalty = 1):
"""
Uses the Viterbi path dynamic programming algorithm
to compute the optimal cluster assigments
Takes in LLE_node_vals matrix and computes the path that minimizes
the total cost over the path
Note the LLE's are negative of the true LLE's actually!!!!!
Note: switch penalty > 0
"""
(T,num_clusters) = LLE_node_vals.shape
future_cost_vals = np.zeros(LLE_node_vals.shape)
##compute future costs
for i in xrange(T-2,-1,-1):
j = i+1
indicator = np.zeros(num_clusters)
future_costs = future_cost_vals[j,:]
lle_vals = LLE_node_vals[j,:]
for cluster in xrange(num_clusters):
total_vals = future_costs + lle_vals + switch_penalty
total_vals[cluster] -= switch_penalty
future_cost_vals[i,cluster] = np.min(total_vals)
##compute the best path
path = np.zeros(T)
##the first location
curr_location = np.argmin(future_cost_vals[0,:] + LLE_node_vals[0,:])
path[0] = curr_location
##compute the path
for i in xrange(T-1):
j = i+1
future_costs = future_cost_vals[j,:]
lle_vals = LLE_node_vals[j,:]
total_vals = future_costs + lle_vals + switch_penalty
total_vals[int(path[i])] -= switch_penalty
path[i+1] = np.argmin(total_vals)
##return the computed path
return path
def cluster(data):
"""
Use k-means clustering on training data to find profitable patterns
we can exploit
"""
num_clusters = 100
num_selected_clusters = 20
# Split into 30, 60, and 120 min time intervals, cluster each
split = lambda n: split_into_intervals(data, n)
kmeans30 = sklearn.cluster.k_means(split(30), num_clusters)
kmeans60 = sklearn.cluster.k_means(split(60), num_clusters)
kmeans120 = sklearn.cluster.k_means(split(120), num_clusters)
# Sort the clusters by performance
hp30, hp60, hp120 = [], [], []
for i in range(0, num_clusters):
hp30.append((i,kmeans30[0][i,-1]))
hp60.append((i,kmeans60[0][i,-1]))
hp120.append((i,kmeans120[0][i,-1]))
hp30 = sorted(hp30, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
hp60 = sorted(hp60, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
hp60 = sorted(hp120, reverse=True, key=lambda x: x[1])[0:num_selected_clusters]
# Select the highest performing clusters
top30 = np.zeros((num_selected_clusters,181))
top60 = np.zeros((num_selected_clusters,361))
top120 = np.zeros((num_selected_clusters,721))
for i in range(0, num_selected_clusters):
top30[i,0:181] = kmeans30[0][hp30[i][0],0:181]
top60[i,0:361] = kmeans60[0][hp60[i][0],0:361]
top120[i,0:721] = kmeans120[0][hp120[i][0],0:721]
# Then normalize the clusters so we can use the faster similarity function
# from S&Z to compare instead of L2 norm
scaler = sklearn.preprocessing.StandardScaler()
for i in range(0, num_selected_clusters):
top30[i,0:180] = scaler.fit_transform(top30[i,0:180])
top60[i,0:360] = scaler.fit_transform(top60[i,0:360])
top120[i,0:720] = scaler.fit_transform(top120[i,0:720])
return [top30, top60, top120]