def runClustering(cluster_df):
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score as silhouette_score
Xcols = [col for col in cluster_df.columns if 'NOTMODEL' not in col.upper()]
# Convert character columns to dummy variables
X = cluster_df[Xcols]
cols = X.columns
num_cols = X._get_numeric_data().columns
char_cols = list(set(cols) - set(num_cols))
for col in char_cols:
if len(X[col].unique()) <= 20:
dummy = pd.get_dummies(X[col], prefix='dm' + col)
column_name = X.columns.values.tolist()
column_name.remove(col)
X = X[column_name].join(dummy)
else:
if col in X.columns: # If more than 20 distinct values then delete
del X[col]
# Standardize (Z-score normalize) all continuous variables
from scipy.stats import zscore
for col in X:
if len(X[col].unique()) > 2: # Standardize non-dummy variables
col_zscore = 'z_' + col
X[col_zscore] = zscore(X[col])
del X[col]
# Fill missing values with 0 = the mean in the z-normalize data
# Obviously missing values can be handled in many different ways
X.fillna(0, inplace=True)
# convert to matrix/numpy array to use in KMeans clustering class
data_for_clustering_matrix = X.as_matrix()
number_of_Clusters = []
silhouette_value = []
# Loop through 2 and 20 clusters and identify which has the highest silhouette score
k = range(2, 21)
for i in k:
clustering_method = KMeans(n_clusters=i)
clustering_method.fit(data_for_clustering_matrix)
labels = clustering_method.predict(data_for_clustering_matrix)
silhouette_average = silhouette_score(data_for_clustering_matrix, labels)
silhouette_value.append(silhouette_average)
number_of_Clusters.append(int(i))
# maxind = np.argmax(silhouette_value)
max_value = max(silhouette_value)
indexMaxValue = silhouette_value.index(max_value)
# FIT KMEANS CLUSTER MODEL WITH NUMBER OF CLUSTERS WITH HIGHEST SILHOUETTE SCORE
clustering_method = KMeans(n_clusters=number_of_Clusters[indexMaxValue])
clustering_method.fit(data_for_clustering_matrix)
labels = clustering_method.predict(data_for_clustering_matrix)
# SCORE THE DATAFRAME score_df
cluster_df['cluster'] = labels
return cluster_df
评论列表
文章目录