ml_algorithms.py 文件源码-python代码片段

def runClustering(cluster_df):
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score as silhouette_score

    Xcols = [col for col in cluster_df.columns if 'NOTMODEL' not in col.upper()]

    # Convert character columns to dummy variables
    X = cluster_df[Xcols]
    cols = X.columns
    num_cols = X._get_numeric_data().columns
    char_cols = list(set(cols) - set(num_cols))
    for col in char_cols:
        if len(X[col].unique()) <= 20:
            dummy = pd.get_dummies(X[col], prefix='dm' + col)
            column_name = X.columns.values.tolist()
            column_name.remove(col)
            X = X[column_name].join(dummy)
        else:
            if col in X.columns:    # If more than 20 distinct values then delete
                del X[col]

    # Standardize (Z-score normalize) all continuous variables
    from scipy.stats import zscore
    for col in X:
        if len(X[col].unique()) > 2:    # Standardize non-dummy variables
            col_zscore = 'z_' + col
            X[col_zscore] = zscore(X[col])
            del X[col]

    # Fill missing values with 0 = the mean in the z-normalize data
    # Obviously missing values can be handled in many different ways
    X.fillna(0, inplace=True)

    # convert to matrix/numpy array to use in KMeans clustering class
    data_for_clustering_matrix = X.as_matrix()

    number_of_Clusters = []
    silhouette_value = []
    # Loop through 2 and 20 clusters and identify which has the highest silhouette score
    k = range(2, 21)
    for i in k:
        clustering_method = KMeans(n_clusters=i)
        clustering_method.fit(data_for_clustering_matrix)
        labels = clustering_method.predict(data_for_clustering_matrix)
        silhouette_average = silhouette_score(data_for_clustering_matrix, labels)
        silhouette_value.append(silhouette_average)
        number_of_Clusters.append(int(i))

        # maxind = np.argmax(silhouette_value)
        max_value = max(silhouette_value)
        indexMaxValue = silhouette_value.index(max_value)

        # FIT KMEANS CLUSTER MODEL WITH NUMBER OF CLUSTERS WITH HIGHEST SILHOUETTE SCORE
        clustering_method = KMeans(n_clusters=number_of_Clusters[indexMaxValue])
        clustering_method.fit(data_for_clustering_matrix)
        labels = clustering_method.predict(data_for_clustering_matrix)

        # SCORE THE DATAFRAME  score_df
        cluster_df['cluster'] = labels
        return cluster_df