clustering_model_kmeans_external.py 文件源码-python代码片段

def process_clustering(self):
        print("K-Means Clustering in progress...")

        dataset_choice = self.prediction_config.DATASET_LOCATION[self.prediction_config.DATASET_CHOICE]


        if not "affiliation_column" in dataset_choice or not dataset_choice["affiliation_column"]:
            return

        # Explore loaded data
        df = self.prediction_data
        target_column = dataset_choice["target_column"]
        affiliation_column = dataset_choice["affiliation_column"]

        centroids_quantity = self.prediction_config.CENTROIDS_QUANTITY
        # Initialise K-Means Clustering Model using specified quantity of clusters (centroids)
        # for training the model using the whole dataset.
        kmeans_model = KMeans(n_clusters=centroids_quantity, random_state=1)

        df_numeric = df.select_dtypes(include=['int', 'int64', 'float64', 'floating'], exclude=['O'])
        print("Excluding non-numeric columns from K-Means Clustering: ", df.select_dtypes(include=['O']).columns.tolist())

        print("All dtypes: ", dict(df.dtypes))
        print("Any rows null?: ", df.isnull().values.any())
        print("Columns/rows with NaN values: ", df[df.isnull().any(axis=1)])

        # Fit the K-Means Model to the DataFrame to calculate the Euclidean Distance of each row
        # to each cluster (centroid) and return a Numpy array with n_columns. Each column represents a
        # cluster (centroid) and indicates how far each rows is from the nearest cluster (centroid)
        # Important Note: Pass only numeric dataframe columns
        clustered_row_distances = kmeans_model.fit_transform(df_numeric)

        # Explore clusters to by computing cross-tabulation of the quantity of rows in each clustered_row_distance column
        # and the checking how they corresponded to unique row values of Affiliation column (i.e. 'party')
        labels = kmeans_model.labels_
        # Show how many are grouped into say Cluster 0
        # print(labels.tolist().count(0))
        # Count quantity of unique Clusters
        print("Clusters total count: %r" % (len(labels.tolist())))
        print("Clusters unique count: %r" % (len(set(labels.tolist()))))
        cluster_names = list(map(lambda cluster_name: ("Cluster " + str(cluster_name)) if cluster_name else None, labels))

        print("Cross Tabulation between Clustered Labels and Affiliation i.e. 'party' column: \n%r" % (pd.crosstab(index=labels, columns=df[affiliation_column])))

        if self.prediction_config.PLOT_KMEANS_OUTLIERS == True:
            self.example_plot_outliers(df, affiliation_column, labels, cluster_names, clustered_row_distances)

        # Generate new DataFrame column to be used as Target Column for Prediction Algorithms
        # (i.e. to detect which roll call votes were most likely to cause extremism such
        # that Senators would not vote along their own party lines)
        extremism = (clustered_row_distances ** 3).sum(axis=1)
        df["extremism"] = extremism
        df.sort_values("extremism", inplace=True, ascending=False)
        print("Top 10 observations ranked in order of 'extremism': %r" % (df.head(10)))
        self.prediction_data.df_listings = df