def process_clustering(self):
print("K-Means Clustering in progress...")
dataset_choice = self.prediction_config.DATASET_LOCATION[self.prediction_config.DATASET_CHOICE]
if not "affiliation_column" in dataset_choice or not dataset_choice["affiliation_column"]:
return
# Explore loaded data
df = self.prediction_data
target_column = dataset_choice["target_column"]
affiliation_column = dataset_choice["affiliation_column"]
centroids_quantity = self.prediction_config.CENTROIDS_QUANTITY
# Initialise K-Means Clustering Model using specified quantity of clusters (centroids)
# for training the model using the whole dataset.
kmeans_model = KMeans(n_clusters=centroids_quantity, random_state=1)
df_numeric = df.select_dtypes(include=['int', 'int64', 'float64', 'floating'], exclude=['O'])
print("Excluding non-numeric columns from K-Means Clustering: ", df.select_dtypes(include=['O']).columns.tolist())
print("All dtypes: ", dict(df.dtypes))
print("Any rows null?: ", df.isnull().values.any())
print("Columns/rows with NaN values: ", df[df.isnull().any(axis=1)])
# Fit the K-Means Model to the DataFrame to calculate the Euclidean Distance of each row
# to each cluster (centroid) and return a Numpy array with n_columns. Each column represents a
# cluster (centroid) and indicates how far each rows is from the nearest cluster (centroid)
# Important Note: Pass only numeric dataframe columns
clustered_row_distances = kmeans_model.fit_transform(df_numeric)
# Explore clusters to by computing cross-tabulation of the quantity of rows in each clustered_row_distance column
# and the checking how they corresponded to unique row values of Affiliation column (i.e. 'party')
labels = kmeans_model.labels_
# Show how many are grouped into say Cluster 0
# print(labels.tolist().count(0))
# Count quantity of unique Clusters
print("Clusters total count: %r" % (len(labels.tolist())))
print("Clusters unique count: %r" % (len(set(labels.tolist()))))
cluster_names = list(map(lambda cluster_name: ("Cluster " + str(cluster_name)) if cluster_name else None, labels))
print("Cross Tabulation between Clustered Labels and Affiliation i.e. 'party' column: \n%r" % (pd.crosstab(index=labels, columns=df[affiliation_column])))
if self.prediction_config.PLOT_KMEANS_OUTLIERS == True:
self.example_plot_outliers(df, affiliation_column, labels, cluster_names, clustered_row_distances)
# Generate new DataFrame column to be used as Target Column for Prediction Algorithms
# (i.e. to detect which roll call votes were most likely to cause extremism such
# that Senators would not vote along their own party lines)
extremism = (clustered_row_distances ** 3).sum(axis=1)
df["extremism"] = extremism
df.sort_values("extremism", inplace=True, ascending=False)
print("Top 10 observations ranked in order of 'extremism': %r" % (df.head(10)))
self.prediction_data.df_listings = df
clustering_model_kmeans_external.py 文件源码
python
阅读 18
收藏 0
点赞 0
评论 0
评论列表
文章目录