def feature_agglomeration(df):
df = df.copy()
# Todo: find optimal number of clusters for the feature clustering
# number_of_clusters = int(df.shape[1]/2)
number_of_clusters = int(df.shape[1] / 1.2)
from sklearn.cluster import FeatureAgglomeration
agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters)
# mask = ~df[features].isnull()
# mask_index = mask[mask == 1].index
if any(tuple(df.columns == 'SalePrice')):
# res = agglomerated_features.fit_transform(np.reshape(np.array(df[HousePrices._feature_names_num.values]
# [mask].values),
# df[HousePrices._feature_names_num.values][mask]
# .shape), y=df.SalePrice.values).toarray()
res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), df.dropna()
.shape), y=df.SalePrice.values)
else:
# res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), df.dropna()
# .shape))
res = agglomerated_features.fit_transform(np.reshape(np.array(df.values), df.shape))
# Todo: in case of adding values using df.loc[], remember mask is only possible for a single feature at a time.
print(''.join(['labels:', str(agglomerated_features.labels_)]))
print(''.join(['Children:', str(agglomerated_features.children_)]))
print(''.join(['number of leaves in the hierarchical tree:', str(agglomerated_features.n_leaves_)]))
HousePrices.dendrogram(df, number_of_clusters, agglomerated_features.labels_)
df = pd.DataFrame(data=res)
return df
评论列表
文章目录