stratified_rand.py 文件源码-python代码片段

def create_stratum(self, column_names, **kwargs):
        '''
        Use affinity propagation to find number of strata for each column. 
        column_names is a list of the covariates to be split into strata and 
        used for classification. This funciton adds a column to the data frame
        for each column as column_name_strata that gives the strata designation
        for that variable.  The whole data frame is returned.
        '''

        for colname in column_names:
            X = self.data[colname].reshape(-1, 1)

            if np.isnan(X).any():
                raise ValueError("There are NaN values in self.data[%s] that the \
                                  clustering algorithm can't handle" % colname)

            elif np.unique(self.data[colname]).shape[0] <=2:
                string_name = colname+'_strata'
                self.data[string_name] = self.data[colname].astype(int)

            else:
                af_model = AP(damping = 0.9)
                strata_groups = af_model.fit(X)

                #cluster_centers_indices = af.cluster_centers_indices_
                #n_clusters_ = len(cluster_centers_indices)

                string_name = colname+'_strata'
                self.data[string_name] = strata_groups.labels_

        return self.data


    #In the main function, you need to call create_stratum before create_unique_strata