def _create_significance_table(self,data):
"""
Create a table containing p values for significance tests. Add features of
the distributions and the p values to the dataframe.
"""
# list features of the variable e.g. matched, paired, n_expected
df=pd.DataFrame(index=self.continuous+self.categorical,
columns=['continuous','nonnormal','min_observed','pval','ptest'])
df.index.rename('variable', inplace=True)
df['continuous'] = np.where(df.index.isin(self.continuous),True,False)
df['nonnormal'] = np.where(df.index.isin(self.nonnormal),True,False)
# list values for each variable, grouped by groupby levels
for v in df.index:
# compute p value
is_continuous = df.loc[v]['continuous']
is_categorical = ~df.loc[v]['continuous']
is_normal = ~df.loc[v]['nonnormal']
# if continuous, group data into list of lists
if is_continuous:
catlevels = None
grouped_data = []
for s in self.groupbylvls:
lvl_data = data[data[self.groupby]==s].dropna(subset=[v])[v]
grouped_data.append(lvl_data.values)
min_observed = len(min(grouped_data,key=len))
# if categorical, create contingency table
elif is_categorical:
catlevels = sorted(data[v].astype('category').cat.categories)
grouped_data = pd.crosstab(data[self.groupby],data[v])
min_observed = grouped_data.sum(axis=1).min()
# minimum number of observations across all levels
df.loc[v,'min_observed'] = min_observed
# compute pvalues
df.loc[v,'pval'],df.loc[v,'ptest'] = self._p_test(v,
grouped_data,is_continuous,is_categorical,
is_normal,min_observed,catlevels)
return df
评论列表
文章目录