def tidyfy_df(df):
"""
Returns tidy df pivoted around 'otu', for the aggregate ubiquity and abundance
measures.
Input df should have columns labeled like "ubiquity_calc_type_patients" or
"abundance_calc_type_patients" where the first underscore-delimited value
is "abundance" or "ubiquity" and the last one is "dis", "h", or "total"
(or some other patient type indicator). The middle values are the type of
calculation used (e.g. "from_pooled_calc", "mean_of_datasets")
Note that columns with 'in_one_dataset' are discarded.
"""
id_vars = ['otu']
value_vars = [i for i in df.columns if i.startswith('ubiquity') or i.startswith('abundance')]
value_vars = [i for i in value_vars if 'in_one_dataset' not in i]
tidydf = pd.melt(df, id_vars=id_vars, value_vars=value_vars).drop_duplicates()
tidydf['metric'] = tidydf['variable'].apply(lambda x: x.split('_')[0])
tidydf['calculation'] = tidydf['variable'].apply(lambda x: x.split('_',1)[1].rsplit('_',1)[0])
tidydf['patient'] = tidydf['variable'].apply(lambda x: x.split('_')[-1])
return tidydf
评论列表
文章目录