ubiquity_abundance.py 文件源码-python代码片段

def tidyfy_df(df):
    """
    Returns tidy df pivoted around 'otu', for the aggregate ubiquity and abundance
    measures.

    Input df should have columns labeled like "ubiquity_calc_type_patients" or
    "abundance_calc_type_patients" where the first underscore-delimited value
    is "abundance" or "ubiquity" and the last one is "dis", "h", or "total"
    (or some other patient type indicator). The middle values are the type of
    calculation used (e.g. "from_pooled_calc", "mean_of_datasets")

    Note that columns with 'in_one_dataset' are discarded.
    """

    id_vars = ['otu']
    value_vars = [i for i in df.columns if i.startswith('ubiquity') or i.startswith('abundance')]
    value_vars = [i for i in value_vars if 'in_one_dataset' not in i]

    tidydf = pd.melt(df, id_vars=id_vars, value_vars=value_vars).drop_duplicates()

    tidydf['metric'] = tidydf['variable'].apply(lambda x: x.split('_')[0])
    tidydf['calculation'] = tidydf['variable'].apply(lambda x: x.split('_',1)[1].rsplit('_',1)[0])
    tidydf['patient'] = tidydf['variable'].apply(lambda x: x.split('_')[-1])

    return tidydf