python类melt()的实例源码

glm.py 文件源码 项目:diamond 作者: stitchfix 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def _create_main_design(self, **kwargs):
        r"""
        Create design matrix for main effects
        Keyword Args:
            * *df* (``DataFrame``). specify a new dataframe to create
                design matrix from
        Returns:
            array_like: design matrix in sparse CSR format

        """
        df = kwargs.get('df', self.train_df)
        df.reset_index(drop=True, inplace=True)
        df['row_index'] = df.index
        df['intercept'] = 1.0  # assume intercept is always included

        id_cols = ['row_index']

        melted_df = pd.melt(df[id_cols + self.main_effects], id_cols)
        melted_df = melted_df.merge(self.main_map, on='variable')
        melted_df['col_index'] = melted_df['main_idx']
        row = melted_df.row_index
        col = melted_df.col_index
        data = melted_df.value
        return sparse.coo_matrix((data, (row, col)),
                                 shape=(max(row) + 1, max(col) + 1)).tocsr()
plots.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def missing_rate_plot(consensus_data, ordered_genomes, biotypes, missing_plot_tgt):
    """Missing genes/transcripts"""
    base_title = 'Number of missing orthologs in consensus set'
    gene_missing_df = json_biotype_counter_to_df(consensus_data, 'Gene Missing')
    gene_missing_df.columns = ['biotype', 'Genes', 'genome']
    transcript_missing_df = json_biotype_counter_to_df(consensus_data, 'Transcript Missing')
    transcript_missing_df.columns = ['biotype', 'Transcripts', 'genome']
    df = transcript_missing_df.merge(gene_missing_df, on=['genome', 'biotype'])
    df = pd.melt(df, id_vars=['biotype', 'genome'])
    ylabel = 'Number of genes or transcripts'
    with missing_plot_tgt.open('w') as outf, PdfPages(outf) as pdf:
        tot_df = df.groupby(['genome', 'biotype', 'variable']).aggregate(sum).reset_index()
        generic_barplot(tot_df, pdf, '', ylabel, base_title, x='genome', y='value',
                        col='variable', row_order=ordered_genomes)
        for biotype in biotypes:
            biotype_df = biotype_filter(df, biotype)
            if biotype_df is None:
                continue
            biotype_df = biotype_df.groupby(['genome', 'variable']).aggregate(sum).reset_index()
            title = base_title + ' for biotype {}'.format(biotype)
            generic_barplot(biotype_df, pdf, '', ylabel, title, x='genome', y='value',
                            col='variable', row_order=ordered_genomes)
test_reshape.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def test_custom_var_name(self):
        result5 = melt(self.df, var_name=self.var_name)
        self.assertEqual(result5.columns.tolist(), ['var', 'value'])

        result6 = melt(self.df, id_vars=['id1'], var_name=self.var_name)
        self.assertEqual(result6.columns.tolist(), ['id1', 'var', 'value'])

        result7 = melt(self.df, id_vars=['id1', 'id2'], var_name=self.var_name)
        self.assertEqual(result7.columns.tolist(), ['id1', 'id2', 'var',
                                                    'value'])

        result8 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A',
                       var_name=self.var_name)
        self.assertEqual(result8.columns.tolist(), ['id1', 'id2', 'var',
                                                    'value'])

        result9 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'],
                       var_name=self.var_name)
        expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                               'id2': self.df['id2'].tolist() * 2,
                               self.var_name: ['A'] * 10 + ['B'] * 10,
                               'value': (self.df['A'].tolist() +
                                         self.df['B'].tolist())},
                              columns=['id1', 'id2', self.var_name, 'value'])
        tm.assert_frame_equal(result9, expected9)
ubiquity_abundance.py 文件源码 项目:microbiomeHD 作者: cduvallet 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def tidyfy_df(df):
    """
    Returns tidy df pivoted around 'otu', for the aggregate ubiquity and abundance
    measures.

    Input df should have columns labeled like "ubiquity_calc_type_patients" or
    "abundance_calc_type_patients" where the first underscore-delimited value
    is "abundance" or "ubiquity" and the last one is "dis", "h", or "total"
    (or some other patient type indicator). The middle values are the type of
    calculation used (e.g. "from_pooled_calc", "mean_of_datasets")

    Note that columns with 'in_one_dataset' are discarded.
    """

    id_vars = ['otu']
    value_vars = [i for i in df.columns if i.startswith('ubiquity') or i.startswith('abundance')]
    value_vars = [i for i in value_vars if 'in_one_dataset' not in i]

    tidydf = pd.melt(df, id_vars=id_vars, value_vars=value_vars).drop_duplicates()

    tidydf['metric'] = tidydf['variable'].apply(lambda x: x.split('_')[0])
    tidydf['calculation'] = tidydf['variable'].apply(lambda x: x.split('_',1)[1].rsplit('_',1)[0])
    tidydf['patient'] = tidydf['variable'].apply(lambda x: x.split('_')[-1])

    return tidydf
process.py 文件源码 项目:gini-index 作者: datasets 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def main():
    giniIndex = pd.read_csv(source)
    giniIndex.to_csv('archive/gini-index.csv', sep=",", index_col=0, index=False) 
    print("Saved archive CSV file.")
    print (giniIndex)

    # Processing the data
    df = pd.read_csv('archive/gini-index.csv')      # Reading the source csv
    """
    Python is printing "Country Name" with quotes in data frame and does not
    work for the remaining code
    """
    df.columns.values[0] = 'Country Name'

    df = pd.melt(df, id_vars=['Country Name', 'Country Code'], var_name="Year", value_name="Value")     # Unpivoting
    df = df.sort_values(by=['Country Name', 'Year'], ascending=[True, True]) # Sorting by country

    df.dropna().to_csv('data/gini-index.csv', sep=",", index=False)   # Saving CSV
    print ("File has been saved and it is ready for data packaging.")
bubble_plot.py 文件源码 项目:bubble_plot 作者: shirmeir 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def plot_with_z(df, x, y, z_boolean, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, maximal_bubble_size=4000, normalization_by_all=False):
    count_table = pd.concat([pd.cut(df[x], bins=bins_x) if x_is_numeric else df[x],
                         pd.cut(df[y], bins=bins_y) if y_is_numeric else df[y], df[z_boolean]], axis=1)
    count_table = count_table.groupby([x,z_boolean])[y].value_counts().unstack().fillna(0)
    count_table = count_table.unstack()
    count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
    z_boolean_values = count_table_long[z_boolean].unique()
    ratio = pd.DataFrame({'ratio':count_table_long.set_index([x,y,z_boolean]).unstack()['value'][z_boolean_values[1]] / (
    count_table_long.set_index([x,y,z_boolean]).unstack()['value'].sum(axis=1) )})
    count_table_long = count_table_long.set_index([x, y ])[['value']].merge(ratio, left_index=True, right_index=True).reset_index()
    size_factor = maximal_bubble_size/count_table_long['value'].max()
    x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
        if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
    y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
        if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
    xticks = np.arange(len(ordered_x_values)) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticks = np.arange(len(ordered_y_values)) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    count_table_long[x] = count_table_long[x].map(x_values_dict)
    count_table_long[y] = count_table_long[y].map(y_values_dict)
    plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
                c=count_table_long['ratio'],  alpha=0.5,
                cmap='cool')
    return count_table_long, xticks, yticks, xticklabels, yticklabels
plotting.py 文件源码 项目:fitbit-analyzer 作者: 5agado 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _plotWeekdayStats(stats, columns, groupBy=True):
    dataToPlot = stats.copy()
    # Group by weekday and rename date column
    if groupBy:
        dataToPlot = dataToPlot.groupby(stats['date'].dt.weekday).mean()
        dataToPlot = dataToPlot.reset_index().rename(columns={'date':'weekday'})

    # change stats from columns to row attribute
    dataToPlot = pd.melt(dataToPlot, id_vars=['weekday'], value_vars=columns,
                         var_name='stats', value_name='val')
    # Rename stats and weekdays
    dataToPlot['stats'].replace(NAMES, inplace=True)
    dataToPlot['weekday'].replace(dayOfWeek, inplace=True)
    # Plot
    g = sns.factorplot(data=dataToPlot, x="weekday", y="val", col="stats",
                       order=dayOfWeekOrder, kind="point", sharey=False, col_wrap=3)
    g.set_xticklabels(rotation=45)
    g.set(xlabel='')
    return g
    #sns.plt.show()
helpers.py 文件源码 项目:quail 作者: ContextLab 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def format2tidy(df, subjname, listname, subjgroup, **attrs):

    melted_df = pd.melt(df.T)
    melted_df[subjname]=""
    for idx,sub in enumerate(melted_df['Subject'].unique()):
        melted_df.loc[melted_df['Subject']==sub,subjname]=subjgroup[idx]
    if attrs['analysis_type'] in ['spc']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Proportion Recalled', subjname, 'Position']
    elif attrs['analysis_type'] in ['pfr', 'pnr']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Probability of Recall: Position ' + str(attrs['n']), subjname, 'Position']
    elif attrs['analysis_type'] is 'lagcrp':
        base = range(int(-len(df.columns.values)/2),int(len(df.columns.values)/2)+1)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Conditional Response Probability', subjname, 'Position']
    elif attrs['analysis_type'] is 'fingerprint' or attrs['analysis_type'] is 'fingerprint_temporal':
        base = list(df.columns.values)
        melted_df['Feature'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Clustering Score', subjname, 'Feature']
    elif attrs['analysis_type'] is 'accuracy':
        melted_df.columns = ['Subject', listname, 'Accuracy', subjname]
    elif attrs['analysis_type'] is 'temporal':
        melted_df.columns = ['Subject', listname, 'Temporal Clustering Score', subjname]


    return melted_df
helpers.py 文件源码 项目:quail 作者: ContextLab 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def format2tidy(df, subjname, listname, subjgroup, **attrs):

    melted_df = pd.melt(df.T)
    melted_df[subjname]=""
    for idx,sub in enumerate(melted_df['Subject'].unique()):
        melted_df.loc[melted_df['Subject']==sub,subjname]=subjgroup[idx]
    if attrs['analysis_type'] in ['spc']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Proportion Recalled', subjname, 'Position']
    elif attrs['analysis_type'] in ['pfr', 'pnr']:
        base = list(df.columns)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Probability of Recall: Position ' + str(attrs['n']), subjname, 'Position']
    elif attrs['analysis_type'] is 'lagcrp':
        base = range(int(-len(df.columns.values)/2),int(len(df.columns.values)/2)+1)
        melted_df['Position'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Conditional Response Probability', subjname, 'Position']
    elif attrs['analysis_type'] is 'fingerprint' or attrs['analysis_type'] is 'fingerprint_temporal':
        base = list(df.columns.values)
        melted_df['Feature'] = base * int(melted_df.shape[0] / len(base))
        melted_df.columns = ['Subject', listname, 'Clustering Score', subjname, 'Feature']
    elif attrs['analysis_type'] is 'accuracy':
        melted_df.columns = ['Subject', listname, 'Accuracy', subjname]
    elif attrs['analysis_type'] is 'temporal':
        melted_df.columns = ['Subject', listname, 'Temporal Clustering Score', subjname]


    return melted_df
utils.py 文件源码 项目:kmeans-service 作者: MAYHEM-Lab 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def plot_aic_bic_fig(tasks):
    """
    Creates AIC-BIC plot, as a 2-row x 3-col grid of point plots with 95% confidence intervals.

    Parameters
    ----------
    tasks: list(dict)

    Returns
    -------
    Matplotlib Figure object
    """
    sns.set(context='talk', style='whitegrid')
    # Filter list of dicts to reduce the size of Pandas DataFrame
    df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied', 'bic', 'aic']))
    df['covar_type'] = [x.capitalize() for x in df['covar_type']]
    df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']]
    df['aic'] = df['aic'].astype('float')
    df['bic'] = df['bic'].astype('float')
    df = pd.melt(df, id_vars=['k', 'covar_type', 'covar_tied'], value_vars=['aic', 'bic'], var_name='metric')
    f = sns.factorplot(x='k', y='value', col='covar_type', row='covar_tied', hue='metric', data=df,
                       row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True,
                       ci=95, n_boot=100)
    f.set_titles("{col_name}-{row_name}")
    f.set_xlabels("Num. of Clusters (K)")
    return f.fig
eval_models.py 文件源码 项目:johnson-county-ddj-public 作者: dssg 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def generate_metrics(self):
        """Given a model id and a set of thresholds, obtain the y values (true
        class and predicted probability) and calculate metrics for the
        model at each threshold.

        :param batch_timestamp: timestamps of model batches
        :type batch_timestamp: list
        :returns: None -- always returns None as default
        :rtype: None
        """
        # get the y-values
        y_values = self.get_y_values()

        # generate metrics at thresholds
        eval_metrics_pct = self.threshold_pct.apply(self.evaluate_model_at_threshold,
                                               args = (y_values['scores'],
                                                       y_values['y_true'],
                                                       True))
        eval_metrics_abs = self.threshold_abs.apply(self.evaluate_model_at_threshold,
                                               args = (y_values['scores'],
                                                       y_values['y_true'],
                                                       False))

        # build table of metrics
        eval_metrics = pd.concat([eval_metrics_pct, eval_metrics_abs])
        eval_metrics_long = pd.melt(eval_metrics, id_vars = ['parameter'],
                                    var_name = 'metric')
        eval_metrics_long['unique_timestamp'] = self.model_id
        auc = self.compute_AUC(y_values['y_true'], y_values['scores'])
        final_metrics = eval_metrics_long.append({'parameter': 'roc',
            'metric': 'auc',
            'value': auc,
            'unique_timestamp': self.model_id},
            ignore_index = True)
        metrics_cols = ['parameter', 'metric', 'value', 'unique_timestamp']
        final_metrics = final_metrics[metrics_cols]

        return(final_metrics)
test_logistic_simulated.py 文件源码 项目:diamond 作者: stitchfix 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_setUp(self, tol=0.02):
        # assumes working directory is diamond/
        folder = "diamond/integration_tests/logistic"

        simulated_data_loc = "%s/simulated_logistic_df.csv" % folder
        estimated_covariance_loc = "%s/simulated_logistic_covariance.csv" % folder
        resources_exist = os.path.exists(simulated_data_loc) and os.path.exists(estimated_covariance_loc)
        if not resources_exist:
            logging.info("Simulating data and estimating covariances in R")
            os.system("/usr/local/bin/Rscript %s/logistic_generate_and_fit.R" % folder)
        logging.info("Reading in training data and R::lme4-estimated covariance matrix")
        df_train = pd.read_csv(simulated_data_loc)
        df_estimated_covariance = pd.read_csv(estimated_covariance_loc)

        self.model = LogisticRegression(train_df=df_train,
                                        priors_df=df_estimated_covariance,
                                        copy=True,
                                        test_df=None)
        logging.info("Fitting model in diamond")
        self.formula = "y ~ 1 + x + (1 + x | level)"
        results = self.model.fit(self.formula, tol=1e-4, verbose=True)

        # the format of the coefficient vector is:
        # fixed effects, then [random intercept, random slope] for each level
        beta_hat = np.append(results["fixed_effects"].value.values,
                             pd.melt(results["level"], "level").sort_values(["level", "variable"]).value.values)

        beta_true = pd.read_csv("%s/simulated_logistic_true_parameters.csv" % folder)["x"].values
        rel_error = np.mean((beta_hat - beta_true) ** 2) / np.mean(abs(beta_true))
        if rel_error > tol:
            logging.warn("relative error = %f > tolerance = %f" % (rel_error, tol))
        else:
            logging.info("relative error = %f < tolerance = %f" % (rel_error, tol))
        # make sure the coefficients are very close
        self.assertTrue(rel_error < tol)
VehicleScraper.py 文件源码 项目:statscraper 作者: jplusplus 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _clean_data(self, df, year, month):
        df = df.dropna(how='all', axis=1)
        df = df.dropna(how='all', axis=0)
        df = df.drop('Totalsumma', axis=1)
        df = df.rename(columns={'Unnamed: 1': 'vehicle_type'})
        df = df[df['vehicle_type'] != 'Totalsumma']
        df.loc[:, 'year'] = year
        df.loc[:, 'month'] = month
        df = pd.melt(df,
                     id_vars=['vehicle_type', 'month', 'year'],
                     value_vars=['AVREGISTRERAD', 'AVSTÄLLD', 'ITRAFIK'],
                     var_name='status')
        return df
plots.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def tm_metrics_plot(tm_metrics, ordered_genomes, biotypes, transcript_biotype_map, tm_coverage_tgt, tm_identity_tgt):
    """plots for transMap coverage, identity"""
    tm_iter = zip(*[['transMap Coverage', 'transMap Identity'],
                    [tm_coverage_tgt, tm_identity_tgt]])
    for mode, tgt in tm_iter:
        df = dict_to_df_with_biotype(tm_metrics[mode], transcript_biotype_map)
        df = pd.melt(df, id_vars='biotype', value_vars=ordered_genomes).dropna()
        df.columns = ['biotype', 'genome', mode]
        cov_ident_plot(biotypes, ordered_genomes, mode, tgt, df, x=mode, y='genome')
plots.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def consensus_support_plot(consensus_data, ordered_genomes, biotypes, modes, title, tgt):
    """grouped violin plots of original intron / intron annotation / exon annotation support"""
    def adjust_plot(g, this_title):
        g.set_xticklabels(rotation=90)
        g.fig.suptitle(this_title)
        g.fig.subplots_adjust(top=0.9)
        for ax in g.axes.flat:
            ax.set_ylabel('Percent supported')
            ax.set_ylim(-1, 101)

    dfs = []
    for i, mode in enumerate(modes):
        df = json_to_df_with_biotype(consensus_data, mode)
        if i > 0:
            df = df[mode]
        dfs.append(df)
    df = pd.concat(dfs, axis=1)
    df = pd.melt(df, value_vars=modes, id_vars=['genome', 'biotype'])
    with tgt.open('w') as outf, PdfPages(outf) as pdf:
        if len(ordered_genomes) > 1:
            g = sns.factorplot(data=df, y='value', x='genome', col='variable', col_wrap=2, kind='violin', sharex=True,
                               sharey=True, row_order=ordered_genomes, cut=0)
        else:
            g = sns.factorplot(data=df, y='value', x='variable', kind='violin', sharex=True,
                               sharey=True, row_order=ordered_genomes, cut=0)
        adjust_plot(g, title)
        multipage_close(pdf, tight_layout=False)
        title += ' for {}'
        for biotype in biotypes:
            this_title = title.format(biotype)
            biotype_df = biotype_filter(df, biotype)
            if biotype_df is not None:
                if len(ordered_genomes) > 1:
                    g = sns.factorplot(data=biotype_df, y='value', x='genome', col='variable', col_wrap=2,
                                       kind='violin', sharex=True, sharey=True, row_order=ordered_genomes, cut=0)
                else:
                    g = sns.factorplot(data=df, y='value', x='variable', kind='violin', sharex=True,
                                       sharey=True, row_order=ordered_genomes, cut=0)
                adjust_plot(g, this_title)
                multipage_close(pdf, tight_layout=False)
plots.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def tx_modes_plot(consensus_data, ordered_genomes, tx_mode_plot_tgt):
    ordered_groups = ['transMap', 'transMap+TM', 'transMap+TMR', 'transMap+TM+TMR', 'TM', 'TMR', 'TM+TMR', 'CGP', 'PB',
                      'Other']
    ordered_groups = OrderedDict([[frozenset(x.split('+')), x] for x in ordered_groups])

    def split_fn(s):
        return ordered_groups.get(frozenset(s['Transcript Modes'].replace('aug', '').split(',')), 'Other')

    modes_df = json_biotype_counter_to_df(consensus_data, 'Transcript Modes')
    df = modes_df.pivot(index='genome', columns='Transcript Modes').transpose().reset_index()
    df['Modes'] = df.apply(split_fn, axis=1)
    df = df[['Modes'] + ordered_genomes]
    ordered_values = [x for x in ordered_groups.itervalues() if x in set(df['Modes'])]
    with tx_mode_plot_tgt.open('w') as outf, PdfPages(outf) as pdf:
        title_string = 'Transcript modes in protein coding consensus gene set'
        ylabel = 'Number of transcripts'
        if len(ordered_genomes) > 1:
            df['Ordered Modes'] = pd.Categorical(df['Modes'], ordered_values, ordered=True)
            df = df.sort_values('Ordered Modes')
            df = df[['Ordered Modes'] + ordered_genomes].set_index('Ordered Modes')
            df = df.fillna(0)
            generic_stacked_barplot(df, pdf, title_string, df.index, ylabel, ordered_genomes, 'Transcript mode(s)',
                                    bbox_to_anchor=(1.25, 0.7))

        else:
            generic_barplot(pd.melt(df, id_vars='Modes'), pdf, 'Transcript mode(s)', ylabel, title_string, x='Modes',
                            y='value', order=ordered_values)
plots.py 文件源码 项目:Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def indel_plot(consensus_data, ordered_genomes, indel_plot_tgt):
    with indel_plot_tgt.open('w') as outf, PdfPages(outf) as pdf:
        tm_df = pd.concat([pd.DataFrame.from_dict(consensus_data[genome]['transMap Indels'], orient='index').T
                           for genome in ordered_genomes])
        tm_df['genome'] = ordered_genomes
        tm_df['transcript set'] = ['transMap'] * len(tm_df)
        consensus_df = pd.concat([pd.DataFrame.from_dict(consensus_data[genome]['Consensus Indels'], orient='index').T
                                  for genome in ordered_genomes])
        consensus_df['genome'] = ordered_genomes
        consensus_df['transcript set'] = ['Consensus'] * len(consensus_df)
        df = pd.concat([consensus_df, tm_df])
        df = pd.melt(df, id_vars=['genome', 'transcript set'],
                     value_vars=['CodingDeletion', 'CodingInsertion', 'CodingMult3Indel'])
        df.columns = ['Genome', 'Transcript set', 'Type', 'Percent of transcripts']
        g = sns.factorplot(data=df, x='Genome', y='Percent of transcripts', col='Transcript set',
                           hue='Type', kind='bar', row_order=ordered_genomes,
                           col_order=['transMap', 'Consensus'])
        g.set_xticklabels(rotation=90)
        g.fig.subplots_adjust(top=.8)
        g.fig.suptitle('Coding indels')
        multipage_close(pdf, tight_layout=False)


###
# shared plotting functions
###
dcpg_train_viz.py 文件源码 项目:deepcpg 作者: cangermueller 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def plot_lc(lc, metrics=None, outputs=False):
    lc = pd.melt(lc, id_vars=['split', 'epoch'], var_name='output')
    if metrics:
        if not isinstance(metrics, list):
            metrics = [metrics]
        tmp = '(%s)' % ('|'.join(metrics))
        lc = lc.loc[lc.output.str.contains(tmp)]
    metrics = lc.output[~lc.output.str.contains('_')].unique()
    lc['metric'] = ''

    for metric in metrics:
        lc.loc[lc.output.str.contains(metric), 'metric'] = metric
        lc.loc[lc.output == metric, 'output'] = 'mean'
        lc.output = lc.output.str.replace('_%s' % metric, '')
        lc.output = lc.output.str.replace('cpg_', '')

    if outputs:
        lc = lc.loc[lc.output != 'mean']
    else:
        lc = lc.loc[lc.output == 'mean']

    grid = sns.FacetGrid(lc, col='split', row='metric', hue='output',
                         sharey=False, size=3, aspect=1.2, legend_out=True)
    grid.map(mpl.pyplot.plot, 'epoch', 'value', linewidth=2)
    grid.set(ylabel='')
    grid.add_legend()
    return grid
dcpg_data_stats.py 文件源码 项目:deepcpg 作者: cangermueller 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def plot_stats(stats):
    stats = stats.sort_values('frac_obs', ascending=False)
    stats = pd.melt(stats, id_vars=['output'], var_name='metric')
    #  stats = stats.loc[stats.metric.isin(['frac_obs', 'frac_one'])]
    #  stats.metric = stats.metric.str.replace('frac_obs', 'cov')
    #  stats.metric = stats.metric.str.replace('frac_one', 'met')
    grid = sns.FacetGrid(data=stats, col='metric', sharex=False)
    grid.map(sns.barplot, 'value', 'output')
    for ax in grid.axes.ravel():
        ax.set(xlabel='', ylabel='')
    return grid
test_reshape.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_default_col_names(self):
        result = melt(self.df)
        self.assertEqual(result.columns.tolist(), ['variable', 'value'])

        result1 = melt(self.df, id_vars=['id1'])
        self.assertEqual(result1.columns.tolist(), ['id1', 'variable', 'value'
                                                    ])

        result2 = melt(self.df, id_vars=['id1', 'id2'])
        self.assertEqual(result2.columns.tolist(), ['id1', 'id2', 'variable',
                                                    'value'])
test_reshape.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_value_vars(self):
        result3 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A')
        self.assertEqual(len(result3), 10)

        result4 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'])
        expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                               'id2': self.df['id2'].tolist() * 2,
                               'variable': ['A'] * 10 + ['B'] * 10,
                               'value': (self.df['A'].tolist() +
                                         self.df['B'].tolist())},
                              columns=['id1', 'id2', 'variable', 'value'])
        tm.assert_frame_equal(result4, expected4)
test_reshape.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_custom_value_name(self):
        result10 = melt(self.df, value_name=self.value_name)
        self.assertEqual(result10.columns.tolist(), ['variable', 'val'])

        result11 = melt(self.df, id_vars=['id1'], value_name=self.value_name)
        self.assertEqual(result11.columns.tolist(), ['id1', 'variable', 'val'])

        result12 = melt(self.df, id_vars=['id1', 'id2'],
                        value_name=self.value_name)
        self.assertEqual(result12.columns.tolist(), ['id1', 'id2', 'variable',
                                                     'val'])

        result13 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A',
                        value_name=self.value_name)
        self.assertEqual(result13.columns.tolist(), ['id1', 'id2', 'variable',
                                                     'val'])

        result14 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'],
                        value_name=self.value_name)
        expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                                'id2': self.df['id2'].tolist() * 2,
                                'variable': ['A'] * 10 + ['B'] * 10,
                                self.value_name: (self.df['A'].tolist() +
                                                  self.df['B'].tolist())},
                               columns=['id1', 'id2', 'variable',
                                        self.value_name])
        tm.assert_frame_equal(result14, expected14)
test_reshape.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def test_custom_var_and_value_name(self):

        result15 = melt(self.df, var_name=self.var_name,
                        value_name=self.value_name)
        self.assertEqual(result15.columns.tolist(), ['var', 'val'])

        result16 = melt(self.df, id_vars=['id1'], var_name=self.var_name,
                        value_name=self.value_name)
        self.assertEqual(result16.columns.tolist(), ['id1', 'var', 'val'])

        result17 = melt(self.df, id_vars=['id1', 'id2'],
                        var_name=self.var_name, value_name=self.value_name)
        self.assertEqual(result17.columns.tolist(), ['id1', 'id2', 'var', 'val'
                                                     ])

        result18 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A',
                        var_name=self.var_name, value_name=self.value_name)
        self.assertEqual(result18.columns.tolist(), ['id1', 'id2', 'var', 'val'
                                                     ])

        result19 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'],
                        var_name=self.var_name, value_name=self.value_name)
        expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
                                'id2': self.df['id2'].tolist() * 2,
                                self.var_name: ['A'] * 10 + ['B'] * 10,
                                self.value_name: (self.df['A'].tolist() +
                                                  self.df['B'].tolist())},
                               columns=['id1', 'id2', self.var_name,
                                        self.value_name])
        tm.assert_frame_equal(result19, expected19)

        df20 = self.df.copy()
        df20.columns.name = 'foo'
        result20 = melt(df20)
        self.assertEqual(result20.columns.tolist(), ['foo', 'value'])
test_reshape.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_multiindex(self):
        res = pd.melt(self.df1)
        self.assertEqual(res.columns.tolist(), ['CAP', 'low', 'value'])
meta_analyze_stouffer.py 文件源码 项目:microbiomeHD 作者: cduvallet 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def pvals_to_long(pvals):
    """
    Given dataframe with signed p-values, convert to longform
    with columns: otu, study, direction, pval, sample_size.

    Parameters
    ----------
    pvals : pandas DataFrame
        Genera in rows, studies in columns, signed pvalues in values.
        Positive indicates higher in disease, negatives is higher in healthy.

    Returns
    -------
    longpvals : pandas DataFrame
        Tidy dataframe with columns otu, study, direction, and pval (for that
        direction)
    """
    pvals.index.name = 'otu'
    pvals = pvals.reset_index()
    longpvals = pd.melt(pvals, id_vars='otu', var_name='dataset',
                        value_name='signed_qvalue').dropna()

    # Convert all p-values to health-associated pvalue
    # Original p-values were calculated from KW test, making them two-sided.
    # If the pvalue is negative, then abs(p)/2 is the health-associated pval.
    # If the pvalue is positive, then 1 - abs(p)/2 is the health-associated
    # pvalue.
    p_to_healthy = lambda x: abs(x)/2.0 if x <= 0  else 1-abs(x)/2.0
    longpvals['q'] = longpvals['signed_qvalue'].map(p_to_healthy)
    longpvals['direction'] = 'healthy'

    # Now add the disease-associated qvalues
    disqs = copy.deepcopy(longpvals)
    disqs['direction'] = 'disease'
    disqs['q'] = 1 - disqs['q']

    longpvals = pd.concat((longpvals, disqs))

    return longpvals
bubble_plot.py 文件源码 项目:bubble_plot 作者: shirmeir 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def plot_without_z(df, x, y, z, count_table, bins_x, bins_y, x_is_numeric, y_is_numeric, ordered_x_values, ordered_y_values, normalization_by_all=False, log=False, maximal_bubble_size=4000):
    if normalization_by_all:
        count_table /= count_table.sum().sum()
    else:
        count_table = count_table.transpose()
        for col in count_table.columns:
            count_table[col] /= count_table[col].sum()
        count_table = count_table.transpose()
    if log:
        count_table = np.log(count_table)
        maximal_bubble_size /= 2
    size_factor = maximal_bubble_size/count_table.max().max()
    count_table_long = pd.melt(count_table.reset_index(), id_vars=x)
    x_values_dict = {x:i for i, x in enumerate(ordered_x_values)} \
        if not x_is_numeric else {xx:get_point(xx) for xx in ordered_x_values}
    y_values_dict = {x:i for i, x in enumerate(ordered_y_values)} \
        if not y_is_numeric else {xx: get_point(xx) for xx in ordered_y_values}
    xticks = np.arange(count_table.shape[0]) if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticks = np.arange(count_table.shape[1]) if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    xticklabels = ordered_x_values if not x_is_numeric else [get_point(xx) for xx in ordered_x_values]
    yticklabels = ordered_y_values if not y_is_numeric else [get_point(xx) for xx in ordered_y_values]
    count_table_long[x] = count_table_long[x].map(x_values_dict)
    count_table_long[y] = count_table_long[y].map(y_values_dict) 
    plt.scatter(count_table_long[x], count_table_long[y], s=size_factor*count_table_long['value'],
                c=count_table_long['value'], cmap='cool')

    return count_table_long, xticks, yticks, xticklabels, yticklabels
figure_3.py 文件源码 项目:Waskom_PNAS_2017 作者: WagnerLabPapers 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def plot_points(df, axes):

    for exp, ax in zip(["dots", "sticks", "rest"], axes):

        exp_df = pd.melt(df.query("exp == @exp"),
                         "subj", ["within", "between"], "test", "corr")

        sns.pointplot(x="test", y="corr", hue="test", data=exp_df,
                      dodge=.5, join=False, ci=95,
                      palette=[".15", ".5"], ax=ax)
        plt.setp(ax.lines, linewidth=2)

        sns.pointplot(x="test", y="corr", hue="subj", data=exp_df,
                      palette=[".75"], scale=.75, ax=ax)
        plt.setp(ax.collections[:], facecolor="w", zorder=20)

        ax.legend_ = None
        ax.set(ylabel="",
               xlabel="",
               xticks=[-.1, 1.1],
               xticklabels=["Same\ncontext", "Different\ncontext"])

    axes[0].set(ylim=(0, .105), ylabel="Timeseries correlation (r)")
    axes[1].set(ylim=(0, .0525))
    axes[2].set(ylim=(0, .0525))

    for ax in axes:
        sns.despine(ax=ax, trim=True)
util.py 文件源码 项目:kotori 作者: daq-tools 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def dataframe_wide_to_long_indexed(df, column):
    """
    Convert DataFrame from wide to long format using specified column as index column,
    followed by indexing the DataFrame on the very same column and finally sorting it.

    See also:

    - http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-by-melt
    - http://stackoverflow.com/questions/17688155/complicated-for-me-reshaping-from-wide-to-long-in-pandas
    """
    df = pandas.melt(df, id_vars=column).dropna()
    df = dataframe_index_and_sort(df, column)
    return df
sgcount.py 文件源码 项目:sgrsea 作者: bchen4 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def generatefinaltable(resultdic, totaldic, lib, dfile):
  '''
    merge df based on sublib  
  ''' 
  #Generate count table
  count_df = pd.DataFrame()
  for sublibname, c_df in resultdic.items():
    sublib = lib[sublibname]
    df = sublib.merge(c_df,on='Sequence',how='left')
    df = df.fillna(0)
    count_df = count_df.append(df)
  #Generate summary table
  count_columns = count_df.columns.tolist()
  count_columns.insert(0,count_columns.pop(count_columns.index('sgRNA')))
  count_columns.insert(1,count_columns.pop(count_columns.index('Gene')))
  count_columns.insert(2,count_columns.pop(count_columns.index('Sequence')))
  count_columns.insert(3,count_columns.pop(count_columns.index('sublib')))
  count_df = count_df.loc[:,count_columns]
  mapped_total = count_df.iloc[:,3:].groupby("sublib").sum().reset_index()
  mapped_total_df = pd.melt(mapped_total, id_vars=['sublib'],var_name=['sample'],value_name='mapped_reads')
  totalread_df = pd.DataFrame(totaldic.items(),columns=["filepath","total_reads"])
  if isinstance(dfile,pd.DataFrame):
    summary_df = dfile.merge(totalread_df,on="filepath")
    summary_df = summary_df.merge(mapped_total_df,on=['sublib','sample'])
  else:#single file
    summary_df = totalread_df
    summary_df = summary_df.join(mapped_total_df)
  summary_df['mapping_ratio'] = summary_df['mapped_reads']/summary_df['total_reads']
  summary_df = summary_df.loc[:,['filepath','sample','sublib','total_reads','mapped_reads','mapping_ratio']]
  return (count_df, summary_df)
plotting.py 文件源码 项目:fitbit-analyzer 作者: 5agado 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _prepareYearAndMonthStats(stats, columns):
    # Group by month and change stats from columns to row attribute
    dataToPlot = stats.groupby(stats['date'].dt.to_period("M")).mean()
    dataToPlot = pd.melt(dataToPlot.reset_index(), id_vars=['date'], value_vars=columns,
                         var_name='stats', value_name='val')
    # Rename stats
    dataToPlot['stats'].replace(NAMES, inplace=True)
    return dataToPlot


问题


面经


文章

微信
公众号

扫码关注公众号