utils_data.py 文件源码

python
阅读 27 收藏 0 点赞 0 评论 0

项目:MultiMAuS 作者: lmzintgraf 项目源码 文件源码
def get_data_stats(datasets):

    data_stats_cols = ['all', 'non-fraud', 'fraud']
    data_stats = pd.DataFrame(columns=data_stats_cols)

    data_stats.loc['transactions'] = [d.shape[0] for d in datasets]

    data_stats.loc['transactions/hour'] = [round(d['Local_Date'].apply(lambda x: x.hour).value_counts().sum()/24/366, 2) for d in datasets]
    data_stats.loc['transactions/day'] = [round(d['Local_Date'].apply(lambda x: x.day).value_counts().sum() / 366, 2) for d in datasets]
    data_stats.loc['transactions/week'] = [round(d['Local_Date'].apply(lambda x: x.week).value_counts().sum() / 52, 2) for d in datasets]
    data_stats.loc['transactions/month'] = [round(d['Local_Date'].apply(lambda x: x.month).value_counts().sum() / 12, 2) for d in datasets]

    data_stats.loc['cards'] = [len(d["CardID"].unique()) for d in datasets]
    data_stats.loc['cards, single use'] = [sum(d["CardID"].value_counts() == 1) for d in datasets]
    data_stats.loc['cards, multi use'] = [sum(d["CardID"].value_counts() > 1) for d in datasets]

    cards_genuine = datasets[1]['CardID'].unique()
    cards_fraud = datasets[2]['CardID'].unique()
    data_stats.loc['fraud cards in genuine'] = ['-', '-', len(np.intersect1d(cards_genuine, cards_fraud)) / len(cards_fraud)]

    data_stats.loc['first transaction'] = [min(d["Global_Date"]).date() for d in datasets]
    data_stats.loc['last transaction'] = [max(d["Global_Date"]).date() for d in datasets]

    data_stats.loc['min amount'] = [min(d["Amount"]) for d in datasets]
    data_stats.loc['max amount'] = [max(d["Amount"]) for d in datasets]
    data_stats.loc['avg amount'] = [np.average(d["Amount"]) for d in datasets]

    data_stats.loc['num merchants'] = [len(d["MerchantID"].unique()) for d in datasets]

    data_stats.loc['countries'] = [len(d["Country"].unique()) for d in datasets]
    data_stats.loc['currencies'] = [len(d["Currency"].unique()) for d in datasets]

    data_stats.loc['min trans/card'] = [min(d["CardID"].value_counts()) for d in datasets]
    data_stats.loc['max trans/card'] = [max(d["CardID"].value_counts()) for d in datasets]
    data_stats.loc['avg trans/card'] = [np.average(d["CardID"].value_counts()) for d in datasets]

    return data_stats
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号