def get_data_stats(datasets):
data_stats_cols = ['all', 'non-fraud', 'fraud']
data_stats = pd.DataFrame(columns=data_stats_cols)
data_stats.loc['transactions'] = [d.shape[0] for d in datasets]
data_stats.loc['transactions/hour'] = [round(d['Local_Date'].apply(lambda x: x.hour).value_counts().sum()/24/366, 2) for d in datasets]
data_stats.loc['transactions/day'] = [round(d['Local_Date'].apply(lambda x: x.day).value_counts().sum() / 366, 2) for d in datasets]
data_stats.loc['transactions/week'] = [round(d['Local_Date'].apply(lambda x: x.week).value_counts().sum() / 52, 2) for d in datasets]
data_stats.loc['transactions/month'] = [round(d['Local_Date'].apply(lambda x: x.month).value_counts().sum() / 12, 2) for d in datasets]
data_stats.loc['cards'] = [len(d["CardID"].unique()) for d in datasets]
data_stats.loc['cards, single use'] = [sum(d["CardID"].value_counts() == 1) for d in datasets]
data_stats.loc['cards, multi use'] = [sum(d["CardID"].value_counts() > 1) for d in datasets]
cards_genuine = datasets[1]['CardID'].unique()
cards_fraud = datasets[2]['CardID'].unique()
data_stats.loc['fraud cards in genuine'] = ['-', '-', len(np.intersect1d(cards_genuine, cards_fraud)) / len(cards_fraud)]
data_stats.loc['first transaction'] = [min(d["Global_Date"]).date() for d in datasets]
data_stats.loc['last transaction'] = [max(d["Global_Date"]).date() for d in datasets]
data_stats.loc['min amount'] = [min(d["Amount"]) for d in datasets]
data_stats.loc['max amount'] = [max(d["Amount"]) for d in datasets]
data_stats.loc['avg amount'] = [np.average(d["Amount"]) for d in datasets]
data_stats.loc['num merchants'] = [len(d["MerchantID"].unique()) for d in datasets]
data_stats.loc['countries'] = [len(d["Country"].unique()) for d in datasets]
data_stats.loc['currencies'] = [len(d["Currency"].unique()) for d in datasets]
data_stats.loc['min trans/card'] = [min(d["CardID"].value_counts()) for d in datasets]
data_stats.loc['max trans/card'] = [max(d["CardID"].value_counts()) for d in datasets]
data_stats.loc['avg trans/card'] = [np.average(d["CardID"].value_counts()) for d in datasets]
return data_stats
评论列表
文章目录