240314 Data정리

Author

김보람

Published

March 14, 2024

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')
# import graft
with open('../fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result
df = fraudTrain[fraudTrain.is_fraud == 1]
fraud1_cc_num = df['cc_num'].unique()
df_fraud = fraudTrain[fraudTrain['cc_num'].isin(fraud1_cc_num)].reset_index()
df_group = df_fraud.groupby("cc_num")["is_fraud"].agg(['count', 'sum']).reset_index()
df_group.columns = ['cc_num', 'total_transactions', 'fraud_transactions']
first = df_fraud.groupby('cc_num')['first'].first().reset_index() 
last = df_fraud.groupby('cc_num')['last'].first().reset_index()
name = first['first'] + ' ' + last['last']
df_group['name'] = name
df_group.insert(1, 'name', df_group.pop('name'))
df_group
cc_num name total_transactions fraud_transactions
0 6.041621e+10 Mary Diaz 1242 9
1 6.042293e+10 Jeffrey Powers 1261 12
2 6.042310e+10 Jason Gray 423 10
3 6.042785e+10 Bradley Martinez 432 14
4 6.049060e+10 Karen Johnson 827 10
... ... ... ... ...
591 4.906630e+18 Charles Moreno 2103 11
592 4.911820e+18 Jeremy Chavez 9 9
593 4.956830e+18 Kenneth Robinson 2090 10
594 4.973530e+18 Mary Rodriguez 860 10
595 4.992350e+18 Benjamin Kim 1689 8

596 rows × 4 columns

(df_group['fraud_transactions'] == 1).sum()
0
df_group[df_group['cc_num'] == 180068e+9]
cc_num name total_transactions fraud_transactions
160 1.800680e+14 Mary Juarez 2093 22
df_group.to_csv('df_group_ccnum', index=False)
df_fraud.is_fraud.mean()
0.009219716623428457
df_group['fraud_transactions'].sum()
6006

df_group['ratio of fraudulent transactions per cc_num'] = df_group['fraud_transactions'] / df_group['total_transactions']
df_group
cc_num name total_transactions fraud_transactions ratio of fraudulent transactions per cc_num
0 6.041621e+10 Mary Diaz 1242 9 0.007246
1 6.042293e+10 Jeffrey Powers 1261 12 0.009516
2 6.042310e+10 Jason Gray 423 10 0.023641
3 6.042785e+10 Bradley Martinez 432 14 0.032407
4 6.049060e+10 Karen Johnson 827 10 0.012092
... ... ... ... ... ...
591 4.906630e+18 Charles Moreno 2103 11 0.005231
592 4.911820e+18 Jeremy Chavez 9 9 1.000000
593 4.956830e+18 Kenneth Robinson 2090 10 0.004785
594 4.973530e+18 Mary Rodriguez 860 10 0.011628
595 4.992350e+18 Benjamin Kim 1689 8 0.004737

596 rows × 5 columns

df_group.describe()
cc_num total_transactions fraud_transactions ratio of fraudulent transactions per cc_num
count 5.960000e+02 596.000000 596.000000 596.000000
mean 4.235681e+17 1093.003356 10.077181 0.102714
std 1.311573e+18 701.358772 3.254589 0.286436
min 6.041621e+10 7.000000 2.000000 0.000792
25% 1.800348e+14 429.000000 8.000000 0.006349
50% 3.525630e+15 867.500000 10.000000 0.009679
75% 4.741782e+15 1645.250000 12.000000 0.018946
max 4.992350e+18 4173.000000 24.000000 1.000000
df_group.sort_values(by='fraud_transactions', ascending=False).reset_index()
index cc_num name total_transactions fraud_transactions ratio of fraudulent transactions per cc_num
0 179 2.131740e+14 Joseph Wagner 2474 24 0.009701
1 278 3.506040e+15 Christine Burns 2072 23 0.011100
2 160 1.800680e+14 Mary Juarez 2093 22 0.010511
3 493 6.011380e+15 Martin Duarte 1639 22 0.013423
4 163 1.800850e+14 David Kirby 1657 21 0.012674
... ... ... ... ... ... ...
591 573 4.503100e+18 Katherine Tucker 1250 2 0.001600
592 327 3.546670e+15 Jordan May 824 2 0.002427
593 485 6.011110e+15 Rebecca Erickson 2525 2 0.000792
594 360 3.576430e+15 Jessica Ward 2478 2 0.000807
595 88 4.809700e+12 Sabrina Nolan 1624 2 0.001232

596 rows × 6 columns