import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
'ignore')
warnings.filterwarnings(# import graft
with open('../fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
= fraudTrain[fraudTrain.is_fraud == 1]
df = df['cc_num'].unique()
fraud1_cc_num = fraudTrain[fraudTrain['cc_num'].isin(fraud1_cc_num)].reset_index()
df_fraud = df_fraud.groupby("cc_num")["is_fraud"].agg(['count', 'sum']).reset_index()
df_group = ['cc_num', 'total_transactions', 'fraud_transactions']
df_group.columns = df_fraud.groupby('cc_num')['first'].first().reset_index()
first = df_fraud.groupby('cc_num')['last'].first().reset_index()
last = first['first'] + ' ' + last['last']
name 'name'] = name
df_group[1, 'name', df_group.pop('name'))
df_group.insert( df_group
cc_num | name | total_transactions | fraud_transactions | |
---|---|---|---|---|
0 | 6.041621e+10 | Mary Diaz | 1242 | 9 |
1 | 6.042293e+10 | Jeffrey Powers | 1261 | 12 |
2 | 6.042310e+10 | Jason Gray | 423 | 10 |
3 | 6.042785e+10 | Bradley Martinez | 432 | 14 |
4 | 6.049060e+10 | Karen Johnson | 827 | 10 |
... | ... | ... | ... | ... |
591 | 4.906630e+18 | Charles Moreno | 2103 | 11 |
592 | 4.911820e+18 | Jeremy Chavez | 9 | 9 |
593 | 4.956830e+18 | Kenneth Robinson | 2090 | 10 |
594 | 4.973530e+18 | Mary Rodriguez | 860 | 10 |
595 | 4.992350e+18 | Benjamin Kim | 1689 | 8 |
596 rows × 4 columns
'fraud_transactions'] == 1).sum() (df_group[
0
'cc_num'] == 180068e+9] df_group[df_group[
cc_num | name | total_transactions | fraud_transactions | |
---|---|---|---|---|
160 | 1.800680e+14 | Mary Juarez | 2093 | 22 |
'df_group_ccnum', index=False) df_group.to_csv(
df_fraud.is_fraud.mean()
0.009219716623428457
'fraud_transactions'].sum() df_group[
6006
'ratio of fraudulent transactions per cc_num'] = df_group['fraud_transactions'] / df_group['total_transactions'] df_group[
df_group
cc_num | name | total_transactions | fraud_transactions | ratio of fraudulent transactions per cc_num | |
---|---|---|---|---|---|
0 | 6.041621e+10 | Mary Diaz | 1242 | 9 | 0.007246 |
1 | 6.042293e+10 | Jeffrey Powers | 1261 | 12 | 0.009516 |
2 | 6.042310e+10 | Jason Gray | 423 | 10 | 0.023641 |
3 | 6.042785e+10 | Bradley Martinez | 432 | 14 | 0.032407 |
4 | 6.049060e+10 | Karen Johnson | 827 | 10 | 0.012092 |
... | ... | ... | ... | ... | ... |
591 | 4.906630e+18 | Charles Moreno | 2103 | 11 | 0.005231 |
592 | 4.911820e+18 | Jeremy Chavez | 9 | 9 | 1.000000 |
593 | 4.956830e+18 | Kenneth Robinson | 2090 | 10 | 0.004785 |
594 | 4.973530e+18 | Mary Rodriguez | 860 | 10 | 0.011628 |
595 | 4.992350e+18 | Benjamin Kim | 1689 | 8 | 0.004737 |
596 rows × 5 columns
df_group.describe()
cc_num | total_transactions | fraud_transactions | ratio of fraudulent transactions per cc_num | |
---|---|---|---|---|
count | 5.960000e+02 | 596.000000 | 596.000000 | 596.000000 |
mean | 4.235681e+17 | 1093.003356 | 10.077181 | 0.102714 |
std | 1.311573e+18 | 701.358772 | 3.254589 | 0.286436 |
min | 6.041621e+10 | 7.000000 | 2.000000 | 0.000792 |
25% | 1.800348e+14 | 429.000000 | 8.000000 | 0.006349 |
50% | 3.525630e+15 | 867.500000 | 10.000000 | 0.009679 |
75% | 4.741782e+15 | 1645.250000 | 12.000000 | 0.018946 |
max | 4.992350e+18 | 4173.000000 | 24.000000 | 1.000000 |
='fraud_transactions', ascending=False).reset_index() df_group.sort_values(by
index | cc_num | name | total_transactions | fraud_transactions | ratio of fraudulent transactions per cc_num | |
---|---|---|---|---|---|---|
0 | 179 | 2.131740e+14 | Joseph Wagner | 2474 | 24 | 0.009701 |
1 | 278 | 3.506040e+15 | Christine Burns | 2072 | 23 | 0.011100 |
2 | 160 | 1.800680e+14 | Mary Juarez | 2093 | 22 | 0.010511 |
3 | 493 | 6.011380e+15 | Martin Duarte | 1639 | 22 | 0.013423 |
4 | 163 | 1.800850e+14 | David Kirby | 1657 | 21 | 0.012674 |
... | ... | ... | ... | ... | ... | ... |
591 | 573 | 4.503100e+18 | Katherine Tucker | 1250 | 2 | 0.001600 |
592 | 327 | 3.546670e+15 | Jordan May | 824 | 2 | 0.002427 |
593 | 485 | 6.011110e+15 | Rebecca Erickson | 2525 | 2 | 0.000792 |
594 | 360 | 3.576430e+15 | Jessica Ward | 2478 | 2 | 0.000807 |
595 | 88 | 4.809700e+12 | Sabrina Nolan | 1624 | 2 | 0.001232 |
596 rows × 6 columns