import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
warnings.filterwarnings('ignore')
# import graftwith open('../fraudTrain.pkl', 'rb') as file:
fraudTrain = pickle.load(file) def compute_time_difference(group):
n = len(group)
result = []
for i in range(n):
for j in range(n):
time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
return resultdf = fraudTrain[fraudTrain.is_fraud == 1]
fraud1_cc_num = df['cc_num'].unique()
df_fraud = fraudTrain[fraudTrain['cc_num'].isin(fraud1_cc_num)].reset_index()
df_group = df_fraud.groupby("cc_num")["is_fraud"].agg(['count', 'sum']).reset_index()
df_group.columns = ['cc_num', 'total_transactions', 'fraud_transactions']
first = df_fraud.groupby('cc_num')['first'].first().reset_index()
last = df_fraud.groupby('cc_num')['last'].first().reset_index()
name = first['first'] + ' ' + last['last']
df_group['name'] = name
df_group.insert(1, 'name', df_group.pop('name'))
df_group| cc_num | name | total_transactions | fraud_transactions | |
|---|---|---|---|---|
| 0 | 6.041621e+10 | Mary Diaz | 1242 | 9 |
| 1 | 6.042293e+10 | Jeffrey Powers | 1261 | 12 |
| 2 | 6.042310e+10 | Jason Gray | 423 | 10 |
| 3 | 6.042785e+10 | Bradley Martinez | 432 | 14 |
| 4 | 6.049060e+10 | Karen Johnson | 827 | 10 |
| ... | ... | ... | ... | ... |
| 591 | 4.906630e+18 | Charles Moreno | 2103 | 11 |
| 592 | 4.911820e+18 | Jeremy Chavez | 9 | 9 |
| 593 | 4.956830e+18 | Kenneth Robinson | 2090 | 10 |
| 594 | 4.973530e+18 | Mary Rodriguez | 860 | 10 |
| 595 | 4.992350e+18 | Benjamin Kim | 1689 | 8 |
596 rows × 4 columns
(df_group['fraud_transactions'] == 1).sum()0
df_group[df_group['cc_num'] == 180068e+9]| cc_num | name | total_transactions | fraud_transactions | |
|---|---|---|---|---|
| 160 | 1.800680e+14 | Mary Juarez | 2093 | 22 |
df_group.to_csv('df_group_ccnum', index=False)df_fraud.is_fraud.mean()0.009219716623428457
df_group['fraud_transactions'].sum()6006
df_group['ratio of fraudulent transactions per cc_num'] = df_group['fraud_transactions'] / df_group['total_transactions']df_group| cc_num | name | total_transactions | fraud_transactions | ratio of fraudulent transactions per cc_num | |
|---|---|---|---|---|---|
| 0 | 6.041621e+10 | Mary Diaz | 1242 | 9 | 0.007246 |
| 1 | 6.042293e+10 | Jeffrey Powers | 1261 | 12 | 0.009516 |
| 2 | 6.042310e+10 | Jason Gray | 423 | 10 | 0.023641 |
| 3 | 6.042785e+10 | Bradley Martinez | 432 | 14 | 0.032407 |
| 4 | 6.049060e+10 | Karen Johnson | 827 | 10 | 0.012092 |
| ... | ... | ... | ... | ... | ... |
| 591 | 4.906630e+18 | Charles Moreno | 2103 | 11 | 0.005231 |
| 592 | 4.911820e+18 | Jeremy Chavez | 9 | 9 | 1.000000 |
| 593 | 4.956830e+18 | Kenneth Robinson | 2090 | 10 | 0.004785 |
| 594 | 4.973530e+18 | Mary Rodriguez | 860 | 10 | 0.011628 |
| 595 | 4.992350e+18 | Benjamin Kim | 1689 | 8 | 0.004737 |
596 rows × 5 columns
df_group.describe()| cc_num | total_transactions | fraud_transactions | ratio of fraudulent transactions per cc_num | |
|---|---|---|---|---|
| count | 5.960000e+02 | 596.000000 | 596.000000 | 596.000000 |
| mean | 4.235681e+17 | 1093.003356 | 10.077181 | 0.102714 |
| std | 1.311573e+18 | 701.358772 | 3.254589 | 0.286436 |
| min | 6.041621e+10 | 7.000000 | 2.000000 | 0.000792 |
| 25% | 1.800348e+14 | 429.000000 | 8.000000 | 0.006349 |
| 50% | 3.525630e+15 | 867.500000 | 10.000000 | 0.009679 |
| 75% | 4.741782e+15 | 1645.250000 | 12.000000 | 0.018946 |
| max | 4.992350e+18 | 4173.000000 | 24.000000 | 1.000000 |
df_group.sort_values(by='fraud_transactions', ascending=False).reset_index()| index | cc_num | name | total_transactions | fraud_transactions | ratio of fraudulent transactions per cc_num | |
|---|---|---|---|---|---|---|
| 0 | 179 | 2.131740e+14 | Joseph Wagner | 2474 | 24 | 0.009701 |
| 1 | 278 | 3.506040e+15 | Christine Burns | 2072 | 23 | 0.011100 |
| 2 | 160 | 1.800680e+14 | Mary Juarez | 2093 | 22 | 0.010511 |
| 3 | 493 | 6.011380e+15 | Martin Duarte | 1639 | 22 | 0.013423 |
| 4 | 163 | 1.800850e+14 | David Kirby | 1657 | 21 | 0.012674 |
| ... | ... | ... | ... | ... | ... | ... |
| 591 | 573 | 4.503100e+18 | Katherine Tucker | 1250 | 2 | 0.001600 |
| 592 | 327 | 3.546670e+15 | Jordan May | 824 | 2 | 0.002427 |
| 593 | 485 | 6.011110e+15 | Rebecca Erickson | 2525 | 2 | 0.000792 |
| 594 | 360 | 3.576430e+15 | Jessica Ward | 2478 | 2 | 0.000807 |
| 595 | 88 | 4.809700e+12 | Sabrina Nolan | 1624 | 2 | 0.001232 |
596 rows × 6 columns