imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv

# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor

def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]

fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
1	2019-01-01 00:00:00	6.304230e+11	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462	0
2	2019-01-01 00:00:00	3.885950e+13	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481	0
3	2019-01-01 00:01:00	3.534090e+15	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071	0
4	2019-01-01 00:03:00	3.755340e+14	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0
1048571	2020-03-10 16:07:00	4.839040e+15	fraud_Cremin, Hamill and Reichel	misc_pos	116.94	Meredith	Campbell	F	043 Hanson Turnpike	Hedrick	...	41.1826	-92.3097	1583	Geochemist	1999-06-28	c00ce51c6ebb7657474a77b9e0b51f34	1362931670	41.400318	-92.726724	0
1048572	2020-03-10 16:08:00	5.718440e+11	fraud_O'Connell, Botsford and Hand	home	21.27	Susan	Mills	F	005 Cody Estates	Louisville	...	38.2507	-85.7476	736284	Engineering geologist	1952-04-02	17c9dc8b2a6449ca2473726346e58e6c	1362931711	37.293339	-84.798122	0
1048573	2020-03-10 16:08:00	4.646850e+18	fraud_Thompson-Gleason	health_fitness	9.52	Julia	Bell	F	576 House Crossroad	West Sayville	...	40.7320	-73.1000	4056	Film/video editor	1990-06-25	5ca650881b48a6a38754f841c23b77ab	1362931718	39.773077	-72.213209	0
1048574	2020-03-10 16:08:00	2.283740e+15	fraud_Buckridge PLC	misc_pos	6.81	Shannon	Williams	F	9345 Spencer Junctions Suite 183	Alpharetta	...	34.0770	-84.3033	165556	Prison officer	1997-12-27	8d0a575fe635bbde12f1a2bffc126731	1362931730	33.601468	-83.891921	0

1048575 rows × 22 columns

데이터정리

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

(214520, 22)

df50 = down_sample_textbook(df02)
df50.shape

(12012, 22)

df50 = df50.reset_index()

N = len(df50)

df50 = df50[["amt","is_fraud"]]

df50["amt"].mean()

297.4638911088911

df50["amt"].describe()

count    12012.000000
mean       297.463891
std        384.130842
min          1.010000
25%         19.917500
50%         84.680000
75%        468.295000
max      12025.300000
Name: amt, dtype: float64

tr/test

df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

df50_tr.shape, df50_test.shape

((9009, 2), (3003, 2))

train_mask = [i in df50_tr.index for i in range(N)]
test_mask = [i in df50_test.index for i in range(N)]

train_mask = np.array(train_mask)
test_mask = np.array(test_mask)

train_mask.sum(), test_mask.sum()

(9009, 3003)

train_mask.shape, test_mask.shape

((12012,), (12012,))

edge_index 설정

# groups = df50.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus50.npy', edge_index_list_plus_nparr)

edge_index = np.load('edge_index_list_plus50.npy')
theta = edge_index[:,2].mean()
edge_index = np.load('edge_index_list_plus50.npy').astype(np.float64)
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index = edge_index.tolist()
mean_ = np.array(edge_index)[:,2].mean()
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()

data설정(x, edge_index, y)

x = torch.tensor(df50['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df50['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data

Data(x=[12012, 1], edge_index=[2, 93730], y=[12012], train_mask=[12012], test_mask=[12012])

autogluon

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")

No path specified. Models will be saved in: "AutogluonModels/ag-20230930_045601/"

C.적합(fit)

predictr.fit(tr, presets='best_quality')

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230930_045601/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   749.06 GB / 982.82 GB (76.2%)
Train Data Rows:    9009
Train Data Columns: 1
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    31104.83 MB
    Train Data (Original)  Memory Usage: 0.07 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    0.0s = Fit runtime
    1 features in original data used to generate 1 features in processed data.
    Train Data (Processed) Memory Usage: 0.07 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_BAG_L1 ...
    0.8782   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: KNeighborsDist_BAG_L1 ...
    0.8641   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBMXT_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.885    = Validation score   (accuracy)
    0.49s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.894    = Validation score   (accuracy)
    0.57s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ...
    0.856    = Validation score   (accuracy)
    0.34s    = Training   runtime
    0.19s    = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ...
    0.856    = Validation score   (accuracy)
    0.44s    = Training   runtime
    0.21s    = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8947   = Validation score   (accuracy)
    1.49s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini_BAG_L1 ...
    0.8622   = Validation score   (accuracy)
    0.33s    = Training   runtime
    0.2s     = Validation runtime
Fitting model: ExtraTreesEntr_BAG_L1 ...
    0.8626   = Validation score   (accuracy)
    0.31s    = Training   runtime
    0.2s     = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.867    = Validation score   (accuracy)
    7.47s    = Training   runtime
    0.09s    = Validation runtime
Fitting model: XGBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8944   = Validation score   (accuracy)
    0.5s     = Training   runtime
    0.02s    = Validation runtime
Fitting model: NeuralNetTorch_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8888   = Validation score   (accuracy)
    14.26s   = Training   runtime
    0.05s    = Validation runtime
Fitting model: LightGBMLarge_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8941   = Validation score   (accuracy)
    0.85s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.8948   = Validation score   (accuracy)
    2.14s    = Training   runtime
    0.01s    = Validation runtime
AutoGluon training complete, total runtime = 38.84s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230930_045601/")

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f3fabaaf250>

predictr.leaderboard()

                      model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.894772       0.019776   4.481569                0.009500           2.139285            2       True         14
1           CatBoost_BAG_L1   0.894661       0.004185   1.490714                0.004185           1.490714            1       True          7
2            XGBoost_BAG_L1   0.894439       0.021677   0.498835                0.021677           0.498835            1       True         11
3      LightGBMLarge_BAG_L1   0.894106       0.006091   0.851570                0.006091           0.851570            1       True         13
4           LightGBM_BAG_L1   0.893995       0.013288   0.571850                0.013288           0.571850            1       True          4
5     NeuralNetTorch_BAG_L1   0.888778       0.050316  14.255491                0.050316          14.255491            1       True         12
6         LightGBMXT_BAG_L1   0.885004       0.034434   0.492139                0.034434           0.492139            1       True          3
7     KNeighborsUnif_BAG_L1   0.878233       0.010086   0.003595                0.010086           0.003595            1       True          1
8    NeuralNetFastAI_BAG_L1   0.867022       0.086901   7.465914                0.086901           7.465914            1       True         10
9     KNeighborsDist_BAG_L1   0.864136       0.008126   0.002526                0.008126           0.002526            1       True          2
10    ExtraTreesEntr_BAG_L1   0.862582       0.202717   0.313634                0.202717           0.313634            1       True          9
11    ExtraTreesGini_BAG_L1   0.862249       0.202180   0.328506                0.202180           0.328506            1       True          8
12  RandomForestGini_BAG_L1   0.856033       0.187426   0.337518                0.187426           0.337518            1       True          5
13  RandomForestEntr_BAG_L1   0.856033       0.210448   0.443938                0.210448           0.443938            1       True          6

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	WeightedEnsemble_L2	0.894772	0.019776	4.481569	0.009500	2.139285	2	True	14
1	CatBoost_BAG_L1	0.894661	0.004185	1.490714	0.004185	1.490714	1	True	7
2	XGBoost_BAG_L1	0.894439	0.021677	0.498835	0.021677	0.498835	1	True	11
3	LightGBMLarge_BAG_L1	0.894106	0.006091	0.851570	0.006091	0.851570	1	True	13
4	LightGBM_BAG_L1	0.893995	0.013288	0.571850	0.013288	0.571850	1	True	4
5	NeuralNetTorch_BAG_L1	0.888778	0.050316	14.255491	0.050316	14.255491	1	True	12
6	LightGBMXT_BAG_L1	0.885004	0.034434	0.492139	0.034434	0.492139	1	True	3
7	KNeighborsUnif_BAG_L1	0.878233	0.010086	0.003595	0.010086	0.003595	1	True	1
8	NeuralNetFastAI_BAG_L1	0.867022	0.086901	7.465914	0.086901	7.465914	1	True	10
9	KNeighborsDist_BAG_L1	0.864136	0.008126	0.002526	0.008126	0.002526	1	True	2
10	ExtraTreesEntr_BAG_L1	0.862582	0.202717	0.313634	0.202717	0.313634	1	True	9
11	ExtraTreesGini_BAG_L1	0.862249	0.202180	0.328506	0.202180	0.328506	1	True	8
12	RandomForestGini_BAG_L1	0.856033	0.187426	0.337518	0.187426	0.337518	1	True	5
13	RandomForestEntr_BAG_L1	0.856033	0.210448	0.443938	0.210448	0.443938	1	True	6

D. 예측(predict)

(tr.is_fraud == predictr.predict(tr)).mean()

0.8967698967698968

(tst.is_fraud == predictr.predict(tst)).mean()

0.8877788877788878

뭐지 best 옵션을 줬는데 더 낮아졌다.