import

import pandas as pd
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

from node2vec import Node2Vec
from node2vec.edges import AverageEmbedder

    def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
        df1 = df[df['is_fraud'] == 1].copy()
        df0 = df[df['is_fraud'] == 0].copy()
        df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
        df0_down = df0.sample(frac=df0_downsample, random_state=42)
        df_p = pd.concat([df1, df0_down])
        return df_p
    
    def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
        n = len(data_frame)
    
        # 사기 거래와 정상 거래를 분리
        fraud_data = data_frame[data_frame['is_fraud'] == 1]
        normal_data = data_frame[data_frame['is_fraud'] == 0]

        # 테스트 데이터 크기 계산
        test_samples = int(test_fraud_rate * (n * test_rate))
        remaining_test_samples = int(n * test_rate) - test_samples
    
        # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
        test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
        test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

        # 테스트 데이터 합치기
        test_data = pd.concat([test_normal_data, test_fraud_data])

        # 훈련 데이터 생성
        train_data = data_frame[~data_frame.index.isin(test_data.index)]

        return train_data, test_data
    
    def concat(df_tr, df_tst):   
        df = pd.concat([df_tr, df_tst])
        train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
        test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
        mask = (train_mask, test_mask)
        return df, mask
        
    def evaluation(y, yhat):
        metrics = [sklearn.metrics.accuracy_score,
                   sklearn.metrics.precision_score,
                   sklearn.metrics.recall_score,
                   sklearn.metrics.f1_score,
                   sklearn.metrics.roc_auc_score]
        return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
        
    def compute_time_difference(group):
        n = len(group)
        result = []
        for i in range(n):
            for j in range(n):
                time_difference = abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
                result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
        return result

    def edge_index_save(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
        filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        while os.path.exists(filename):
            self.save_attempt += 1
            filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index
    
    def edge_index(df, unique_col, theta, gamma):
        groups = df.groupby(unique_col)
        edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
        edge_index = edge_index.astype(np.float64)
       # filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        
        # while os.path.exists(filename):
        #     self.save_attempt += 1
        #     filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
        # np.save(filename, edge_index)
        #tetha = edge_index_plust_itme[:,].mean()
    
        
        edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
        edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
        return edge_index

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))

(throw 0.3 /split 0.05)

df = throw(fraudTrain, 0.3)
df_tr, df_tst = split_dataframe(df, 0.05)
df2, mask = concat(df_tr, df_tst)
df2['index'] = df2.index
df3 = df2.reset_index()

G_down = build_graph_tripartite(df3)

range(len(G_down.edges))

range(0, 40040)

train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.30, 
                                                                      random_state=42)

edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

Generating walks (CPU: 1): 100%|██████████| 10/10 [01:02<00:00,  6.20s/it]

embeddings_train = AverageEmbedder(keyed_vectors=model_train.wv) 
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

np.array(train_embeddings).shape

(28028, 128)

np.array(edgs).shape

(40040, 2)

np.array(train_labels).shape

(28028,)

test_df

	X_0	X_1	X_2	X_3	X_4	X_5	X_6	X_7	X_8	X_9	...	X_118	X_119	X_120	X_121	X_122	X_123	X_124	X_125	X_126	X_127
0	0.294095	-0.182587	0.247678	0.590049	0.006470	-0.142112	0.089056	-0.001260	0.094326	0.135021	...	0.069856	0.066224	0.115723	0.147979	-0.197021	-0.095803	-0.337333	-0.125295	-0.009825	0.151540
1	-0.272624	0.151767	-0.154251	0.323210	0.926657	-0.067704	0.017721	-0.107441	0.145443	0.302530	...	0.003853	-0.062410	0.533643	-0.343930	-0.080773	0.252463	-0.586127	-0.611270	0.173824	-0.255820
2	-0.285497	-0.727879	0.088415	0.351048	0.179241	-0.211548	0.396390	0.184747	0.019989	0.056321	...	0.329557	-0.137155	-0.492804	-0.395840	-0.124599	0.071007	0.092745	0.113933	0.531759	0.185540
3	0.101396	-0.118513	0.023900	0.479202	0.010183	0.010264	0.090974	-0.374058	-0.035802	0.005411	...	0.445272	0.161917	0.132266	-0.353145	0.408945	0.278389	-0.149341	-0.426027	0.157640	0.036185
4	0.335867	0.193802	0.017849	-0.094391	0.406467	-0.252247	0.253800	0.060329	-0.227593	0.060363	...	-0.042921	-0.296897	-0.108547	-0.168377	0.239888	0.338059	-0.371277	0.179102	-0.186001	0.128560
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12007	0.042202	-0.018433	0.708589	-0.087046	0.239773	-0.049197	0.437209	0.417002	-0.341246	0.192022	...	0.255594	0.440218	-0.155139	-0.615026	0.221697	0.187364	0.244558	-0.492887	0.498050	0.081703
12008	-0.265624	-0.221091	0.016436	-0.082620	0.160059	-0.097688	0.372506	0.080932	-0.232221	0.103701	...	0.106539	0.294270	-0.140846	0.088624	-0.184099	0.296281	-0.292350	0.041315	0.421299	-0.014916
12009	-0.040139	0.002283	-0.052671	0.268232	0.253181	0.208248	0.081583	0.019004	-0.129444	0.090210	...	-0.112681	-0.216101	-0.273943	-0.158223	0.288049	0.343018	0.022950	-0.497030	0.158982	-0.103655
12010	0.108170	-0.334442	-0.155009	-0.043705	0.576253	-0.157641	-0.352357	-0.275126	-0.084098	-0.305660	...	0.417730	-0.285950	-0.008567	-0.083101	0.029371	0.271623	-0.458983	0.137005	-0.205865	0.410773
12011	-0.267466	0.094266	0.158908	0.479838	0.401144	-0.120019	0.683475	0.051481	-0.131274	0.665234	...	0.423447	0.033082	-0.014823	-0.390924	0.166589	-0.030113	-0.244755	0.128242	-0.013727	0.101052

12012 rows × 128 columns

y.mean()

0.30128205128205127

yhat.mean()

0.02355977355977356

df

	X_0	X_1	X_2	X_3	X_4	X_5	X_6	X_7	X_8	X_9	...	X_119	X_120	X_121	X_122	X_123	X_124	X_125	X_126	X_127	label
0	-0.018232	-0.461368	0.364313	-0.003355	0.447681	0.124173	-0.003152	-0.540774	-0.572652	0.302768	...	-0.409031	0.276566	-0.532460	0.310427	-0.138094	-0.232335	-0.699818	0.409554	0.513786	1
1	-0.034503	-0.591312	0.221111	0.221223	0.456697	0.206631	0.071338	-0.419212	-0.801116	0.093788	...	-0.397874	-0.322119	-0.167821	-0.249518	0.307504	-0.631245	-0.300779	0.297532	-0.434563	0
2	-0.310014	-0.523434	-0.253751	0.291488	-0.216207	-0.602615	0.851062	0.502921	-0.013592	-0.049582	...	-0.690101	0.033092	-0.025023	0.303007	-0.769665	0.134720	0.556271	0.101577	0.475444	0
3	-0.491736	-0.378155	-0.069874	-0.418389	0.708743	-0.091546	0.106801	-0.590884	0.115419	0.375492	...	0.555193	-0.788328	-0.293919	0.354652	-0.004965	-0.360878	-0.673302	0.952318	-0.829513	0
4	-0.509173	-0.160158	0.732356	0.799263	0.406524	0.346506	0.278970	0.135962	-0.022850	0.728933	...	-0.419749	-0.317320	-0.089520	0.018165	0.111126	0.156540	-0.480898	0.906297	0.375273	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
28023	-1.031186	-0.789458	-0.395662	0.672432	0.366671	-0.042877	0.584821	-1.050437	-0.178776	1.148389	...	0.035637	-0.534154	-0.465015	-0.292141	-0.529951	0.083408	-0.534623	-0.033971	-0.392283	0
28024	0.582753	-0.374445	0.245931	-0.342081	0.360757	-0.232758	0.543620	0.188705	0.140474	0.003210	...	0.469309	-0.733129	-0.328261	0.185925	0.491741	-0.889476	-0.515644	0.718754	-0.470877	1
28025	-0.073720	-0.696957	0.398181	0.148028	0.527938	0.282207	0.083161	0.223829	0.746264	-0.078440	...	0.403734	-0.565365	-0.134230	0.076866	-0.147996	-0.263811	-0.342502	0.232086	0.396120	0
28026	0.198808	-0.342205	-0.171712	-0.427701	0.376993	-0.368098	0.268516	-0.081111	-0.124265	0.539024	...	0.235571	0.433356	-0.304768	-0.276453	0.025376	-0.091774	-1.139718	-0.064087	-0.382336	0
28027	-0.071535	-0.478938	0.958107	-0.103668	0.735266	0.264131	0.526417	-0.046119	0.337727	0.551102	...	-0.100332	-0.451198	-0.238998	-0.315951	-0.040852	-0.560907	0.301089	-0.071838	0.057509	1

28028 rows × 129 columns

# embeddings_train = AverageEmbedder(keyed_vectors=model_train.wv) 
# train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
# test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]


# DataFrame 생성
columns = [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
df_data = pd.DataFrame(data=train_embeddings, columns=columns)

df_labels = pd.DataFrame(data=train_labels, columns=['label'])

# DataFrame 합치기
df = pd.concat([df_data, df_labels], axis=1)


label = np.array(train_labels)

predictr = TabularPredictor(label='label')

predictr.fit(df) 

test = np.array(test_embeddings)

columns = [f'X_{i}' for i in range(test.shape[1])]

# DataFrame 생성
test_df = pd.DataFrame(data=test, columns=columns)

y = np.array(test_labels)

yhat = predictr.predict(test_df)

evaluation(y,yhat)

No path specified. Models will be saved in: "AutogluonModels/ag-20240129_072006/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240129_072006/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   597.99 GB / 982.82 GB (60.8%)
Train Data Rows:    28028
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    5155.63 MB
    Train Data (Original)  Memory Usage: 14.35 MB (0.3% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
    0.3s = Fit runtime
    128 features in original data used to generate 128 features in processed data.
    Train Data (Processed) Memory Usage: 14.35 MB (0.2% of available memory)
Data preprocessing and feature engineering runtime = 0.31s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.08919651776794633, Train Rows: 25528, Val Rows: 2500
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f77c8288f70>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.7884   = Validation score   (accuracy)
    0.08s    = Training   runtime
    0.72s    = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f77c8288f70>
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
    0.7908   = Validation score   (accuracy)
    0.08s    = Training   runtime
    0.53s    = Validation runtime
Fitting model: LightGBMXT ...
    0.8216   = Validation score   (accuracy)
    5.64s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: LightGBM ...
    0.8376   = Validation score   (accuracy)
    35.19s   = Training   runtime
    0.1s     = Validation runtime
Fitting model: RandomForestGini ...
    0.8076   = Validation score   (accuracy)
    13.37s   = Training   runtime
    0.08s    = Validation runtime
Fitting model: RandomForestEntr ...
    0.808    = Validation score   (accuracy)
    17.13s   = Training   runtime
    0.07s    = Validation runtime
Fitting model: CatBoost ...
    0.8228   = Validation score   (accuracy)
    21.58s   = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini ...
    0.8024   = Validation score   (accuracy)
    2.14s    = Training   runtime
    0.09s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    0.8032   = Validation score   (accuracy)
    2.4s     = Training   runtime
    0.09s    = Validation runtime
Fitting model: NeuralNetFastAI ...
    0.8508   = Validation score   (accuracy)
    33.83s   = Training   runtime
    0.09s    = Validation runtime
Fitting model: XGBoost ...
    0.83     = Validation score   (accuracy)
    16.11s   = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetTorch ...
    0.8516   = Validation score   (accuracy)
    53.0s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: LightGBMLarge ...
    0.8308   = Validation score   (accuracy)
    29.28s   = Training   runtime
    0.05s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.8704   = Validation score   (accuracy)
    0.99s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 234.95s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240129_072006/")

[1000]  valid_set's binary_error: 0.1784
[2000]  valid_set's binary_error: 0.1736
[3000]  valid_set's binary_error: 0.1736
[4000]  valid_set's binary_error: 0.168
[5000]  valid_set's binary_error: 0.1628
[6000]  valid_set's binary_error: 0.1644
[1000]  valid_set's binary_error: 0.1732

	accuracy_score	precision_score	recall_score	f1_score	roc_auc_score
0	0.711788	0.777385	0.06079	0.112763	0.526642