[Proposed] Split

Author

김보람

Published

April 2, 2024

imports

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')
%run ../function_proposed_gcn.py
with open('../fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
def try_7(fraudTrain, ratio, n, theta, gamma, prev_results=None): # try_4에서 index안겹치게 바꾼것
    if prev_results is None:
        df_results = pd.DataFrame(columns=[
            'model', 'time', 'acc', 'pre', 'rec', 'f1', 'auc', 'graph_based', 
            'method', 'throw_rate', 'train_size', 'train_cols', 'train_frate', 
            'test_size', 'test_frate', 'hyper_params', 'theta', 'gamma'
        ])
    else:
        df_results = prev_results
    
    df50 = throw(fraudTrain,ratio)
    df_tr, df_tst = sklearn.model_selection.train_test_split(df50)
        
    dfn = fraudTrain[::n]
    dfnn = dfn[~dfn.index.isin(df_tr.index)]
    dfnn = dfnn.reset_index(drop=True)
    df_trn, df_tstn = sklearn.model_selection.train_test_split(dfnn)
    

   
    df2, mask = concat(df_tr, df_tstn)
    df2['index'] = df2.index
    df = df2.reset_index()

    
    groups = df.groupby('cc_num')
    edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
    edge_index = edge_index.astype(np.float64)
    edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
    edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
    
    x = torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
    y = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
    data = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
    
    model = GCN1()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    yy = (data.y[data.test_mask]).numpy()
    yyhat, yyhat_ = train_and_evaluate_model(data, model, optimizer)
    yyhat_ = yyhat_.detach().numpy()
    eval = evaluation(yy, yyhat, yyhat_)
    
    result = {
        'model': 'GCN',
        'time': None,
        'acc': eval['acc'],
        'pre': eval['pre'],
        'rec': eval['rec'],
        'f1': eval['f1'],
        'auc': eval['auc'],
        'graph_based': True,
        'method': 'Proposed',
        'throw_rate': df.is_fraud.mean(),
        'train_size': len(df_tr),
        'train_cols': 'amt',
        'train_frate': df_tr.is_fraud.mean(),
        'test_size': len(df_tstn),
        'test_frate':  df_tstn.is_fraud.mean(),
        'hyper_params': None,
        'theta': theta,
        'gamma': gamma
    }
    
    df_results = df_results.append(result, ignore_index=True)
    
    return df_results
df_results = try_7(fraudTrain, 0.2, 10,1e+7,0.8)

ymdhms = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S') 
df_results.to_csv(f'../results/{ymdhms}-proposed.csv',index=False)

df_results
model time acc pre rec f1 auc graph_based method throw_rate train_size train_cols train_frate test_size test_frate hyper_params theta gamma
0 GCN None 0.956907 0.030035 0.829268 0.057971 0.972998 True Proposed 0.093805 22522 amt 0.198783 25642 0.001599 None 10000000.0 0.8