[Autogluon] df_train1, df_test

Author

김보람

Published

May 3, 2024

imports

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
from autogluon.tabular import TabularDataset, TabularPredictor

warnings.filterwarnings('ignore')
df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')
(df_train1.shape, df_train1.is_fraud.mean()), (df_test.shape, df_test.is_fraud.mean())
(((734003, 22), 0.005728859418830713), ((314572, 22), 0.005725239372862174))
_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])
_df1_mean = _df1.is_fraud.mean()
_df2_mean = _df2.is_fraud.mean()
_df3_mean = _df3.is_fraud.mean()
_df4_mean = _df4.is_fraud.mean()
_df5_mean = _df5.is_fraud.mean()
_df6_mean = _df6.is_fraud.mean()
_df7_mean = _df7.is_fraud.mean()
_df8_mean = _df8.is_fraud.mean()
df_tr = df_train1[["amt","is_fraud"]]
df_tst = df_80[["amt","is_fraud"]]     
tr = TabularDataset(df_tr)
tst = TabularDataset(df_tst)
predictr = TabularPredictor(label="is_fraud", verbosity=1)
t1 = time.time()
predictr.fit(tr)
No path specified. Models will be saved in: "AutogluonModels/ag-20240520_095957/"
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f8518c9c430>
predictr.predict(df_tst)
0         0
1         0
2         0
3         0
5         0
         ..
314538    0
314558    0
314563    0
314566    0
314571    0
Name: is_fraud, Length: 231011, dtype: int64
_df = df_test.assign(
    prob_hat = predictr.predict_proba(df_test).iloc[:,-1]
).query(
    f"amt <{150}"
)
sklearn.metrics.roc_auc_score(_df.is_fraud , _df.prob_hat)
0.8655555630724932
def f(thresh):
    _df = df_test.assign(
    prob_hat = predictr.predict_proba(df_test).iloc[:,-1]
    )
    _df = _df[_df.amt <= thresh]
    return sklearn.metrics.roc_auc_score(_df.is_fraud , _df.prob_hat)
import numpy as np
_df = pd.DataFrame([[th,f(th)] for th in np.linspace(2,df_test.amt.max(),1000)])
_df
0 1
0 2.000000 0.192184
1 29.415536 0.788045
2 56.831071 0.831348
3 84.246607 0.872586
4 111.662142 0.875747
... ... ...
995 27280.457858 0.956493
996 27307.873393 0.956493
997 27335.288929 0.956493
998 27362.704464 0.956493
999 27390.120000 0.956493

1000 rows × 2 columns

s.plot()
#sklearn.metrics.recall_score(_df.is_fraud , _df.prob_hat>0.5)

#sklearn.metrics.f1_score(_df.is_fraud , _df.prob_hat>0.5)

#sklearn.metrics.precision_score(_df.is_fraud , _df.prob_hat>0.5)

#sklearn.metrics.accuracy_score(_df.is_fraud , _df.prob_hat>0.5)
def auto_amt_ver0503(df_tr, df_tst, _df_mean):
    df_tr = df_tr[["amt","is_fraud"]]
    df_tst = df_tst[["amt","is_fraud"]]     
    tr = TabularDataset(df_tr)
    tst = TabularDataset(df_tst)
    predictr = TabularPredictor(label="is_fraud", verbosity=1)
    t1 = time.time()
    predictr.fit(tr)
    t2 = time.time()
    time_diff = t2 - t1
    models = predictr._trainer.model_graph.nodes
    results = []
    for model_name in models:
    # 모델 평가
        eval_result = predictr.evaluate(tst, model=model_name)

    # 결과를 데이터프레임에 추가
        results.append({'model': model_name, 
                        'acc': eval_result['accuracy'], 
                        'pre': eval_result['precision'], 
                        'rec': eval_result['recall'], 
                        'f1': eval_result['f1'], 
                        'auc': eval_result['roc_auc']})
        
    model = []
    time_diff = []
    acc = []
    pre = []
    rec = []
    f1 = [] 
    auc = [] 
    graph_based = []
    method = [] 
    throw_rate = [] 
    train_size = []
    train_cols = []
    train_frate = []
    test_size = []
    test_frate = []
    hyper_params = [] 
    
    for result in results:
        model_name = result['model']
        model.append(model_name)
        time_diff.append(None)  # 각 모델별로 학습한 시간을 나타내고 싶은데 잘 안됨
        acc.append(result['acc']) 
        pre.append(result['pre'])
        rec.append(result['rec'])
        f1.append(result['f1'])
        auc.append(result['auc'])
        graph_based.append(False) 
        method.append('Autogluon') 
        throw_rate.append(_df_mean)
        train_size.append(len(tr))
        train_cols.append([col for col in tr.columns if col != 'is_fraud'])
        train_frate.append(tr.is_fraud.mean())
        test_size.append(len(tst))
        test_frate.append(tst.is_fraud.mean())
        hyper_params.append(None)
        
    df_results = pd.DataFrame(dict(
        model=model,
        time=time_diff,
        acc=acc,
        pre=pre,
        rec=rec,
        f1=f1,
        auc=auc,
        graph_based=graph_based,
        method=method,
        throw_rate=throw_rate,  
        train_size=train_size,
        train_cols=train_cols,
        train_frate=train_frate,
        test_size=test_size,
        test_frate=test_frate,
        hyper_params=hyper_params
    ))    
    ymdhms = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S') 
    df_results.to_csv(f'../results2/{ymdhms}-Autogluon.csv',index=False)
    return df_results

amt 80 미만 잘 잡는지 확인용..

df_80 = df_test[df_test['amt'] <= 80]
df_80.shape, df_80.is_fraud.mean()
((231011, 22), 0.0016665873053664112)
_df1_ = pd.concat([df_train1, df_80])
_df2_ = pd.concat([df_train2, df_80])
_df3_ = pd.concat([df_train3, df_80])
_df4_ = pd.concat([df_train4, df_80])
_df5_ = pd.concat([df_train5, df_80])
_df6_ = pd.concat([df_train6, df_80])
_df7_ = pd.concat([df_train7, df_80])
_df8_ = pd.concat([df_train8, df_80])
_df1_mean_ = _df1_.is_fraud.mean()
_df2_mean_ = _df2_.is_fraud.mean()
_df3_mean_ = _df3_.is_fraud.mean()
_df4_mean_ = _df4_.is_fraud.mean()
_df5_mean_ = _df5_.is_fraud.mean()
_df6_mean_ = _df6_.is_fraud.mean()
_df7_mean_ = _df7_.is_fraud.mean()
_df8_mean_ = _df8_.is_fraud.mean()