import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
from autogluon.tabular import TabularDataset, TabularPredictor
'ignore') warnings.filterwarnings(
imports
= pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train1 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_test.csv') df_test
(df_train1.shape, df_train1.is_fraud.mean()), (df_test.shape, df_test.is_fraud.mean())
(((734003, 22), 0.005728859418830713), ((314572, 22), 0.005725239372862174))
= pd.concat([df_train1, df_test])
_df1 = pd.concat([df_train2, df_test])
_df2 = pd.concat([df_train3, df_test])
_df3 = pd.concat([df_train4, df_test])
_df4 = pd.concat([df_train5, df_test])
_df5 = pd.concat([df_train6, df_test])
_df6 = pd.concat([df_train7, df_test])
_df7 = pd.concat([df_train8, df_test]) _df8
= _df1.is_fraud.mean()
_df1_mean = _df2.is_fraud.mean()
_df2_mean = _df3.is_fraud.mean()
_df3_mean = _df4.is_fraud.mean()
_df4_mean = _df5.is_fraud.mean()
_df5_mean = _df6.is_fraud.mean()
_df6_mean = _df7.is_fraud.mean()
_df7_mean = _df8.is_fraud.mean() _df8_mean
= df_train1[["amt","is_fraud"]]
df_tr = df_80[["amt","is_fraud"]]
df_tst = TabularDataset(df_tr)
tr = TabularDataset(df_tst)
tst = TabularPredictor(label="is_fraud", verbosity=1)
predictr = time.time()
t1 predictr.fit(tr)
No path specified. Models will be saved in: "AutogluonModels/ag-20240520_095957/"
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f8518c9c430>
predictr.predict(df_tst)
0 0
1 0
2 0
3 0
5 0
..
314538 0
314558 0
314563 0
314566 0
314571 0
Name: is_fraud, Length: 231011, dtype: int64
= df_test.assign(
_df = predictr.predict_proba(df_test).iloc[:,-1]
prob_hat
).query(f"amt <{150}"
)
sklearn.metrics.roc_auc_score(_df.is_fraud , _df.prob_hat)
0.8655555630724932
def f(thresh):
= df_test.assign(
_df = predictr.predict_proba(df_test).iloc[:,-1]
prob_hat
)= _df[_df.amt <= thresh]
_df return sklearn.metrics.roc_auc_score(_df.is_fraud , _df.prob_hat)
import numpy as np
= pd.DataFrame([[th,f(th)] for th in np.linspace(2,df_test.amt.max(),1000)]) _df
_df
0 | 1 | |
---|---|---|
0 | 2.000000 | 0.192184 |
1 | 29.415536 | 0.788045 |
2 | 56.831071 | 0.831348 |
3 | 84.246607 | 0.872586 |
4 | 111.662142 | 0.875747 |
... | ... | ... |
995 | 27280.457858 | 0.956493 |
996 | 27307.873393 | 0.956493 |
997 | 27335.288929 | 0.956493 |
998 | 27362.704464 | 0.956493 |
999 | 27390.120000 | 0.956493 |
1000 rows × 2 columns
s.plot()
#sklearn.metrics.recall_score(_df.is_fraud , _df.prob_hat>0.5)
#sklearn.metrics.f1_score(_df.is_fraud , _df.prob_hat>0.5)
#sklearn.metrics.precision_score(_df.is_fraud , _df.prob_hat>0.5)
#sklearn.metrics.accuracy_score(_df.is_fraud , _df.prob_hat>0.5)
def auto_amt_ver0503(df_tr, df_tst, _df_mean):
= df_tr[["amt","is_fraud"]]
df_tr = df_tst[["amt","is_fraud"]]
df_tst = TabularDataset(df_tr)
tr = TabularDataset(df_tst)
tst = TabularPredictor(label="is_fraud", verbosity=1)
predictr = time.time()
t1
predictr.fit(tr)= time.time()
t2 = t2 - t1
time_diff = predictr._trainer.model_graph.nodes
models = []
results for model_name in models:
# 모델 평가
= predictr.evaluate(tst, model=model_name)
eval_result
# 결과를 데이터프레임에 추가
'model': model_name,
results.append({'acc': eval_result['accuracy'],
'pre': eval_result['precision'],
'rec': eval_result['recall'],
'f1': eval_result['f1'],
'auc': eval_result['roc_auc']})
= []
model = []
time_diff = []
acc = []
pre = []
rec = []
f1 = []
auc = []
graph_based = []
method = []
throw_rate = []
train_size = []
train_cols = []
train_frate = []
test_size = []
test_frate = []
hyper_params
for result in results:
= result['model']
model_name
model.append(model_name)None) # 각 모델별로 학습한 시간을 나타내고 싶은데 잘 안됨
time_diff.append('acc'])
acc.append(result['pre'])
pre.append(result['rec'])
rec.append(result['f1'])
f1.append(result['auc'])
auc.append(result[False)
graph_based.append('Autogluon')
method.append(
throw_rate.append(_df_mean)len(tr))
train_size.append(for col in tr.columns if col != 'is_fraud'])
train_cols.append([col
train_frate.append(tr.is_fraud.mean())len(tst))
test_size.append(
test_frate.append(tst.is_fraud.mean())None)
hyper_params.append(
= pd.DataFrame(dict(
df_results =model,
model=time_diff,
time=acc,
acc=pre,
pre=rec,
rec=f1,
f1=auc,
auc=graph_based,
graph_based=method,
method=throw_rate,
throw_rate=train_size,
train_size=train_cols,
train_cols=train_frate,
train_frate=test_size,
test_size=test_frate,
test_frate=hyper_params
hyper_params
)) = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
ymdhms f'../results2/{ymdhms}-Autogluon.csv',index=False)
df_results.to_csv(return df_results
amt 80 미만 잘 잡는지 확인용..
= df_test[df_test['amt'] <= 80] df_80
df_80.shape, df_80.is_fraud.mean()
((231011, 22), 0.0016665873053664112)
= pd.concat([df_train1, df_80])
_df1_ = pd.concat([df_train2, df_80])
_df2_ = pd.concat([df_train3, df_80])
_df3_ = pd.concat([df_train4, df_80])
_df4_ = pd.concat([df_train5, df_80])
_df5_ = pd.concat([df_train6, df_80])
_df6_ = pd.concat([df_train7, df_80])
_df7_ = pd.concat([df_train8, df_80]) _df8_
= _df1_.is_fraud.mean()
_df1_mean_ = _df2_.is_fraud.mean()
_df2_mean_ = _df3_.is_fraud.mean()
_df3_mean_ = _df4_.is_fraud.mean()
_df4_mean_ = _df5_.is_fraud.mean()
_df5_mean_ = _df6_.is_fraud.mean()
_df6_mean_ = _df7_.is_fraud.mean()
_df7_mean_ = _df8_.is_fraud.mean() _df8_mean_