import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
from autogluon.tabular import TabularDataset, TabularPredictor
warnings.filterwarnings('ignore')imports
df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')(df_train1.shape, df_train1.is_fraud.mean()), (df_test.shape, df_test.is_fraud.mean())(((734003, 22), 0.005728859418830713), ((314572, 22), 0.005725239372862174))
_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])_df1_mean = _df1.is_fraud.mean()
_df2_mean = _df2.is_fraud.mean()
_df3_mean = _df3.is_fraud.mean()
_df4_mean = _df4.is_fraud.mean()
_df5_mean = _df5.is_fraud.mean()
_df6_mean = _df6.is_fraud.mean()
_df7_mean = _df7.is_fraud.mean()
_df8_mean = _df8.is_fraud.mean()df_tr = df_train1[["amt","is_fraud"]]
df_tst = df_80[["amt","is_fraud"]]
tr = TabularDataset(df_tr)
tst = TabularDataset(df_tst)
predictr = TabularPredictor(label="is_fraud", verbosity=1)
t1 = time.time()
predictr.fit(tr)No path specified. Models will be saved in: "AutogluonModels/ag-20240520_095957/"
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f8518c9c430>
predictr.predict(df_tst)0 0
1 0
2 0
3 0
5 0
..
314538 0
314558 0
314563 0
314566 0
314571 0
Name: is_fraud, Length: 231011, dtype: int64
_df = df_test.assign(
prob_hat = predictr.predict_proba(df_test).iloc[:,-1]
).query(
f"amt <{150}"
)sklearn.metrics.roc_auc_score(_df.is_fraud , _df.prob_hat)0.8655555630724932
def f(thresh):
_df = df_test.assign(
prob_hat = predictr.predict_proba(df_test).iloc[:,-1]
)
_df = _df[_df.amt <= thresh]
return sklearn.metrics.roc_auc_score(_df.is_fraud , _df.prob_hat)import numpy as np_df = pd.DataFrame([[th,f(th)] for th in np.linspace(2,df_test.amt.max(),1000)])_df| 0 | 1 | |
|---|---|---|
| 0 | 2.000000 | 0.192184 |
| 1 | 29.415536 | 0.788045 |
| 2 | 56.831071 | 0.831348 |
| 3 | 84.246607 | 0.872586 |
| 4 | 111.662142 | 0.875747 |
| ... | ... | ... |
| 995 | 27280.457858 | 0.956493 |
| 996 | 27307.873393 | 0.956493 |
| 997 | 27335.288929 | 0.956493 |
| 998 | 27362.704464 | 0.956493 |
| 999 | 27390.120000 | 0.956493 |
1000 rows × 2 columns
s.plot()#sklearn.metrics.recall_score(_df.is_fraud , _df.prob_hat>0.5)
#sklearn.metrics.f1_score(_df.is_fraud , _df.prob_hat>0.5)
#sklearn.metrics.precision_score(_df.is_fraud , _df.prob_hat>0.5)
#sklearn.metrics.accuracy_score(_df.is_fraud , _df.prob_hat>0.5)def auto_amt_ver0503(df_tr, df_tst, _df_mean):
df_tr = df_tr[["amt","is_fraud"]]
df_tst = df_tst[["amt","is_fraud"]]
tr = TabularDataset(df_tr)
tst = TabularDataset(df_tst)
predictr = TabularPredictor(label="is_fraud", verbosity=1)
t1 = time.time()
predictr.fit(tr)
t2 = time.time()
time_diff = t2 - t1
models = predictr._trainer.model_graph.nodes
results = []
for model_name in models:
# 모델 평가
eval_result = predictr.evaluate(tst, model=model_name)
# 결과를 데이터프레임에 추가
results.append({'model': model_name,
'acc': eval_result['accuracy'],
'pre': eval_result['precision'],
'rec': eval_result['recall'],
'f1': eval_result['f1'],
'auc': eval_result['roc_auc']})
model = []
time_diff = []
acc = []
pre = []
rec = []
f1 = []
auc = []
graph_based = []
method = []
throw_rate = []
train_size = []
train_cols = []
train_frate = []
test_size = []
test_frate = []
hyper_params = []
for result in results:
model_name = result['model']
model.append(model_name)
time_diff.append(None) # 각 모델별로 학습한 시간을 나타내고 싶은데 잘 안됨
acc.append(result['acc'])
pre.append(result['pre'])
rec.append(result['rec'])
f1.append(result['f1'])
auc.append(result['auc'])
graph_based.append(False)
method.append('Autogluon')
throw_rate.append(_df_mean)
train_size.append(len(tr))
train_cols.append([col for col in tr.columns if col != 'is_fraud'])
train_frate.append(tr.is_fraud.mean())
test_size.append(len(tst))
test_frate.append(tst.is_fraud.mean())
hyper_params.append(None)
df_results = pd.DataFrame(dict(
model=model,
time=time_diff,
acc=acc,
pre=pre,
rec=rec,
f1=f1,
auc=auc,
graph_based=graph_based,
method=method,
throw_rate=throw_rate,
train_size=train_size,
train_cols=train_cols,
train_frate=train_frate,
test_size=test_size,
test_frate=test_frate,
hyper_params=hyper_params
))
ymdhms = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
df_results.to_csv(f'../results2/{ymdhms}-Autogluon.csv',index=False)
return df_resultsamt 80 미만 잘 잡는지 확인용..
df_80 = df_test[df_test['amt'] <= 80]df_80.shape, df_80.is_fraud.mean()((231011, 22), 0.0016665873053664112)
_df1_ = pd.concat([df_train1, df_80])
_df2_ = pd.concat([df_train2, df_80])
_df3_ = pd.concat([df_train3, df_80])
_df4_ = pd.concat([df_train4, df_80])
_df5_ = pd.concat([df_train5, df_80])
_df6_ = pd.concat([df_train6, df_80])
_df7_ = pd.concat([df_train7, df_80])
_df8_ = pd.concat([df_train8, df_80])_df1_mean_ = _df1_.is_fraud.mean()
_df2_mean_ = _df2_.is_fraud.mean()
_df3_mean_ = _df3_.is_fraud.mean()
_df4_mean_ = _df4_.is_fraud.mean()
_df5_mean_ = _df5_.is_fraud.mean()
_df6_mean_ = _df6_.is_fraud.mean()
_df7_mean_ = _df7_.is_fraud.mean()
_df8_mean_ = _df8_.is_fraud.mean()