import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
= pd.read_csv("fraudTrain.csv") _df
= set(_df.query('is_fraud==1').cc_num.tolist())
cus_list = _df.query("cc_num in @ cus_list")
_df2 = _df2.assign(time= list(map(lambda x: int(x.split(' ')[-1].split(':')[0]), _df2['trans_date_trans_time']))) _df2
(651430, 24)
Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
'merch_lat', 'merch_long', 'is_fraud', 'time'],
<class 'pandas.core.frame.DataFrame'>
Int64Index: 651430 entries, 3 to 1048574
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 651430 non-null int64
1 trans_date_trans_time 651430 non-null object
2 cc_num 651430 non-null float64
3 merchant 651430 non-null object
4 category 651430 non-null object
5 amt 651430 non-null float64
6 first 651430 non-null object
7 last 651430 non-null object
8 gender 651430 non-null object
9 street 651430 non-null object
10 city 651430 non-null object
11 state 651430 non-null object
12 zip 651430 non-null int64
13 lat 651430 non-null float64
14 long 651430 non-null float64
15 city_pop 651430 non-null int64
16 job 651430 non-null object
17 dob 651430 non-null object
18 trans_num 651430 non-null object
19 unix_time 651430 non-null int64
20 merch_lat 651430 non-null float64
21 merch_long 651430 non-null float64
22 is_fraud 651430 non-null int64
23 time 651430 non-null int64
dtypes: float64(6), int64(6), object(12)
memory usage: 124.3+ MB
과 merch_long
은 상점의 위도 경도, 위의 lat과 long은 고객의 ??
는 생년월일(date of birth)을 나타내는 변수
1970년 1월 1일 0시 0분 0초(UTC)부터 경과된 시간을 초(second) 단위로 표현하는 방법
"is_fraud"].value_counts() _df2[
0 645424
1 6006
Name: is_fraud, dtype: int64
"is_fraud"].value_counts()/len(_df2) _df2[
0 0.99078
1 0.00922
Name: is_fraud, dtype: float64
=['is_fraud']).agg({'city_pop':np.mean,'amt':np.mean,'time':np.mean}) _df2.groupby(by
city_pop | amt | time | |
is_fraud | |||
0 | 83870.443845 | 67.743047 | 12.813152 |
1 | 96323.951715 | 530.573492 | 13.915917 |
=_df2[['amt','time','city_pop','is_fraud']] _df3
<class 'pandas.core.frame.DataFrame'>
Int64Index: 651430 entries, 3 to 1048574
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 amt 651430 non-null float64
1 time 651430 non-null int64
2 city_pop 651430 non-null int64
3 is_fraud 651430 non-null int64
dtypes: float64(1), int64(3)
memory usage: 24.9 MB
=np.hstack([_df3.values[:,:]]) data
array([[4.50000e+01, 0.00000e+00, 1.93900e+03, 0.00000e+00],
[9.46300e+01, 0.00000e+00, 2.15800e+03, 0.00000e+00],
[4.45400e+01, 0.00000e+00, 2.69100e+03, 0.00000e+00],
[6.03000e+00, 1.60000e+01, 5.20000e+02, 0.00000e+00],
[1.16940e+02, 1.60000e+01, 1.58300e+03, 0.00000e+00],
[6.81000e+00, 1.60000e+01, 1.65556e+05, 0.00000e+00]])
= data[:,:-1]
X = data[:,-1] y
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
= train_test_split(X,y, test_size=0.2) X_train, X_test, y_train, y_test
= LogisticRegression() lr, y_train)
=lr.predict(X_test) y_pred
= accuracy_score(y_test, y_pred)
acc= precision_score(y_test, y_pred, average='weighted')
precision = recall_score(y_test, y_pred, average='weighted')
recall = f1_score(y_test, y_pred, average='weighted')
f1score print("Accuracy: {}".format(acc))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))
Accuracy: 0.9902215126721213
Precision: 0.9831134944972946
Recall: 0.9902215126721213
F1 score: 0.9866547019260462
= accuracy_score(y_test, y_pred)
acc= precision_score(y_test, y_pred, average='macro')
precision = recall_score(y_test, y_pred, average='macro')
recall = f1_score(y_test, y_pred, average='macro')
f1score print("Accuracy: {}".format(acc))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))
Accuracy: 0.9902215126721213
Recall: 0.49934201359322505
F1 score: 0.49754336709114605
f1 score가 엄청 커졌다. 이유가 뭘까? 처음에 city_pop에 대한 걸 생각했을때는 사기거래=0과 사기거래=1의 큰 차이가 없어보였는데 갑자기…