import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
= pd.read_csv("fraudTrain.csv") _df
= set(_df.query('is_fraud==1').cc_num.tolist())
cus_list = _df.query("cc_num in @ cus_list")
_df2 = _df2.assign(time= list(map(lambda x: int(x.split(' ')[-1].split(':')[0]), _df2['trans_date_trans_time']))) _df2
_df2.shape
(651430, 24)
_df2.columns
Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
'merch_lat', 'merch_long', 'is_fraud', 'time'],
dtype='object')
_df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 651430 entries, 3 to 1048574
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 651430 non-null int64
1 trans_date_trans_time 651430 non-null object
2 cc_num 651430 non-null float64
3 merchant 651430 non-null object
4 category 651430 non-null object
5 amt 651430 non-null float64
6 first 651430 non-null object
7 last 651430 non-null object
8 gender 651430 non-null object
9 street 651430 non-null object
10 city 651430 non-null object
11 state 651430 non-null object
12 zip 651430 non-null int64
13 lat 651430 non-null float64
14 long 651430 non-null float64
15 city_pop 651430 non-null int64
16 job 651430 non-null object
17 dob 651430 non-null object
18 trans_num 651430 non-null object
19 unix_time 651430 non-null int64
20 merch_lat 651430 non-null float64
21 merch_long 651430 non-null float64
22 is_fraud 651430 non-null int64
23 time 651430 non-null int64
dtypes: float64(6), int64(6), object(12)
memory usage: 124.3+ MB
merch_lat
과 merch_long
은 상점의 위도 경도, 위의 lat과 long은 고객의 ??
dob
는 생년월일(date of birth)을 나타내는 변수
unix_time
1970년 1월 1일 0시 0분 0초(UTC)부터 경과된 시간을 초(second) 단위로 표현하는 방법
zip
우편번호
`
len(set(_df2['city']))
576
"is_fraud"].value_counts() _df2[
0 645424
1 6006
Name: is_fraud, dtype: int64
"is_fraud"].value_counts()/len(_df2) _df2[
0 0.99078
1 0.00922
Name: is_fraud, dtype: float64
=['is_fraud']).agg({'city_pop':np.mean,'amt':np.mean,'time':np.mean}) _df2.groupby(by
city_pop | amt | time | |
---|---|---|---|
is_fraud | |||
0 | 83870.443845 | 67.743047 | 12.813152 |
1 | 96323.951715 | 530.573492 | 13.915917 |
=_df2[['amt','time','city_pop','is_fraud']] _df3
_df3.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 651430 entries, 3 to 1048574
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 amt 651430 non-null float64
1 time 651430 non-null int64
2 city_pop 651430 non-null int64
3 is_fraud 651430 non-null int64
dtypes: float64(1), int64(3)
memory usage: 24.9 MB
=np.hstack([_df3.values[:,:]]) data
data
array([[4.50000e+01, 0.00000e+00, 1.93900e+03, 0.00000e+00],
[9.46300e+01, 0.00000e+00, 2.15800e+03, 0.00000e+00],
[4.45400e+01, 0.00000e+00, 2.69100e+03, 0.00000e+00],
...,
[6.03000e+00, 1.60000e+01, 5.20000e+02, 0.00000e+00],
[1.16940e+02, 1.60000e+01, 1.58300e+03, 0.00000e+00],
[6.81000e+00, 1.60000e+01, 1.65556e+05, 0.00000e+00]])
= data[:,:-1]
X = data[:,-1] y
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
= train_test_split(X,y, test_size=0.2) X_train, X_test, y_train, y_test
= LogisticRegression() lr
lr.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
=lr.predict(X_test) y_pred
= accuracy_score(y_test, y_pred)
acc= precision_score(y_test, y_pred, average='weighted')
precision = recall_score(y_test, y_pred, average='weighted')
recall = f1_score(y_test, y_pred, average='weighted')
f1score print("Accuracy: {}".format(acc))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))
Accuracy: 0.9902215126721213
Precision: 0.9831134944972946
Recall: 0.9902215126721213
F1 score: 0.9866547019260462
= accuracy_score(y_test, y_pred)
acc= precision_score(y_test, y_pred, average='macro')
precision = recall_score(y_test, y_pred, average='macro')
recall = f1_score(y_test, y_pred, average='macro')
f1score print("Accuracy: {}".format(acc))
print("Precision:{}".format(precision))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))
Accuracy: 0.9902215126721213
Precision:0.4957576316517569
Recall: 0.49934201359322505
F1 score: 0.49754336709114605
f1 score가 엄청 커졌다. 이유가 뭘까? 처음에 city_pop에 대한 걸 생각했을때는 사기거래=0과 사기거래=1의 큰 차이가 없어보였는데 갑자기…