import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
= pd.read_csv("fraudTrain.csv") _df
= set(_df.query('is_fraud==1').cc_num.tolist())
cus_list = _df.query("cc_num in @ cus_list")
_df2 = _df2.assign(time= list(map(lambda x: int(x.split(' ')[-1].split(':')[0]), _df2['trans_date_trans_time']))) _df2
_df2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 651430 entries, 3 to 1048574
Data columns (total 24 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 651430 non-null int64
1 trans_date_trans_time 651430 non-null object
2 cc_num 651430 non-null float64
3 merchant 651430 non-null object
4 category 651430 non-null object
5 amt 651430 non-null float64
6 first 651430 non-null object
7 last 651430 non-null object
8 gender 651430 non-null object
9 street 651430 non-null object
10 city 651430 non-null object
11 state 651430 non-null object
12 zip 651430 non-null int64
13 lat 651430 non-null float64
14 long 651430 non-null float64
15 city_pop 651430 non-null int64
16 job 651430 non-null object
17 dob 651430 non-null object
18 trans_num 651430 non-null object
19 unix_time 651430 non-null int64
20 merch_lat 651430 non-null float64
21 merch_long 651430 non-null float64
22 is_fraud 651430 non-null int64
23 time 651430 non-null int64
dtypes: float64(6), int64(6), object(12)
memory usage: 124.3+ MB
merch_lat
과 merch_long
은 상점의 위도 경도, 위의 lat과 long은 고객의 ??
dob
는 생년월일(date of birth)을 나타내는 변수
unix_time
1970년 1월 1일 0시 0분 0초(UTC)부터 경과된 시간을 초(second) 단위로 표현하는 방법
zip
우편번호
`
"is_fraud"].value_counts() _df2[
0 645424
1 6006
Name: is_fraud, dtype: int64
"is_fraud"].value_counts()/len(_df2) _df2[
0 0.99078
1 0.00922
Name: is_fraud, dtype: float64
type(_df2)
pandas.core.frame.DataFrame
= _df2['lat'] - _df2['merch_lat']
diff =abs(diff)
latabsprint("lat:",abs(diff).mean())
= _df2['long'] - _df2['merch_long']
diff2 =abs(diff2)
longabsprint("long:",abs(diff2).mean())
lat: 0.5002190204058765
long: 0.5004574515650185
= _df2.assign(latabs=abs(_df2['lat'] - _df2['merch_lat'])) _df2
= _df2.assign(longabs=abs(_df2['long'] - _df2['merch_long'])) _df2
=['is_fraud']).agg({'city_pop':np.mean,'amt':np.mean,'time':np.mean,'latabs':np.mean, 'longabs':np.mean}) _df2.groupby(by
city_pop | amt | time | latabs | longabs | |
---|---|---|---|---|---|
is_fraud | |||||
0 | 83870.443845 | 67.743047 | 12.813152 | 0.500202 | 0.500468 |
1 | 96323.951715 | 530.573492 | 13.915917 | 0.502055 | 0.499343 |
=_df2[['amt','time','lat','merch_lat','is_fraud']] _df3
=np.hstack([_df3.values[:,:]]) data
data
array([[ 45. , 0. , 46.2306 , 47.034331, 0. ],
[ 94.63 , 0. , 40.375 , 40.653382, 0. ],
[ 44.54 , 0. , 37.9931 , 37.162705, 0. ],
...,
[ 6.03 , 16. , 42.1939 , 42.633354, 0. ],
[116.94 , 16. , 41.1826 , 41.400318, 0. ],
[ 6.81 , 16. , 34.077 , 33.601468, 0. ]])
= data[:,:-1]
X = data[:,-1] y
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
= train_test_split(X,y, test_size=0.2) X_train, X_test, y_train, y_test
= LogisticRegression() lr
lr.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
=lr.predict(X_test) y_pred
= accuracy_score(y_test, y_pred)
acc= precision_score(y_test, y_pred, average='weighted')
precision = recall_score(y_test, y_pred, average='weighted')
recall = f1_score(y_test, y_pred, average='weighted')
f1score print("Accuracy: {}".format(acc))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))
Accuracy: 0.9891776553121594
Precision: 0.9810129371194015
Recall: 0.9891776553121594
F1 score: 0.9850783784050309
= accuracy_score(y_test, y_pred)
acc= precision_score(y_test, y_pred, average='macro')
precision = recall_score(y_test, y_pred, average='macro')
recall = f1_score(y_test, y_pred, average='macro')
f1score print("Accuracy: {}".format(acc))
print("Precision:{}".format(precision))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))
Accuracy: 0.9891776553121594
Precision:0.4952274089672451
Recall: 0.49934905923560957
F1 score: 0.4972796937822675
간단하게 생각해보면, 고객의 lat과 상점의 lat의 차이가 크다.. 그러면 사기거래일 가능성이 클 거 같은 느낌.?