import pandas as pd
import numpy as np
import matplotlib.pyplot as plt_df = pd.read_csv("fraudTrain.csv")cus_list = set(_df.query('is_fraud==1').cc_num.tolist())
_df2 = _df.query("cc_num in @ cus_list")
_df2 = _df2.assign(time= list(map(lambda x: int(x.split(' ')[-1].split(':')[0]), _df2['trans_date_trans_time'])))_df2.shape(651430, 24)
_df2.columnsIndex(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'time'],
      dtype='object')
_df2.info()<class 'pandas.core.frame.DataFrame'>
Int64Index: 651430 entries, 3 to 1048574
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             651430 non-null  int64  
 1   trans_date_trans_time  651430 non-null  object 
 2   cc_num                 651430 non-null  float64
 3   merchant               651430 non-null  object 
 4   category               651430 non-null  object 
 5   amt                    651430 non-null  float64
 6   first                  651430 non-null  object 
 7   last                   651430 non-null  object 
 8   gender                 651430 non-null  object 
 9   street                 651430 non-null  object 
 10  city                   651430 non-null  object 
 11  state                  651430 non-null  object 
 12  zip                    651430 non-null  int64  
 13  lat                    651430 non-null  float64
 14  long                   651430 non-null  float64
 15  city_pop               651430 non-null  int64  
 16  job                    651430 non-null  object 
 17  dob                    651430 non-null  object 
 18  trans_num              651430 non-null  object 
 19  unix_time              651430 non-null  int64  
 20  merch_lat              651430 non-null  float64
 21  merch_long             651430 non-null  float64
 22  is_fraud               651430 non-null  int64  
 23  time                   651430 non-null  int64  
dtypes: float64(6), int64(6), object(12)
memory usage: 124.3+ MB
_df2["is_fraud"].value_counts()0    645424
1      6006
Name: is_fraud, dtype: int64
_df2["is_fraud"].value_counts()/len(_df2)0    0.99078
1    0.00922
Name: is_fraud, dtype: float64
_df2.groupby(by=['is_fraud']).agg({'city_pop':np.mean,'amt':np.mean,'time':np.mean})| city_pop | amt | time | |
|---|---|---|---|
| is_fraud | |||
| 0 | 83870.443845 | 67.743047 | 12.813152 | 
| 1 | 96323.951715 | 530.573492 | 13.915917 | 
_df2.groupby(by=['category']).agg({'is_fraud':np.mean})| is_fraud | |
|---|---|
| category | |
| entertainment | 0.003907 | 
| food_dining | 0.002628 | 
| gas_transport | 0.007570 | 
| grocery_net | 0.004802 | 
| grocery_pos | 0.022539 | 
| health_fitness | 0.002408 | 
| home | 0.002488 | 
| kids_pets | 0.003440 | 
| misc_net | 0.023023 | 
| misc_pos | 0.004859 | 
| personal_care | 0.003774 | 
| shopping_net | 0.027628 | 
| shopping_pos | 0.011342 | 
| travel | 0.004886 | 
_df2.groupby(by=['time']).agg({'is_fraud':np.mean}).plot()_files/figure-html/cell-13-output-1.png)
그래프상 시간을 3등분 하거나 2등분 해서 적합시키면 좋을 거 같다.
- 3등분: 20 ~ 04, 04 ~ 12, 12 ~ 20
 
- 2등분: 06 ~ 18, 18 ~ 06
 
사기거래와 사기거래가 아닌 그룹에서 데이터 범주가 차이가 나는걸 보면
금액, 시간..
_df3=_df2[['amt','time','category','is_fraud']]
_df3| amt | time | category | is_fraud | |
|---|---|---|---|---|
| 3 | 45.00 | 0 | gas_transport | 0 | 
| 5 | 94.63 | 0 | gas_transport | 0 | 
| 6 | 44.54 | 0 | grocery_net | 0 | 
| 7 | 71.65 | 0 | gas_transport | 0 | 
| 8 | 4.27 | 0 | misc_pos | 0 | 
| ... | ... | ... | ... | ... | 
| 1048567 | 39.96 | 16 | kids_pets | 0 | 
| 1048568 | 20.67 | 16 | entertainment | 0 | 
| 1048569 | 6.03 | 16 | food_dining | 0 | 
| 1048571 | 116.94 | 16 | misc_pos | 0 | 
| 1048574 | 6.81 | 16 | misc_pos | 0 | 
651430 rows × 4 columns
_df3.info()<class 'pandas.core.frame.DataFrame'>
Int64Index: 651430 entries, 3 to 1048574
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   amt       651430 non-null  float64
 1   time      651430 non-null  int64  
 2   category  651430 non-null  object 
 3   is_fraud  651430 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 24.9+ MB
_df4=_df2[['amt','time','is_fraud']]
_df4| amt | time | is_fraud | |
|---|---|---|---|
| 3 | 45.00 | 0 | 0 | 
| 5 | 94.63 | 0 | 0 | 
| 6 | 44.54 | 0 | 0 | 
| 7 | 71.65 | 0 | 0 | 
| 8 | 4.27 | 0 | 0 | 
| ... | ... | ... | ... | 
| 1048567 | 39.96 | 16 | 0 | 
| 1048568 | 20.67 | 16 | 0 | 
| 1048569 | 6.03 | 16 | 0 | 
| 1048571 | 116.94 | 16 | 0 | 
| 1048574 | 6.81 | 16 | 0 | 
651430 rows × 3 columns
data=np.hstack([_df4.values[:,:]])dataarray([[ 45.  ,   0.  ,   0.  ],
       [ 94.63,   0.  ,   0.  ],
       [ 44.54,   0.  ,   0.  ],
       ...,
       [  6.03,  16.  ,   0.  ],
       [116.94,  16.  ,   0.  ],
       [  6.81,  16.  ,   0.  ]])
X = data[:,:-1]
y = data[:,-1]from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_scoreX_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)lr = LogisticRegression()lr.fit(X_train, y_train)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
y_pred=lr.predict(X_test)acc= accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1score = f1_score(y_test, y_pred, average='weighted')
print("Accuracy: {:.6f}".format(acc))
print("Precision: {:.6f}".format(precision))
print("Recall: {:.6f}".format(recall))
print("F1 score: {:.6f}".format(f1score))Accuracy: 0.051349
Precision: 0.005162
Recall: 0.051349
F1 score: 0.009370
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
acc= accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1score = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {}".format(acc))
print("Precision:{}".format(precision))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))Accuracy: 0.051348571604009643
Precision:0.004232713983377072
Recall: 0.04223025149824091
F1 score: 0.007685871263604649
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
acc= accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1score = f1_score(y_test, y_pred, average='micro')
print("Accuracy: {}".format(acc))
print("Precision:{}".format(precision))
print("Recall: {}".format(recall))
print("F1 score: {}".format(f1score))Accuracy: 0.051348571604009643
Precision:0.051348571604009643
Recall: 0.051348571604009643
F1 score: 0.051348571604009643
- average 매개 변수
None: 클래스별 metric 값을 계산
micro: 모든 샘플을 하나의 그룹으로 취급하여 metric 값을 계산
macro: 클래스별 metric 값을 동일한 가중치로 더하여 산술평균을 계산
weighted: 클래스별 metric 값을 라벨 수로 가중 평균하여 계산
samples: 샘플마다 metric 값을 계산