import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
'ignore') warnings.filterwarnings(
with open('fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
'is_fraud'].sum() fraudTrain[
6006
10] fraudTrain[::
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
10 | 2019-01-01 00:06:00 | 4.642890e+12 | fraud_Rutherford-Mertz | grocery_pos | 24.74 | Eddie | Mendez | M | 1831 Faith View Suite 653 | Clarinda | ... | 40.7491 | -95.0380 | 7297 | IT trainer | 1990-07-13 | d71c95ab6b7356dd74389d41df429c87 | 1325376383 | 40.275891 | -96.011548 | 0 |
20 | 2019-01-01 00:13:00 | 4.469780e+18 | fraud_Bauch-Raynor | grocery_pos | 57.34 | Gregory | Graham | M | 4005 Dana Glens | Methuen | ... | 42.7280 | -71.1810 | 47249 | Market researcher | 1980-11-22 | 139a1bee15be607d79fe173bfcb2502a | 1325376788 | 42.268760 | -71.217297 | 0 |
30 | 2019-01-01 00:22:00 | 1.800950e+14 | fraud_Kihn, Abernathy and Douglas | shopping_net | 3.66 | Tammie | Harper | F | 57887 Gutierrez Harbor | Westfir | ... | 43.7575 | -122.4810 | 597 | Forensic psychologist | 1961-05-19 | 870c92b288a974a2faf1f24b05c27e33 | 1325377356 | 44.278191 | -121.815161 | 0 |
40 | 2019-01-01 00:30:00 | 6.763730e+11 | fraud_Christiansen, Goyette and Schamberger | gas_transport | 53.60 | Vicki | Mendoza | F | 3645 Atkins Island Apt. 238 | Esbon | ... | 39.7562 | -98.4462 | 242 | Tourism officer | 1987-07-18 | f1566e9623814dd277dfa2a9bf83ea36 | 1325377849 | 40.332519 | -99.062962 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048530 | 2020-03-10 15:53:00 | 4.746000e+12 | fraud_Langworth LLC | personal_care | 6.18 | Carrie | Washington | F | 6114 Adams Harbor Suite 096 | Kingsford Heights | ... | 41.4802 | -86.6919 | 1423 | Psychologist, forensic | 1998-10-07 | a8119f2c9856637af2a6d3a92ae41892 | 1362930809 | 41.945985 | -86.088059 | 0 |
1048540 | 2020-03-10 15:58:00 | 6.304520e+11 | fraud_Prosacco LLC | personal_care | 232.82 | Rachel | Daniels | F | 561 Little Plain Apt. 738 | Wetmore | ... | 46.3535 | -86.6345 | 765 | Immunologist | 1972-06-12 | 19f0f3a0a194a3747561e41273f69fd6 | 1362931131 | 46.119515 | -87.542503 | 0 |
1048550 | 2020-03-10 16:02:00 | 4.378990e+15 | fraud_Abernathy and Sons | food_dining | 229.60 | Travis | Hayes | M | 1561 Chase Grove | Conway | ... | 43.9742 | -71.1503 | 3807 | Surgeon | 1999-10-25 | a25384c63d760aecb7fbae0dd3aa4df2 | 1362931360 | 44.256850 | -71.940706 | 0 |
1048560 | 2020-03-10 16:04:00 | 3.009300e+13 | fraud_Sawayn PLC | shopping_pos | 16.29 | Jessica | Terry | F | 6412 Elizabeth Gardens Suite 633 | Maysville | ... | 38.6207 | -83.8067 | 14228 | Advertising account executive | 1971-03-26 | 14dfd9d63388207ab0fba67e4272f6af | 1362931476 | 39.221625 | -83.713083 | 0 |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
104858 rows × 22 columns
train이랑 test fraud비율이 같게 해서 0.002 ~~ 0.05… 0.5해보기
def throw(df, fraud_rate, random_state=42): # 사기 거래 비율에 맞춰 버려지는 함수!
= df[df['is_fraud'] == 1].copy()
df1 = df[df['is_fraud'] == 0].copy()
df0 = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
df0_downsample = df0.sample(frac=df0_downsample, random_state=random_state)
df0_down = pd.concat([df1, df0_down])
df_p return df_p
def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3, random_state=42):
= len(data_frame)
n
# 사기 거래와 정상 거래를 분리
= data_frame[data_frame['is_fraud'] == 1]
fraud_data = data_frame[data_frame['is_fraud'] == 0]
normal_data
# 테스트 데이터 크기 계산
= int(test_fraud_rate * (n * test_rate))
test_samples = int(n * test_rate) - test_samples
remaining_test_samples
# 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
= fraud_data.sample(n=test_samples, replace=False, random_state=random_state)
test_fraud_data = normal_data.sample(n=remaining_test_samples, replace=False, random_state=random_state)
test_normal_data
# 테스트 데이터 합치기
= pd.concat([test_normal_data, test_fraud_data])
test_data
# 훈련 데이터 생성
= data_frame[~data_frame.index.isin(test_data.index)]
train_data
return train_data, test_data
= throw(fraudTrain, 0.3) df
df.is_fraud.mean()
0.3
= split_dataframe(df, 0.3) train_data, test_data
train_data.is_fraud.mean()
0.30005708577137147
test_data.is_fraud.mean()
0.29986679986679987
df.is_fraud.mean()
= split_dataframe(df, 0.05) train_data, test_data
train_data.is_fraud.mean()
0.05000951429522858
test_data.is_fraud.mean()
0.049977799977799976