데이터1/10

Author

김보람

Published

February 8, 2024

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')
with open('fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
fraudTrain['is_fraud'].sum()
6006
fraudTrain[::10]
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
10 2019-01-01 00:06:00 4.642890e+12 fraud_Rutherford-Mertz grocery_pos 24.74 Eddie Mendez M 1831 Faith View Suite 653 Clarinda ... 40.7491 -95.0380 7297 IT trainer 1990-07-13 d71c95ab6b7356dd74389d41df429c87 1325376383 40.275891 -96.011548 0
20 2019-01-01 00:13:00 4.469780e+18 fraud_Bauch-Raynor grocery_pos 57.34 Gregory Graham M 4005 Dana Glens Methuen ... 42.7280 -71.1810 47249 Market researcher 1980-11-22 139a1bee15be607d79fe173bfcb2502a 1325376788 42.268760 -71.217297 0
30 2019-01-01 00:22:00 1.800950e+14 fraud_Kihn, Abernathy and Douglas shopping_net 3.66 Tammie Harper F 57887 Gutierrez Harbor Westfir ... 43.7575 -122.4810 597 Forensic psychologist 1961-05-19 870c92b288a974a2faf1f24b05c27e33 1325377356 44.278191 -121.815161 0
40 2019-01-01 00:30:00 6.763730e+11 fraud_Christiansen, Goyette and Schamberger gas_transport 53.60 Vicki Mendoza F 3645 Atkins Island Apt. 238 Esbon ... 39.7562 -98.4462 242 Tourism officer 1987-07-18 f1566e9623814dd277dfa2a9bf83ea36 1325377849 40.332519 -99.062962 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048530 2020-03-10 15:53:00 4.746000e+12 fraud_Langworth LLC personal_care 6.18 Carrie Washington F 6114 Adams Harbor Suite 096 Kingsford Heights ... 41.4802 -86.6919 1423 Psychologist, forensic 1998-10-07 a8119f2c9856637af2a6d3a92ae41892 1362930809 41.945985 -86.088059 0
1048540 2020-03-10 15:58:00 6.304520e+11 fraud_Prosacco LLC personal_care 232.82 Rachel Daniels F 561 Little Plain Apt. 738 Wetmore ... 46.3535 -86.6345 765 Immunologist 1972-06-12 19f0f3a0a194a3747561e41273f69fd6 1362931131 46.119515 -87.542503 0
1048550 2020-03-10 16:02:00 4.378990e+15 fraud_Abernathy and Sons food_dining 229.60 Travis Hayes M 1561 Chase Grove Conway ... 43.9742 -71.1503 3807 Surgeon 1999-10-25 a25384c63d760aecb7fbae0dd3aa4df2 1362931360 44.256850 -71.940706 0
1048560 2020-03-10 16:04:00 3.009300e+13 fraud_Sawayn PLC shopping_pos 16.29 Jessica Terry F 6412 Elizabeth Gardens Suite 633 Maysville ... 38.6207 -83.8067 14228 Advertising account executive 1971-03-26 14dfd9d63388207ab0fba67e4272f6af 1362931476 39.221625 -83.713083 0
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0

104858 rows × 22 columns

train이랑 test fraud비율이 같게 해서 0.002 ~~ 0.05… 0.5해보기

def throw(df, fraud_rate, random_state=42):  # 사기 거래 비율에 맞춰 버려지는 함수!
    df1 = df[df['is_fraud'] == 1].copy()
    df0 = df[df['is_fraud'] == 0].copy()
    df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
    df0_down = df0.sample(frac=df0_downsample, random_state=random_state)
    df_p = pd.concat([df1, df0_down])
    return df_p

def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3, random_state=42):
    n = len(data_frame)

    # 사기 거래와 정상 거래를 분리
    fraud_data = data_frame[data_frame['is_fraud'] == 1]
    normal_data = data_frame[data_frame['is_fraud'] == 0]

    # 테스트 데이터 크기 계산
    test_samples = int(test_fraud_rate * (n * test_rate))
    remaining_test_samples = int(n * test_rate) - test_samples

    # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
    test_fraud_data = fraud_data.sample(n=test_samples, replace=False, random_state=random_state)
    test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False, random_state=random_state)

    # 테스트 데이터 합치기
    test_data = pd.concat([test_normal_data, test_fraud_data])

    # 훈련 데이터 생성
    train_data = data_frame[~data_frame.index.isin(test_data.index)]

    return train_data, test_data
df = throw(fraudTrain, 0.3)
df.is_fraud.mean()
0.3
 train_data, test_data = split_dataframe(df, 0.3)
train_data.is_fraud.mean()
0.30005708577137147
test_data.is_fraud.mean()
0.29986679986679987
df.is_fraud.mean()
 train_data, test_data = split_dataframe(df, 0.05)
train_data.is_fraud.mean()
0.05000951429522858
test_data.is_fraud.mean()
0.049977799977799976