GNNpaper - 데이터1/10

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')

with open('fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)

fraudTrain['is_fraud'].sum()

fraudTrain[::10]

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
10	2019-01-01 00:06:00	4.642890e+12	fraud_Rutherford-Mertz	grocery_pos	24.74	Eddie	Mendez	M	1831 Faith View Suite 653	Clarinda	...	40.7491	-95.0380	7297	IT trainer	1990-07-13	d71c95ab6b7356dd74389d41df429c87	1325376383	40.275891	-96.011548	0
20	2019-01-01 00:13:00	4.469780e+18	fraud_Bauch-Raynor	grocery_pos	57.34	Gregory	Graham	M	4005 Dana Glens	Methuen	...	42.7280	-71.1810	47249	Market researcher	1980-11-22	139a1bee15be607d79fe173bfcb2502a	1325376788	42.268760	-71.217297	0
30	2019-01-01 00:22:00	1.800950e+14	fraud_Kihn, Abernathy and Douglas	shopping_net	3.66	Tammie	Harper	F	57887 Gutierrez Harbor	Westfir	...	43.7575	-122.4810	597	Forensic psychologist	1961-05-19	870c92b288a974a2faf1f24b05c27e33	1325377356	44.278191	-121.815161	0
40	2019-01-01 00:30:00	6.763730e+11	fraud_Christiansen, Goyette and Schamberger	gas_transport	53.60	Vicki	Mendoza	F	3645 Atkins Island Apt. 238	Esbon	...	39.7562	-98.4462	242	Tourism officer	1987-07-18	f1566e9623814dd277dfa2a9bf83ea36	1325377849	40.332519	-99.062962	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048530	2020-03-10 15:53:00	4.746000e+12	fraud_Langworth LLC	personal_care	6.18	Carrie	Washington	F	6114 Adams Harbor Suite 096	Kingsford Heights	...	41.4802	-86.6919	1423	Psychologist, forensic	1998-10-07	a8119f2c9856637af2a6d3a92ae41892	1362930809	41.945985	-86.088059	0
1048540	2020-03-10 15:58:00	6.304520e+11	fraud_Prosacco LLC	personal_care	232.82	Rachel	Daniels	F	561 Little Plain Apt. 738	Wetmore	...	46.3535	-86.6345	765	Immunologist	1972-06-12	19f0f3a0a194a3747561e41273f69fd6	1362931131	46.119515	-87.542503	0
1048550	2020-03-10 16:02:00	4.378990e+15	fraud_Abernathy and Sons	food_dining	229.60	Travis	Hayes	M	1561 Chase Grove	Conway	...	43.9742	-71.1503	3807	Surgeon	1999-10-25	a25384c63d760aecb7fbae0dd3aa4df2	1362931360	44.256850	-71.940706	0
1048560	2020-03-10 16:04:00	3.009300e+13	fraud_Sawayn PLC	shopping_pos	16.29	Jessica	Terry	F	6412 Elizabeth Gardens Suite 633	Maysville	...	38.6207	-83.8067	14228	Advertising account executive	1971-03-26	14dfd9d63388207ab0fba67e4272f6af	1362931476	39.221625	-83.713083	0
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0

104858 rows × 22 columns

train이랑 test fraud비율이 같게 해서 0.002 ~~ 0.05… 0.5해보기

def throw(df, fraud_rate, random_state=42):  # 사기 거래 비율에 맞춰 버려지는 함수!
    df1 = df[df['is_fraud'] == 1].copy()
    df0 = df[df['is_fraud'] == 0].copy()
    df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
    df0_down = df0.sample(frac=df0_downsample, random_state=random_state)
    df_p = pd.concat([df1, df0_down])
    return df_p

def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3, random_state=42):
    n = len(data_frame)

    # 사기 거래와 정상 거래를 분리
    fraud_data = data_frame[data_frame['is_fraud'] == 1]
    normal_data = data_frame[data_frame['is_fraud'] == 0]

    # 테스트 데이터 크기 계산
    test_samples = int(test_fraud_rate * (n * test_rate))
    remaining_test_samples = int(n * test_rate) - test_samples

    # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
    test_fraud_data = fraud_data.sample(n=test_samples, replace=False, random_state=random_state)
    test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False, random_state=random_state)

    # 테스트 데이터 합치기
    test_data = pd.concat([test_normal_data, test_fraud_data])

    # 훈련 데이터 생성
    train_data = data_frame[~data_frame.index.isin(test_data.index)]

    return train_data, test_data

df = throw(fraudTrain, 0.3)

df.is_fraud.mean()

0.3

 train_data, test_data = split_dataframe(df, 0.3)

train_data.is_fraud.mean()

0.30005708577137147

test_data.is_fraud.mean()

0.29986679986679987

df.is_fraud.mean()

 train_data, test_data = split_dataframe(df, 0.05)

train_data.is_fraud.mean()

0.05000951429522858

test_data.is_fraud.mean()

0.049977799977799976