파이썬:사이킷런

sklearn
Author

김보람

Published

April 14, 2023

ref:https://losskatsu.github.io/machine-learning/sklearn/#train-test-%EB%8D%B0%EC%9D%B4%ED%84%B0-%EB%B6%84%ED%95%A0%ED%95%98%EA%B8%B0

import numpy as np
import pandas as pd
from sklearn import datasets
# 참고: 분류용 가상 데이터 만들기
from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB #나이브 베이즈
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
# 분류용 가상 데이터 만들기
from sklearn.datasets import make_classification
X, Y = make_classification(n_samples=1000, n_features=4,
                        n_informative=2, n_redundant=0,
                        random_state=0, shuffle=False)
# n_informative: 종속변수와 상관관계가 존재하는 독립변수 수(default=2)
# n_redundant: 독립변수끼리 종속관계에 있는 독립변수 수
raw = datasets.load_breast_cancer()         ## sklearn에 내장된 원본 데이터 불러오기
print(raw.feature_names)                    ## 열(column) 이름 확인
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

data = pd.DataFrame(raw.data)               ## 독립변수 데이터 모음  
target = pd.DataFrame(raw.target)           ## 종속변수 데이터 모음
rawData = pd.concat([data,target], axis=1)  ## 독립변수 + 종속변수 열 결합

## 열(column)이름 설정
rawData.columns=['mean radius', 'mean texture', 'mean perimeter', 'mean area',
 'mean smoothness', 'mean compactness', 'mean concavity',
 'mean concave points', 'mean symmetry', 'mean fractal dimension',
 'radius error', 'texture error', 'perimeter error', 'area error',
 'smoothness error', 'compactness error', 'concavity error',
 'concave points error', 'symmetry error', 'fractal dimension error',
 'worst radius', 'worst texture', 'worst perimeter', 'worst area',
 'worst smoothness', 'worst compactness', 'worst concavity',
 'worst concave points', 'worst symmetry', 'worst fractal dimension'
 , 'cancer']

rawData.head(10)                                ## 데이터 확인 
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
mean radius mean texture mean perimeter mean area mean smoothness mean compactness mean concavity mean concave points mean symmetry mean fractal dimension ... worst texture worst perimeter worst area worst smoothness worst compactness worst concavity worst concave points worst symmetry worst fractal dimension cancer
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 0.2419 0.07871 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 0
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 0.1812 0.05667 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 0
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 0.2069 0.05999 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 0
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 0.2597 0.09744 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 0
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 0.1809 0.05883 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 0
5 12.45 15.70 82.57 477.1 0.12780 0.17000 0.15780 0.08089 0.2087 0.07613 ... 23.75 103.40 741.6 0.1791 0.5249 0.5355 0.1741 0.3985 0.12440 0
6 18.25 19.98 119.60 1040.0 0.09463 0.10900 0.11270 0.07400 0.1794 0.05742 ... 27.66 153.20 1606.0 0.1442 0.2576 0.3784 0.1932 0.3063 0.08368 0
7 13.71 20.83 90.20 577.9 0.11890 0.16450 0.09366 0.05985 0.2196 0.07451 ... 28.14 110.60 897.0 0.1654 0.3682 0.2678 0.1556 0.3196 0.11510 0
8 13.00 21.82 87.50 519.8 0.12730 0.19320 0.18590 0.09353 0.2350 0.07389 ... 30.73 106.20 739.3 0.1703 0.5401 0.5390 0.2060 0.4378 0.10720 0
9 12.46 24.04 83.97 475.9 0.11860 0.23960 0.22730 0.08543 0.2030 0.08243 ... 40.68 97.65 711.4 0.1853 1.0580 1.1050 0.2210 0.4366 0.20750 0

10 rows × 31 columns

x = rawData[['mean radius', 'mean texture']]
y = rawData['cancer']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=0)
fit(x_train, y_train)     ## 모수 추정(estimat)
get_params()              ## 추정된 모수 확인
predict(x_test)           ## x_test로부터 라벨 예측
predict_log_proba(x_test) ## 로그 취한 확률 예측
predict_proba(x_test)     ## 각 라벨로 예측될 확률
score(x_test, y_test)     ## 모델 정확도 평가를 위한 mean accuracy

- 선형회귀

clf = LinearRegression()
clf.fit(x_train,y_train)  # 모수 추정
clf.coef_                 # 추정 된 모수 확인(상수항 제외)
clf.intercept_            # 추정 된 상수항 확인
clf.predict(x_test)
#clf.predic(x_test)        # 예측
clf.score(x_test, y_test) # 모형 성능 평가
0.6092200214592733

- 로지스틱

clf = LogisticRegression(solver='lbfgs').fit(x_train,y_train)
clf.predict(x_test)
clf.predict_proba(x_test)
clf.score(x_test,y_test)
0.9020979020979021

- 나이브베이즈

gnb = GaussianNB()
gnb.fit(x_train, y_train)
gnb.predict(x_test)
gnb.score(x_test, y_test)
0.8951048951048951

- 의사결정나무

clf = tree.DecisionTreeClassifier()
clf.fit(x_train, y_train)
clf.predict(x_test)
clf.predict_proba(x_test)
clf.score(x_test, y_test)
0.8601398601398601

- svm

clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
clf.predict(x_test)
clf.score(x_test, y_test)
0.9020979020979021

- 랜덤포레스트

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train, y_train)
clf.feature_importances_
clf.predict(x_test)
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0])