import numpy as np
import pandas as pd
from sklearn import datasets
# 참고: 분류용 가상 데이터 만들기
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB #나이브 베이즈
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
파이썬:사이킷런
sklearn
ref:https://losskatsu.github.io/machine-learning/sklearn/#train-test-%EB%8D%B0%EC%9D%B4%ED%84%B0-%EB%B6%84%ED%95%A0%ED%95%98%EA%B8%B0
# 분류용 가상 데이터 만들기
from sklearn.datasets import make_classification
= make_classification(n_samples=1000, n_features=4,
X, Y =2, n_redundant=0,
n_informative=0, shuffle=False) random_state
# n_informative: 종속변수와 상관관계가 존재하는 독립변수 수(default=2)
# n_redundant: 독립변수끼리 종속관계에 있는 독립변수 수
= datasets.load_breast_cancer() ## sklearn에 내장된 원본 데이터 불러오기
raw print(raw.feature_names) ## 열(column) 이름 확인
'mean radius' 'mean texture' 'mean perimeter' 'mean area'
['mean smoothness' 'mean compactness' 'mean concavity'
'mean concave points' 'mean symmetry' 'mean fractal dimension'
'radius error' 'texture error' 'perimeter error' 'area error'
'smoothness error' 'compactness error' 'concavity error'
'concave points error' 'symmetry error' 'fractal dimension error'
'worst radius' 'worst texture' 'worst perimeter' 'worst area'
'worst smoothness' 'worst compactness' 'worst concavity'
'worst concave points' 'worst symmetry' 'worst fractal dimension']
= pd.DataFrame(raw.data) ## 독립변수 데이터 모음
data = pd.DataFrame(raw.target) ## 종속변수 데이터 모음
target = pd.concat([data,target], axis=1) ## 독립변수 + 종속변수 열 결합
rawData
## 열(column)이름 설정
=['mean radius', 'mean texture', 'mean perimeter', 'mean area',
rawData.columns'mean smoothness', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'radius error', 'texture error', 'perimeter error', 'area error',
'smoothness error', 'compactness error', 'concavity error',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension'
'cancer']
,
10) ## 데이터 확인 rawData.head(
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
'mean smoothness' 'mean compactness' 'mean concavity'
'mean concave points' 'mean symmetry' 'mean fractal dimension'
'radius error' 'texture error' 'perimeter error' 'area error'
'smoothness error' 'compactness error' 'concavity error'
'concave points error' 'symmetry error' 'fractal dimension error'
'worst radius' 'worst texture' 'worst perimeter' 'worst area'
'worst smoothness' 'worst compactness' 'worst concavity'
'worst concave points' 'worst symmetry' 'worst fractal dimension']
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | cancer | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | 0 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | 0 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | 0 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | 0 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | 0 |
5 | 12.45 | 15.70 | 82.57 | 477.1 | 0.12780 | 0.17000 | 0.15780 | 0.08089 | 0.2087 | 0.07613 | ... | 23.75 | 103.40 | 741.6 | 0.1791 | 0.5249 | 0.5355 | 0.1741 | 0.3985 | 0.12440 | 0 |
6 | 18.25 | 19.98 | 119.60 | 1040.0 | 0.09463 | 0.10900 | 0.11270 | 0.07400 | 0.1794 | 0.05742 | ... | 27.66 | 153.20 | 1606.0 | 0.1442 | 0.2576 | 0.3784 | 0.1932 | 0.3063 | 0.08368 | 0 |
7 | 13.71 | 20.83 | 90.20 | 577.9 | 0.11890 | 0.16450 | 0.09366 | 0.05985 | 0.2196 | 0.07451 | ... | 28.14 | 110.60 | 897.0 | 0.1654 | 0.3682 | 0.2678 | 0.1556 | 0.3196 | 0.11510 | 0 |
8 | 13.00 | 21.82 | 87.50 | 519.8 | 0.12730 | 0.19320 | 0.18590 | 0.09353 | 0.2350 | 0.07389 | ... | 30.73 | 106.20 | 739.3 | 0.1703 | 0.5401 | 0.5390 | 0.2060 | 0.4378 | 0.10720 | 0 |
9 | 12.46 | 24.04 | 83.97 | 475.9 | 0.11860 | 0.23960 | 0.22730 | 0.08543 | 0.2030 | 0.08243 | ... | 40.68 | 97.65 | 711.4 | 0.1853 | 1.0580 | 1.1050 | 0.2210 | 0.4366 | 0.20750 | 0 |
10 rows × 31 columns
= rawData[['mean radius', 'mean texture']]
x = rawData['cancer'] y
= train_test_split(x,y,test_size=0.25, random_state=0) x_train, x_test, y_train, y_test
## 모수 추정(estimat)
fit(x_train, y_train) ## 추정된 모수 확인
get_params() ## x_test로부터 라벨 예측
predict(x_test) ## 로그 취한 확률 예측
predict_log_proba(x_test) ## 각 라벨로 예측될 확률
predict_proba(x_test) ## 모델 정확도 평가를 위한 mean accuracy score(x_test, y_test)
-
선형회귀
= LinearRegression()
clf # 모수 추정
clf.fit(x_train,y_train) # 추정 된 모수 확인(상수항 제외)
clf.coef_ # 추정 된 상수항 확인
clf.intercept_
clf.predict(x_test)#clf.predic(x_test) # 예측
# 모형 성능 평가 clf.score(x_test, y_test)
0.6092200214592733
-
로지스틱
= LogisticRegression(solver='lbfgs').fit(x_train,y_train)
clf
clf.predict(x_test)
clf.predict_proba(x_test) clf.score(x_test,y_test)
0.9020979020979021
-
나이브베이즈
= GaussianNB()
gnb
gnb.fit(x_train, y_train)
gnb.predict(x_test) gnb.score(x_test, y_test)
0.8951048951048951
-
의사결정나무
= tree.DecisionTreeClassifier()
clf
clf.fit(x_train, y_train)
clf.predict(x_test)
clf.predict_proba(x_test) clf.score(x_test, y_test)
0.8601398601398601
-
svm
= svm.SVC(kernel='linear')
clf
clf.fit(x_train, y_train)
clf.predict(x_test) clf.score(x_test, y_test)
0.9020979020979021
-
랜덤포레스트
= RandomForestClassifier(max_depth=2, random_state=0)
clf
clf.fit(x_train, y_train)
clf.feature_importances_ clf.predict(x_test)
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0])