from pyod.models.knn import KNN # kNN detector
from pyod.utils.data import generate_data
from pyod.utils.example import visualize
[GNN] pyod examples
pyod examples
-
ref: https://pyod.readthedocs.io/en/latest/example.html
knn_example
= 0.1 # percentage of outliers
contamination = 200 # number of training points
n_train = 100 # number of testing points
n_test
= generate_data(
X_train, X_test, y_train, y_test =n_train, n_test=n_test, contamination=contamination) n_train
# train kNN detector
= 'KNN'
clf_name = KNN()
clf
clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
= clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_pred = clf.decision_scores_ # raw outlier scores
y_train_scores
# get the prediction on the test data
= clf.predict(X_test) # outlier labels (0 or 1)
y_test_pred = clf.decision_function(X_test) # outlier scores
y_test_scores
# it is possible to get the prediction confidence as well
= clf.predict(X_test, return_confidence=True) # outlier labels (0 or 1) and confidence in the range of [0,1] y_test_pred, y_test_pred_confidence
from pyod.utils.data import evaluate_print
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
On Training Data:
KNN ROC:0.8753, precision @ rank n:0.65
On Test Data:
KNN ROC:0.9811, precision @ rank n:0.9
visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,=True, save_figure=False) y_test_pred, show_figure
Model Combination Example
Example of combining multiple base outlier scores. Four combination frameworks are demonstrated:
- Average: take the average of all base detectors
- maximization : take the maximum score across all detectors as the score
- Average of Maximum (AOM)
- Maximum of Average (MOA)
위 4개의 프레임워크를 사용하여 KNN 이상치 감지 모델 평가
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import loadmat
from pyod.models.knn import KNN
from pyod.models.combination import aom, moa, average, maximization, median
from pyod.utils.utility import standardizer
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
if __name__ == "__main__":
# Define data file and read X and y
# Generate some data if the source data is missing
= 'cardio.mat'
mat_file try:
= loadmat(os.path.join('data', mat_file))
mat
except TypeError:
print('{data_file} does not exist. Use generated data'.format(
=mat_file))
data_file= generate_data(train_only=True) # load data
X, y except IOError: # 존재하지 않을 때 ..
print('{data_file} does not exist. Use generated data'.format(
=mat_file))
data_file= generate_data(train_only=True) # load data
X, y else:
= mat['X']
X = mat['y'].ravel()
y
= train_test_split(X, y, test_size=0.4)
X_train, X_test, y_train, y_test
# standardizing data for processing
= standardizer(X_train, X_test)
X_train_norm, X_test_norm
= 20 # number of base detectors 이상치 초기화
n_clf
# Initialize 20 base detectors for combination
= [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,
k_list 150, 160, 170, 180, 190, 200]
= np.zeros([X_train.shape[0], n_clf]) # 훈련 데이터의 행 수
train_scores = np.zeros([X_test.shape[0], n_clf])
test_scores
print('Combining {n_clf} kNN detectors'.format(n_clf=n_clf))
for i in range(n_clf):
= k_list[i]
k
= KNN(n_neighbors=k, method='largest')
clf
clf.fit(X_train_norm)
= clf.decision_scores_
train_scores[:, i] = clf.decision_function(X_test_norm)
test_scores[:, i]
# Decision scores have to be normalized before combination
= standardizer(train_scores,
train_scores_norm, test_scores_norm
test_scores)# Combination by average
= average(test_scores_norm)
y_by_average 'Combination by Average', y_test, y_by_average)
evaluate_print(
# Combination by max
= maximization(test_scores_norm)
y_by_maximization 'Combination by Maximization', y_test, y_by_maximization)
evaluate_print(
# Combination by median
= median(test_scores_norm)
y_by_median 'Combination by Median', y_test, y_by_median)
evaluate_print(
# Combination by aom
= aom(test_scores_norm, n_buckets=5)
y_by_aom 'Combination by AOM', y_test, y_by_aom)
evaluate_print(
# Combination by moa
= moa(test_scores_norm, n_buckets=5)
y_by_moa 'Combination by MOA', y_test, y_by_moa) evaluate_print(
cardio.mat does not exist. Use generated data
Combining 20 kNN detectors
Combination by Average ROC:0.0, precision @ rank n:0.0
Combination by Maximization ROC:0.0055, precision @ rank n:0.0
Combination by Median ROC:0.0, precision @ rank n:0.0
Combination by AOM ROC:0.0055, precision @ rank n:0.0
Combination by MOA ROC:0.0083, precision @ rank n:0.0
-
참고(홈페이지에서 나온 결과값)
Thresholding Example
Example of using Angle-base outlier detection (ABOD) for outlier detection
from pyod.models.knn import KNN # kNN detector
from pyod.models.thresholds import FILTER # Filter thresholder
= 0.1 # percentage of outliers
contamination = 200 # number of training points
n_train = 100 # number of testing points
n_test
= generate_data(
X_train, X_test, y_train, y_test =n_train, n_test=n_test, contamination=contamination) n_train
# train kNN detector and apply FILTER thresholding
= 'KNN'
clf_name = KNN(contamination=FILTER())
clf
clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
= clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_pred = clf.decision_scores_ # raw outlier scores y_train_scores
ModuleNotFoundError: No module named 'pythresh'