Bootstrap

python离群点检测_Python机器学习(1)——异常点检测

本文主要介绍4种异常点检测方法,One-Class SVM(一分类向量机,非高斯分布)、EllipticEnvelope(基于高斯概率密度的异常点检测)、Isolation Forest(基于集成学习方法异常点检测)、LocalOutlierFactor(基于密度的局部异常因子),并基于同一数据集,对比不同检测方法的效果。

实现代码如下所示:

'''

目标:比较One-Class SVM、EllipticEnvelope、Isolation Forest、

LocalOutlierFactor这4种异常检测算法在相同数据集下的异常检测效果。

'''

import numpy as np

from scipy import stats

import matplotlib.pyplot as plt

import matplotlib.font_manager

from sklearn import svm

from sklearn.covariance import EllipticEnvelope

from sklearn.ensemble import IsolationForest

from sklearn.neighbors import LocalOutlierFactor

rng = np.random.RandomState(42)

#设置样本数量、异常样本比例,不同类样本分离度

n_samples = 256

outliers_fraction = 0.25

clusters_separation = [0,1,2]

#定义各种异常检测方法

classifiers = {

"One-Class SVM":svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel='rbf',gamma=0.1),

"Roubst covariance":EllipticEnvelope(contamination=outliers_fraction),

"Isolation Forest":IsolationForest(max_samples=n_samples,

contamination=outliers_fraction,

random_state=rng),

"Local Outlier Factor":LocalOutlierFactor(n_neighbors=35,

contamination=outliers_fraction)}

#样本集各变量赋初值

xx,yy = np.meshgrid(np.linspace(-7,7,100),np.linspace(-7,7,100))

n_inliers = int((1 - outliers_fraction) * n_samples)

n_outliers = int(outliers_fraction * n_samples)

ground_truth = np.ones(n_samples,dtype=int)

ground_truth[-n_outliers:] = -1

#在不同的样本分离度下测试异常检测效果

for i,offset in enumerate(clusters_separation):

np.random.seed(42)

#生成数据

X1 = 0.3 * np.random.randn(n_inliers // 2,2) - offset

X2 = 0.3 * np.random.randn(n_inliers // 2,2) + offset

X = np.r_[X1,X2]

X = np.r_[X,np.random.uniform(low=-6,high=6,size=(n_outliers,2))]

#模型匹配

plt.figure(figsize=(9,7))

for j,(clf_name,clf) in enumerate(classifiers.items()):

#数据匹配

if clf_name == "Local Outlier Factor":

y_pred = clf.fit_predict(X)

scores_pred = clf.negative_outlier_factor_

else:

clf.fit(X)

scores_pred = clf.decision_function(X)

y_pred = clf.predict(X)

threshold = stats.scoreatpercentile(scores_pred,100 * outliers_fraction)

n_errors = (y_pred != ground_truth).sum()

#画图

if clf_name == "Local Outlier Factor":

Z = clf._decision_function(np.c_[xx.ravel(),yy.ravel()])

else:

Z = clf.decision_function(np.c_[xx.ravel(),yy.ravel()])

Z = Z.reshape(xx.shape)

subplot = plt.subplot(2,2,j+1)

subplot.contourf(xx,yy,Z,levels=np.linspace(Z.min(),threshold,7),

cmap=plt.cm.Blues_r)

a = subplot.contour(xx,yy,Z,levels=[threshold],linewidths=2,colors='red')

subplot.contourf(xx,yy,Z,levels=[threshold,Z.max()],colors='orange')

#正常样本

b = subplot.scatter(X[:-n_outliers,0],X[:-n_outliers,1],c='white',s=20,edgecolor='k')

#异常样本

c = subplot.scatter(X[-n_outliers:,0],X[-n_outliers:,1],c='black',s=20,edgecolor='k')

subplot.axis('tight')

subplot.legend(

[a.collections[0],b,c],

['learned decision function','true inliers','true outliers'],

prop=matplotlib.font_manager.FontProperties(size=10),

loc='lower right')

subplot.set_xlabel("{}.{}({})".format(j+1,clf_name,n_errors))

subplot.set_xlim((-7,7))

subplot.set_ylim((-7,7))

plt.subplots_adjust(0.04,0.1,0.96,0.94,0.1,0.20)

plt.suptitle("Outlier detection{}".format(i+1))

plt.show()

对比结果如下图所示:

20180531170404646

20180531170419355

2018053117042825

;