Linear

Method Accuracy Precision Recall F1 Score
Proposed Method 0.992 0.992 1.000 0.996
LOF (Breunig et al., 2000) 0.871 0.962 0.900 0.930
OCSVM (Sch¨olkopf et al., 2001) 0.940 0.994 0.942 0.968
ABOD (Kriegel et al., 2008) 0.988 0.994 0.994 0.994
Isolation Forest (Liu et al., 2008) 0.881 1.000 0.875 0.933
signal = sample(c(runif(25,-7,-2.5), runif(25,2.5,7), rep(0,950)))
Method Accuracy Precision Recall F1 Score
Proposed Method 0.970 0.969 1.000 0.984
LOF (Breunig et al., 2000) 0.879 0.969 0.901 0.934
OCSVM (Sch¨olkopf et al., 2001) 0.938 0.992 0.942 0.967
ABOD (Kriegel et al., 2008) 0.976 0.987 0.987 0.987
Isolation Forest (Liu et al., 2008) 0.864 0.998 0.859 0.923
signal = sample(c(runif(25,-7,-1), runif(25,1,7), rep(0,950)))
Method Accuracy Precision Recall F1 Score
Proposed Method 0.969 0.968 1.000 0.984
LOF (Breunig et al., 2000) 0.879 0.976 0.906 0.940
OCSVM (Sch¨olkopf et al., 2001) 0.923 0.986 0.933 0.958
ABOD (Kriegel et al., 2008) 0.968 0.983 0.983 0.983
Isolation Forest (Liu et al., 2008) 0.841 0.991 0.840 0.909

one dimensional manifold

Method Accuracy Precision Recall F1 Score
Proposed Method 0.997 0.997 1.000 0.998
LOF (Breunig et al., 2000) 0.886 0.987 0.892 0.937
OCSVM (Sch¨olkopf et al., 2001) 0.923 0.988 0.931 0.958
ABOD (Kriegel et al., 2008) 0.988 0.994 0.994 0.994
Isolation Forest (Liu et al., 2008) 0.477 0.989 0.455 0.623

Bummy

Method Accuracy Precision Recall F1 Score
Proposed Method 0.991 0.993 0.997 0.995
LOF (Breunig et al., 2000) 0.918 0.954 0.959 0.957
OCSVM (Sch¨olkopf et al., 2001) 0.865 0.955 0.901 0.927
ABOD (Kriegel et al., 2008) 0.905 0.951 0.949 0.950
Isolation Forest (Liu et al., 2008) 0.761 0.953 0.789 0.863

Circle(n=5000, 음수쪽 3%. 양수쪽 2%)

Method Accuracy Precision Recall F1 Score
Proposed Method 0.972 0.989 0.982 0.985
LOF (Breunig et al., 2000) 0.865 0.950 0.905 0.927
OCSVM (Sch¨olkopf et al., 2001) 0.921 0.985 0.932 0.957
ABOD (Kriegel et al., 2008) 0.970 0.984 $\underline{0.984}$ 0.984
Isolation Forest (Liu et al., 2008) 0.855 $\underline{0.993}$ 0.853 0.918

latex

% Please add the following required packages to your document preamble: % \usepackage[normalem]{ulem} % \useunder{\uline}{\ul}{} \begin{table}[] \begin{tabular}{|lcccc} \hline \multicolumn{5}{|c|}{Linear} \\ \hline \multicolumn{1}{|l|}{Method} & \multicolumn{1}{c|}{Accuracy} & \multicolumn{1}{c|}{Precision} & \multicolumn{1}{c|}{Recall} & \multicolumn{1}{c|}{F1 Score} \\ \hline \multicolumn{1}{|l|}{Propose Method} & \textbf{0.939} & {\ul \textbf{1.000}} & \textbf{0.936} & \textbf{0.967} \\ \cline{1-1} \multicolumn{1}{|l|}{LOF (Breunig et al., 2000)} & 0.871 & 0.962 & 0.900 & 0.930 \\ \cline{1-1} \multicolumn{1}{|l|}{OCSVM (Sch¨olkopf et al., 2001)} & 0.940 & 0.994 & 0.942 & 0.968 \\ \cline{1-1} \multicolumn{1}{|l|}{ABOD (Kriegel et al., 2008)} & {\ul 0.988} & 0.994 & {\ul 0.994} & {\ul 0.994} \\ \cline{1-1} \multicolumn{1}{|l|}{Isolation Forest (Liu et al., 2008)} & 0.881 & {\ul 1.000} & 0.875 & 0.933 \\ \hline \multicolumn{5}{|c|}{One Dimensional Manifold} \\ \hline \multicolumn{1}{|l|}{Method} & \multicolumn{1}{c|}{Accuracy} & \multicolumn{1}{c|}{Precision} & \multicolumn{1}{c|}{Recall} & \multicolumn{1}{c|}{F1 Score} \\ \hline \multicolumn{1}{|l|}{Propose Method} & \textbf{0.997} & \textbf{0.997} & {\ul \textbf{1.000}} & \textbf{0.998} \\ \cline{1-1} \multicolumn{1}{|l|}{LOF (Breunig et al., 2000)} & 0.866 & 0.987 & 0.892 & 0.937 \\ \cline{1-1} \multicolumn{1}{|l|}{OCSVM (Sch¨olkopf et al., 2001)} & 0.923 & 0.988 & 0.931 & 0.958 \\ \cline{1-1} \multicolumn{1}{|l|}{ABOD (Kriegel et al., 2008)} & {\ul 0.988} & {\ul 0.994} & 0.994 & {\ul 0.994} \\ \cline{1-1} \multicolumn{1}{|l|}{Isolation Forest (Liu et al., 2008)} & 0.477 & 0.989 & 0.455 & 0.623 \\ \hline \multicolumn{5}{|c|}{Two Dimensional Manifold} \\ \hline \multicolumn{1}{|l|}{Method} & \multicolumn{1}{c|}{Accuracy} & \multicolumn{1}{c|}{Precision} & \multicolumn{1}{c|}{Recall} & \multicolumn{1}{c|}{F1 Score} \\ \hline \multicolumn{1}{|l|}{Propose Method} & {\ul \textbf{0.960}} & {\ul \textbf{0.969}} & {\ul \textbf{0.990}} & {\ul \textbf{0.979}} \\ \cline{1-1} \multicolumn{1}{|l|}{LOF (Breunig et al., 2000)} & 0.918 & 0.954 & 0.959 & 0.957 \\ \cline{1-1} \multicolumn{1}{|l|}{OCSVM (Sch¨olkopf et al., 2001)} & 0.865 & 0.955 & 0.901 & 0.953 \\ \cline{1-1} \multicolumn{1}{|l|}{ABOD (Kriegel et al., 2008)} & 0.905 & 0.951 & 0.949 & 0.950 \\ \cline{1-1} \multicolumn{1}{|l|}{Isolation Forest (Liu et al., 2008)} & 0.761 & 0.953 & 0.789 & 0.863 \\ \cline{1-1} \end{tabular} \end{table}

Inliers are labeled 1, while outliers are labeled -1

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline

import pandas as pd
from sklearn.neighbors import LocalOutlierFactor

import rpy2
import rpy2.robjects as ro 
from rpy2.robjects.vectors import FloatVector 
from rpy2.robjects.packages import importr

from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
from sklearn.preprocessing import LabelBinarizer

import tqdm

from pygsp import graphs, filters, plotting, utils

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import plotly.graph_objects as go
from IPython.display import HTML

import plotly.express as px

from sklearn.covariance import EmpiricalCovariance, MinCovDet

from alibi_detect.od import IForest

from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
import seaborn as sns

from PyNomaly import loop

from sklearn import svm

EbayesThresh

%load_ext rpy2.ipython
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
%%R
library(EbayesThresh)
set.seed(1)
epsilon = rnorm(1000)
signal = sample(c(runif(25,-7,-5), runif(25,5,7), rep(0,950)))
index_of_trueoutlier = which(signal!=0)
index_of_trueoutlier
x=signal+epsilon
plot(1:1000,x)
points(index_of_trueoutlier,x[index_of_trueoutlier],col=2,cex=4)

#plot(x,type='l')
#mu <- EbayesThresh::ebayesthresh(x,sdev=2)
#lines(mu,col=2,lty=2,lwd=2)
%R -o x
%R -o index_of_trueoutlier
%R -o signal
ebayesthresh = importr('EbayesThresh').ebayesthresh
xhat = np.array(ebayesthresh(FloatVector(x)))
# plt.plot(xhat)
outlier_true_index = index_of_trueoutlier
outlier_true_value = x[index_of_trueoutlier]

package와 비교를 위해 outlier는 -1, inlier는 1로 표시

outlier_true_one = signal.copy()
outlier_true_one = list(map(lambda x: -1 if x!=0 else 1,outlier_true_one))

Linear

_x = np.linspace(0,2,1000)
_y1 = 5*_x
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)

1. Proposed Method

outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
    def __init__(self,df):
        self.df = df
        self.y = df.y.to_numpy()
        #self.y1 = df.y1.to_numpy()
        self.x = df.x.to_numpy()
        self.n = len(self.y)
        self.W = w
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)      
    def fit(self,sd=5): # fit with ebayesthresh
        self._eigen()
        self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
        self.power = self.ybar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
        self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
        self.yhat = self.Psi@self.ybar_threshed
        self.df = self.df.assign(yHat = self.yhat)
        self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
    for j in range(1000):
        if i==j :
            w[i,j] = 0
        elif np.abs(i-j) <= 1 : 
            w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit(sd=20)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>25').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>25').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 25 else 1,outlier_simul_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
Accuracy: 0.958
Precision: 0.958
Recall: 1.000
F1 Score: 0.978

2. LOF

clf = LocalOutlierFactor(n_neighbors=2)
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
Accuracy: 0.884
Precision: 0.970
Recall: 0.906
F1 Score: 0.937
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
    X[:, 0],
    X[:, 1],
    s=1000 * radius,
    edgecolors="r",
    facecolors="none",
    label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()

3. SVM

X = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X)
OneClassSVM(gamma=0.1, nu=0.1)
y_pred = clf.predict(X)
outlier_OSVM_one = list(clf.predict(X))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
Accuracy: 0.924
Precision: 0.987
Recall: 0.933
F1 Score: 0.959

4. ABOD

abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
ABOD(contamination=0.05, method='fast', n_neighbors=5)
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0  else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
Accuracy: 0.966
Precision: 0.982
Recall: 0.982
F1 Score: 0.982

5. IForest

od = IForest(
    threshold=0.,
    n_estimators=100
)
od.fit(_df[['x', 'y']])
preds = od.predict(
    _df[['x', 'y']],
    return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0  else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
Accuracy: 0.820
Precision: 0.995
Recall: 0.815
F1 Score: 0.896

Orbit

np.random.seed(777)
pi=np.pi
n=1000
ang=np.linspace(-pi,pi-2*pi/n,n)
r=5+np.cos(np.linspace(0,12*pi,n))
vx=r*np.cos(ang)
vy=r*np.sin(ang)
f1=10*np.sin(np.linspace(0,6*pi,n))
f = f1 + x
_df = pd.DataFrame({'x' : vx, 'y' : vy, 'f' : f})

1. Proposed Method

X = np.array(_df)
class SIMUL:
    def __init__(self,df):
        self.df = df 
        self.f = df.f.to_numpy()
        self.x = df.x.to_numpy()
        self.y = df.y.to_numpy()
        self.n = len(self.f)
        self.theta= None
    def get_distance(self):
        self.D = np.zeros([self.n,self.n])
        locations = np.stack([self.x, self.y],axis=1)
        for i in tqdm.tqdm(range(self.n)):
            for j in range(i,self.n):
                self.D[i,j]=np.linalg.norm(locations[i]-locations[j])
        self.D = self.D + self.D.T
    def get_weightmatrix(self,theta=1,beta=0.5,kappa=4000):
        self.theta = theta
        dist = np.where(self.D < kappa,self.D,0)
        self.W = np.exp(-(dist/self.theta)**2)
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)       
    def fit(self,sd=5,ref=20): # fit with ebayesthresh
        self._eigen()
        self.fbar = self.Psi.T @ self.f # fbar := graph fourier transform of f
        self.power = self.fbar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.fbar**2),sd=sd))
        self.fbar_threshed = np.where(self.power_threshed>0,self.fbar,0)
        self.fhat = self.Psi@self.fbar_threshed
        self.df = self.df.assign(fHat = self.fhat)
        self.df = self.df.assign(Residual = self.df.f- self.df.fHat)
        self.bottom = np.zeros_like(self.f)
        self.width=0.05
        self.depth=0.05
_simul = SIMUL(_df)
_simul.get_distance()
100%|██████████| 1000/1000 [00:02<00:00, 436.00it/s]
_simul.get_weightmatrix(theta=(_simul.D[_simul.D>0].mean()),kappa=2500) 
_simul.fit(sd=15,ref=20)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>20').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>20').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 20 else 1,outlier_simul_one))
p=plt.figure(figsize=(12,4), dpi=200)  # Make figure object 
ax=p.add_subplot(1,1,1, projection='3d')
ax.grid(False)
ax.ticklabel_format(style='sci', axis='x',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='z',scilimits=(0,0))
top = f
bottom = np.zeros_like(top)
width=depth=0.05
ax.scatter3D(vx,vy,f,zdir='z',s=10,marker='.',c='green',alpha=0.2)
ax.scatter3D(_simul.df.loc[outlier_simul_first_index]['x'],_simul.df.loc[outlier_simul_first_index]['y'],_simul.df.loc[outlier_simul_first_index]['f'], zdir='z',s=10,marker='.',c='red',alpha=0.5)
# ax.bar3d(vx, vy, bottom, width, depth, 0, color='Black',shade=False)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f0640a05550>
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
Accuracy: 0.997
Precision: 0.997
Recall: 1.000
F1 Score: 0.998

2. LOF

clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
p=plt.figure(figsize=(12,4), dpi=200)  # Make figure object 
ax=p.add_subplot(1,1,1, projection='3d')
ax.grid(False)
ax.ticklabel_format(style='sci', axis='x',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='z',scilimits=(0,0))
# ax.title("Local Outlier Factor (LOF)")
ax.scatter3D(X[:, 0], X[:, 1],X[:, 2], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
ax.scatter3D(
    X[:, 0],
    X[:, 1],
    X[:, 2],
    s=1000 * radius,
    edgecolors="r",
    facecolors="none",
    label="Outlier scores",
)
ax.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
# ax.xlabel("prediction errors: %d" % (n_errors))
legend = ax.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
# ax.show()
<Figure size 720x432 with 0 Axes>
p=plt.figure(figsize=(12,4), dpi=200)  # Make figure object 
ax=p.add_subplot(1,1,1, projection='3d')
ax.grid(False)
ax.ticklabel_format(style='sci', axis='x',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='z',scilimits=(0,0))
top = f
bottom = np.zeros_like(top)
width=depth=0.05
ax.scatter3D(vx,vy,f,zdir='z',s=10,marker='.',c='green',alpha=0.2)
ax.scatter3D(_df.loc[outlier_first_index]['x'],_df.loc[outlier_first_index]['y'],_df.loc[outlier_first_index]['f'], zdir='z',s=10,marker='.',c='blue',alpha=0.5)
# ax.bar3d(vx, vy, bottom, width, depth, 0, color='Black',shade=False)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f0640a7b5b0>

outlier

outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
Accuracy: 0.886
Precision: 0.987
Recall: 0.892
F1 Score: 0.937

3. SVM

X = np.array(_df).reshape(-1,3)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X)
OneClassSVM(gamma=0.1, nu=0.1)
y_pred = clf.predict(X)
outlier_OSVM_one = list(clf.predict(X))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
Accuracy: 0.923
Precision: 0.988
Recall: 0.931
F1 Score: 0.958

4. ABOD

abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y','f']])
ABOD(contamination=0.05, method='fast', n_neighbors=5)
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y', 'f']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0  else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
Accuracy: 0.988
Precision: 0.994
Recall: 0.994
F1 Score: 0.994

5. IForest

od.fit(_df[['x', 'y','f']])
preds = od.predict(
    _df[['x', 'y','f']],
    return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0  else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
Accuracy: 0.477
Precision: 0.989
Recall: 0.455
F1 Score: 0.623

sklearn.neighbors.LocalOutlierFactor

2. linear(2)

_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
    X[:, 0],
    X[:, 1],
    s=1000 * radius,
    edgecolors="r",
    facecolors="none",
    label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()
outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
    def __init__(self,df):
        self.df = df
        self.y = df.y.to_numpy()
        #self.y1 = df.y1.to_numpy()
        self.x = df.x.to_numpy()
        self.n = len(self.y)
        self.W = w
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)      
    def fit(self,sd=5): # fit with ebayesthresh
        self._eigen()
        self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
        self.power = self.ybar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
        self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
        self.yhat = self.Psi@self.ybar_threshed
        self.df = self.df.assign(yHat = self.yhat)
        self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
    for j in range(1000):
        if i==j :
            w[i,j] = 0
        elif np.abs(i-j) <= 1 : 
            w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit()
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>4.5').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>4.5').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 4.5 else 1,outlier_simul_one))
plt.figure(figsize=(10,6))
plt.plot(_df.loc[outlier_first_index].reset_index()['index'],_df.loc[outlier_first_index].reset_index()['y'],'r.')
plt.plot(_y1+signal,'go',alpha=0.3)
[<matplotlib.lines.Line2D at 0x7f0691f4be50>]
plt.figure(figsize=(10,6))
plt.plot(outlier_simul_first_index,outlier_simul_first_value,'b.')
plt.plot(_y1+signal,'go',alpha=0.3)
[<matplotlib.lines.Line2D at 0x7f069288d7c0>]
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
Accuracy: 0.888
Precision: 0.978
Recall: 0.902
F1 Score: 0.939
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
Accuracy: 0.942
Precision: 0.999
Recall: 0.940
F1 Score: 0.969

3. sin

_x = np.linspace(0,2,1000)
_y1 =  3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x) 
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
    X[:, 0],
    X[:, 1],
    s=1000 * radius,
    edgecolors="r",
    facecolors="none",
    label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()

outlier

outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
    def __init__(self,df):
        self.df = df
        self.y = df.y.to_numpy()
        #self.y1 = df.y1.to_numpy()
        self.x = df.x.to_numpy()
        self.n = len(self.y)
        self.W = w
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)      
    def fit(self,sd=5): # fit with ebayesthresh
        self._eigen()
        self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
        self.power = self.ybar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
        self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
        self.yhat = self.Psi@self.ybar_threshed
        self.df = self.df.assign(yHat = self.yhat)
        self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
    for j in range(1000):
        if i==j :
            w[i,j] = 0
        elif np.abs(i-j) <= 1 : 
            w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit()
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>4').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>4').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 4 else 1,outlier_simul_one))
plt.figure(figsize=(10,6))
plt.plot(_df.loc[outlier_first_index].reset_index()['index'],_df.loc[outlier_first_index].reset_index()['y'],'r.')
plt.plot(_y1+signal,'go',alpha=0.3)
[<matplotlib.lines.Line2D at 0x7f0692810760>]
plt.figure(figsize=(10,6))
plt.plot(outlier_simul_first_index,outlier_simul_first_value,'b.')
plt.plot(_y1+signal,'go',alpha=0.3)
[<matplotlib.lines.Line2D at 0x7f069269a0d0>]
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
Accuracy: 0.882
Precision: 0.975
Recall: 0.899
F1 Score: 0.935
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
Accuracy: 0.934
Precision: 1.000
Recall: 0.931
F1 Score: 0.964

4. cosin

_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
    X[:, 0],
    X[:, 1],
    s=1000 * radius,
    edgecolors="r",
    facecolors="none",
    label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()

outlier

outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
    def __init__(self,df):
        self.df = df
        self.y = df.y.to_numpy()
        #self.y1 = df.y1.to_numpy()
        self.x = df.x.to_numpy()
        self.n = len(self.y)
        self.W = w
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)      
    def fit(self,sd=5): # fit with ebayesthresh
        self._eigen()
        self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
        self.power = self.ybar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
        self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
        self.yhat = self.Psi@self.ybar_threshed
        self.df = self.df.assign(yHat = self.yhat)
        self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
    for j in range(1000):
        if i==j :
            w[i,j] = 0
        elif np.abs(i-j) <= 1 : 
            w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit()
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>4').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>4').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 4 else 1,outlier_simul_one))
plt.figure(figsize=(10,6))
plt.plot(_df.loc[outlier_first_index].reset_index()['index'],_df.loc[outlier_first_index].reset_index()['y'],'r.')
plt.plot(_y1+signal,'go',alpha=0.3)
[<matplotlib.lines.Line2D at 0x7f069256d160>]
plt.figure(figsize=(10,6))
plt.plot(outlier_simul_first_index,outlier_simul_first_value,'b.')
plt.plot(_y1+signal,'go',alpha=0.3)
[<matplotlib.lines.Line2D at 0x7f069248c6a0>]
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
Accuracy: 0.854
Precision: 0.981
Recall: 0.863
F1 Score: 0.918
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
Accuracy: 0.942
Precision: 1.000
Recall: 0.939
F1 Score: 0.969

6. Bunny(Two dimensional manifold)

G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75) 
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev') 
2022-11-10 21:13:20,395:[WARNING](pygsp.graphs.graph.lmax): The largest eigenvalue G.lmax is not available, we need to estimate it. Explicitly call G.estimate_lmax() or G.compute_fourier_basis() once beforehand to suppress the warning.
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise,'f' : f, 'noise': noise})
outlier_true_index_2 = np.where(unif!=0)
outlier_true_value_2 = unif[unif!=0]
outlier_true_one_2 = unif.copy()
outlier_true_one_2 = list(map(lambda x: -1 if x !=0  else 1,outlier_true_one_2))
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
clf.fit_predict(X[:,:4])
array([1, 1, 1, ..., 1, 1, 1])

outlier

outlier_first_index = np.where(clf.fit_predict(X[:,:4])==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X[:,:4])==-1]
class SIMUL:
    def __init__(self,df):
        self.df = df 
        self.f = df.f.to_numpy()
        self.z = df.z.to_numpy()
        self.x = df.x.to_numpy()
        self.y = df.y.to_numpy()
        self.noise = df.noise.to_numpy()
        self.fnoise = self.f + self.noise
        self.W = _W
        self.n = len(self.f)
        self.theta= None
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)       
    def fit(self,sd=5,ref=6): # fit with ebayesthresh
        self._eigen()
        self.fbar = self.Psi.T @ self.fnoise # fbar := graph fourier transform of f
        self.power = self.fbar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.fbar**2),sd=sd))
        self.fbar_threshed = np.where(self.power_threshed>0,self.fbar,0)
        self.fhat = self.Psi@self.fbar_threshed
        self.df = self.df.assign(fnoise = self.fnoise)
        self.df = self.df.assign(fHat = self.fhat)
        self.df = self.df.assign(Residual = self.df.f + self.df.noise - self.df.fHat)
        self.bottom = np.zeros_like(self.f)
        self.width=0.05
        self.depth=0.05
_simul = SIMUL(_df)
_simul.fit(sd=20,ref=10)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>10').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>10').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 10 else 1,outlier_simul_one))
conf_matrix = confusion_matrix(outlier_true_one_2, clf.fit_predict(X[:,:4]))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
print('Precision: %.3f' % precision_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
print('Recall: %.3f' % recall_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
Accuracy: 0.923
Precision: 0.958
Recall: 0.962
F1 Score: 0.960
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_simul_one))
Accuracy: 0.991
Precision: 0.993
Recall: 0.997
F1 Score: 0.995

ABOD Angle Based Outlier Detection

contamination =0.05

  • 5%의 이상치 감지

2.

_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
ABOD(contamination=0.05, method='fast', n_neighbors=5)
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0  else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
Accuracy: 0.986
Precision: 0.993
Recall: 0.993
F1 Score: 0.993

3.

_x = np.linspace(0,2,1000)
_y1 =  3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x) 
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
ABOD(contamination=0.05, method='fast', n_neighbors=5)
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0  else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
Accuracy: 0.984
Precision: 0.992
Recall: 0.992
F1 Score: 0.992

4.

_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
ABOD(contamination=0.05, method='fast', n_neighbors=5)
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0  else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
Accuracy: 0.984
Precision: 0.992
Recall: 0.992
F1 Score: 0.992

6.

G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75) 
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev') 
2022-11-10 06:59:54,683:[WARNING](pygsp.graphs.graph.lmax): The largest eigenvalue G.lmax is not available, we need to estimate it. Explicitly call G.estimate_lmax() or G.compute_fourier_basis() once beforehand to suppress the warning.
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise,'f' : f, 'noise': noise})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y', 'z','fnoise']])
ABOD(contamination=0.05, method='fast', n_neighbors=5)
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y','z','fnoise']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0  else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_ABOD_one))
Accuracy: 0.905
Precision: 0.951
Recall: 0.949
F1 Score: 0.950

Isolation Forest

2.

_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
od.fit(_df[['x', 'y']])
preds = od.predict(
    _df[['x', 'y']],
    return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0  else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
Accuracy: 0.810
Precision: 1.000
Recall: 0.800
F1 Score: 0.889

3.

_x = np.linspace(0,2,1000)
_y1 =  3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x) 
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
od.fit(_df[['x', 'y']])
preds = od.predict(
    _df[['x', 'y']],
    return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0  else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
Accuracy: 0.773
Precision: 0.992
Recall: 0.767
F1 Score: 0.865

4.

_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
od.fit(_df[['x', 'y']])
preds = od.predict(
    _df[['x', 'y']],
    return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0  else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
Accuracy: 0.788
Precision: 0.996
Recall: 0.780
F1 Score: 0.875

6.

G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75) 
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev') 
2022-11-10 00:48:04,538:[WARNING](pygsp.graphs.graph.lmax): The largest eigenvalue G.lmax is not available, we need to estimate it. Explicitly call G.estimate_lmax() or G.compute_fourier_basis() once beforehand to suppress the warning.
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise,'f' : f, 'noise': noise})
od.fit(_df[['x', 'y','z','fnoise']])
preds = od.predict(
    _df[['x', 'y','z','fnoise']],
    return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0  else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_alibi_one))
Accuracy: 0.761
Precision: 0.953
Recall: 0.789
F1 Score: 0.863

One SVM

2.

_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
OneClassSVM(gamma=0.1, nu=0.1)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
Accuracy: 0.941
Precision: 0.993
Recall: 0.944
F1 Score: 0.968

3.

_x = np.linspace(0,2,1000)
_y1 =  3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x) 
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
OneClassSVM(gamma=0.1, nu=0.1)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
Accuracy: 0.903
Precision: 0.972
Recall: 0.924
F1 Score: 0.948

4.

_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
OneClassSVM(gamma=0.1, nu=0.1)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
Accuracy: 0.909
Precision: 0.977
Recall: 0.926
F1 Score: 0.951

6.

G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75) 
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev') 
2022-11-10 00:49:07,162:[WARNING](pygsp.graphs.graph.lmax): The largest eigenvalue G.lmax is not available, we need to estimate it. Explicitly call G.estimate_lmax() or G.compute_fourier_basis() once beforehand to suppress the warning.
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise})
_df = np.array(_df).reshape(-1,4)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
OneClassSVM(gamma=0.1, nu=0.1)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_OSVM_one))
Accuracy: 0.865
Precision: 0.955
Recall: 0.901
F1 Score: 0.927

Circle

1. Simul, ABOD

%%R
library(EbayesThresh)
set.seed(1)
n=5000
epsilon = rnorm(n)
signal = sample(c(runif(n*0.07,-8,-4), runif(n*0.03,6,7), rep(0,n*0.90)))
index_of_trueoutlier = which(signal!=0)
index_of_trueoutlier
x=signal+epsilon
%R -o x
%R -o index_of_trueoutlier
%R -o signal
n=5000
ebayesthresh = importr('EbayesThresh').ebayesthresh
outlier_true_index = index_of_trueoutlier

outlier_true_value = x[index_of_trueoutlier]

outlier_true_one = signal.copy()
outlier_true_one = list(map(lambda x: -1 if x!=0 else 1,outlier_true_one))
r=x+10
θ = np.linspace(-3.14,3.14,len(x))
_x = r*np.cos(θ)
_y = r*np.sin(θ)
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
ABOD(contamination=0.05, method='fast', n_neighbors=5)
_df['ABOD_Clf'] = abod_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0  else -1,outlier_ABOD_one))
class SIMUL:
    def __init__(self,df):
        self.df = df
        self.y = df.y.to_numpy()
        #self.y1 = df.y1.to_numpy()
        self.x = df.x.to_numpy()
        self.n = len(self.y)
        self.W = w
    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)      
    def fit(self,sd=5): # fit with ebayesthresh
        self._eigen()
        self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
        self.power = self.ybar**2 
        ebayesthresh = importr('EbayesThresh').ebayesthresh
        self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
        self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
        self.yhat = self.Psi@self.ybar_threshed
        self.df = self.df.assign(yHat = self.yhat)
        self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((n,n))
for i in range(n):
    for j in range(n):
        if i==j :
            w[i,j] = 0
        elif np.abs(i-j) <= 1 : 
            w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit(sd=20)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>2').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>2').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 2 else 1,outlier_simul_one))
_simul.df=_simul.df.assign(one = outlier_simul_one)
plt.figure(figsize=(10,6))
plt.plot(_simul.df['x'],_simul.df['y'],'k.')
plt.plot(_simul.df.iloc[outlier_simul_first_index]['x'],_simul.df.iloc[outlier_simul_first_index]['y'],'b.',alpha=0.5)
[<matplotlib.lines.Line2D at 0x7f063ca963a0>]
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
Accuracy: 0.914
Precision: 0.978
Recall: 0.925
F1 Score: 0.951
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
Accuracy: 0.926
Precision: 0.935
Recall: 0.987
F1 Score: 0.960

2. LOF

X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)

outlier

outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
Accuracy: 0.828
Precision: 0.901
Recall: 0.908
F1 Score: 0.905

3. forest

od = IForest(
    threshold=0.,
    n_estimators=100
)
od.fit(_df[['x', 'y']])
preds = od.predict(
    _df[['x', 'y']],
    return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
<AxesSubplot:xlabel='x', ylabel='y'>
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0  else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
Accuracy: 0.828
Precision: 0.926
Recall: 0.879
F1 Score: 0.902

4. SVM

_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
OneClassSVM(gamma=0.1, nu=0.1)
y_pred = clf.predict(_df)
_df.shape
(5000, 2)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
Accuracy: 0.886
Precision: 0.939
Recall: 0.935
F1 Score: 0.937