Comparison Study of Outlier Detection
- EbayesThresh
- Linear
- Orbit
- sklearn.neighbors.LocalOutlierFactor
- ABOD Angle Based Outlier Detection
- Isolation Forest
- One SVM
- Circle
Linear
Method | Accuracy | Precision | Recall | F1 Score |
---|---|---|---|---|
Proposed Method | 0.992 | 0.992 | 1.000 | 0.996 |
LOF (Breunig et al., 2000) | 0.871 | 0.962 | 0.900 | 0.930 |
OCSVM (Sch¨olkopf et al., 2001) | 0.940 | 0.994 | 0.942 | 0.968 |
ABOD (Kriegel et al., 2008) | 0.988 | 0.994 | 0.994 | 0.994 |
Isolation Forest (Liu et al., 2008) | 0.881 | 1.000 | 0.875 | 0.933 |
signal = sample(c(runif(25,-7,-2.5), runif(25,2.5,7), rep(0,950)))
Method | Accuracy | Precision | Recall | F1 Score |
---|---|---|---|---|
Proposed Method | 0.970 | 0.969 | 1.000 | 0.984 |
LOF (Breunig et al., 2000) | 0.879 | 0.969 | 0.901 | 0.934 |
OCSVM (Sch¨olkopf et al., 2001) | 0.938 | 0.992 | 0.942 | 0.967 |
ABOD (Kriegel et al., 2008) | 0.976 | 0.987 | 0.987 | 0.987 |
Isolation Forest (Liu et al., 2008) | 0.864 | 0.998 | 0.859 | 0.923 |
signal = sample(c(runif(25,-7,-1), runif(25,1,7), rep(0,950)))
Method | Accuracy | Precision | Recall | F1 Score |
---|---|---|---|---|
Proposed Method | 0.969 | 0.968 | 1.000 | 0.984 |
LOF (Breunig et al., 2000) | 0.879 | 0.976 | 0.906 | 0.940 |
OCSVM (Sch¨olkopf et al., 2001) | 0.923 | 0.986 | 0.933 | 0.958 |
ABOD (Kriegel et al., 2008) | 0.968 | 0.983 | 0.983 | 0.983 |
Isolation Forest (Liu et al., 2008) | 0.841 | 0.991 | 0.840 | 0.909 |
one dimensional manifold
Method | Accuracy | Precision | Recall | F1 Score |
---|---|---|---|---|
Proposed Method | 0.997 | 0.997 | 1.000 | 0.998 |
LOF (Breunig et al., 2000) | 0.886 | 0.987 | 0.892 | 0.937 |
OCSVM (Sch¨olkopf et al., 2001) | 0.923 | 0.988 | 0.931 | 0.958 |
ABOD (Kriegel et al., 2008) | 0.988 | 0.994 | 0.994 | 0.994 |
Isolation Forest (Liu et al., 2008) | 0.477 | 0.989 | 0.455 | 0.623 |
Bummy
Method | Accuracy | Precision | Recall | F1 Score |
---|---|---|---|---|
Proposed Method | 0.991 | 0.993 | 0.997 | 0.995 |
LOF (Breunig et al., 2000) | 0.918 | 0.954 | 0.959 | 0.957 |
OCSVM (Sch¨olkopf et al., 2001) | 0.865 | 0.955 | 0.901 | 0.927 |
ABOD (Kriegel et al., 2008) | 0.905 | 0.951 | 0.949 | 0.950 |
Isolation Forest (Liu et al., 2008) | 0.761 | 0.953 | 0.789 | 0.863 |
Circle(n=5000, 음수쪽 3%. 양수쪽 2%)
Method | Accuracy | Precision | Recall | F1 Score |
---|---|---|---|---|
Proposed Method | 0.972 | 0.989 | 0.982 | 0.985 |
LOF (Breunig et al., 2000) | 0.865 | 0.950 | 0.905 | 0.927 |
OCSVM (Sch¨olkopf et al., 2001) | 0.921 | 0.985 | 0.932 | 0.957 |
ABOD (Kriegel et al., 2008) | 0.970 | 0.984 | $\underline{0.984}$ | 0.984 |
Isolation Forest (Liu et al., 2008) | 0.855 | $\underline{0.993}$ | 0.853 | 0.918 |
latex
% Please add the following required packages to your document preamble: % \usepackage[normalem]{ulem} % \useunder{\uline}{\ul}{} \begin{table}[] \begin{tabular}{|lcccc} \hline \multicolumn{5}{|c|}{Linear} \\ \hline \multicolumn{1}{|l|}{Method} & \multicolumn{1}{c|}{Accuracy} & \multicolumn{1}{c|}{Precision} & \multicolumn{1}{c|}{Recall} & \multicolumn{1}{c|}{F1 Score} \\ \hline \multicolumn{1}{|l|}{Propose Method} & \textbf{0.939} & {\ul \textbf{1.000}} & \textbf{0.936} & \textbf{0.967} \\ \cline{1-1} \multicolumn{1}{|l|}{LOF (Breunig et al., 2000)} & 0.871 & 0.962 & 0.900 & 0.930 \\ \cline{1-1} \multicolumn{1}{|l|}{OCSVM (Sch¨olkopf et al., 2001)} & 0.940 & 0.994 & 0.942 & 0.968 \\ \cline{1-1} \multicolumn{1}{|l|}{ABOD (Kriegel et al., 2008)} & {\ul 0.988} & 0.994 & {\ul 0.994} & {\ul 0.994} \\ \cline{1-1} \multicolumn{1}{|l|}{Isolation Forest (Liu et al., 2008)} & 0.881 & {\ul 1.000} & 0.875 & 0.933 \\ \hline \multicolumn{5}{|c|}{One Dimensional Manifold} \\ \hline \multicolumn{1}{|l|}{Method} & \multicolumn{1}{c|}{Accuracy} & \multicolumn{1}{c|}{Precision} & \multicolumn{1}{c|}{Recall} & \multicolumn{1}{c|}{F1 Score} \\ \hline \multicolumn{1}{|l|}{Propose Method} & \textbf{0.997} & \textbf{0.997} & {\ul \textbf{1.000}} & \textbf{0.998} \\ \cline{1-1} \multicolumn{1}{|l|}{LOF (Breunig et al., 2000)} & 0.866 & 0.987 & 0.892 & 0.937 \\ \cline{1-1} \multicolumn{1}{|l|}{OCSVM (Sch¨olkopf et al., 2001)} & 0.923 & 0.988 & 0.931 & 0.958 \\ \cline{1-1} \multicolumn{1}{|l|}{ABOD (Kriegel et al., 2008)} & {\ul 0.988} & {\ul 0.994} & 0.994 & {\ul 0.994} \\ \cline{1-1} \multicolumn{1}{|l|}{Isolation Forest (Liu et al., 2008)} & 0.477 & 0.989 & 0.455 & 0.623 \\ \hline \multicolumn{5}{|c|}{Two Dimensional Manifold} \\ \hline \multicolumn{1}{|l|}{Method} & \multicolumn{1}{c|}{Accuracy} & \multicolumn{1}{c|}{Precision} & \multicolumn{1}{c|}{Recall} & \multicolumn{1}{c|}{F1 Score} \\ \hline \multicolumn{1}{|l|}{Propose Method} & {\ul \textbf{0.960}} & {\ul \textbf{0.969}} & {\ul \textbf{0.990}} & {\ul \textbf{0.979}} \\ \cline{1-1} \multicolumn{1}{|l|}{LOF (Breunig et al., 2000)} & 0.918 & 0.954 & 0.959 & 0.957 \\ \cline{1-1} \multicolumn{1}{|l|}{OCSVM (Sch¨olkopf et al., 2001)} & 0.865 & 0.955 & 0.901 & 0.953 \\ \cline{1-1} \multicolumn{1}{|l|}{ABOD (Kriegel et al., 2008)} & 0.905 & 0.951 & 0.949 & 0.950 \\ \cline{1-1} \multicolumn{1}{|l|}{Isolation Forest (Liu et al., 2008)} & 0.761 & 0.953 & 0.789 & 0.863 \\ \cline{1-1} \end{tabular} \end{table}
Inliers are labeled 1, while outliers are labeled -1
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
import rpy2
import rpy2.robjects as ro
from rpy2.robjects.vectors import FloatVector
from rpy2.robjects.packages import importr
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
from sklearn.preprocessing import LabelBinarizer
import tqdm
from pygsp import graphs, filters, plotting, utils
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import plotly.graph_objects as go
from IPython.display import HTML
import plotly.express as px
from sklearn.covariance import EmpiricalCovariance, MinCovDet
from alibi_detect.od import IForest
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
import seaborn as sns
from PyNomaly import loop
from sklearn import svm
%load_ext rpy2.ipython
%%R
library(EbayesThresh)
set.seed(1)
epsilon = rnorm(1000)
signal = sample(c(runif(25,-7,-5), runif(25,5,7), rep(0,950)))
index_of_trueoutlier = which(signal!=0)
index_of_trueoutlier
x=signal+epsilon
plot(1:1000,x)
points(index_of_trueoutlier,x[index_of_trueoutlier],col=2,cex=4)
#plot(x,type='l')
#mu <- EbayesThresh::ebayesthresh(x,sdev=2)
#lines(mu,col=2,lty=2,lwd=2)
%R -o x
%R -o index_of_trueoutlier
%R -o signal
ebayesthresh = importr('EbayesThresh').ebayesthresh
xhat = np.array(ebayesthresh(FloatVector(x)))
# plt.plot(xhat)
outlier_true_index = index_of_trueoutlier
outlier_true_value = x[index_of_trueoutlier]
package와 비교를 위해 outlier는 -1, inlier는 1로 표시
outlier_true_one = signal.copy()
outlier_true_one = list(map(lambda x: -1 if x!=0 else 1,outlier_true_one))
_x = np.linspace(0,2,1000)
_y1 = 5*_x
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)
outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
def __init__(self,df):
self.df = df
self.y = df.y.to_numpy()
#self.y1 = df.y1.to_numpy()
self.x = df.x.to_numpy()
self.n = len(self.y)
self.W = w
def _eigen(self):
d= self.W.sum(axis=1)
D= np.diag(d)
self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
self.lamb, self.Psi = np.linalg.eigh(self.L)
self.Lamb = np.diag(self.lamb)
def fit(self,sd=5): # fit with ebayesthresh
self._eigen()
self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
self.power = self.ybar**2
ebayesthresh = importr('EbayesThresh').ebayesthresh
self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
self.yhat = self.Psi@self.ybar_threshed
self.df = self.df.assign(yHat = self.yhat)
self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
for j in range(1000):
if i==j :
w[i,j] = 0
elif np.abs(i-j) <= 1 :
w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit(sd=20)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>25').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>25').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 25 else 1,outlier_simul_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
clf = LocalOutlierFactor(n_neighbors=2)
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
X[:, 0],
X[:, 1],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()
X = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X)
y_pred = clf.predict(X)
outlier_OSVM_one = list(clf.predict(X))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0 else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
od = IForest(
threshold=0.,
n_estimators=100
)
od.fit(_df[['x', 'y']])
preds = od.predict(
_df[['x', 'y']],
return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0 else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
np.random.seed(777)
pi=np.pi
n=1000
ang=np.linspace(-pi,pi-2*pi/n,n)
r=5+np.cos(np.linspace(0,12*pi,n))
vx=r*np.cos(ang)
vy=r*np.sin(ang)
f1=10*np.sin(np.linspace(0,6*pi,n))
f = f1 + x
_df = pd.DataFrame({'x' : vx, 'y' : vy, 'f' : f})
X = np.array(_df)
class SIMUL:
def __init__(self,df):
self.df = df
self.f = df.f.to_numpy()
self.x = df.x.to_numpy()
self.y = df.y.to_numpy()
self.n = len(self.f)
self.theta= None
def get_distance(self):
self.D = np.zeros([self.n,self.n])
locations = np.stack([self.x, self.y],axis=1)
for i in tqdm.tqdm(range(self.n)):
for j in range(i,self.n):
self.D[i,j]=np.linalg.norm(locations[i]-locations[j])
self.D = self.D + self.D.T
def get_weightmatrix(self,theta=1,beta=0.5,kappa=4000):
self.theta = theta
dist = np.where(self.D < kappa,self.D,0)
self.W = np.exp(-(dist/self.theta)**2)
def _eigen(self):
d= self.W.sum(axis=1)
D= np.diag(d)
self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
self.lamb, self.Psi = np.linalg.eigh(self.L)
self.Lamb = np.diag(self.lamb)
def fit(self,sd=5,ref=20): # fit with ebayesthresh
self._eigen()
self.fbar = self.Psi.T @ self.f # fbar := graph fourier transform of f
self.power = self.fbar**2
ebayesthresh = importr('EbayesThresh').ebayesthresh
self.power_threshed=np.array(ebayesthresh(FloatVector(self.fbar**2),sd=sd))
self.fbar_threshed = np.where(self.power_threshed>0,self.fbar,0)
self.fhat = self.Psi@self.fbar_threshed
self.df = self.df.assign(fHat = self.fhat)
self.df = self.df.assign(Residual = self.df.f- self.df.fHat)
self.bottom = np.zeros_like(self.f)
self.width=0.05
self.depth=0.05
_simul = SIMUL(_df)
_simul.get_distance()
_simul.get_weightmatrix(theta=(_simul.D[_simul.D>0].mean()),kappa=2500)
_simul.fit(sd=15,ref=20)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>20').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>20').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 20 else 1,outlier_simul_one))
p=plt.figure(figsize=(12,4), dpi=200) # Make figure object
ax=p.add_subplot(1,1,1, projection='3d')
ax.grid(False)
ax.ticklabel_format(style='sci', axis='x',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='z',scilimits=(0,0))
top = f
bottom = np.zeros_like(top)
width=depth=0.05
ax.scatter3D(vx,vy,f,zdir='z',s=10,marker='.',c='green',alpha=0.2)
ax.scatter3D(_simul.df.loc[outlier_simul_first_index]['x'],_simul.df.loc[outlier_simul_first_index]['y'],_simul.df.loc[outlier_simul_first_index]['f'], zdir='z',s=10,marker='.',c='red',alpha=0.5)
# ax.bar3d(vx, vy, bottom, width, depth, 0, color='Black',shade=False)
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
p=plt.figure(figsize=(12,4), dpi=200) # Make figure object
ax=p.add_subplot(1,1,1, projection='3d')
ax.grid(False)
ax.ticklabel_format(style='sci', axis='x',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='z',scilimits=(0,0))
# ax.title("Local Outlier Factor (LOF)")
ax.scatter3D(X[:, 0], X[:, 1],X[:, 2], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
ax.scatter3D(
X[:, 0],
X[:, 1],
X[:, 2],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
ax.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
# ax.xlabel("prediction errors: %d" % (n_errors))
legend = ax.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
# ax.show()
p=plt.figure(figsize=(12,4), dpi=200) # Make figure object
ax=p.add_subplot(1,1,1, projection='3d')
ax.grid(False)
ax.ticklabel_format(style='sci', axis='x',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='y',scilimits=(0,0))
ax.ticklabel_format(style='sci', axis='z',scilimits=(0,0))
top = f
bottom = np.zeros_like(top)
width=depth=0.05
ax.scatter3D(vx,vy,f,zdir='z',s=10,marker='.',c='green',alpha=0.2)
ax.scatter3D(_df.loc[outlier_first_index]['x'],_df.loc[outlier_first_index]['y'],_df.loc[outlier_first_index]['f'], zdir='z',s=10,marker='.',c='blue',alpha=0.5)
# ax.bar3d(vx, vy, bottom, width, depth, 0, color='Black',shade=False)
outlier
outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
X = np.array(_df).reshape(-1,3)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X)
y_pred = clf.predict(X)
outlier_OSVM_one = list(clf.predict(X))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y','f']])
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y', 'f']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0 else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
od.fit(_df[['x', 'y','f']])
preds = od.predict(
_df[['x', 'y','f']],
return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0 else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
X[:, 0],
X[:, 1],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()
outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
def __init__(self,df):
self.df = df
self.y = df.y.to_numpy()
#self.y1 = df.y1.to_numpy()
self.x = df.x.to_numpy()
self.n = len(self.y)
self.W = w
def _eigen(self):
d= self.W.sum(axis=1)
D= np.diag(d)
self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
self.lamb, self.Psi = np.linalg.eigh(self.L)
self.Lamb = np.diag(self.lamb)
def fit(self,sd=5): # fit with ebayesthresh
self._eigen()
self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
self.power = self.ybar**2
ebayesthresh = importr('EbayesThresh').ebayesthresh
self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
self.yhat = self.Psi@self.ybar_threshed
self.df = self.df.assign(yHat = self.yhat)
self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
for j in range(1000):
if i==j :
w[i,j] = 0
elif np.abs(i-j) <= 1 :
w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit()
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>4.5').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>4.5').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 4.5 else 1,outlier_simul_one))
plt.figure(figsize=(10,6))
plt.plot(_df.loc[outlier_first_index].reset_index()['index'],_df.loc[outlier_first_index].reset_index()['y'],'r.')
plt.plot(_y1+signal,'go',alpha=0.3)
plt.figure(figsize=(10,6))
plt.plot(outlier_simul_first_index,outlier_simul_first_value,'b.')
plt.plot(_y1+signal,'go',alpha=0.3)
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
_x = np.linspace(0,2,1000)
_y1 = 3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x)
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
X[:, 0],
X[:, 1],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()
outlier
outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
def __init__(self,df):
self.df = df
self.y = df.y.to_numpy()
#self.y1 = df.y1.to_numpy()
self.x = df.x.to_numpy()
self.n = len(self.y)
self.W = w
def _eigen(self):
d= self.W.sum(axis=1)
D= np.diag(d)
self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
self.lamb, self.Psi = np.linalg.eigh(self.L)
self.Lamb = np.diag(self.lamb)
def fit(self,sd=5): # fit with ebayesthresh
self._eigen()
self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
self.power = self.ybar**2
ebayesthresh = importr('EbayesThresh').ebayesthresh
self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
self.yhat = self.Psi@self.ybar_threshed
self.df = self.df.assign(yHat = self.yhat)
self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
for j in range(1000):
if i==j :
w[i,j] = 0
elif np.abs(i-j) <= 1 :
w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit()
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>4').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>4').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 4 else 1,outlier_simul_one))
plt.figure(figsize=(10,6))
plt.plot(_df.loc[outlier_first_index].reset_index()['index'],_df.loc[outlier_first_index].reset_index()['y'],'r.')
plt.plot(_y1+signal,'go',alpha=0.3)
plt.figure(figsize=(10,6))
plt.plot(outlier_simul_first_index,outlier_simul_first_value,'b.')
plt.plot(_y1+signal,'go',alpha=0.3)
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
ground_truth=outlier_true_one
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_
plt.figure(figsize=(10,6))
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
X[:, 0],
X[:, 1],
s=1000 * radius,
edgecolors="r",
facecolors="none",
label="Outlier scores",
)
plt.axis("tight")
# plt.xlim((-5, 5))
# plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()
outlier
outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
class SIMUL:
def __init__(self,df):
self.df = df
self.y = df.y.to_numpy()
#self.y1 = df.y1.to_numpy()
self.x = df.x.to_numpy()
self.n = len(self.y)
self.W = w
def _eigen(self):
d= self.W.sum(axis=1)
D= np.diag(d)
self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
self.lamb, self.Psi = np.linalg.eigh(self.L)
self.Lamb = np.diag(self.lamb)
def fit(self,sd=5): # fit with ebayesthresh
self._eigen()
self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
self.power = self.ybar**2
ebayesthresh = importr('EbayesThresh').ebayesthresh
self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
self.yhat = self.Psi@self.ybar_threshed
self.df = self.df.assign(yHat = self.yhat)
self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((1000,1000))
for i in range(1000):
for j in range(1000):
if i==j :
w[i,j] = 0
elif np.abs(i-j) <= 1 :
w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit()
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>4').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>4').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 4 else 1,outlier_simul_one))
plt.figure(figsize=(10,6))
plt.plot(_df.loc[outlier_first_index].reset_index()['index'],_df.loc[outlier_first_index].reset_index()['y'],'r.')
plt.plot(_y1+signal,'go',alpha=0.3)
plt.figure(figsize=(10,6))
plt.plot(outlier_simul_first_index,outlier_simul_first_value,'b.')
plt.plot(_y1+signal,'go',alpha=0.3)
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75)
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev')
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise,'f' : f, 'noise': noise})
outlier_true_index_2 = np.where(unif!=0)
outlier_true_value_2 = unif[unif!=0]
outlier_true_one_2 = unif.copy()
outlier_true_one_2 = list(map(lambda x: -1 if x !=0 else 1,outlier_true_one_2))
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
clf.fit_predict(X[:,:4])
outlier
outlier_first_index = np.where(clf.fit_predict(X[:,:4])==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X[:,:4])==-1]
class SIMUL:
def __init__(self,df):
self.df = df
self.f = df.f.to_numpy()
self.z = df.z.to_numpy()
self.x = df.x.to_numpy()
self.y = df.y.to_numpy()
self.noise = df.noise.to_numpy()
self.fnoise = self.f + self.noise
self.W = _W
self.n = len(self.f)
self.theta= None
def _eigen(self):
d= self.W.sum(axis=1)
D= np.diag(d)
self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
self.lamb, self.Psi = np.linalg.eigh(self.L)
self.Lamb = np.diag(self.lamb)
def fit(self,sd=5,ref=6): # fit with ebayesthresh
self._eigen()
self.fbar = self.Psi.T @ self.fnoise # fbar := graph fourier transform of f
self.power = self.fbar**2
ebayesthresh = importr('EbayesThresh').ebayesthresh
self.power_threshed=np.array(ebayesthresh(FloatVector(self.fbar**2),sd=sd))
self.fbar_threshed = np.where(self.power_threshed>0,self.fbar,0)
self.fhat = self.Psi@self.fbar_threshed
self.df = self.df.assign(fnoise = self.fnoise)
self.df = self.df.assign(fHat = self.fhat)
self.df = self.df.assign(Residual = self.df.f + self.df.noise - self.df.fHat)
self.bottom = np.zeros_like(self.f)
self.width=0.05
self.depth=0.05
_simul = SIMUL(_df)
_simul.fit(sd=20,ref=10)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>10').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>10').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 10 else 1,outlier_simul_one))
conf_matrix = confusion_matrix(outlier_true_one_2, clf.fit_predict(X[:,:4]))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
print('Precision: %.3f' % precision_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
print('Recall: %.3f' % recall_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, clf.fit_predict(X[:,:4])))
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_simul_one))
contamination =0.05
- 5%의 이상치 감지
_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0 else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
_x = np.linspace(0,2,1000)
_y1 = 3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x)
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0 else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0 else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75)
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev')
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise,'f' : f, 'noise': noise})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y', 'z','fnoise']])
cblof_clf = CBLOF(contamination=0.05,check_estimator=False, random_state=77)
cblof_clf.fit(_df[['x', 'y','z','fnoise']])
_df['ABOD_Clf'] = abod_clf.labels_
_df['CBLOF_Clf'] = cblof_clf.labels_
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0 else -1,outlier_ABOD_one))
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_ABOD_one))
_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
od.fit(_df[['x', 'y']])
preds = od.predict(
_df[['x', 'y']],
return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0 else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
_x = np.linspace(0,2,1000)
_y1 = 3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x)
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
od.fit(_df[['x', 'y']])
preds = od.predict(
_df[['x', 'y']],
return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0 else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
od.fit(_df[['x', 'y']])
preds = od.predict(
_df[['x', 'y']],
return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0 else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75)
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev')
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise,'f' : f, 'noise': noise})
od.fit(_df[['x', 'y','z','fnoise']])
preds = od.predict(
_df[['x', 'y','z','fnoise']],
return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0 else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_alibi_one))
_x = np.linspace(0,2,1000)
_y1 = 5*_x**2
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
_x = np.linspace(0,2,1000)
_y1 = 3*np.sin(_x) + 1*np.sin(_x**2) + 5*np.sin(5*_x)
_y = _y1 + x # x is epsilon
_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
_x = np.linspace(0,2,1000)
_y1 = -2+ 3*np.cos(_x) + 1*np.cos(2*_x) + 5*np.cos(5*_x)
_y = _y1 + x
_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))
G = graphs.Bunny()
n = G.N
g = filters.Heat(G, tau=75)
normal = np.random.randn(n)
unif = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
noise = normal + unif
index_of_trueoutlier2 = np.where(unif!=0)
f = np.zeros(n)
f[1000] = -3234
f = g.filter(f, method='chebyshev')
_W = G.W.toarray()
_x = G.coords[:,0]
_y = G.coords[:,1]
_z = -G.coords[:,2]
_df = pd.DataFrame({'x' : _x, 'y' : _y, 'z' : _z, 'fnoise':f+noise})
_df = np.array(_df).reshape(-1,4)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
y_pred = clf.predict(_df)
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one_2, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one_2, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one_2, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one_2, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one_2, outlier_OSVM_one))
%%R
library(EbayesThresh)
set.seed(1)
n=5000
epsilon = rnorm(n)
signal = sample(c(runif(n*0.07,-8,-4), runif(n*0.03,6,7), rep(0,n*0.90)))
index_of_trueoutlier = which(signal!=0)
index_of_trueoutlier
x=signal+epsilon
%R -o x
%R -o index_of_trueoutlier
%R -o signal
n=5000
ebayesthresh = importr('EbayesThresh').ebayesthresh
outlier_true_index = index_of_trueoutlier
outlier_true_value = x[index_of_trueoutlier]
outlier_true_one = signal.copy()
outlier_true_one = list(map(lambda x: -1 if x!=0 else 1,outlier_true_one))
r=x+10
θ = np.linspace(-3.14,3.14,len(x))
_x = r*np.cos(θ)
_y = r*np.sin(θ)
_df=pd.DataFrame({'x':_x, 'y':_y})
abod_clf = ABOD(contamination=0.05)
abod_clf.fit(_df[['x', 'y']])
_df['ABOD_Clf'] = abod_clf.labels_
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'ABOD_Clf')
outlier_ABOD_one = list(abod_clf.labels_)
outlier_ABOD_one = list(map(lambda x: 1 if x==0 else -1,outlier_ABOD_one))
class SIMUL:
def __init__(self,df):
self.df = df
self.y = df.y.to_numpy()
#self.y1 = df.y1.to_numpy()
self.x = df.x.to_numpy()
self.n = len(self.y)
self.W = w
def _eigen(self):
d= self.W.sum(axis=1)
D= np.diag(d)
self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
self.lamb, self.Psi = np.linalg.eigh(self.L)
self.Lamb = np.diag(self.lamb)
def fit(self,sd=5): # fit with ebayesthresh
self._eigen()
self.ybar = self.Psi.T @ self.y # fbar := graph fourier transform of f
self.power = self.ybar**2
ebayesthresh = importr('EbayesThresh').ebayesthresh
self.power_threshed=np.array(ebayesthresh(FloatVector(self.ybar**2),sd=sd))
self.ybar_threshed = np.where(self.power_threshed>0,self.ybar,0)
self.yhat = self.Psi@self.ybar_threshed
self.df = self.df.assign(yHat = self.yhat)
self.df = self.df.assign(Residual = self.df.y- self.df.yHat)
w=np.zeros((n,n))
for i in range(n):
for j in range(n):
if i==j :
w[i,j] = 0
elif np.abs(i-j) <= 1 :
w[i,j] = 1
_simul = SIMUL(_df)
_simul.fit(sd=20)
outlier_simul_first_index = np.array(_simul.df.query('Residual**2>2').reset_index()['index'])
outlier_simul_first_value = np.array(_simul.df.query('Residual**2>2').reset_index()['y'])
outlier_simul_one = (_simul.df['Residual']**2).tolist()
outlier_simul_one = list(map(lambda x: -1 if x > 2 else 1,outlier_simul_one))
_simul.df=_simul.df.assign(one = outlier_simul_one)
plt.figure(figsize=(10,6))
plt.plot(_simul.df['x'],_simul.df['y'],'k.')
plt.plot(_simul.df.iloc[outlier_simul_first_index]['x'],_simul.df.iloc[outlier_simul_first_index]['y'],'b.',alpha=0.5)
conf_matrix = confusion_matrix(outlier_true_one, outlier_simul_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_simul_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_simul_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_simul_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_simul_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_ABOD_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_ABOD_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_ABOD_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_ABOD_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_ABOD_one))
X = np.array(_df)
clf = LocalOutlierFactor(n_neighbors=2)
outlier
outlier_first_index = np.where(clf.fit_predict(X)==-1)
outlier_first_value = clf.fit_predict(X)[clf.fit_predict(X)==-1]
conf_matrix = confusion_matrix(outlier_true_one, clf.fit_predict(X))
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, clf.fit_predict(X)))
print('Precision: %.3f' % precision_score(outlier_true_one, clf.fit_predict(X)))
print('Recall: %.3f' % recall_score(outlier_true_one, clf.fit_predict(X)))
print('F1 Score: %.3f' % f1_score(outlier_true_one, clf.fit_predict(X)))
od = IForest(
threshold=0.,
n_estimators=100
)
od.fit(_df[['x', 'y']])
preds = od.predict(
_df[['x', 'y']],
return_instance_score=True
)
_df['IF_alibi'] = preds['data']['is_outlier']
sns.scatterplot(data = _df, x = 'x', y = 'y', hue = 'IF_alibi')
outlier_alibi_one = _df['IF_alibi']
outlier_alibi_one = list(map(lambda x: 1 if x==0 else -1,outlier_alibi_one))
conf_matrix = confusion_matrix(outlier_true_one, outlier_alibi_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_alibi_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_alibi_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_alibi_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_alibi_one))
_df=pd.DataFrame({'x':_x, 'y':_y})
_df = np.array(_df).reshape(-1,2)
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(_df)
y_pred = clf.predict(_df)
_df.shape
outlier_OSVM_one = list(clf.predict(_df))
conf_matrix = confusion_matrix(outlier_true_one, outlier_OSVM_one)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
for j in range(conf_matrix.shape[1]):
ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print('Accuracy: %.3f' % accuracy_score(outlier_true_one, outlier_OSVM_one))
print('Precision: %.3f' % precision_score(outlier_true_one, outlier_OSVM_one))
print('Recall: %.3f' % recall_score(outlier_true_one, outlier_OSVM_one))
print('F1 Score: %.3f' % f1_score(outlier_true_one, outlier_OSVM_one))