import GODE_ebayes
This tutorial is about GODE: Graph fourier transform based Outlier Detection using Emprical Bayesian thresholding paper.
0. Import
import numpy as np
import pandas as pd
from pygsp import graphs, filters, plotting, utils
import pickle
1. Linear
1.1. Data
Data description
- Graph \({\cal G}=({\cal V},E,{\bf W})\)
- Vertex set \({\cal V}=\{1,2,\dots,n\}=\{ v_1,v_2,\dots,v_N\}\)
- \(N=1000\)
- \(W_{ij} = \begin{cases} 1, & \text{ if } j-i = 1 \\ 0, & \text{ otherwise}\end{cases}\)
- Graph signal \(y:{\cal V} \to \mathbb{R}\)
\[y_i=\frac{10}{N}v_i+\eta_i+\epsilon_i,\]
- \(\eta_i = \begin{cases} U(5,7) & \text{ with probability 0.025}\\ U(-7,-5) & \text{ with probability 0.025} \\ 0 & \text{ otherwise } \end{cases}\)
- \(\epsilon_i \sim N(0,1)\)
Generate Dataset
6)
np.random.seed(
= 1000
n = 0.05
eta_sparsity
= np.around(np.random.normal(size=n),15)
epsilon = np.random.choice(np.concatenate((np.random.uniform(-7, -5, round(n*eta_sparsity/2)).round(15), np.random.uniform(5, 7, round(n*eta_sparsity/2)).round(15), np.repeat(0, n - round(n*eta_sparsity)))), n)
signal = signal + epsilon
eta
= signal.copy()
outlier_true_linear= list(map(lambda x: 1 if x!=0 else 0,outlier_true_linear))
outlier_true_linear
= np.linspace(0,2,n)
x_1 = 5 * x_1
y1_1 = y1_1 + eta # eta = signal + epsilon
y_1
=pd.DataFrame({'x':x_1, 'y':y_1})
_df
= signal!=0 index_of_trueoutlier_bool
1.2. GODE
= GODE_ebayes.Linear(_df) Lin
=20) Lin.fit(sd
= GODE_ebayes.GODE_Anomalous(Lin(_df),contamination=0.05) outlier_old_linear, outlier_linear, outlier_index_linear
1.3. Plot
GODE_ebayes.Linear_plot(Lin(_df),index_of_trueoutlier_bool, outlier_index_linear)
how to access data
'x'][:10] Lin(_df)[
array([0. , 0.002002 , 0.004004 , 0.00600601, 0.00800801,
0.01001001, 0.01201201, 0.01401401, 0.01601602, 0.01801802])
'y'][:10] Lin(_df)[
array([-0.31178367, -6.08358567, 0.23784081, -0.86906177, -2.44674061,
0.96330157, 1.18712379, -1.44402316, 1.71937116, -0.33980351])
'yhat'][:10] Lin(_df)[
array([0.26224717, 0.37091714, 0.37104804, 0.3712662 , 0.3715716 ,
0.37196424, 0.37244408, 0.37301111, 0.3736653 , 0.37440661])
1.4. Confusion matrix
= GODE_ebayes.Conf_matrx(outlier_true_linear,outlier_linear) Conf_linear
'GODE') Conf_linear.conf(
Accuracy: 0.999
Precision: 1.000
Recall: 0.980
F1 Score: 0.990
Conf_linear()
{'Accuracy': 0.999,
'Precision': 1.0,
'Recall': 0.9803921568627451,
'F1 Score': 0.99009900990099}
2. Orbit
2.1. Data
Data description
- Graph \({\cal G}=({\cal V},E,{\bf W})\)
- \(N=1000\)
- \({\boldsymbol v}_i=(r_i \cos(\theta_i), r_i \sin(\theta_i))\)
- \(r_i= 5 + \cos(\frac{12\pi (i - 1)}{n - 1})\)
- \(\theta_i= -\pi + \frac{{\pi(N-2)(i - 1)}}{N(N - 1)}\)
- graph signal \(y:{\cal V} \to \mathbb{R}\)
\[y_i = 10 \cdot \sin\left(\frac{{6\pi \cdot (i - 1)}}{{N - 1}}\right)+\epsilon_i+\eta_i\]
- \(\eta_i = \begin{cases} U(1,4) & \text{ with probability 0.025}\\ U(-4,-1) & \text{ with probability 0.025} \\ 0 & \text{ otherwise } \end{cases}\)
- \(\epsilon_i \sim N(0,1)\)
- \({\bf W} = \begin{cases} \exp\left(-\frac{|{\boldsymbol v}_i -{\boldsymbol v}_j|^2}{2\theta^2}\right), & \text{ if } |{\boldsymbol v}_i - {\boldsymbol v}_j| \le \kappa \\ 0, & \text{ otherwise}\end{cases}\)
- \(\theta=6.45\) and \(\kappa=1.21\)
Generate Dataset
= 1000
n = 0.05
eta_sparsity =77
random_seed777)
np.random.seed(= np.around(np.random.normal(size=n),15)
epsilon = np.random.choice(np.concatenate((np.random.uniform(-4, -1, round(n * eta_sparsity / 2)).round(15), np.random.uniform(1, 4, round(n * eta_sparsity / 2)).round(15), np.repeat(0, n - round(n * eta_sparsity)))), n)
signal = signal + epsilon
eta =np.pi
pi=np.linspace(-pi,pi-2*pi/n,n)
ang=5+np.cos(np.linspace(0,12*pi,n))
r=r*np.cos(ang)
vx=r*np.sin(ang)
vy=10*np.sin(np.linspace(0,6*pi,n))
f1= f1 + eta
f = pd.DataFrame({'x' : vx, 'y' : vy, 'f' : f,'f1':f1})
_df = signal.copy()
outlier_true_orbit = list(map(lambda x: 1 if x!=0 else 0,outlier_true_orbit))
outlier_true_orbit = signal!=0 index_of_trueoutlier_bool
2.2. GODE
= GODE_ebayes.Orbit(_df) Or
=15,method = 'Euclidean',kappa=1.21) Or.fit(sd
100%|██████████| 1000/1000 [00:01<00:00, 683.67it/s]
= GODE_ebayes.GODE_Anomalous(Or(_df),contamination=0.05) outlier_old_orbit, outlier_orbit, outlier_index_orbit
2.3. Plot
GODE_ebayes.Orbit_plot(Or(_df),index_of_trueoutlier_bool,outlier_index_orbit)
how to access data
'x'][:5] Or(_df)[
array([-6. , -5.99916963, -5.9966797 , -5.99253378, -5.98673778])
'y'][:5] Or(_df)[
array([-7.34788079e-16, -3.76943905e-02, -7.53604665e-02, -1.12969981e-01,
-1.50494820e-01])
'f1'][:5] Or(_df)[
array([0. , 0.18867305, 0.37727893, 0.56575049, 0.75402065])
'f'][:5] Or(_df)[
array([-0.46820879, -0.63415181, 0.31189882, -0.14761143, 1.66037153])
'fhat'][:5] Or(_df)[
array([-0.12358846, 0.28544397, 0.69162952, 0.75432307, 0.83443358])
2.4. Confusion matrix
= GODE_ebayes.Conf_matrx(outlier_true_orbit,outlier_orbit) Conf_orbit
'GODE') Conf_orbit.conf(
Accuracy: 0.957
Precision: 0.560
Recall: 0.571
F1 Score: 0.566
Conf_orbit()
{'Accuracy': 0.957,
'Precision': 0.56,
'Recall': 0.5714285714285714,
'F1 Score': 0.5656565656565656}
3. Bunny
3.1. Data
Data description
- Stanford bunny data
- \(n=2503\)
Generate Dataset
= 0.05
eta_sparsity =77
random_seed= 2503
n with open("Bunny.pkl", "rb") as file:
= pickle.load(file)
loaded_obj = pd.DataFrame({'x':loaded_obj['x'],'y':loaded_obj['y'],'z':loaded_obj['z'],'f':loaded_obj['f']+loaded_obj['noise'],'f1':loaded_obj['f'],'noise':loaded_obj['noise']})
_df = loaded_obj['unif'].copy()
outlier_true_bunny = list(map(lambda x: 1 if x !=0 else 0,outlier_true_bunny))
outlier_true_bunny = loaded_obj['unif']!=0
index_of_trueoutlier_bool_bunny = loaded_obj['W'].copy() _W
3.2. GODE
= GODE_ebayes.BUNNY(_df,_W) bu
=20) bu.fit(sd
= GODE_ebayes.GODE_Anomalous(bu(_df),contamination=eta_sparsity) outlier_old_bunny, outlier_bunny, outlier_index_bunny
3.3. Plot
GODE_ebayes.Bunny_plot(bu(_df),index_of_trueoutlier_bool_bunny,outlier_index_bunny)
how to access data
bu(_df)
{'x': array([ 0.26815193, -0.58456893, -0.02730755, ..., 0.15397547,
-0.45056488, -0.29405249]),
'y': array([ 0.39314334, 0.63468595, 0.33280949, ..., 0.80205526,
0.6207154 , -0.40187451]),
'z': array([-0.13834514, -0.22438843, 0.08658215, ..., 0.33698514,
0.58353051, -0.08647485]),
'f1': array([-1.54422488, -0.03596483, -0.93972715, ..., -0.01924028,
-0.02470869, -0.26266752]),
'f': array([-1.63569131, 0.49423926, -1.04026277, ..., -1.0694093 ,
-0.24395499, 0.41729667]),
'fhat': array([-1.66060553, 0.09516849, -1.11731239, ..., 0.23695946,
0.16766298, -0.11652469])}
3.4. Confusion matrix
= GODE_ebayes.Conf_matrx(outlier_true_bunny,outlier_bunny) Conf_bunny
'GODE') Conf_bunny.conf(
Accuracy: 0.988
Precision: 0.864
Recall: 0.900
F1 Score: 0.882
Conf_bunny()
{'Accuracy': 0.9884139033160207,
'Precision': 0.864,
'Recall': 0.9,
'F1 Score': 0.8816326530612244}
4. Earthquake
4.1. Data
Data description
USGS data from \(2010\) to \(2014\)
Graph \({\cal G}=({\cal V},E,{\bf W})\)
Vertex set \({\cal V}= \{v_1, v_2, \ldots, v_n\}\)
\(v:= \tt{( Latitude}\), \(\tt{Longitude)}\)
\(N=10762\)
\({\bf W_{ij}} = \begin{cases} \exp(-\frac{\rho(i,j)}{2 \theta^2}), & \text{ if } |\rho(i,j)| \le \kappa \\ 0, & \text{ otherwise}\end{cases}\)
\(\rho(i,j)=hs({\boldsymbol v}_i, {\boldsymbol v}_j)\)
\(\theta = 8810.87\) and \(\kappa = 2500\)
graph signal \(y:{\cal V} \to \mathbb{R}\)
= pd.read_csv('./earthquake_tutorial.csv') _df
= _df.assign(Year=list(map(lambda x: x.split('-')[0], _df.time))).rename(columns={'latitude' : 'x', 'longitude' : 'y', 'mag': 'f'}).iloc[:,1:]
_df = _df.Year.astype(np.float64) _df.Year
4.2. GODE
= GODE_ebayes.Earthquake(_df.query("2010 <= Year < 2011")) Er
=20, method = 'Haversine',kappa=2500) Er.fit(sd
100%|██████████| 4790/4790 [00:30<00:00, 159.66it/s]
= GODE_ebayes.GODE_Anomalous(Er(_df.query("2010 <= Year < 2011")),contamination=0.05) outlier_old_earthquake, outlier_earthquake, outlier_index_earthquake
4.3. Plot
"2010 <= Year < 2011")),outlier_index_earthquake,lat_center=37.7749, lon_center=-122.4194,fThresh=7,adjzoom=5,adjmarkersize = 40) GODE_ebayes.Earthquake_plot(Er(_df.query(
how to access data
"2010 <= Year < 2011")) Er(_df.query(
{'x': array([ 0.663, -19.209, -31.83 , ..., 40.726, 30.646, 26.29 ]),
'y': array([ -26.045, 167.902, -178.135, ..., 51.925, 83.791, 99.866]),
'f': array([5.5, 5.1, 5. , ..., 5. , 5.2, 5. ]),
'fhat': array([5.55678619, 5.05719136, 5.02115785, ..., 5.49869582, 5.45532648,
5.42616814])}