import GODE
This tutorial is about GODE: Graph fourier transform based Outlier Detection using Emprical Bayesian thresholding paper.
0. Import
Import the necessary packages.
import numpy as np
import pandas as pd
from pygsp import graphs, filters, plotting, utils
1. Linear
1.1. Data
Data description
- Graph
- Vertex set
- Graph signal
np.random.seed(= np.around(np.random.normal(size=1000),15)
epsilon = np.random.choice(np.concatenate((np.random.uniform(-7, -5, 25).round(15), np.random.uniform(5, 7, 25).round(15), np.repeat(0, 950))), 1000)
signal = signal + epsilon
= signal.copy()
outlier_true_linear= list(map(lambda x: 1 if x!=0 else 0,outlier_true_linear))
outlier_true_linear = signal!=0 index_of_trueoutlier_bool
= np.linspace(0,2,1000)
x_1 = 5 * x_1
y1_1 = y1_1 + eta # eta = signal + epsilon
y_1 =pd.DataFrame({'x':x_1, 'y':y_1}) _df
1.2. GODE
= GODE.Linear(_df) Lin
= GODE.GODE_Anomalous(Lin(_df),contamination=0.05) outlier_old_linear, outlier_linear, outlier_index_linear
1.3. Plot
GODE.Linear_plot(Lin(_df),index_of_trueoutlier_bool, outlier_index_linear)
'x'][:10] Lin(_df)[
array([0. , 0.002002 , 0.004004 , 0.00600601, 0.00800801,
0.01001001, 0.01201201, 0.01401401, 0.01601602, 0.01801802])
'y'][:10] Lin(_df)[
array([-0.31178367, -6.08358567, 0.23784081, -0.86906177, -2.44674061,
0.96330157, 1.18712379, -1.44402316, 1.71937116, -0.33980351])
'yhat'][:10] Lin(_df)[
array([0.26224717, 0.37091714, 0.37104804, 0.3712662 , 0.3715716 ,
0.37196424, 0.37244408, 0.37301111, 0.3736653 , 0.37440661])
1.4. Confusion matrix
= GODE.Conf_matrx(outlier_true_linear,outlier_linear) Conf_linear
'GODE') Conf_linear.conf(
Accuracy: 0.999
Precision: 1.000
Recall: 0.980
F1 Score: 0.990
{'Accuracy': 0.999,
'Precision': 1.0,
'Recall': 0.9803921568627451,
'F1 Score': 0.99009900990099}
2. Orbit
2.1. Data
Data description
- Graph
- Vertex set
and- graph signal
np.random.seed(= np.around(np.random.normal(size=1000),15)
epsilon = np.random.choice(np.concatenate((np.random.uniform(-4, -1, 25).round(15), np.random.uniform(1, 4, 25).round(15), np.repeat(0, 950))), 1000)
signal = signal + epsilon
eta =np.pi
f1= f1 + eta
f = pd.DataFrame({'x' : vx, 'y' : vy, 'f1':f1, 'f' : f})
_df = signal.copy()
outlier_true_orbit = list(map(lambda x: 1 if x!=0 else 0,outlier_true_orbit))
outlier_true_orbit = signal!=0 index_of_trueoutlier_bool
2.2. GODE
= GODE.Orbit(_df) Or
=15,method = 'Euclidean')
= GODE.GODE_Anomalous(Or(_df),contamination=0.05) outlier_old_orbit, outlier_orbit, outlier_index_orbit
2.3. Plot
'x'][:5] Or(_df)[
array([-6. , -5.99916963, -5.9966797 , -5.99253378, -5.98673778])
'y'][:5] Or(_df)[
array([-7.34788079e-16, -3.76943905e-02, -7.53604665e-02, -1.12969981e-01,
'f1'][:5] Or(_df)[
array([0. , 0.18867305, 0.37727893, 0.56575049, 0.75402065])
'f'][:5] Or(_df)[
array([-0.46820879, -0.63415181, 0.31189882, -0.14761143, 1.66037153])
'fhat'][:5] Or(_df)[
array([0.08086037, 0.2425556 , 0.40437747, 0.5665164 , 0.72915801])
2.4. Confusion matrix
= GODE.Conf_matrx(outlier_true_orbit,outlier_orbit) Conf_orbit
'GODE') Conf_orbit.conf(
Accuracy: 0.955
Precision: 0.540
Recall: 0.551
F1 Score: 0.545
{'Accuracy': 0.955,
'Precision': 0.54,
'Recall': 0.5510204081632653,
'F1 Score': 0.5454545454545455}
3. Bunny
3.1. Data
Data description
- Stanford bunny data
= graphs.Bunny()
G = G.N
n = filters.Heat(G, tau=75)
g =2503
np.random.seed(= np.around(np.random.normal(size=n),15)
normal = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
unif = normal + unif
noise = np.zeros(n)
f 1000] = -3234
f[= g.filter(f, method='chebyshev')
f = unif.copy()
outlier_true_bunny = list(map(lambda x: 1 if x !=0 else 0,outlier_true_bunny))
outlier_true_bunny = unif!=0 index_of_trueoutlier_bool_bunny
2023-11-30 13:14:41,369:[WARNING](pygsp.graphs.graph.lmax): The largest eigenvalue G.lmax is not available, we need to estimate it. Explicitly call G.estimate_lmax() or G.compute_fourier_basis() once beforehand to suppress the warning.
G.coords.shape= G.W.toarray()
_W = G.coords[:,0]
_x = G.coords[:,1]
_y = -G.coords[:,2]
_z = pd.DataFrame({'x':_x,'y':_y,'z':_z, 'f1' : f, 'f':f+noise,'noise': noise}) _df
3.2. GODE
= GODE.BUNNY(_df,_W) bu
= GODE.GODE_Anomalous(bu(_df),contamination=0.05) outlier_old_bunny, outlier_bunny, outlier_index_bunny
3.3. Plot
{'x': array([ 0.26815193, -0.58456893, -0.02730755, ..., 0.15397547,
-0.45056488, -0.29405249]),
'y': array([ 0.39314334, 0.63468595, 0.33280949, ..., 0.80205526,
0.6207154 , -0.40187451]),
'z': array([-0.13834514, -0.22438843, 0.08658215, ..., 0.33698514,
0.58353051, -0.08647485]),
'f1': array([-1.54422488, -0.03596483, -0.93972715, ..., -0.01924028,
-0.02470869, -0.26266752]),
'f': array([-0.8068728 , -0.65326195, -4.41287087, ..., -2.06257107,
0.72882576, -0.47420275]),
'fhat': array([-1.82796431, 0.04748775, -1.12152947, ..., -0.03652692,
0.06627654, 0.1743586 ])}
3.4. Confusion matrix
= GODE.Conf_matrx(outlier_true_bunny,outlier_bunny) Conf_bunny
'GODE') Conf_bunny.conf(
Accuracy: 0.988
Precision: 0.864
Recall: 0.900
F1 Score: 0.882
{'Accuracy': 0.9884139033160207,
'Precision': 0.864,
'Recall': 0.9,
'F1 Score': 0.8816326530612244}
4. Earthquake
4.1. Data
Data description
USGS data from
Vertex set
, andgraph signal
= pd.read_csv('./earthquake_tutorial.csv') _df
= _df.assign(Year=list(map(lambda x: x.split('-')[0], _df.time))).rename(columns={'latitude' : 'x', 'longitude' : 'y', 'mag': 'f'}).iloc[:,1:]
_df = _df.Year.astype(np.float64) _df.Year
4.2. GODE
= GODE.Earthquake(_df.query("2010 <= Year < 2011")) Er
=20, method = 'Haversine')
"2010 <= Year < 2011")) Er(_df.query(
{'x': array([ 0.663, -19.209, -31.83 , ..., 40.726, 30.646, 26.29 ]),
'y': array([ -26.045, 167.902, -178.135, ..., 51.925, 83.791, 99.866]),
'f': array([5.5, 5.1, 5. , ..., 5. , 5.2, 5. ]),
'fhat': array([5.62793904, 5.15719404, 4.99555904, ..., 5.42038559, 5.27983909,
= GODE.GODE_Anomalous(Er(_df.query("2010 <= Year < 2011")),contamination=0.05) outlier_old_earthquake, outlier_earthquake, outlier_index_earthquake
4.3. Plot
"2010 <= Year < 2011")),outlier_index_earthquake,lat_center=37.7749, lon_center=-122.4194,fThresh=7,adjzoom=5,adjmarkersize = 40) GODE.Earthquake_plot(Er(_df.query(