import GODE
This tutorial is about GODE: Graph fourier transform based Outlier Detection using Emprical Bayesian thresholding paper.
0. Import
Import the necessary packages.
import numpy as np
import pandas as pd
from pygsp import graphs, filters, plotting, utils
1. Linear
1.1. Data
Data description
- Graph
- Vertex set
- Graph signal
6)
np.random.seed(= np.around(np.random.normal(size=1000),15)
epsilon = np.random.choice(np.concatenate((np.random.uniform(-7, -5, 25).round(15), np.random.uniform(5, 7, 25).round(15), np.repeat(0, 950))), 1000)
signal = signal + epsilon
eta
= signal.copy()
outlier_true_linear= list(map(lambda x: 1 if x!=0 else 0,outlier_true_linear))
outlier_true_linear = signal!=0 index_of_trueoutlier_bool
= np.linspace(0,2,1000)
x_1 = 5 * x_1
y1_1 = y1_1 + eta # eta = signal + epsilon
y_1 =pd.DataFrame({'x':x_1, 'y':y_1}) _df
1.2. GODE
= GODE.Linear(_df) Lin
=20) Lin.fit(sd
= GODE.GODE_Anomalous(Lin(_df),contamination=0.05) outlier_old_linear, outlier_linear, outlier_index_linear
1.3. Plot
GODE.Linear_plot(Lin(_df),index_of_trueoutlier_bool, outlier_index_linear)
'x'][:10] Lin(_df)[
array([0. , 0.002002 , 0.004004 , 0.00600601, 0.00800801,
0.01001001, 0.01201201, 0.01401401, 0.01601602, 0.01801802])
'y'][:10] Lin(_df)[
array([-0.31178367, -6.08358567, 0.23784081, -0.86906177, -2.44674061,
0.96330157, 1.18712379, -1.44402316, 1.71937116, -0.33980351])
'yhat'][:10] Lin(_df)[
array([0.26224717, 0.37091714, 0.37104804, 0.3712662 , 0.3715716 ,
0.37196424, 0.37244408, 0.37301111, 0.3736653 , 0.37440661])
1.4. Confusion matrix
= GODE.Conf_matrx(outlier_true_linear,outlier_linear) Conf_linear
'GODE') Conf_linear.conf(
Accuracy: 0.999
Precision: 1.000
Recall: 0.980
F1 Score: 0.990
Conf_linear()
{'Accuracy': 0.999,
'Precision': 1.0,
'Recall': 0.9803921568627451,
'F1 Score': 0.99009900990099}
2. Orbit
2.1. Data
Data description
- Graph
- Vertex set
and- graph signal
777)
np.random.seed(= np.around(np.random.normal(size=1000),15)
epsilon = np.random.choice(np.concatenate((np.random.uniform(-4, -1, 25).round(15), np.random.uniform(1, 4, 25).round(15), np.repeat(0, 950))), 1000)
signal = signal + epsilon
eta =np.pi
pi=1000
n=np.linspace(-pi,pi-2*pi/n,n)
ang=5+np.cos(np.linspace(0,12*pi,n))
r=r*np.cos(ang)
vx=r*np.sin(ang)
vy=10*np.sin(np.linspace(0,6*pi,n))
f1= f1 + eta
f = pd.DataFrame({'x' : vx, 'y' : vy, 'f1':f1, 'f' : f})
_df = signal.copy()
outlier_true_orbit = list(map(lambda x: 1 if x!=0 else 0,outlier_true_orbit))
outlier_true_orbit = signal!=0 index_of_trueoutlier_bool
2.2. GODE
= GODE.Orbit(_df) Or
=15,method = 'Euclidean') Or.fit(sd
100%|██████████| 1000/1000 [00:01<00:00, 613.53it/s]
= GODE.GODE_Anomalous(Or(_df),contamination=0.05) outlier_old_orbit, outlier_orbit, outlier_index_orbit
2.3. Plot
GODE.Orbit_plot(Or(_df),index_of_trueoutlier_bool,outlier_index_orbit)
'x'][:5] Or(_df)[
array([-6. , -5.99916963, -5.9966797 , -5.99253378, -5.98673778])
'y'][:5] Or(_df)[
array([-7.34788079e-16, -3.76943905e-02, -7.53604665e-02, -1.12969981e-01,
-1.50494820e-01])
'f1'][:5] Or(_df)[
array([0. , 0.18867305, 0.37727893, 0.56575049, 0.75402065])
'f'][:5] Or(_df)[
array([-0.46820879, -0.63415181, 0.31189882, -0.14761143, 1.66037153])
'fhat'][:5] Or(_df)[
array([0.08086037, 0.2425556 , 0.40437747, 0.5665164 , 0.72915801])
2.4. Confusion matrix
= GODE.Conf_matrx(outlier_true_orbit,outlier_orbit) Conf_orbit
'GODE') Conf_orbit.conf(
Accuracy: 0.955
Precision: 0.540
Recall: 0.551
F1 Score: 0.545
Conf_orbit()
{'Accuracy': 0.955,
'Precision': 0.54,
'Recall': 0.5510204081632653,
'F1 Score': 0.5454545454545455}
3. Bunny
3.1. Data
Data description
- Stanford bunny data
= graphs.Bunny()
G = G.N
n = filters.Heat(G, tau=75)
g =2503
n1212)
np.random.seed(= np.around(np.random.normal(size=n),15)
normal = np.concatenate([np.random.uniform(low=3,high=7,size=60), np.random.uniform(low=-7,high=-3,size=60),np.zeros(n-120)]); np.random.shuffle(unif)
unif = normal + unif
noise = np.zeros(n)
f 1000] = -3234
f[= g.filter(f, method='chebyshev')
f = unif.copy()
outlier_true_bunny = list(map(lambda x: 1 if x !=0 else 0,outlier_true_bunny))
outlier_true_bunny = unif!=0 index_of_trueoutlier_bool_bunny
2023-11-30 13:14:41,369:[WARNING](pygsp.graphs.graph.lmax): The largest eigenvalue G.lmax is not available, we need to estimate it. Explicitly call G.estimate_lmax() or G.compute_fourier_basis() once beforehand to suppress the warning.
G.coords.shape= G.W.toarray()
_W = G.coords[:,0]
_x = G.coords[:,1]
_y = -G.coords[:,2]
_z = pd.DataFrame({'x':_x,'y':_y,'z':_z, 'f1' : f, 'f':f+noise,'noise': noise}) _df
3.2. GODE
= GODE.BUNNY(_df,_W) bu
=20) bu.fit(sd
= GODE.GODE_Anomalous(bu(_df),contamination=0.05) outlier_old_bunny, outlier_bunny, outlier_index_bunny
3.3. Plot
GODE.Bunny_plot(bu(_df),index_of_trueoutlier_bool_bunny,outlier_index_bunny)
bu(_df)
{'x': array([ 0.26815193, -0.58456893, -0.02730755, ..., 0.15397547,
-0.45056488, -0.29405249]),
'y': array([ 0.39314334, 0.63468595, 0.33280949, ..., 0.80205526,
0.6207154 , -0.40187451]),
'z': array([-0.13834514, -0.22438843, 0.08658215, ..., 0.33698514,
0.58353051, -0.08647485]),
'f1': array([-1.54422488, -0.03596483, -0.93972715, ..., -0.01924028,
-0.02470869, -0.26266752]),
'f': array([-0.8068728 , -0.65326195, -4.41287087, ..., -2.06257107,
0.72882576, -0.47420275]),
'fhat': array([-1.82796431, 0.04748775, -1.12152947, ..., -0.03652692,
0.06627654, 0.1743586 ])}
3.4. Confusion matrix
= GODE.Conf_matrx(outlier_true_bunny,outlier_bunny) Conf_bunny
'GODE') Conf_bunny.conf(
Accuracy: 0.988
Precision: 0.864
Recall: 0.900
F1 Score: 0.882
Conf_bunny()
{'Accuracy': 0.9884139033160207,
'Precision': 0.864,
'Recall': 0.9,
'F1 Score': 0.8816326530612244}
4. Earthquake
4.1. Data
Data description
USGS data from
toGraph
Vertex set
, andgraph signal
= pd.read_csv('./earthquake_tutorial.csv') _df
= _df.assign(Year=list(map(lambda x: x.split('-')[0], _df.time))).rename(columns={'latitude' : 'x', 'longitude' : 'y', 'mag': 'f'}).iloc[:,1:]
_df = _df.Year.astype(np.float64) _df.Year
4.2. GODE
= GODE.Earthquake(_df.query("2010 <= Year < 2011")) Er
=20, method = 'Haversine') Er.fit(sd
100%|██████████| 4790/4790 [00:30<00:00, 158.67it/s]
"2010 <= Year < 2011")) Er(_df.query(
{'x': array([ 0.663, -19.209, -31.83 , ..., 40.726, 30.646, 26.29 ]),
'y': array([ -26.045, 167.902, -178.135, ..., 51.925, 83.791, 99.866]),
'f': array([5.5, 5.1, 5. , ..., 5. , 5.2, 5. ]),
'fhat': array([5.62793904, 5.15719404, 4.99555904, ..., 5.42038559, 5.27983909,
5.08949907])}
= GODE.GODE_Anomalous(Er(_df.query("2010 <= Year < 2011")),contamination=0.05) outlier_old_earthquake, outlier_earthquake, outlier_index_earthquake
4.3. Plot
"2010 <= Year < 2011")),outlier_index_earthquake,lat_center=37.7749, lon_center=-122.4194,fThresh=7,adjzoom=5,adjmarkersize = 40) GODE.Earthquake_plot(Er(_df.query(