imports

import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import plotly.express as px
import warnings
warnings.simplefilter("ignore", np.ComplexWarning)
from haversine import haversine
from IPython.display import HTML

load data and clean it

- load

df= pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/earthquakes-23k.csv')
df
Date Latitude Longitude Magnitude
0 01/02/1965 19.2460 145.6160 6.0
1 01/04/1965 1.8630 127.3520 5.8
2 01/05/1965 -20.5790 -173.9720 6.2
3 01/08/1965 -59.0760 -23.5570 5.8
4 01/09/1965 11.9380 126.4270 5.8
... ... ... ... ...
23407 12/28/2016 38.3917 -118.8941 5.6
23408 12/28/2016 38.3777 -118.8957 5.5
23409 12/28/2016 36.9179 140.4262 5.9
23410 12/29/2016 -9.0283 118.6639 6.3
23411 12/30/2016 37.3973 141.4103 5.5

23412 rows × 4 columns

- cleaning

df.Date[df.Date == '1975-02-23T02:58:41.000Z']
3378    1975-02-23T02:58:41.000Z
Name: Date, dtype: object
df.iloc[3378,0] = '02/03/1975'
df.Date[df.Date == '1985-04-28T02:53:41.530Z']
7512    1985-04-28T02:53:41.530Z
Name: Date, dtype: object
df.iloc[7512,0] = '04/28/1985'
df.Date[df.Date == '2011-03-13T02:23:34.520Z']
20650    2011-03-13T02:23:34.520Z
Name: Date, dtype: object
df.iloc[20650,0] = '03/13/2011'
df= df.assign(Year=list(map(lambda x: x.split('/')[-1], df.Date))).iloc[:,1:]
df
Latitude Longitude Magnitude Year
0 19.2460 145.6160 6.0 1965
1 1.8630 127.3520 5.8 1965
2 -20.5790 -173.9720 6.2 1965
3 -59.0760 -23.5570 5.8 1965
4 11.9380 126.4270 5.8 1965
... ... ... ... ...
23407 38.3917 -118.8941 5.6 2016
23408 38.3777 -118.8957 5.5 2016
23409 36.9179 140.4262 5.9 2016
23410 -9.0283 118.6639 6.3 2016
23411 37.3973 141.4103 5.5 2016

23412 rows × 4 columns

df.Year = df.Year.astype(np.float64)

define class

class MooYaHo:
    def __init__(self,df):
        self.df = df 
        self.f = df.Magnitude.to_numpy()
        self.year = df.Year.to_numpy()
        self.lat = df.Latitude.to_numpy()
        self.long = df.Longitude.to_numpy()
        self.n = len(self.f)
        self.theta= None
    def get_distance(self):
        self.D = np.zeros([self.n,self.n])
        locations = np.stack([self.lat, self.long],axis=1)
        for i in tqdm.tqdm(range(self.n)):
            for j in range(i,self.n): 
                self.D[i,j]=haversine(locations[i],locations[j])
        self.D = self.D+self.D.T
    def get_weightmatrix(self,theta=1,beta=0.5,kappa=4000):
        self.theta = theta
        dist = np.where(self.D<kappa,self.D,0)
        self.W = np.exp(-(dist/self.theta)**2)
#         nlst = self.df.groupby('Year').aggregate(len).Latitude.tolist()
#         nlst = [0]+np.cumsum(nlst).tolist()
#         beta_matrix = np.zeros([self.n,self.n])
#         for i in range(len(nlst)-1):
#             beta_matrix[nlst[i]:nlst[i+1],nlst[i]:nlst[i+1]] = beta
#         self.W = self.W * beta_matrix 
#     def _get_laplacian(self):
#         self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
#         self.lamb, self.Psi = np.linalg.eigh(self.L)
#         self.Lamb = np.diag(self.lamb)  

    def _eigen(self):
        d= self.W.sum(axis=1)
        D= np.diag(d)
        self.L = np.diag(1/np.sqrt(d)) @ (D-self.W) @ np.diag(1/np.sqrt(d))
        self.lamb, self.Psi = np.linalg.eigh(self.L)
        self.Lamb = np.diag(self.lamb)        
    def fit(self,m):
        self._eigen()
        self.fhat = self.Psi[:,0:m]@self.Psi[:,0:m].T@self.f
        self.df = self.df.assign(MagnitudeHat = self.fhat)
        self.df = self.df.assign(Residual = self.df.Magnitude- self.df.MagnitudeHat)
        plt.plot(self.f,'.')
        plt.plot(self.fhat,'x')
        
    def vis(self,MagThresh=7,ResThresh=1):
        fig = px.density_mapbox(self.df, 
                        lat='Latitude', 
                        lon='Longitude', 
                        z='Magnitude', 
                        radius=15,
                        center=dict(lat=37, lon=160), 
                        zoom=1.5,
                        height=900,
                        opacity = 0.3,
                        mapbox_style="stamen-terrain")
        fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
        fig.add_scattermapbox(lat = self.df.query('Magnitude > @MagThresh')['Latitude'],
                      lon = self.df.query('Magnitude > @MagThresh')['Longitude'],
                      marker_size= 8,
                      marker_color= 'red',
                      opacity = 1
                      )
        fig.add_scattermapbox(lat = self.df.query('Residual**2 > @ResThresh')['Latitude'],
                      lon = self.df.query('Residual**2 > @ResThresh')['Longitude'],
                      marker_size= 8,
                      marker_color= 'blue',
                      opacity = 1
                      )
        return HTML(fig.to_html(include_mathjax=False, config=dict({'scrollZoom':False})))

analysis

- make instance for analysis

moo=MooYaHo(df.query("Year == 2016"))
  • 테스트용으로 2016년 자료만 수집

- get distance

moo.get_distance()
100%|██████████| 469/469 [00:00<00:00, 2052.10it/s]
moo.D[moo.D>0].mean()
8647.918085847994
  • 0이 아닌 거리의 평균은 8600정도?
plt.hist(moo.D[moo.D>0])
(array([18438., 25088., 29216., 29204., 30978., 28712., 22932., 17002.,
        13158.,  4764.]),
 array([3.09519959e-01, 1.99516074e+03, 3.99001197e+03, 5.98486319e+03,
        7.97971442e+03, 9.97456564e+03, 1.19694169e+04, 1.39642681e+04,
        1.59591193e+04, 1.79539705e+04, 1.99488218e+04]),
 <BarContainer object of 10 artists>)
  • 히스토그램결과 -> 2500보다 거리가 작으면 거의 같은 지역이라고 봐도 무방할듯

- weight matrix

moo.get_weightmatrix(theta=(8647.92),kappa=2500) 
  • 평균적으로 노드는 np.exp(-(dist/8647)**2)=np.exp(-1)=0.36787944117144233 정도의 연결강도를 가진다.
  • 거리가 2500이하이면 weight를 1로 설정한다.

- fit

moo.fit(m=150)
moo.df
Latitude Longitude Magnitude Year MagnitudeHat Residual
22943 -50.5575 139.4489 6.3 2016.0 6.083575 0.216425
22944 -28.6278 -177.2810 5.8 2016.0 5.825087 -0.025087
22945 44.8069 129.9406 5.8 2016.0 5.832089 -0.032089
22946 24.8036 93.6505 6.7 2016.0 6.177317 0.522683
22947 30.6132 132.7337 5.8 2016.0 5.933505 -0.133505
... ... ... ... ... ... ...
23407 38.3917 -118.8941 5.6 2016.0 5.562585 0.037415
23408 38.3777 -118.8957 5.5 2016.0 5.565917 -0.065917
23409 36.9179 140.4262 5.9 2016.0 6.022761 -0.122761
23410 -9.0283 118.6639 6.3 2016.0 6.546979 -0.246979
23411 37.3973 141.4103 5.5 2016.0 5.893543 -0.393543

469 rows × 6 columns

moo.df.query('Magnitude>7')
Latitude Longitude Magnitude Year MagnitudeHat Residual
22969 59.6363 -153.4051 7.1 2016.0 6.443695 0.656305
22976 53.9776 158.5463 7.2 2016.0 6.420937 0.779063
23005 -4.9521 94.3299 7.8 2016.0 6.487492 1.312508
23066 0.3819 -79.9218 7.8 2016.0 6.087327 1.712673
23116 -56.2409 -26.9353 7.2 2016.0 6.000830 1.199170
23193 18.5429 145.5073 7.7 2016.0 6.558712 1.141288
23203 -22.4765 173.1167 7.2 2016.0 6.126411 1.073589
23213 -55.2852 -31.8766 7.4 2016.0 6.132357 1.267643
23230 -0.0456 -17.8255 7.1 2016.0 6.245957 0.854043
23326 -42.7358 173.0499 7.8 2016.0 6.700400 1.099600
23369 -10.6787 161.3214 7.8 2016.0 5.920656 1.879344
23386 -4.5049 153.5216 7.9 2016.0 6.038008 1.861992
23403 -43.4029 -73.9395 7.6 2016.0 6.243961 1.356039
plt.plot((moo.df.Residual)**2)
[<matplotlib.lines.Line2D at 0x7fd98087d400>]

- vis

#moo.vis(MagThresh=7,ResThresh=1) # <- 실행해봐요
  • 전체자료는 underlying에 위치
  • 지진강도가 MagThresh 이상인 자료는 붉은점으로 시각화함
  • 이상치정도(우리가 제안하는 새로운 메져) ResThresh 이상인 자료는 파란점으로 시각화함

한계점 (?)

- 시간효과를 따로 모형화하지는 않음