2nd ST-GCN Example dividing train and test

ST-GCN

Author

SEOYEON CHOI

Published

January 17, 2023

Try to divide train and test(ST-GCN WikiMathsDatasetLoader)

import

import rpy2
import rpy2.robjects as ro 
from rpy2.robjects.vectors import FloatVector 
from rpy2.robjects.packages import importr

import torch
import numpy as np
from tqdm import tqdm

import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import GConvGRU

import matplotlib.pyplot as plt
import pandas as pd

import time

from scipy.interpolate import interp1d

class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features, filters):
        super(RecurrentGCN, self).__init__()
        self.recurrent = GConvGRU(node_features, filters, 2)
        self.linear = torch.nn.Linear(filters, 1)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        h = F.relu(h)
        h = self.linear(h)
        return h

Data

from torch_geometric_temporal.dataset import WikiMathsDatasetLoader
from torch_geometric_temporal.signal import temporal_signal_split

loader = WikiMathsDatasetLoader()

dataset = loader.get_dataset(lags=1)

train_dataset, test_dataset = temporal_signal_split(dataset, train_ratio=0.8)

Train

data_train=[]
for time, snapshot in enumerate(train_dataset):
    data_train.append([time,snapshot])

data_train[0][1].x.shape,data_train[0][1].y.shape,data_train[0][1].edge_index.shape,data_train[0][1].edge_attr.shape

(torch.Size([1068, 1]),
 torch.Size([1068]),
 torch.Size([2, 27079]),
 torch.Size([27079]))

time

T_train = time
N = len(data[0][1].x)

edge_index = data_train[0][1].edge_index
edge_attr = data_train[0][1].edge_attr

x_train = []
for i in range(time):
    x_train.append(data_train[i][1].x)

data_tensor = torch.Tensor()
# Iterate over the data points of the dataset
for i in x_train:
    # Concatenate the data point to the tensor
    data_tensor = torch.cat((data_tensor, i), dim=0)
x_train = data_tensor.reshape(time,1068,-1)
x_train.shape

torch.Size([583, 1068, 1])

y_train = []
for i in range(time):
    y_train.append(data_train[i][1].y)

data_tensor = torch.Tensor()
# Iterate over the data points of the dataset
for i in y_train:
    # Concatenate the data point to the tensor
    data_tensor = torch.cat((data_tensor, i), dim=0)
y_train = data_tensor.reshape(time,1068)
y_train.shape

torch.Size([583, 1068])

x_train.shape, y_train.shape

(torch.Size([583, 1068, 1]), torch.Size([583, 1068]))

Test

data_test=[]
for time, snapshot in enumerate(test_dataset):
    data_test.append([time,snapshot])

data_test[0][1].x.shape,data_test[0][1].y.shape,data_test[0][1].edge_index.shape,data_test[0][1].edge_attr.shape

(torch.Size([1068, 1]),
 torch.Size([1068]),
 torch.Size([2, 27079]),
 torch.Size([27079]))

time

T_test = time

x_test = []
for i in range(time):
    x_test.append(data_test[i][1].x)

data_tensor = torch.Tensor()
# Iterate over the data points of the dataset
for i in x_test:
    # Concatenate the data point to the tensor
    data_tensor = torch.cat((data_tensor, i), dim=0)
x_test = data_tensor.reshape(time,1068,-1)
x_test.shape

torch.Size([145, 1068, 1])

y_test = []
for i in range(time):
    y_test.append(data_test[i][1].y)

data_tensor = torch.Tensor()
# Iterate over the data points of the dataset
for i in y_test:
    # Concatenate the data point to the tensor
    data_tensor = torch.cat((data_tensor, i), dim=0)
y_test = data_tensor.reshape(time,1068)
y_test.shape

torch.Size([145, 1068])

x_test.shape, y_test.shape

(torch.Size([145, 1068, 1]), torch.Size([145, 1068]))

Randomly Missing Values

x_train = x_train.reshape(-1,N)

np.random.seed(90)
seed_number = np.random.choice(len(x_train),290,replace=False)

x_train[seed_number] = float('nan')

1) Missing Value - Mean

x_train_mean = x_train.clone()

df = pd.DataFrame(x_train_mean.tolist())
mean_value = df.mean() # finds the mean value of the column A
df = df.fillna(mean_value) # replace missing values with the mean value

x_train_mean = torch.Tensor(df.values)

ST-GCN

mean_f_train = x_train_mean.reshape(T_train,N,1).float()

mean_X = mean_f_train[:438,:,:]
mean_y = mean_f_train[145:,:,:]

mean_X.shape,mean_y.shape

(torch.Size([438, 1068, 1]), torch.Size([438, 1068, 1]))

model = RecurrentGCN(node_features=1, filters=4)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(50)):
    for time, (xt,yt) in enumerate(zip(mean_X,mean_y)):
        y_hat = model(xt, edge_index, edge_attr)
        cost = torch.mean((y_hat-yt)**2)
        cost.backward()
        optimizer.step()
        optimizer.zero_grad()

100%|██████████| 50/50 [04:17<00:00,  5.15s/it]

mean_X_fore = mean_f_train[438:,:]

mean_fhat = torch.stack([model(xt, edge_index, edge_attr) for xt in mean_X_fore]).detach().numpy()

mean_X_fore.shape,x_test.shape

(torch.Size([145, 1068, 1]), torch.Size([145, 1068, 1]))

2) Missing Value - Linear Interpolation

df = pd.DataFrame(x_train.tolist())
df.interpolate(method='linear', inplace=True)
df = df.fillna(0)

x_train_linear = torch.Tensor(df.values).reshape(T_train,N,1)

ST-GCN

linear_f_train = x_train_linear.clone()

linear_X = linear_f_train[:438,:,:]
linear_y = linear_f_train[145:,:,:]

model = RecurrentGCN(node_features=1, filters=4)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(50)):
    for time, (xt,yt) in enumerate(zip(linear_X,linear_y)):
        y_hat = model(xt, edge_index, edge_attr)
        cost = torch.mean((y_hat-yt)**2)
        cost.backward()
        optimizer.step()
        optimizer.zero_grad()

100%|██████████| 50/50 [04:20<00:00,  5.22s/it]

linear_X_fore = linear_f_train[438:,:]

linear_X_fore.shape

torch.Size([145, 1068, 1])

linear_fhat = torch.stack([model(xt, edge_index, edge_attr) for xt in linear_X_fore]).detach().numpy()

linear_X_fore.shape,x_test.shape

(torch.Size([145, 1068, 1]), torch.Size([145, 1068, 1]))

Comparison

MSE

((pd.DataFrame(mean_fhat.reshape(T_test,N)) -  pd.DataFrame(x_test.reshape(T_test,N)))**2).mean()

0       0.802389
1       0.442226
2       1.248909
3       0.389663
4       0.823464
          ...   
1063    0.170553
1064    0.774679
1065    0.701724
1066    1.213499
1067    0.787685
Length: 1068, dtype: float64

((pd.DataFrame(linear_fhat.reshape(T_test,N)) -  pd.DataFrame(x_test.reshape(T_test,N)))**2).mean()

0       0.802464
1       0.441771
2       1.248262
3       0.389308
4       0.822545
          ...   
1063    0.183206
1064    0.774763
1065    0.690184
1066    1.213436
1067    0.787685
Length: 1068, dtype: float64