[Kaggle] Heart Disease Cleveland

Author

SEOYEON CHOI

Published

December 1, 2025

data

import pandas as pd
import numpy as np

import torch
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

from torch_geometric.nn import GCNConv
import torch.nn.functional as F

from sklearn.cluster import KMeans
df = pd.read_csv('../../../delete/Heart_disease_cleveland_new.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB
df.head(5)
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
0 63 1 0 145 233 1 2 150 0 2.3 2 0 2 0
1 67 1 3 160 286 0 2 108 1 1.5 1 3 1 1
2 67 1 3 120 229 0 2 129 1 2.6 1 2 3 1
3 37 1 2 130 250 0 0 187 0 3.5 2 0 1 0
4 41 0 1 130 204 0 2 172 0 1.4 0 0 1 0
X = df.drop(columns=['target']).values
X = StandardScaler().fit_transform(X)
knn = NearestNeighbors(n_neighbors=5)
knn
NearestNeighbors()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
knn.fit(X)
NearestNeighbors()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
edges = knn.kneighbors(X,return_distance=False)
edges
array([[  0, 124,  49, 196, 139],
       [  1,  37, 155, 235,  65],
       [  2,  24, 206,  76,  62],
       ...,
       [300, 250,  64, 177, 127],
       [301, 135, 125, 230, 276],
       [302, 269, 283,  35, 190]])
edge_index = []
for i in range(len(edges)):
    for j in edges[i]:
        edge_index.append([i,j])
len(edge_index)
1515
torch.tensor(edge_index).shape
torch.Size([1515, 2])
torch.tensor(edge_index).t().shape
torch.Size([2, 1515])
torch.tensor(edge_index).t().contiguous().shape
torch.Size([2, 1515])
edge_index = torch.tensor(edge_index).t().contiguous()
x = torch.tensor(X, dtype=torch.float)
x
tensor([[ 0.9487,  0.6862, -2.2518,  ...,  2.2746, -0.7111,  0.1762],
        [ 1.3920,  0.6862,  0.8780,  ...,  0.6491,  2.5049, -0.8708],
        [ 1.3920,  0.6862,  0.8780,  ...,  0.6491,  1.4329,  1.2232],
        ...,
        [ 0.2838,  0.6862,  0.8780,  ...,  0.6491,  0.3609,  1.2232],
        [ 0.2838, -1.4573, -1.2085,  ...,  0.6491,  0.3609, -0.8708],
        [-1.8217,  0.6862, -0.1653,  ..., -0.9764, -0.7111, -0.8708]])
class GCNencode(torch.nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 64)
        self.conv2 = GCNConv(64,32)
        self.conv3 = GCNConv(32,13)
    
    def forward(self, x, edge_index):
        x = self.conv1(x,edge_index)
        x = F.relu(x)
        x = self.conv2(x,edge_index)
        x = F.relu(x)
        x = self.conv3(x,edge_index)
        return x   
x.shape
torch.Size([303, 13])
x.shape[0]
303
x.shape[1]
13
model = GCNencode(x.shape[1])
model
GCNencode(
  (conv1): GCNConv(13, 64)
  (conv2): GCNConv(64, 32)
  (conv3): GCNConv(32, 13)
)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
x[:,:13]
tensor([[ 0.9487,  0.6862, -2.2518,  ...,  2.2746, -0.7111,  0.1762],
        [ 1.3920,  0.6862,  0.8780,  ...,  0.6491,  2.5049, -0.8708],
        [ 1.3920,  0.6862,  0.8780,  ...,  0.6491,  1.4329,  1.2232],
        ...,
        [ 0.2838,  0.6862,  0.8780,  ...,  0.6491,  0.3609,  1.2232],
        [ 0.2838, -1.4573, -1.2085,  ...,  0.6491,  0.3609, -0.8708],
        [-1.8217,  0.6862, -0.1653,  ..., -0.9764, -0.7111, -0.8708]])
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    z = model(x, edge_index)
    loss = F.mse_loss(z,x[:,:13])
    loss.backward()
    optimizer.step()
model.eval()
GCNencode(
  (conv1): GCNConv(13, 64)
  (conv2): GCNConv(64, 32)
  (conv3): GCNConv(32, 13)
)
embeddings = model(x,edge_index).detach().numpy()
embeddings
array([[ 0.93777424,  1.1018754 , -2.2913668 , ...,  1.904715  ,
        -0.5608    , -0.23465104],
       [ 1.4721875 ,  0.82115763, -0.11787386, ...,  0.45280465,
         2.554476  , -0.60368335],
       [ 0.7943949 ,  0.8147143 ,  0.9416017 , ...,  0.30953175,
         1.5317379 ,  1.3029628 ],
       ...,
       [ 0.28482667,  0.437107  ,  0.8282606 , ...,  1.0481589 ,
        -0.33025122,  0.8082806 ],
       [ 0.0184263 , -1.4629475 , -1.0692531 , ...,  0.26454   ,
        -0.32980496, -0.8803086 ],
       [-1.9536707 ,  0.6859256 , -0.09410449, ..., -1.2229915 ,
        -0.93635803, -0.6885667 ]], dtype=float32)
embeddings.shape
torch.Size([303, 13])
kmeans = KMeans(n_clusters=2)
kmeans
KMeans(n_clusters=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.