쓸 수 있는 real data
어떤 data에서 적용이 되지 않을까?
모든 노드가 연결되었을때 오히려 정확도가 낮아지는 등
결측값이 존재한다면 그 값을 어떤 확률로 0/1을 채워넣어야 할까?
X 수가 일정하지 않는다면?

review data?

x는 단어, y의 class는 별점!

설문조사

할목별 문항을 이진화, class는 설문조사 결과

면담

plt.plot([1,2,3,4,5,6],[np.mean([98.37,99.19,98.66,99.8,97.87]),np.mean([98.37,98.39,99.81,98.43,98.84]),
                       np.mean([98.17,97.98,97.52,98.62,99.42]),np.mean([96.95,99.4,98.85,98.43,97.87]),
                       np.mean([97.76,99.4,98.47,99.8,98.65]),np.mean([96.75,99.8,99.81,99.8,99.42])])

[<matplotlib.lines.Line2D at 0x7f3c4ffab3a0>]

X의 p = 0.1	시도1	시도2	시도3	시도4	시도5
baseline	98.37%	99.19%	98.66%	99.8%	97.87%
p of citation 0.1 vs 0.9	98.37%	98.39%	99.81%	98.43%	98.84%
p of citation 0.2 vs 0.8	98.17%	97.98%	97.52%	98.62%	99.42%
p of citation 0.3 vs 0.7	96.95%	99.4%	98.85%	98.43%	97.87%
p of citation 0.4 vs 0.6	97.76%	99.4%	98.47%	99.8%	98.65%
p of citation 0.5 vs 0.5	96.75%	99.8%	99.81%	99.8%	99.42%

plt.plot([1,2,3,4,5,6],[np.mean([94.8,94.16,94.88,93.75,89.74]),np.mean([98.6,94.16,98.16,97.78,98.42]),
                       np.mean([98.2,98.25,92.01,98.39,93.1]),np.mean([93.2,98.64,97.34,96.57,99.41]),
                       np.mean([96.6,96.11,96.11,93.95,97.44]),np.mean([97.0,96.89,97.54,96.98,97.04])])

[<matplotlib.lines.Line2D at 0x7f3c4ff65520>]

X의 p = 0.2	시도1	시도2	시도3	시도4	시도5
baseline	94.8%	94.16%	94.88%	93.75%	89.74%
p of citation 0.1 vs 0.9	98.6%	94.16%	98.16%	97.78%	98.42%
p of citation 0.2 vs 0.8	98.2%	98.25%	92.01%	98.39%	93.1%
p of citation 0.3 vs 0.7	93.2%	98.64%	97.34%	96.57%	99.41%
p of citation 0.4 vs 0.6	96.6%	96.11%	96.11%	93.95%	97.44%
p of citation 0.5 vs 0.5	97.0%	96.89%	97.54%	96.98%	97.04%

plt.plot([1,2,3,4,5,6],[np.mean([82.41,83.13,85.28,81.66,83.66]),np.mean([92.02,95.93,86.8,91.7,87.8]),
                       np.mean([97.34,96.54,87.01,96.91,99.02]),np.mean([96.73,85.57,97.84,97.1,97.24]),
                       np.mean([97.55,82.93,97.62,90.73,92.91]),np.mean([87.12,83.54,97.19,88.61,83.27])])

[<matplotlib.lines.Line2D at 0x7f3c4ff3d6a0>]

X의 p = 0.3	시도1	시도2	시도3	시도4	시도5
baseline	82.41%	83.13%	85.28%	81.66%	83.66%
p of citation 0.1 vs 0.9	92.02%	95.93%	86.8%	91.7%	87.8%
p of citation 0.2 vs 0.8	97.34%	96.54%	87.01%	96.91%	99.02%
p of citation 0.3 vs 0.7	96.73%	85.57%	97.84%	97.1%	97.24%
p of citation 0.4 vs 0.6	97.55%	82.93%	97.62%	90.73%	92.91%
p of citation 0.5 vs 0.5	87.12%	83.54%	97.19%	88.61%	83.27%

plt.plot([1,2,3,4,5,6],[np.mean([65.56,72.73,69.67,67.7,66.6]),np.mean([87.08,92.23,87.28,90.53,78.51]),
                       np.mean([76.13,92.99,61.06,62.55,67.02]),np.mean([87.87,68.37,66.93,58.02,80.21]),
                       np.mean([86.5,88.45,85.71,60.08,83.19]),np.mean([85.91,71.02,82.0,91.36,88.51])])

[<matplotlib.lines.Line2D at 0x7f3c4fe9e6a0>]

X의 p = 0.4	시도1	시도2	시도3	시도4	시도5
baseline	65.56%	72.73%	69.67%	67.7%	66.6%
p of citation 0.1 vs 0.9	87.08%	92.23%	87.28%	90.53%	78.51%
p of citation 0.2 vs 0.8	76.13%	92.99%	61.06%	62.55%	67.02%
p of citation 0.3 vs 0.7	87.87%	68.37%	66.93%	58.02%	80.21%
p of citation 0.4 vs 0.6	86.5%	88.45%	85.71%	60.08%	83.19%
p of citation 0.5 vs 0.5	85.91%	71.02%	82.0%	91.36%	88.51%

plt.plot([1,2,3,4,5,6],[np.mean([52.76,49.9,47.43,52.92,51.13]),np.mean([43.5,46.97,46.84,49.81,49.28]),
                       np.mean([49.8,49.9,46.84,51.17,48.67]),np.mean([52.36,45.21,46.05,49.22,49.49]),
                       np.mean([45.87,47.36,46.84,50.97,49.28]),np.mean([51.38,48.53,46.84,48.83,51.33])])

[<matplotlib.lines.Line2D at 0x7f3c4fe0b250>]

X의 p = 0.5	시도1	시도2	시도3	시도4	시도5
baseline	52.76%	49.9%	47.43%	52.92%	51.13%
p of citation 0.1 vs 0.9	43.5%	46.97%	46.84%	49.81%	49.28%
p of citation 0.2 vs 0.8	49.8%	49.9%	46.84%	51.17%	48.67%
p of citation 0.3 vs 0.7	52.36%	45.21%	46.05%	49.22%	49.49%
p of citation 0.4 vs 0.6	45.87%	47.36%	46.84%	50.97%	49.28%
p of citation 0.5 vs 0.5	51.38%	48.53%	46.84%	48.83%	51.33%

https://www.kaggle.com/code/itzsalil/starter-mushroom-classification-1a2d7941-1

Import

import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2022-06-29 17:02:57.187673: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-29 17:02:57.187700: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.

p=0.1

데이터 구성

p=0.1

papers =pd.concat([pd.concat([pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X1','X2','X3']),
           pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X4','X5','X6']),
                             pd.DataFrame(np.array([['Deep learning']*500]).reshape(500,1),columns=['subject'])],axis=1),
            pd.concat([ pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X1','X2','X3']),
              pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X4','X5','X6']),
                      pd.DataFrame(np.array([['Reinforcement learning']*500]).reshape(500,1),columns=['subject'])],axis=1)],axis=0,ignore_index=True).reset_index().rename(columns={'index':'paper_id'})
papers['paper_id'] = papers['paper_id']+1
papers

0.5 vs 0.5

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1000,1)),np.random.choice(range(1,501),size=(1000,1))]).reshape(1000,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1000,1)),np.random.choice(range(501,1001),size=(1000,1))]).reshape(1000,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(2000,1)),np.random.choice(range(501,1001),size=(2000,1))]).reshape(2000,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

그래프 표현

class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

plt.figure(figsize=(10, 10))
colors = papers["subject"].tolist()
cora_graph = nx.from_pandas_edgelist(citations.sample(n=800))
subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])
nx.draw_spring(cora_graph, node_size=15, node_color=subjects)

Test vs Train

train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 50% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.5
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (483, 8)
Test data shape: (517, 8)

hidden_units = [32,32]
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 300
batch_size = 256

def run_experiment(model, x_train, y_train):
    # Compile the model.
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
    )
    # Create an early stopping callback.
    early_stopping = keras.callbacks.EarlyStopping(
        monitor="val_acc", patience=50, restore_best_weights=True
    )
    # Fit the model.
    history = model.fit(
        x=x_train,
        y=y_train,
        epochs=num_epochs,
        batch_size=batch_size,
        validation_split=0.15,
        callbacks=[early_stopping],verbose=0,
    )

    return history

def display_learning_curves(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    ax1.plot(history.history["loss"])
    ax1.plot(history.history["val_loss"])
    ax1.legend(["train", "test"], loc="upper right")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")

    ax2.plot(history.history["acc"])
    ax2.plot(history.history["val_acc"])
    ax2.legend(["train", "test"], loc="upper right")
    ax2.set_xlabel("Epochs")
    ax2.set_ylabel("Accuracy")
    plt.show()

def create_ffn(hidden_units, dropout_rate, name=None):
    fnn_layers = []

    for units in hidden_units:
        fnn_layers.append(layers.BatchNormalization())
        fnn_layers.append(layers.Dropout(dropout_rate))
        fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))

    return keras.Sequential(fnn_layers, name=name)

feature_names = set(papers.columns) - {"paper_id", "subject"}
num_features = len(feature_names)
num_classes = len(class_idx)

# Create train and test features as a numpy array.
x_train = train_data[feature_names].to_numpy()
x_test = test_data[feature_names].to_numpy()
# Create train and test targets as a numpy array.
y_train = train_data["subject"]
y_test = test_data["subject"]

def create_baseline_model(hidden_units, num_classes, dropout_rate=0.2):
    inputs = layers.Input(shape=(num_features,), name="input_features")
    x = create_ffn(hidden_units, dropout_rate, name=f"ffn_block1")(inputs)
    for block_idx in range(4):
        # Create an FFN block.
        x1 = create_ffn(hidden_units, dropout_rate, name=f"ffn_block{block_idx + 2}")(x)
        # Add skip connection.
        x = layers.Add(name=f"skip_connection{block_idx + 2}")([x, x1])
    # Compute logits.
    logits = layers.Dense(num_classes, name="logits")(x)
    # Create the model.
    return keras.Model(inputs=inputs, outputs=logits, name="baseline")


baseline_model = create_baseline_model(hidden_units, num_classes, dropout_rate)
baseline_model.summary()

Model: "baseline"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 input_features (InputLayer)    [(None, 6)]          0           []                               
                                                                                                  
 ffn_block1 (Sequential)        (None, 32)           1432        ['input_features[0][0]']         
                                                                                                  
 ffn_block2 (Sequential)        (None, 32)           2368        ['ffn_block1[0][0]']             
                                                                                                  
 skip_connection2 (Add)         (None, 32)           0           ['ffn_block1[0][0]',             
                                                                  'ffn_block2[0][0]']             
                                                                                                  
 ffn_block3 (Sequential)        (None, 32)           2368        ['skip_connection2[0][0]']       
                                                                                                  
 skip_connection3 (Add)         (None, 32)           0           ['skip_connection2[0][0]',       
                                                                  'ffn_block3[0][0]']             
                                                                                                  
 ffn_block4 (Sequential)        (None, 32)           2368        ['skip_connection3[0][0]']       
                                                                                                  
 skip_connection4 (Add)         (None, 32)           0           ['skip_connection3[0][0]',       
                                                                  'ffn_block4[0][0]']             
                                                                                                  
 ffn_block5 (Sequential)        (None, 32)           2368        ['skip_connection4[0][0]']       
                                                                                                  
 skip_connection5 (Add)         (None, 32)           0           ['skip_connection4[0][0]',       
                                                                  'ffn_block5[0][0]']             
                                                                                                  
 logits (Dense)                 (None, 2)            66          ['skip_connection5[0][0]']       
                                                                                                  
==================================================================================================
Total params: 10,970
Trainable params: 10,382
Non-trainable params: 588
__________________________________________________________________________________________________

history = run_experiment(baseline_model, x_train, y_train)

display_learning_curves(history)

_, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 97.87%

p = 0.1 Test accuracy: 1) 98.37% 2) 99.19% 3) 98.66% 4) 99.8% 5) 97.87%

baseline 모델 예측

def generate_random_instances(num_instances):
    token_probability = x_train.mean(axis=0)
    instances = []
    for _ in range(num_instances):
        probabilities = np.random.uniform(size=len(token_probability))
        instance = (probabilities <= token_probability).astype(int)
        instances.append(instance)

    return np.array(instances)


def display_class_probabilities(probabilities):
    for instance_idx, probs in enumerate(probabilities):
        print(f"Instance {instance_idx + 1}:")
        for class_idx, prob in enumerate(probs):
            print(f"- {class_values[class_idx]}: {round(prob * 100, 2)}%")

new_instances = generate_random_instances(num_classes)
logits = baseline_model.predict(new_instances)
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Instance 1:
- Deep learning: 24.39%
- Reinforcement learning: 75.61%
Instance 2:
- Deep learning: 46.87%
- Reinforcement learning: 53.13%

# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
edges = citations[["source", "target"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges.shape[1])
# Create a node features array of shape [num_nodes, num_features].
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges, edge_weights)

print("Edges shape:", edges.shape)
print("Edge weight shape:", edge_weights.shape)
print("Nodes shape:", node_features.shape)

Edges shape: (2, 4000)
Edge weight shape: (4000,)
Nodes shape: (1000, 6)

class GraphConvLayer(layers.Layer):
    def __init__(
        self,
        hidden_units,
        dropout_rate=0.2,
        aggregation_type="mean",
        combination_type="concat",
        normalize=False,
        *args,
        **kwargs,
    ):
        super(GraphConvLayer, self).__init__(*args, **kwargs)

        self.aggregation_type = aggregation_type
        self.combination_type = combination_type
        self.normalize = normalize

        self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
        if self.combination_type == "gated":
            self.update_fn = layers.GRU(
                units=hidden_units,
                activation="tanh",
                recurrent_activation="sigmoid",
                dropout=dropout_rate,
                return_state=True,
                recurrent_dropout=dropout_rate,
            )
        else:
            self.update_fn = create_ffn(hidden_units, dropout_rate)

    def prepare(self, node_repesentations, weights=None):
        # node_repesentations shape is [num_edges, embedding_dim].
        messages = self.ffn_prepare(node_repesentations)
        if weights is not None:
            messages = messages * tf.expand_dims(weights, -1)
        return messages

    def aggregate(self, node_indices, neighbour_messages):
        # node_indices shape is [num_edges].
        # neighbour_messages shape: [num_edges, representation_dim].
        num_nodes = tf.math.reduce_max(node_indices) + 1
        if self.aggregation_type == "sum":
            aggregated_message = tf.math.unsorted_segment_sum(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "mean":
            aggregated_message = tf.math.unsorted_segment_mean(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "max":
            aggregated_message = tf.math.unsorted_segment_max(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        else:
            raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")

        return aggregated_message

    def update(self, node_repesentations, aggregated_messages):
        # node_repesentations shape is [num_nodes, representation_dim].
        # aggregated_messages shape is [num_nodes, representation_dim].
        if self.combination_type == "gru":
            # Create a sequence of two elements for the GRU layer.
            h = tf.stack([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "concat":
            # Concatenate the node_repesentations and aggregated_messages.
            h = tf.concat([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "add":
            # Add node_repesentations and aggregated_messages.
            h = node_repesentations + aggregated_messages
        else:
            raise ValueError(f"Invalid combination type: {self.combination_type}.")

        # Apply the processing function.
        node_embeddings = self.update_fn(h)
        if self.combination_type == "gru":
            node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]

        if self.normalize:
            node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
        return node_embeddings

    def call(self, inputs):
        """Process the inputs to produce the node_embeddings.

        inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
        Returns: node_embeddings of shape [num_nodes, representation_dim].
        """

        node_repesentations, edges, edge_weights = inputs
        # Get node_indices (source) and neighbour_indices (target) from edges.
        node_indices, neighbour_indices = edges[0], edges[1]
        # neighbour_repesentations shape is [num_edges, representation_dim].
        neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)

        # Prepare the messages of the neighbours.
        neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
        # Aggregate the neighbour messages.
        aggregated_messages = self.aggregate(node_indices, neighbour_messages)
        # Update the node embedding with the neighbour messages.
        return self.update(node_repesentations, aggregated_messages)

class GNNNodeClassifier(tf.keras.Model):
    def __init__(
        self,
        graph_info,
        num_classes,
        hidden_units,
        aggregation_type="sum",
        combination_type="concat",
        dropout_rate=0.2,
        normalize=True,
        *args,
        **kwargs,
    ):
        super(GNNNodeClassifier, self).__init__(*args, **kwargs)

        # Unpack graph_info to three elements: node_features, edges, and edge_weight.
        node_features, edges, edge_weights = graph_info
        self.node_features = node_features
        self.edges = edges
        self.edge_weights = edge_weights
        # Set edge_weights to ones if not provided.
        if self.edge_weights is None:
            self.edge_weights = tf.ones(shape=edges.shape[1])
        # Scale edge_weights to sum to 1.
        self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)

        # Create a process layer.
        self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
        # Create the first GraphConv layer.
        self.conv1 = GraphConvLayer(
            hidden_units,
            dropout_rate,
            aggregation_type,
            combination_type,
            normalize,
            name="graph_conv1",
        )
        # Create the second GraphConv layer.
        self.conv2 = GraphConvLayer(
            hidden_units,
            dropout_rate,
            aggregation_type,
            combination_type,
            normalize,
            name="graph_conv2",
        )
        # Create a postprocess layer.
        self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
        # Create a compute logits layer.
        self.compute_logits = layers.Dense(units=num_classes, name="logits")

    def call(self, input_node_indices):
        # Preprocess the node_features to produce node representations.
        x = self.preprocess(self.node_features)
        # Apply the first graph conv layer.
        x1 = self.conv1((x, self.edges, self.edge_weights))
        # Skip connection.
        x = x1 + x
        # Apply the second graph conv layer.
        x2 = self.conv2((x, self.edges, self.edge_weights))
        # Skip connection.
        x = x2 + x
        # Postprocess node embedding.
        x = self.postprocess(x)
        # Fetch node embeddings for the input node_indices.
        node_embeddings = tf.gather(x, input_node_indices)
        # Compute logits
        return self.compute_logits(node_embeddings)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

print("GNN output shape:", gnn_model([1, 10, 100]))

gnn_model.summary()

GNN output shape: tf.Tensor(
[[-0.04469565 -0.05246405]
 [-0.16625741 -0.04515836]
 [-0.16618755 -0.04519066]], shape=(3, 2), dtype=float32)
Model: "gnn_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 preprocess (Sequential)     (1000, 32)                1432      
                                                                 
 graph_conv1 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 graph_conv2 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 postprocess (Sequential)    (1000, 32)                2368      
                                                                 
 logits (Dense)              multiple                  66        
                                                                 
=================================================================
Total params: 15,642
Trainable params: 14,798
Non-trainable params: 844
_________________________________________________________________

x_train = train_data.paper_id.to_numpy()
history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 99.42%

p = 0.1, 0.5 vs 0.5 Test accuracy: 1) 96.75% 2) 99.8% 3) 99.81% 4) 99.8% 5) 99.42%

# First we add the N new_instances as nodes to the graph
# by appending the new_instance to node_features.
num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
# Second we add the M edges (citations) from each new node to a set
# of existing nodes in a particular subject
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 93.17%
- Reinforcement learning: 6.83%
Instance 2:
- Deep learning: 66.46%
- Reinforcement learning: 33.54%

p = 0.1, 0.1 vs 0.9

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1800,1)),np.random.choice(range(1,501),size=(1800,1))]).reshape(1800,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1800,1)),np.random.choice(range(501,1001),size=(1800,1))]).reshape(1800,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(400,1)),np.random.choice(range(501,1001),size=(400,1))]).reshape(400,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 98.84%

p = 0.1, 0.1 vs 0.9 Test accuracy: 1) 98.37% 2) 98.39% 3) 99.81% 4) 98.43% 5) 98.84%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 77.03%
- Reinforcement learning: 22.97%
Instance 2:
- Deep learning: 43.49%
- Reinforcement learning: 56.51%

p = 0.1, 0.2 vs 0.8

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(1,501),size=(1600,1))]).reshape(1600,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(800,1)),np.random.choice(range(501,1001),size=(800,1))]).reshape(800,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 99.42%

p = 0.1, 0.2 vs 0.8 Test accuracy: 1) 98.17% 2) 97.98% 3) 97.52% 4) 98.62% 5) 99.42%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 90.78%
- Reinforcement learning: 9.22%
Instance 2:
- Deep learning: 54.59%
- Reinforcement learning: 45.41%

p = 0.1, 0.3 vs 0.7

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(2800,1)),np.random.choice(range(1,501),size=(2800,1))]).reshape(2800,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(2800,1)),np.random.choice(range(501,1001),size=(2800,1))]).reshape(2800,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 97.87%

p = 0.1, 0.3 vs 0.7 Test accuracy: 1) 96.95% 2) 99.4% 3) 98.85% 4) 98.43% 5) 97.87%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 6800)
New node_features shape: (1002, 6)
New edges shape: (2, 6814)
WARNING:tensorflow:5 out of the last 5 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7f3d10546ca0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
Instance 1:
- Deep learning: 55.63%
- Reinforcement learning: 44.37%
Instance 2:
- Deep learning: 56.03%
- Reinforcement learning: 43.97%

p = 0.1, 0.4 vs 0.6

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(1,501),size=(1200,1))]).reshape(1200,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 98.65%

p = 0.1, 0.4 vs 0.6 Test accuracy: 1) 97.76% 2) 99.4% 3) 98.47% 4) 99.8% 5) 98.65%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
WARNING:tensorflow:6 out of the last 6 calls to <function Model.make_predict_function.<locals>.predict_function at 0x7f3c8c334c10> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has experimental_relax_shapes=True option that relaxes argument shapes that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
Instance 1:
- Deep learning: 45.11%
- Reinforcement learning: 54.89%
Instance 2:
- Deep learning: 51.81%
- Reinforcement learning: 48.19%

p=0.2

데이터 구성

p=0.2

papers =pd.concat([pd.concat([pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X1','X2','X3']),
           pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X4','X5','X6']),
                             pd.DataFrame(np.array([['Deep learning']*500]).reshape(500,1),columns=['subject'])],axis=1),
            pd.concat([ pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X1','X2','X3']),
              pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X4','X5','X6']),
                      pd.DataFrame(np.array([['Reinforcement learning']*500]).reshape(500,1),columns=['subject'])],axis=1)],axis=0,ignore_index=True).reset_index().rename(columns={'index':'paper_id'})
papers['paper_id'] = papers['paper_id']+1

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1000,1)),np.random.choice(range(1,501),size=(1000,1))]).reshape(1000,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1000,1)),np.random.choice(range(501,1001),size=(1000,1))]).reshape(1000,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(2000,1)),np.random.choice(range(501,1001),size=(2000,1))]).reshape(2000,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

그래프 표현

class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

plt.figure(figsize=(10, 10))
colors = papers["subject"].tolist()
cora_graph = nx.from_pandas_edgelist(citations.sample(n=800))
subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])
nx.draw_spring(cora_graph, node_size=15, node_color=subjects)

Test vs Train

train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 50% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.5
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (493, 8)
Test data shape: (507, 8)

hidden_units = [32,32]
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 300
batch_size = 256

feature_names = set(papers.columns) - {"paper_id", "subject"}
num_features = len(feature_names)
num_classes = len(class_idx)

# Create train and test features as a numpy array.
x_train = train_data[feature_names].to_numpy()
x_test = test_data[feature_names].to_numpy()
# Create train and test targets as a numpy array.
y_train = train_data["subject"]
y_test = test_data["subject"]

history = run_experiment(baseline_model, x_train, y_train)

display_learning_curves(history)

_, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 89.74%

p=0.2 Test accuracy: 1) 94.8% 2) 94.16% 3) 94.88% 4) 93.75% 5) 89.74%

baseline 모델 예측

new_instances = generate_random_instances(num_classes)
logits = baseline_model.predict(new_instances)
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Instance 1:
- Deep learning: 16.09%
- Reinforcement learning: 83.91%
Instance 2:
- Deep learning: 52.74%
- Reinforcement learning: 47.26%

# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
edges = citations[["source", "target"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges.shape[1])
# Create a node features array of shape [num_nodes, num_features].
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges, edge_weights)

print("Edges shape:", edges.shape)
print("Edge weight shape:", edge_weights.shape)
print("Nodes shape:", node_features.shape)

Edges shape: (2, 4000)
Edge weight shape: (4000,)
Nodes shape: (1000, 6)

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

print("GNN output shape:", gnn_model([1, 10, 100]))

gnn_model.summary()

GNN output shape: tf.Tensor(
[[ 0.04556822  0.07794137]
 [-0.06676228  0.00840319]
 [-0.06674671  0.00836377]], shape=(3, 2), dtype=float32)
Model: "gnn_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 preprocess (Sequential)     (1000, 32)                1432      
                                                                 
 graph_conv1 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 graph_conv2 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 postprocess (Sequential)    (1000, 32)                2368      
                                                                 
 logits (Dense)              multiple                  66        
                                                                 
=================================================================
Total params: 15,642
Trainable params: 14,798
Non-trainable params: 844
_________________________________________________________________

Train the GNN model

x_train = train_data.paper_id.to_numpy()
history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 97.04%

p=0.2, 0.5 vs 0.5 Test accuracy: 1) 97.0% 2) 96.89% 3) 97.54% 4) 96.98% 5) 97.04%

# First we add the N new_instances as nodes to the graph
# by appending the new_instance to node_features.
num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
# Second we add the M edges (citations) from each new node to a set
# of existing nodes in a particular subject
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 12.82%
- Reinforcement learning: 87.18%
Instance 2:
- Deep learning: 44.16%
- Reinforcement learning: 55.84%

p = 0.2, 0.1 vs 0.9

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1800,1)),np.random.choice(range(1,501),size=(1800,1))]).reshape(1800,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1800,1)),np.random.choice(range(501,1001),size=(1800,1))]).reshape(1800,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(400,1)),np.random.choice(range(501,1001),size=(400,1))]).reshape(400,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 98.42%

p = 0.2, 0.1 vs 0.9 Test accuracy: 1) 98.6% 2) 94.16% 3) 98.16% 4) 97.78% 5) 98.42%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 99.7%
- Reinforcement learning: 0.3%
Instance 2:
- Deep learning: 9.46%
- Reinforcement learning: 90.54%

p = 0.2, 0.2 vs 0.8

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(1,501),size=(1600,1))]).reshape(1600,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(800,1)),np.random.choice(range(501,1001),size=(800,1))]).reshape(800,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 93.1%

p = 0.2, 0.2 vs 0.8 Test accuracy: 1) 98.2% 2) 98.25% 3) 92.01% 4) 98.39% 5) 93.1%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 20.61%
- Reinforcement learning: 79.39%
Instance 2:
- Deep learning: 31.04%
- Reinforcement learning: 68.96%

p = 0.2, 0.3 vs 0.7

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1400,1)),np.random.choice(range(1,501),size=(1400,1))]).reshape(1400,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1400,1)),np.random.choice(range(501,1001),size=(1400,1))]).reshape(1400,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 99.41%

p = 0.2, 0.3 vs 0.7 Test accuracy: 1) 93.2% 2) 98.64% 3) 97.34% 4) 96.57% 5) 99.41%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 96.88%
- Reinforcement learning: 3.12%
Instance 2:
- Deep learning: 1.36%
- Reinforcement learning: 98.64%

p = 0.2, 0.4 vs 0.6

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(1,501),size=(1200,1))]).reshape(1200,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 97.44%

p = 0.2, 0.4 vs 0.6 Test accuracy: 1) 96.6% 2) 96.11% 3) 96.11% 4) 93.95% 5) 97.44%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 59.78%
- Reinforcement learning: 40.22%
Instance 2:
- Deep learning: 86.98%
- Reinforcement learning: 13.02%

p=0.3

데이터 구성

p=0.3

papers =pd.concat([pd.concat([pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X1','X2','X3']),
           pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X4','X5','X6']),
                             pd.DataFrame(np.array([['Deep learning']*500]).reshape(500,1),columns=['subject'])],axis=1),
            pd.concat([ pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X1','X2','X3']),
              pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X4','X5','X6']),
                      pd.DataFrame(np.array([['Reinforcement learning']*500]).reshape(500,1),columns=['subject'])],axis=1)],axis=0,ignore_index=True).reset_index().rename(columns={'index':'paper_id'})
papers['paper_id'] = papers['paper_id']+1

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1000,1)),np.random.choice(range(1,501),size=(1000,1))]).reshape(1000,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1000,1)),np.random.choice(range(501,1001),size=(1000,1))]).reshape(1000,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(2000,1)),np.random.choice(range(501,1001),size=(2000,1))]).reshape(2000,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

그래프 표현

class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

plt.figure(figsize=(10, 10))
colors = papers["subject"].tolist()
cora_graph = nx.from_pandas_edgelist(citations.sample(n=800))
subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])
nx.draw_spring(cora_graph, node_size=15, node_color=subjects)

Test vs Train

train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 50% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.5
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (492, 8)
Test data shape: (508, 8)

hidden_units = [32,32]
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 300
batch_size = 256

feature_names = set(papers.columns) - {"paper_id", "subject"}
num_features = len(feature_names)
num_classes = len(class_idx)

# Create train and test features as a numpy array.
x_train = train_data[feature_names].to_numpy()
x_test = test_data[feature_names].to_numpy()
# Create train and test targets as a numpy array.
y_train = train_data["subject"]
y_test = test_data["subject"]

history = run_experiment(baseline_model, x_train, y_train)

display_learning_curves(history)

_, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 83.66%

p=0.3 Test accuracy: 1) 82.41% 2) 83.13% 3) 85.28% 4) 81.66% 5) 83.66%

baseline 모델 예측

new_instances = generate_random_instances(num_classes)
logits = baseline_model.predict(new_instances)
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Instance 1:
- Deep learning: 69.02%
- Reinforcement learning: 30.98%
Instance 2:
- Deep learning: 44.54%
- Reinforcement learning: 55.46%

# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
edges = citations[["source", "target"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges.shape[1])
# Create a node features array of shape [num_nodes, num_features].
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges, edge_weights)

print("Edges shape:", edges.shape)
print("Edge weight shape:", edge_weights.shape)
print("Nodes shape:", node_features.shape)

Edges shape: (2, 4000)
Edge weight shape: (4000,)
Nodes shape: (1000, 6)

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

print("GNN output shape:", gnn_model([1, 10, 100]))

gnn_model.summary()

GNN output shape: tf.Tensor(
[[ 0.02474353 -0.01293221]
 [ 0.00420968 -0.01798705]
 [ 0.00666042 -0.01688628]], shape=(3, 2), dtype=float32)
Model: "gnn_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 preprocess (Sequential)     (1000, 32)                1432      
                                                                 
 graph_conv1 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 graph_conv2 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 postprocess (Sequential)    (1000, 32)                2368      
                                                                 
 logits (Dense)              multiple                  66        
                                                                 
=================================================================
Total params: 15,642
Trainable params: 14,798
Non-trainable params: 844
_________________________________________________________________

Train the GNN model

x_train = train_data.paper_id.to_numpy()
history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 83.27%

p=0.3, 0.5 vs 0.5 Test accuracy: 1) 87.12% 2) 83.54% 3) 97.19% 4) 88.61% 5) 83.27%

# First we add the N new_instances as nodes to the graph
# by appending the new_instance to node_features.
num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
# Second we add the M edges (citations) from each new node to a set
# of existing nodes in a particular subject
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 55.75%
- Reinforcement learning: 44.25%
Instance 2:
- Deep learning: 43.46%
- Reinforcement learning: 56.54%

p = 0.3, 0.1 vs 0.9

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1800,1)),np.random.choice(range(1,501),size=(1800,1))]).reshape(1800,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1800,1)),np.random.choice(range(501,1001),size=(1800,1))]).reshape(1800,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(400,1)),np.random.choice(range(501,1001),size=(400,1))]).reshape(400,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 87.8%

p = 0.3, 0.1 vs 0.9 Test accuracy: 1) 92.02% 2) 95.93% 3) 86.8% 4) 91.7% 5) 87.8%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 85.95%
- Reinforcement learning: 14.05%
Instance 2:
- Deep learning: 15.34%
- Reinforcement learning: 84.66%

p = 0.3, 0.2 vs 0.8

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(1,501),size=(1600,1))]).reshape(1600,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(800,1)),np.random.choice(range(501,1001),size=(800,1))]).reshape(800,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 99.02%

p = 0.3, 0.2 vs 0.8 Test accuracy: 1) 97.34% 2) 96.54% 3) 87.01% 4) 96.91% 5) 99.02%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 99.98%
- Reinforcement learning: 0.02%
Instance 2:
- Deep learning: 0.03%
- Reinforcement learning: 99.97%

p = 0.3, 0.3 vs 0.7

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1400,1)),np.random.choice(range(1,501),size=(1400,1))]).reshape(1400,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1400,1)),np.random.choice(range(501,1001),size=(1400,1))]).reshape(1400,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 97.24%

p = 0.3, 0.3 vs 0.7 Test accuracy: 1) 96.73% 2) 85.57% 3) 97.84% 4) 97.1% 5) 97.24%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 98.62%
- Reinforcement learning: 1.38%
Instance 2:
- Deep learning: 16.43%
- Reinforcement learning: 83.57%

p = 0.3, 0.4 vs 0.6

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(1,501),size=(1200,1))]).reshape(1200,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 92.91%

p = 0.3, 0.4 vs 0.6 Test accuracy: 1) 97.55% 2) 82.93% 3) 97.62% 4) 90.73% 5) 92.91%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 96.98%
- Reinforcement learning: 3.02%
Instance 2:
- Deep learning: 50.9%
- Reinforcement learning: 49.1%

p=0.4

데이터 구성

p=0.4

papers =pd.concat([pd.concat([pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X1','X2','X3']),
           pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X4','X5','X6']),
                             pd.DataFrame(np.array([['Deep learning']*500]).reshape(500,1),columns=['subject'])],axis=1),
            pd.concat([ pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X1','X2','X3']),
              pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X4','X5','X6']),
                      pd.DataFrame(np.array([['Reinforcement learning']*500]).reshape(500,1),columns=['subject'])],axis=1)],axis=0,ignore_index=True).reset_index().rename(columns={'index':'paper_id'})
papers['paper_id'] = papers['paper_id']+1

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1000,1)),np.random.choice(range(1,501),size=(1000,1))]).reshape(1000,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1000,1)),np.random.choice(range(501,1001),size=(1000,1))]).reshape(1000,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(2000,1)),np.random.choice(range(501,1001),size=(2000,1))]).reshape(2000,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

그래프 표현

class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

plt.figure(figsize=(10, 10))
colors = papers["subject"].tolist()
cora_graph = nx.from_pandas_edgelist(citations.sample(n=800))
subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])
nx.draw_spring(cora_graph, node_size=15, node_color=subjects)

Test vs Train

train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 50% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.5
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (530, 8)
Test data shape: (470, 8)

hidden_units = [32,32]
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 300
batch_size = 256

feature_names = set(papers.columns) - {"paper_id", "subject"}
num_features = len(feature_names)
num_classes = len(class_idx)

# Create train and test features as a numpy array.
x_train = train_data[feature_names].to_numpy()
x_test = test_data[feature_names].to_numpy()
# Create train and test targets as a numpy array.
y_train = train_data["subject"]
y_test = test_data["subject"]

history = run_experiment(baseline_model, x_train, y_train)

display_learning_curves(history)

_, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 66.6%

p=0.4 Test accuracy: 1) 65.56% 2) 72.73% 3) 69.67% 4) 67.7% 5) 66.6%

baseline 모델 예측

new_instances = generate_random_instances(num_classes)
logits = baseline_model.predict(new_instances)
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Instance 1:
- Deep learning: 48.7%
- Reinforcement learning: 51.3%
Instance 2:
- Deep learning: 52.25%
- Reinforcement learning: 47.75%

# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
edges = citations[["source", "target"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges.shape[1])
# Create a node features array of shape [num_nodes, num_features].
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges, edge_weights)

print("Edges shape:", edges.shape)
print("Edge weight shape:", edge_weights.shape)
print("Nodes shape:", node_features.shape)

Edges shape: (2, 4000)
Edge weight shape: (4000,)
Nodes shape: (1000, 6)

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

print("GNN output shape:", gnn_model([1, 10, 100]))

gnn_model.summary()

GNN output shape: tf.Tensor(
[[-0.06100957  0.00051374]
 [-0.00893221 -0.1086567 ]
 [-0.1062933  -0.13349164]], shape=(3, 2), dtype=float32)
Model: "gnn_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 preprocess (Sequential)     (1000, 32)                1432      
                                                                 
 graph_conv1 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 graph_conv2 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 postprocess (Sequential)    (1000, 32)                2368      
                                                                 
 logits (Dense)              multiple                  66        
                                                                 
=================================================================
Total params: 15,642
Trainable params: 14,798
Non-trainable params: 844
_________________________________________________________________

Train the GNN model

x_train = train_data.paper_id.to_numpy()
history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 88.51%

p=0.4, 0.5 vs 0.5 Test accuracy: 1) 85.91% 2) 71.02% 3) 82.0% 4) 91.36% 5) 88.51%

# First we add the N new_instances as nodes to the graph
# by appending the new_instance to node_features.
num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
# Second we add the M edges (citations) from each new node to a set
# of existing nodes in a particular subject
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 43.02%
- Reinforcement learning: 56.98%
Instance 2:
- Deep learning: 12.97%
- Reinforcement learning: 87.03%

p = 0.4, 0.1 vs 0.9

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1800,1)),np.random.choice(range(1,501),size=(1800,1))]).reshape(1800,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1800,1)),np.random.choice(range(501,1001),size=(1800,1))]).reshape(1800,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(400,1)),np.random.choice(range(501,1001),size=(400,1))]).reshape(400,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 78.51%

p = 0.4, 0.1 vs 0.9 Test accuracy: 1) 87.08% 2) 92.23% 3) 87.28% 4) 90.53% 5) 78.51%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 17.3%
- Reinforcement learning: 82.7%
Instance 2:
- Deep learning: 9.68%
- Reinforcement learning: 90.32%

p = 0.4, 0.2 vs 0.8

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(1,501),size=(1600,1))]).reshape(1600,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(800,1)),np.random.choice(range(501,1001),size=(800,1))]).reshape(800,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 67.02%

p = 0.4, 0.2 vs 0.8 Test accuracy: 1) 76.13% 2) 92.99% 3) 61.06% 4) 62.55% 5) 67.02%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 48.39%
- Reinforcement learning: 51.61%
Instance 2:
- Deep learning: 41.17%
- Reinforcement learning: 58.83%

p = 0.4, 0.3 vs 0.7

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1400,1)),np.random.choice(range(1,501),size=(1400,1))]).reshape(1400,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1400,1)),np.random.choice(range(501,1001),size=(1400,1))]).reshape(1400,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 80.21%

p = 0.4, 0.3 vs 0.7 Test accuracy: 1) 87.87% 2) 68.37% 3) 66.93% 4) 58.02% 5) 80.21%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 85.54%
- Reinforcement learning: 14.46%
Instance 2:
- Deep learning: 87.09%
- Reinforcement learning: 12.91%

p = 0.4, 0.4 vs 0.6

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(1,501),size=(1200,1))]).reshape(1200,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 83.19%

p = 0.4, 0.4 vs 0.6 Test accuracy: 1) 86.5% 2) 88.45% 3) 85.71% 4) 60.08% 5) 83.19%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 97.81%
- Reinforcement learning: 2.19%
Instance 2:
- Deep learning: 1.24%
- Reinforcement learning: 98.76%

p=0.5

데이터 구성

p=0.5

papers =pd.concat([pd.concat([pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X1','X2','X3']),
           pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X4','X5','X6']),
                             pd.DataFrame(np.array([['Deep learning']*500]).reshape(500,1),columns=['subject'])],axis=1),
            pd.concat([ pd.DataFrame(np.random.choice(2, 1500, p=[1-p,p]).reshape(500,3),columns=['X1','X2','X3']),
              pd.DataFrame(np.random.choice(2, 1500, p=[p,1-p]).reshape(500,3),columns=['X4','X5','X6']),
                      pd.DataFrame(np.array([['Reinforcement learning']*500]).reshape(500,1),columns=['subject'])],axis=1)],axis=0,ignore_index=True).reset_index().rename(columns={'index':'paper_id'})
papers['paper_id'] = papers['paper_id']+1

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1000,1)),np.random.choice(range(1,501),size=(1000,1))]).reshape(1000,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1000,1)),np.random.choice(range(501,1001),size=(1000,1))]).reshape(1000,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(2000,1)),np.random.choice(range(501,1001),size=(2000,1))]).reshape(2000,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

그래프 표현

class_values = sorted(papers["subject"].unique())
class_idx = {name: id for id, name in enumerate(class_values)}
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}

papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])

plt.figure(figsize=(10, 10))
colors = papers["subject"].tolist()
cora_graph = nx.from_pandas_edgelist(citations.sample(n=800))
subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])
nx.draw_spring(cora_graph, node_size=15, node_color=subjects)

Test vs Train

train_data, test_data = [], []

for _, group_data in papers.groupby("subject"):
    # Select around 50% of the dataset for training.
    random_selection = np.random.rand(len(group_data.index)) <= 0.5
    train_data.append(group_data[random_selection])
    test_data.append(group_data[~random_selection])

train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)

print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (513, 8)
Test data shape: (487, 8)

hidden_units = [32,32]
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 300
batch_size = 256

feature_names = set(papers.columns) - {"paper_id", "subject"}
num_features = len(feature_names)
num_classes = len(class_idx)

# Create train and test features as a numpy array.
x_train = train_data[feature_names].to_numpy()
x_test = test_data[feature_names].to_numpy()
# Create train and test targets as a numpy array.
y_train = train_data["subject"]
y_test = test_data["subject"]

history = run_experiment(baseline_model, x_train, y_train)

display_learning_curves(history)

_, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 51.13%

p=0.5 Test accuracy: 1) 52.76% 2) 49.9% 3) 47.43% 4) 52.92% 5) 51.13%

baseline 모델 예측

new_instances = generate_random_instances(num_classes)
logits = baseline_model.predict(new_instances)
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Instance 1:
- Deep learning: 52.78%
- Reinforcement learning: 47.22%
Instance 2:
- Deep learning: 50.34%
- Reinforcement learning: 49.66%

# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
edges = citations[["source", "target"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges.shape[1])
# Create a node features array of shape [num_nodes, num_features].
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges, edge_weights)

print("Edges shape:", edges.shape)
print("Edge weight shape:", edge_weights.shape)
print("Nodes shape:", node_features.shape)

Edges shape: (2, 4000)
Edge weight shape: (4000,)
Nodes shape: (1000, 6)

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

print("GNN output shape:", gnn_model([1, 10, 100]))

gnn_model.summary()

GNN output shape: tf.Tensor(
[[0.00731459 0.09327544]
 [0.03734017 0.0862596 ]
 [0.04290678 0.16998649]], shape=(3, 2), dtype=float32)
Model: "gnn_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 preprocess (Sequential)     (1000, 32)                1432      
                                                                 
 graph_conv1 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 graph_conv2 (GraphConvLayer  multiple                 5888      
 )                                                               
                                                                 
 postprocess (Sequential)    (1000, 32)                2368      
                                                                 
 logits (Dense)              multiple                  66        
                                                                 
=================================================================
Total params: 15,642
Trainable params: 14,798
Non-trainable params: 844
_________________________________________________________________

Train the GNN model

x_train = train_data.paper_id.to_numpy()
history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 51.33%

p=0.5, 0.5 vs 0.5 Test accuracy: 1) 51.38% 2) 48.53% 3) 46.84% 4) 48.83% 5) 51.33%

# First we add the N new_instances as nodes to the graph
# by appending the new_instance to node_features.
num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
# Second we add the M edges (citations) from each new node to a set
# of existing nodes in a particular subject
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 53.7%
- Reinforcement learning: 46.3%
Instance 2:
- Deep learning: 54.63%
- Reinforcement learning: 45.37%

p = 0.5, 0.1 vs 0.9

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1800,1)),np.random.choice(range(1,501),size=(1800,1))]).reshape(1800,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1800,1)),np.random.choice(range(501,1001),size=(1800,1))]).reshape(1800,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(400,1)),np.random.choice(range(501,1001),size=(400,1))]).reshape(400,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 49.28%

p = 0.5, 0.1 vs 0.9 Test accuracy: 1) 43.5% 2) 46.97% 3) 46.84% 4) 49.81% 5) 49.28%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 50.71%
- Reinforcement learning: 49.29%
Instance 2:
- Deep learning: 50.95%
- Reinforcement learning: 49.05%

p = 0.5, 0.2 vs 0.8

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(1,501),size=(1600,1))]).reshape(1600,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(800,1)),np.random.choice(range(501,1001),size=(800,1))]).reshape(800,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 48.67%

p = 0.5, 0.2 vs 0.8 Test accuracy: 1) 49.8% 2) 49.9% 3) 46.84% 4) 51.17% 5) 48.67%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 49.08%
- Reinforcement learning: 50.92%
Instance 2:
- Deep learning: 49.33%
- Reinforcement learning: 50.67%

p = 0.5, 0.3 vs 0.7

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1400,1)),np.random.choice(range(1,501),size=(1400,1))]).reshape(1400,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1400,1)),np.random.choice(range(501,1001),size=(1400,1))]).reshape(1400,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 49.49%

p = 0.5, 0.3 vs 0.7 Test accuracy: 1) 52.36% 2) 45.21% 3) 46.05% 4) 49.22% 5) 49.49%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 50.72%
- Reinforcement learning: 49.28%
Instance 2:
- Deep learning: 50.78%
- Reinforcement learning: 49.22%

p = 0.5, 0.4 vs 0.6

citations =  pd.concat([pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1200,1)),np.random.choice(range(1,501),size=(1200,1))]).reshape(1200,2)),
                          pd.DataFrame(np.array([np.random.choice(range(501,1001),size=(1200,1)),np.random.choice(range(501,1001),size=(1200,1))]).reshape(1200,2)),
                       pd.DataFrame(np.array([np.random.choice(range(1,501),size=(1600,1)),np.random.choice(range(501,1001),size=(1600,1))]).reshape(1600,2))],ignore_index=True).rename(columns={0:'target',1:'source'})

citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])

edges = citations[["source", "target"]].to_numpy().T
edge_weights = tf.ones(shape=edges.shape[1])
node_features = tf.cast(
    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
graph_info = (node_features, edges, edge_weights)

Train the GNN model

gnn_model = GNNNodeClassifier(
    graph_info=graph_info,
    num_classes=num_classes,
    hidden_units=hidden_units,
    dropout_rate=dropout_rate,
    name="gnn_model",
)

history = run_experiment(gnn_model, x_train, y_train)

display_learning_curves(history)

x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")

Test accuracy: 49.28%

p = 0.4, 0.4 vs 0.6 Test accuracy: 1) 45.87% 2) 47.36% 3) 46.84% 4) 50.97% 5) 49.28%

num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
for subject_idx, group in papers.groupby("subject"):
    subject_papers = list(group.paper_id)
    # Select random x papers specific subject.
    selected_paper_indices1 = np.random.choice(subject_papers, 5)
    # Select random y papers from any subject (where y < x).
    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
    # Merge the selected paper indices.
    selected_paper_indices = np.concatenate(
        [selected_paper_indices1, selected_paper_indices2], axis=0
    )
    # Create edges between a citing paper idx and the selected cited papers.
    citing_paper_indx = new_node_indices[subject_idx]
    for cited_paper_idx in selected_paper_indices:
        new_citations.append([citing_paper_indx, cited_paper_idx])

new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)

print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)

logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
display_class_probabilities(probabilities)

Original node_features shape: (1000, 6)
Original edges shape: (2, 4000)
New node_features shape: (1002, 6)
New edges shape: (2, 4014)
Instance 1:
- Deep learning: 50.61%
- Reinforcement learning: 49.39%
Instance 2:
- Deep learning: 51.89%
- Reinforcement learning: 48.11%

	paper_id	X1	X2	X3	X4	X5	X6	subject
0	1	1	0	1	0	0	0	Deep learning
1	2	1	1	1	0	0	1	Deep learning
2	3	1	1	1	0	0	0	Deep learning
3	4	1	0	1	0	1	0	Deep learning
4	5	1	1	1	0	1	0	Deep learning
...	...	...	...	...	...	...	...	...
995	996	0	1	0	1	1	1	Reinforcement learning
996	997	1	0	0	1	1	1	Reinforcement learning
997	998	1	0	0	0	1	1	Reinforcement learning
998	999	0	0	0	1	1	1	Reinforcement learning
999	1000	0	0	0	1	1	1	Reinforcement learning