#!/usr/bin/env python
coding: utf-8
# 3rd Model: Deepgraph CNN: Stock Price Prediction using DeepGraphCNN Neural Networks. It includes GCN layers and CNN layers. I have added an MLP at the last layer to predict stock prices.
#
# Input graphs were created for spearman, Spearman, and Kendal Tau correlations/coefficients from historical stock prices. Also, another graph is created based on financial news articles.
#
# For the sake of making execution easier (and at once), I have kept multiple approaches (spearman, Spearman, and Kendal Tau, News Based) in the same file. One big code file can be difficult to handle; is done just for making execution easier.
#
# Because I initially tried separately and brought the code together, some code might be a bit redundant/repeating. I may have done some cleaning.
#
# An use case of DeepGraphCNN for Node Classification
# https://stellargraph.readthedocs.io/en/latest/demos/graph-classification/dgcnn-graph-classification.html
#
# Import Libraries
In[1]:
import libraries
import os
import pandas as pd
import math
In[2]:
Import Libraries for Graph, GNN, and GCN
import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.layer import DeepGraphCNN
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import GCN
In[3]:
Machine Learnig related library Imports
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
get_ipython().run_line_magic(‘matplotlib’, ‘inline’)
from tensorflow.keras.layers import Dense, Conv1D, MaxPool1D, Dropout, Flatten
from tensorflow import keras
In[4]:
If we want to drop NAN column or row wise for stock price data
I did not need to use this options that much
drop_cols_with_na = 1
drop_rows_with_na = 1
# Dataset: Using 30 companies from the Fortune 500 companies (the paper used these stocks)
In[5]:
df_s = pd.DataFrame();
data_file = "per-day-fortune-30-company-stock-price-data.csv";
df_s = pd.read_csv("./data/" + data_file, low_memory = False);
df_s.head()
In[6]:
You can see ANTM stock price data is empty
# Cure data such as replace missing/null values, use correct data type, sort by date (not really required)
In[7]:
convert Date field to be a Date Type
df_s["Date"] = df_s["Date"].astype(‘datetime64[ns]’)
Sort data by date although this is no longer needed as data already is sorted when I generated data
df_s = df_s.sort_values( by = ['Ticker','Date'], ascending = True )
df_s = df_s.sort_values( by = ‘Date’, ascending = True )
df_s.head()
In[8]:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html
df_s_transpose = df_s
try:
df_s_transpose = df_s_transpose.interpolate(inplace = False)
except:
print("An exception occurred. Operation ignored")
exit
check if any value is null
df_s_transpose.isnull().values.any()
check if any column (axis=1) is null
df_s_transpose[df_s_transpose.isna().any(axis = 1)]
In[9]:
df_s_transpose
In[10]:
df_s_transpose = df_s
if drop_cols_with_na == 1:
df_s_transpose = df_s_transpose.dropna(axis = 1);
print(df_s_transpose.shape)
df_s_transpose.head()
In[11]:
further check and verify
df_s_transpose.isnull().values.any()
df_s_transpose[df_s_transpose.isna().any( axis = 1 )]
In[12]:
making the date column as the index column for the dataset
df_s_transpose.index = df_s_transpose['Date']
df_s_transpose.index = df_s_transpose.index.astype(‘datetime64[ns]’)
# spearman Correlation Coefficient
In[13]:
df_s_transpose_spearman = df_s_transpose.corr(method = ‘spearman’, numeric_only = True)
df_s_transpose_spearman
# spearman Correlation Coefficient based Adjacency Graph Matrix
In[14]:
df_s_transpose_spearman[df_s_transpose_spearman >= 0.4] = 1
df_s_transpose_spearman[df_s_transpose_spearman < 0.4] = 0
df_s_transpose_spearman
In[15]:
make the diagonal element to be zero. No self loop/edge
import numpy as np
np.fill_diagonal(df_s_transpose_spearman.values, 0)
df_s_transpose_spearman
Create and visualize the Graphs
In[17]:
import networkx as nx
Graph_spearman = nx.Graph(df_s_transpose_spearman)
In[18]:
nx.draw_networkx(Graph_spearman, pos = nx.circular_layout( Graph_spearman ), node_color = ‘r’, edge_color = ‘b’)
# Experiment, we will divide the data into train, test, and validation graphs
In[19]:
df_s_transpose.corr(method = ‘spearman’, numeric_only = True)
#df_s_transpose[[{1,2,3}]]
#df_s_transpose.iloc[:, 0:10]
In[20]:
Train Graph
In[21]:
df_s_spearman_train = df_s_transpose.iloc[:, 0:15]
df_s_transpose_spearman_train = df_s_spearman_train.corr(method = ‘spearman’, numeric_only = True)
np.fill_diagonal(df_s_transpose_spearman_train.values, 0)
df_s_transpose_spearman_train[df_s_transpose_spearman_train >= 0.4] = 1
df_s_transpose_spearman_train[df_s_transpose_spearman_train < 0.4] = 0
df_s_transpose_spearman_train
df_s_transpose_spearman_train
# Test Graph
In[22]:
df_s_spearman_test = df_s_transpose.iloc[:, 15:] #df_s_transpose.iloc[:, 15:23]
df_s_transpose_spearman_test = df_s_spearman_test.corr(method = ‘spearman’, numeric_only = True)
np.fill_diagonal(df_s_transpose_spearman_test.values, 0)
df_s_transpose_spearman_train[df_s_transpose_spearman_test >= 0.4] = 1
df_s_transpose_spearman_train[df_s_transpose_spearman_test < 0.4] = 0
df_s_transpose_spearman_test
# Validation Graph
In[23]:
df_s_spearman_validation = df_s_transpose.iloc[:, 15:] #df_s_transpose.iloc[:, 23:]
df_s_transpose_spearman_validation = df_s_spearman_validation.corr(method = ‘spearman’, numeric_only = True)
np.fill_diagonal(df_s_transpose_spearman_validation.values, 0)
df_s_transpose_spearman_validation
df_s_transpose_spearman_validation[df_s_transpose_spearman_validation >= 0.4] = 1
df_s_transpose_spearman_validation[df_s_transpose_spearman_validation < 0.4] = 0
df_s_transpose_spearman_validation
In[24]:
graph_spearman_train = nx.Graph(df_s_transpose_spearman_train)
graph_spearman_test = nx.Graph(df_s_transpose_spearman_test)
graph_spearman_validation = nx.Graph(df_s_transpose_spearman_validation)
nx.draw_networkx(graph_spearman_train, pos = nx.circular_layout( graph_spearman_train ), node_color = ‘r’, edge_color = ‘b’)
In[25]:
df_s_spearman_train.corr(numeric_only = True)
In[26]:
nx.draw_networkx(graph_spearman_test, pos = nx.circular_layout( graph_spearman_test ), node_color = ‘r’, edge_color = ‘b’)
In[27]:
nx.draw_networkx(graph_spearman_validation, pos = nx.circular_layout( graph_spearman_validation ), node_color = ‘r’, edge_color = ‘b’)
# Create GCN layer. spearman
# Find all stocks = nodes
In[28]:
improvement: make sure only stocks/nodes that are in the graph are taken
all_stock_nodes = df_s_transpose_spearman.index.to_list()
all_stock_nodes[:5]
# Find all edges between nodes
#
This may need adjustment to reflect train, test, validation graphs
In[29]:
source = [];
target = [];
edge_feature = [];
for aStock in all_stock_nodes:
for anotherStock in all_stock_nodes:
if df_s_transpose_spearman[aStock][anotherStock] > 0:
#print(df_s_transpose_spearman[aStock][anotherStock])
source.append(aStock)
target.append(anotherStock)
edge_feature.append(1)
edge feature is not required except for news based graph
source, target, edge_feature
# Find all edges in Train, Test, and Validation Graphs
In[30]:
trainSource = [];
trainTarget = [];
trainEdge_feature = [];
trainNodeList = df_s_transpose_spearman_train.index.to_list();
testSource = [];
testTarget = [];
testEdge_feature = [];
testNodeList = df_s_transpose_spearman_test.index.to_list();
validationSource = [];
validationTarget = [];
validationEdge_feature = [];
validationNodeList = df_s_transpose_spearman_validation.index.to_list();
for aStock in trainNodeList:
for anotherStock in trainNodeList:
if df_s_transpose_spearman_train[aStock][anotherStock] > 0:
#print(df_s_transpose_spearman[aStock][anotherStock])
trainSource.append(aStock)
trainTarget.append(anotherStock)
trainEdge_feature.append(1)
for aStock in testNodeList:
for anotherStock in testNodeList:
if df_s_transpose_spearman_test[aStock][anotherStock] > 0:
#print(df_s_transpose_spearman[aStock][anotherStock])
testSource.append(aStock)
testTarget.append(anotherStock)
testEdge_feature.append(1)
for aStock in validationNodeList:
for anotherStock in validationNodeList:
if df_s_transpose_spearman_validation[aStock][anotherStock] > 0:
print(df_s_transpose_spearman[aStock][anotherStock])
validationSource.append(aStock)
validationTarget.append(anotherStock)
validationEdge_feature.append(1)
edge feature is not required except for news based graph
trainSource, trainTarget, trainEdge_feature
testSource, testTarget, testEdge_feature
validationSource, validationTarget, validationEdge_feature
# Create variables to create stellar graph
# Edges
In[31]:
https://stellargraph.readthedocs.io/en/stable/demos/basics/loading-pandas.html
spearman_edges = pd.DataFrame(
{"source": source, "target": target}
)
spearman_edges_data = pd.DataFrame(
{"source": source, "target": target, "edge_feature": edge_feature}
)
https://stellargraph.readthedocs.io/en/stable/demos/basics/loading-pandas.html
spearman_edges_train = pd.DataFrame(
{"source": trainSource, "target": trainTarget}
)
spearman_edges_data_train = pd.DataFrame(
{"source": trainSource, "target": trainTarget, "edge_feature": trainEdge_feature}
)
spearman_edges_test = pd.DataFrame(
{"source": testSource, "target": testTarget}
)
spearman_edges_data_test = pd.DataFrame(
{"source": testSource, "target": testTarget, "edge_feature": testEdge_feature}
)
spearman_edges_validation = pd.DataFrame(
{"source": validationSource, "target": validationTarget}
)
spearman_edges_train[:10]
# Have the time series data as part of the nodes
# Structure the Feature Matrix so that it can be passed to the GCN
In[32]:
df_s_transpose_feature = df_s_transpose.reset_index(drop = True, inplace = False)
df_s_transpose_feature = df_s_transpose_feature.values.tolist()
print(df_s_transpose_feature.values.tolist())
#df_s_transpose_feature['WY'].values
df_s_transpose_feature['AAPL'].shape, df_s_transpose_feature['AAPL'].values
In[33]:
len(all_stock_nodes)
In[34]:
bring/assign data to nodes
node_Data = [];
for x in all_stock_nodes:
node_Data.append( df_s_transpose_feature[x].values)
node_Data
In[35]:
convert node data variable into a dataframe so that the data structure is compatible with graph NN
spearman_graph_node_data = pd.DataFrame(node_Data, index = all_stock_nodes)
spearman_graph_node_data.head()
In[36]:
node_Data[14:15],
len(validationNodeList)
len(testNodeList)
In[37]:
Node time series data based on train, test, validation graph
In[38]:
convert node data variable into a dataframe so that the data structure is compatible with graph NN
spearman_graph_node_data_train = pd.DataFrame(node_Data[0:14], index = trainNodeList)
spearman_graph_node_data_train.head()
spearman_graph_node_data_test = pd.DataFrame(node_Data[14:], index = testNodeList) #pd.DataFrame(node_Data[15:23], index = testNodeList)
spearman_graph_node_data_test.head()
spearman_graph_node_data_validation = pd.DataFrame(node_Data[14:], index = validationNodeList) #pd.DataFrame(node_Data[22:30], index = validationNodeList)
spearman_graph_node_data_validation.head()
In[39]:
spearman_graph_node_data_train
# Graph (stellar) with features as part of Nodes
In[40]:
Overall
spearman_graph_with_node_features = StellarGraph(spearman_graph_node_data, edges = spearman_edges, node_type_default = "corner", edge_type_default = "line")
print(spearman_graph_with_node_features.info())
train nodes
spearman_train_graph_with_node_features = StellarGraph(spearman_graph_node_data_train, edges = spearman_edges_train, node_type_default = "corner", edge_type_default = "line")
print(spearman_train_graph_with_node_features.info())
test
spearman_test_graph_with_node_features = StellarGraph(spearman_graph_node_data_test, edges = spearman_edges_test, node_type_default = "corner", edge_type_default = "line")
print(spearman_test_graph_with_node_features.info())
validation
spearman_validation_graph_with_node_features = StellarGraph(spearman_graph_node_data_validation, edges = spearman_edges_validation, node_type_default = "corner", edge_type_default = "line")
print(spearman_validation_graph_with_node_features.info())
# Adapting everything for DeepGraphCNN
In[41]:
spearman_graph_node_data.iloc[0:15, :]
# Graphs to be jused for DeepGraphCNN
In[42]:
graphs = list()
#graphs.append(spearman_graph_with_node_features)
graphs.append(spearman_train_graph_with_node_features)
graphs.append(spearman_test_graph_with_node_features)
graphs.append(spearman_validation_graph_with_node_features)
In[43]:
summary = pd.DataFrame(
[(g.number_of_nodes(), g.number_of_edges()) for g in graphs],
columns=["nodes", "edges"],
)
summary.describe().round()
In[44]:
graph_labels = all_stock_nodes
In[45]:
Generator
#generator = FullBatchNodeGenerator(spearman_graph_with_node_features, method = "gcn") # , sparse = False
#vars(generator)
generator = PaddedGraphGenerator( graphs = graphs)
generator = PaddedGraphGenerator( spearman_graph_with_node_features)
In[46]:
vars(generator)
# Train Test Split
# Commented out on 2023-04-18
train_subjects, test_subjects = model_selection.train_test_split(
spearman_graph_node_data
)
#
val_subjects, test_subjects_step_2 = model_selection.train_test_split(
test_subjects
)
#
#, train_size = 500, test_size = None, stratify = test_subjects
#