Spearman Correlation Coefficient and Graph Mining

#!/usr/bin/env python

coding: utf-8

# 3rd Model: Deepgraph CNN: Stock Price Prediction using DeepGraphCNN Neural Networks. It includes GCN layers and CNN layers. I have added an MLP at the last layer to predict stock prices.

# Input graphs were created for spearman, Spearman, and Kendal Tau correlations/coefficients from historical stock prices. Also, another graph is created based on financial news articles.

# For the sake of making execution easier (and at once), I have kept multiple approaches (spearman, Spearman, and Kendal Tau, News Based) in the same file. One big code file can be difficult to handle; is done just for making execution easier.

# Because I initially tried separately and brought the code together, some code might be a bit redundant/repeating. I may have done some cleaning.

# An use case of DeepGraphCNN for Node Classification

# https://stellargraph.readthedocs.io/en/latest/demos/graph-classification/dgcnn-graph-classification.html

# Import Libraries

In[1]:

import libraries

import os
import pandas as pd
import math

In[2]:

Import Libraries for Graph, GNN, and GCN

import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.layer import DeepGraphCNN
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import GCN

In[3]:

Machine Learnig related library Imports

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
get_ipython().run_line_magic(‘matplotlib’, ‘inline’)
from tensorflow.keras.layers import Dense, Conv1D, MaxPool1D, Dropout, Flatten
from tensorflow import keras

In[4]:

If we want to drop NAN column or row wise for stock price data

I did not need to use this options that much

drop_cols_with_na = 1
drop_rows_with_na = 1

# Dataset: Using 30 companies from the Fortune 500 companies (the paper used these stocks)

In[5]:

df_s = pd.DataFrame();
data_file = "per-day-fortune-30-company-stock-price-data.csv";
df_s = pd.read_csv("./data/" + data_file, low_memory = False);
df_s.head()

In[6]:

You can see ANTM stock price data is empty

# Cure data such as replace missing/null values, use correct data type, sort by date (not really required)

In[7]:

convert Date field to be a Date Type

df_s["Date"] = df_s["Date"].astype(‘datetime64[ns]’)

Sort data by date although this is no longer needed as data already is sorted when I generated data

df_s = df_s.sort_values( by = ['Ticker','Date'], ascending = True )

df_s = df_s.sort_values( by = ‘Date’, ascending = True )
df_s.head()

In[8]:

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html

df_s_transpose = df_s

try:
df_s_transpose = df_s_transpose.interpolate(inplace = False)
except:
print("An exception occurred. Operation ignored")
exit

check if any value is null

df_s_transpose.isnull().values.any()

check if any column (axis=1) is null

df_s_transpose[df_s_transpose.isna().any(axis = 1)]

In[9]:

df_s_transpose

In[10]:

df_s_transpose = df_s

if drop_cols_with_na == 1:
df_s_transpose = df_s_transpose.dropna(axis = 1);

print(df_s_transpose.shape)
df_s_transpose.head()

In[11]:

further check and verify

df_s_transpose.isnull().values.any()
df_s_transpose[df_s_transpose.isna().any( axis = 1 )]

In[12]:

making the date column as the index column for the dataset

df_s_transpose.index = df_s_transpose['Date']

df_s_transpose.index = df_s_transpose.index.astype(‘datetime64[ns]’)

# spearman Correlation Coefficient

In[13]:

df_s_transpose_spearman = df_s_transpose.corr(method = ‘spearman’, numeric_only = True)
df_s_transpose_spearman

# spearman Correlation Coefficient based Adjacency Graph Matrix

In[14]:

df_s_transpose_spearman[df_s_transpose_spearman >= 0.4] = 1
df_s_transpose_spearman[df_s_transpose_spearman < 0.4] = 0
df_s_transpose_spearman

In[15]:

make the diagonal element to be zero. No self loop/edge

import numpy as np
np.fill_diagonal(df_s_transpose_spearman.values, 0)
df_s_transpose_spearman

Create and visualize the Graphs

In[17]:

import networkx as nx
Graph_spearman = nx.Graph(df_s_transpose_spearman)

In[18]:

nx.draw_networkx(Graph_spearman, pos = nx.circular_layout( Graph_spearman ), node_color = ‘r’, edge_color = ‘b’)

# Experiment, we will divide the data into train, test, and validation graphs

In[19]:

df_s_transpose.corr(method = ‘spearman’, numeric_only = True)
#df_s_transpose[[{1,2,3}]]
#df_s_transpose.iloc[:, 0:10]

In[20]:

Train Graph

In[21]:

df_s_spearman_train = df_s_transpose.iloc[:, 0:15]
df_s_transpose_spearman_train = df_s_spearman_train.corr(method = ‘spearman’, numeric_only = True)
np.fill_diagonal(df_s_transpose_spearman_train.values, 0)

df_s_transpose_spearman_train[df_s_transpose_spearman_train >= 0.4] = 1
df_s_transpose_spearman_train[df_s_transpose_spearman_train < 0.4] = 0
df_s_transpose_spearman_train

df_s_transpose_spearman_train

# Test Graph

In[22]:

df_s_spearman_test = df_s_transpose.iloc[:, 15:] #df_s_transpose.iloc[:, 15:23]
df_s_transpose_spearman_test = df_s_spearman_test.corr(method = ‘spearman’, numeric_only = True)
np.fill_diagonal(df_s_transpose_spearman_test.values, 0)

df_s_transpose_spearman_train[df_s_transpose_spearman_test >= 0.4] = 1
df_s_transpose_spearman_train[df_s_transpose_spearman_test < 0.4] = 0
df_s_transpose_spearman_test

# Validation Graph

In[23]:

df_s_spearman_validation = df_s_transpose.iloc[:, 15:] #df_s_transpose.iloc[:, 23:]
df_s_transpose_spearman_validation = df_s_spearman_validation.corr(method = ‘spearman’, numeric_only = True)
np.fill_diagonal(df_s_transpose_spearman_validation.values, 0)
df_s_transpose_spearman_validation

df_s_transpose_spearman_validation[df_s_transpose_spearman_validation >= 0.4] = 1
df_s_transpose_spearman_validation[df_s_transpose_spearman_validation < 0.4] = 0
df_s_transpose_spearman_validation

In[24]:

graph_spearman_train = nx.Graph(df_s_transpose_spearman_train)
graph_spearman_test = nx.Graph(df_s_transpose_spearman_test)
graph_spearman_validation = nx.Graph(df_s_transpose_spearman_validation)

nx.draw_networkx(graph_spearman_train, pos = nx.circular_layout( graph_spearman_train ), node_color = ‘r’, edge_color = ‘b’)

In[25]:

df_s_spearman_train.corr(numeric_only = True)

In[26]:

nx.draw_networkx(graph_spearman_test, pos = nx.circular_layout( graph_spearman_test ), node_color = ‘r’, edge_color = ‘b’)

In[27]:

nx.draw_networkx(graph_spearman_validation, pos = nx.circular_layout( graph_spearman_validation ), node_color = ‘r’, edge_color = ‘b’)

# Create GCN layer. spearman

# Find all stocks = nodes

In[28]:

improvement: make sure only stocks/nodes that are in the graph are taken

all_stock_nodes = df_s_transpose_spearman.index.to_list()
all_stock_nodes[:5]

# Find all edges between nodes

This may need adjustment to reflect train, test, validation graphs

In[29]:

source = [];
target = [];
edge_feature = [];

for aStock in all_stock_nodes:
for anotherStock in all_stock_nodes:
if df_s_transpose_spearman[aStock][anotherStock] > 0:
#print(df_s_transpose_spearman[aStock][anotherStock])
source.append(aStock)
target.append(anotherStock)
edge_feature.append(1)

edge feature is not required except for news based graph

source, target, edge_feature

# Find all edges in Train, Test, and Validation Graphs

In[30]:

trainSource = [];
trainTarget = [];
trainEdge_feature = [];
trainNodeList = df_s_transpose_spearman_train.index.to_list();

testSource = [];
testTarget = [];
testEdge_feature = [];
testNodeList = df_s_transpose_spearman_test.index.to_list();

validationSource = [];
validationTarget = [];
validationEdge_feature = [];
validationNodeList = df_s_transpose_spearman_validation.index.to_list();

for aStock in trainNodeList:
for anotherStock in trainNodeList:
if df_s_transpose_spearman_train[aStock][anotherStock] > 0:
#print(df_s_transpose_spearman[aStock][anotherStock])
trainSource.append(aStock)
trainTarget.append(anotherStock)
trainEdge_feature.append(1)

for aStock in testNodeList:
for anotherStock in testNodeList:
if df_s_transpose_spearman_test[aStock][anotherStock] > 0:
#print(df_s_transpose_spearman[aStock][anotherStock])
testSource.append(aStock)
testTarget.append(anotherStock)
testEdge_feature.append(1)

for aStock in validationNodeList:
for anotherStock in validationNodeList:
if df_s_transpose_spearman_validation[aStock][anotherStock] > 0:

print(df_s_transpose_spearman[aStock][anotherStock])

validationSource.append(aStock)
validationTarget.append(anotherStock)
validationEdge_feature.append(1)

edge feature is not required except for news based graph

trainSource, trainTarget, trainEdge_feature
testSource, testTarget, testEdge_feature
validationSource, validationTarget, validationEdge_feature

# Create variables to create stellar graph

# Edges

In[31]:

https://stellargraph.readthedocs.io/en/stable/demos/basics/loading-pandas.html

spearman_edges = pd.DataFrame(
{"source": source, "target": target}
)

spearman_edges_data = pd.DataFrame(
{"source": source, "target": target, "edge_feature": edge_feature}
)

https://stellargraph.readthedocs.io/en/stable/demos/basics/loading-pandas.html

spearman_edges_train = pd.DataFrame(
{"source": trainSource, "target": trainTarget}
)

spearman_edges_data_train = pd.DataFrame(
{"source": trainSource, "target": trainTarget, "edge_feature": trainEdge_feature}
)

spearman_edges_test = pd.DataFrame(
{"source": testSource, "target": testTarget}
)

spearman_edges_data_test = pd.DataFrame(
{"source": testSource, "target": testTarget, "edge_feature": testEdge_feature}
)

spearman_edges_validation = pd.DataFrame(
{"source": validationSource, "target": validationTarget}
)

spearman_edges_train[:10]

# Have the time series data as part of the nodes

# Structure the Feature Matrix so that it can be passed to the GCN

In[32]:

df_s_transpose_feature = df_s_transpose.reset_index(drop = True, inplace = False)

df_s_transpose_feature = df_s_transpose_feature.values.tolist()

print(df_s_transpose_feature.values.tolist())

#df_s_transpose_feature['WY'].values
df_s_transpose_feature['AAPL'].shape, df_s_transpose_feature['AAPL'].values

In[33]:

len(all_stock_nodes)

In[34]:

bring/assign data to nodes

node_Data = [];
for x in all_stock_nodes:
node_Data.append( df_s_transpose_feature[x].values)

node_Data

In[35]:

convert node data variable into a dataframe so that the data structure is compatible with graph NN

spearman_graph_node_data = pd.DataFrame(node_Data, index = all_stock_nodes)
spearman_graph_node_data.head()

In[36]:

node_Data[14:15],
len(validationNodeList)
len(testNodeList)

In[37]:

Node time series data based on train, test, validation graph

In[38]:

convert node data variable into a dataframe so that the data structure is compatible with graph NN

spearman_graph_node_data_train = pd.DataFrame(node_Data[0:14], index = trainNodeList)
spearman_graph_node_data_train.head()

spearman_graph_node_data_test = pd.DataFrame(node_Data[14:], index = testNodeList) #pd.DataFrame(node_Data[15:23], index = testNodeList)
spearman_graph_node_data_test.head()

spearman_graph_node_data_validation = pd.DataFrame(node_Data[14:], index = validationNodeList) #pd.DataFrame(node_Data[22:30], index = validationNodeList)
spearman_graph_node_data_validation.head()

In[39]:

spearman_graph_node_data_train

# Graph (stellar) with features as part of Nodes

In[40]:

Overall

spearman_graph_with_node_features = StellarGraph(spearman_graph_node_data, edges = spearman_edges, node_type_default = "corner", edge_type_default = "line")
print(spearman_graph_with_node_features.info())

train nodes

spearman_train_graph_with_node_features = StellarGraph(spearman_graph_node_data_train, edges = spearman_edges_train, node_type_default = "corner", edge_type_default = "line")
print(spearman_train_graph_with_node_features.info())

test

spearman_test_graph_with_node_features = StellarGraph(spearman_graph_node_data_test, edges = spearman_edges_test, node_type_default = "corner", edge_type_default = "line")
print(spearman_test_graph_with_node_features.info())

validation

spearman_validation_graph_with_node_features = StellarGraph(spearman_graph_node_data_validation, edges = spearman_edges_validation, node_type_default = "corner", edge_type_default = "line")
print(spearman_validation_graph_with_node_features.info())

# Adapting everything for DeepGraphCNN

In[41]:

spearman_graph_node_data.iloc[0:15, :]

# Graphs to be jused for DeepGraphCNN

In[42]:

graphs = list()
#graphs.append(spearman_graph_with_node_features)
graphs.append(spearman_train_graph_with_node_features)
graphs.append(spearman_test_graph_with_node_features)
graphs.append(spearman_validation_graph_with_node_features)

In[43]:

summary = pd.DataFrame(
[(g.number_of_nodes(), g.number_of_edges()) for g in graphs],
columns=["nodes", "edges"],
)
summary.describe().round()

In[44]:

graph_labels = all_stock_nodes

In[45]:

Generator

#generator = FullBatchNodeGenerator(spearman_graph_with_node_features, method = "gcn") # , sparse = False
#vars(generator)

generator = PaddedGraphGenerator( graphs = graphs)

generator = PaddedGraphGenerator( spearman_graph_with_node_features)

In[46]:

vars(generator)

# Train Test Split

# Commented out on 2023-04-18

train_subjects, test_subjects = model_selection.train_test_split(

spearman_graph_node_data

)

val_subjects, test_subjects_step_2 = model_selection.train_test_split(

test_subjects

)

#, train_size = 500, test_size = None, stratify = test_subjects

train_subjects.shape, test_subjects.shape, val_subjects.shape, test_subjects_step_2.shape