K-Means Clustering

Click on the images to see them clearly

#!/usr/bin/env python

coding: utf-8

In[1]:

k-means clustering

from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from matplotlib import pyplot
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic(‘matplotlib’, ‘inline’)
import pandas as pd
import numpy as np
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

In[2]:

import warnings
warnings.filterwarnings(‘ignore’)

In[3]:

the combined data

data_folder = ‘./nhanes_input_data/’

import the CSV as a pandas dataframe

df = pd.read_csv( data_folder + ‘0_dietaryIntakeDataForClassificationAndAnalysisData.csv’)
df.shape

In[4]:

df.head(5)

In[5]:

parameters to be used for KMeans clustring: centres

X and/or kdf will have only features we want to create cluster around

kdf = df[
   
    [
        'RIDAGEYR_Age_in_years_at_screening'
        ,'URDACT_Albumin_creatinine_ratio_mg_g'
    ]
]
X = kdf
X[:5]

In[6]:

ref: internet (not my code, using as a library)

def clean_dataset(df):
assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)

In[7]:

X has the features to cluster around (centres: Age, ACR). df has the complete data

after clustering is done using features in X, we find positions (index) for each data

in a cluster then we use those index positions to cluster the data from df

X.shape, df.shape

In[8]:

X = clean_dataset(X)

In[9]:

define the model

model = KMeans(n_clusters = 10) #,random_state=0, n_init="auto"

fit the model

model.fit(X)
#model.labels_

In[10]:

Create csv files with the cluster daya

One csv for one Cluster

In[11]:

howManyClusters = 10
for clusterId in range (howManyClusters):
ind_list = np.where(model.labels_ == clusterId )[0]
cluster = df.iloc[ind_list]
cluster.to_csv(‘./nhanes_output_data/classifiedGroups/kmeanscluster/cluster-‘
+ str(clusterId) + ‘.csv’);

In[12]:

model.cluster_centers_

In[13]:

Scatter plot to see each cluster points visually

std_data = StandardScaler().fit_transform(X)
plt.scatter(std_data[:,0], std_data[:,1], c = model.labels_, cmap = "rainbow")
plt.title("K-means Clustering of Diet and ACR data")
plt.show()

# References:

# print("Shape of cluster:", model.cluster_centers_.shape)

# https://stackoverflow.com/questions/50297142/get-cluster-points-after-kmeans-in-a-list-format

#

# https://machinelearningmastery.com/clustering-algorithms-with-python/

# https://stackoverflow.com/questions/50297142/get-cluster-points-after-kmeans-in-a-list-format

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

# https://datascience.stackexchange.com/questions/48693/perform-k-means-clustering-over-multiple-columns

# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

#

>>> from sklearn.cluster import KMeans

>>> import numpy as np

>>> X = np.array([[1, 2], [1, 4], [1, 0],

… [10, 2], [10, 4], [10, 0]])

>>> kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)

>>> kmeans.labels_

array([1, 1, 1, 0, 0, 0], dtype=int32)

>>> kmeans.predict([[0, 0], [12, 3]])

array([1, 0], dtype=int32)

>>> kmeans.cluster_centers_

array([[10.,  2.],

[ 1.,  2.]])

In[ ]: