{"id":76075,"date":"2024-05-18T21:44:43","date_gmt":"2024-05-19T01:44:43","guid":{"rendered":"http:\/\/bangla.sitestree.com\/?p=76075"},"modified":"2024-05-18T21:44:43","modified_gmt":"2024-05-19T01:44:43","slug":"k-means-clustering","status":"publish","type":"post","link":"http:\/\/bangla.sitestree.com\/?p=76075","title":{"rendered":"K-Means Clustering"},"content":{"rendered":"<p>Click on the images to see them clearly<\/p>\n<p><a href=\"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-40.png\" rel=\"attachment wp-att-76076\"><img data-recalc-dims=\"1\" loading=\"lazy\" decoding=\"async\" src=\"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-40.png?resize=750%2C750\" alt=\"\" title=\"image-40-png\" width=\"750\" height=\"750\" class=\"alignnone size-full wp-image-76076\" \/><\/a><\/p>\n<p>#!\/usr\/bin\/env python<\/p>\n<h1>coding: utf-8<\/h1>\n<h1>In[1]:<\/h1>\n<h1>k-means clustering<\/h1>\n<p>from numpy import unique<br \/>\nfrom numpy import where<br \/>\nfrom sklearn.datasets import make_classification<br \/>\nfrom sklearn.cluster import KMeans<br \/>\nfrom matplotlib import pyplot<br \/>\nimport numpy as np<br \/>\nimport pandas as pd<br \/>\nimport seaborn as sns<br \/>\nimport matplotlib.pyplot as plt<br \/>\nget_ipython().run_line_magic(&#8216;matplotlib&#8217;, &#8216;inline&#8217;)<br \/>\nimport pandas as pd<br \/>\nimport numpy as np<br \/>\nimport numpy as np<br \/>\nfrom sklearn.cluster import KMeans<br \/>\nfrom sklearn import datasets<br \/>\nfrom sklearn.preprocessing import StandardScaler<\/p>\n<h1>In[2]:<\/h1>\n<p>import warnings<br \/>\nwarnings.filterwarnings(&#8216;ignore&#8217;)<\/p>\n<h1>In[3]:<\/h1>\n<h1>the combined data<\/h1>\n<p>data_folder = &#8216;.\/nhanes_input_data\/&#8217;<\/p>\n<h1>import the CSV as a pandas dataframe<\/h1>\n<p>df = pd.read_csv( data_folder + &#8216;0_dietaryIntakeDataForClassificationAndAnalysisData.csv&#8217;)<br \/>\ndf.shape<\/p>\n<h1>In[4]:<\/h1>\n<p>df.head(5)<\/p>\n<h1>In[5]:<\/h1>\n<h1>parameters to be used for KMeans clustring: centres<\/h1>\n<h1>X and\/or kdf will have only features we want to create cluster around<\/h1>\n<p>kdf = df[<br \/>\u00a0 \u00a0 <br \/>\u00a0 \u00a0 [<br \/>\u00a0 \u00a0 \u00a0 \u00a0 &#39;RIDAGEYR_Age_in_years_at_screening&#39;<br \/>\u00a0 \u00a0 \u00a0 \u00a0 ,&#39;URDACT_Albumin_creatinine_ratio_mg_g&#39;<br \/>\u00a0 \u00a0 ]<br \/>\n]<br \/>\nX = kdf<br \/>\nX[:5]<\/p>\n<h1>In[6]:<\/h1>\n<h1>ref: internet (not my code, using as a library)<\/h1>\n<p>def clean_dataset(df):<br \/>\nassert isinstance(df, pd.DataFrame), &quot;df needs to be a pd.DataFrame&quot;<br \/>\ndf.dropna(inplace=True)<br \/>\nindices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)<br \/>\nreturn df[indices_to_keep].astype(np.float64)<\/p>\n<h1>In[7]:<\/h1>\n<h1>X has the features to cluster around (centres: Age, ACR). df has the complete data<\/h1>\n<h1>after clustering is done using features in X, we find positions (index) for each data<\/h1>\n<h1>in a cluster then we use those index positions to cluster the data from df<\/h1>\n<p>X.shape, df.shape<\/p>\n<h1>In[8]:<\/h1>\n<p>X = clean_dataset(X)<\/p>\n<h1>In[9]:<\/h1>\n<h1>define the model<\/h1>\n<p>model = KMeans(n_clusters = 10) #,random_state=0, n_init=&quot;auto&quot;<\/p>\n<h1>fit the model<\/h1>\n<p>model.fit(X)<br \/>\n#model.labels_<\/p>\n<h1>In[10]:<\/h1>\n<h1>Create csv files with the cluster daya<\/h1>\n<h1>One csv for one Cluster<\/h1>\n<h1>In[11]:<\/h1>\n<p>howManyClusters = 10<br \/>\nfor clusterId in range (howManyClusters):<br \/>\nind_list = np.where(model.labels_ == clusterId )[0]<br \/>\ncluster = df.iloc[ind_list]<br \/>\ncluster.to_csv(&#8216;.\/nhanes_output_data\/classifiedGroups\/kmeanscluster\/cluster-&#8216;<br \/>\n+ str(clusterId) + &#8216;.csv&#8217;);<\/p>\n<h1>In[12]:<\/h1>\n<p>model.cluster_centers_<\/p>\n<h1>In[13]:<\/h1>\n<h1>Scatter plot to see each cluster points visually<\/h1>\n<p>std_data = StandardScaler().fit_transform(X)<br \/>\nplt.scatter(std_data[:,0], std_data[:,1], c = model.labels_, cmap = &quot;rainbow&quot;)<br \/>\nplt.title(&quot;K-means Clustering of Diet and ACR data&quot;)<br \/>\nplt.show()<\/p>\n<h1># References:<\/h1>\n<h1># print(&quot;Shape of cluster:&quot;, model.cluster_centers_.shape)<\/h1>\n<h1># <a href=\"https:\/\/stackoverflow.com\/questions\/50297142\/get-cluster-points-after-kmeans-in-a-list-format\">https:\/\/stackoverflow.com\/questions\/50297142\/get-cluster-points-after-kmeans-in-a-list-format<\/a><\/h1>\n<p>#<\/p>\n<h1># <a href=\"https:\/\/machinelearningmastery.com\/clustering-algorithms-with-python\/\">https:\/\/machinelearningmastery.com\/clustering-algorithms-with-python\/<\/a><\/h1>\n<h1># <a href=\"https:\/\/stackoverflow.com\/questions\/50297142\/get-cluster-points-after-kmeans-in-a-list-format\">https:\/\/stackoverflow.com\/questions\/50297142\/get-cluster-points-after-kmeans-in-a-list-format<\/a><\/h1>\n<h1># <a href=\"https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.cluster.KMeans.html\">https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.cluster.KMeans.html<\/a><\/h1>\n<h1># <a href=\"https:\/\/datascience.stackexchange.com\/questions\/48693\/perform-k-means-clustering-over-multiple-columns\">https:\/\/datascience.stackexchange.com\/questions\/48693\/perform-k-means-clustering-over-multiple-columns<\/a><\/h1>\n<h1># <a href=\"https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.cluster.KMeans.html\">https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.cluster.KMeans.html<\/a><\/h1>\n<p>#<\/p>\n<h1>&gt;&gt;&gt; from sklearn.cluster import KMeans<\/h1>\n<h1>&gt;&gt;&gt; import numpy as np<\/h1>\n<h1>&gt;&gt;&gt; X = np.array([[1, 2], [1, 4], [1, 0],<\/h1>\n<h1>&#8230; [10, 2], [10, 4], [10, 0]])<\/h1>\n<h1>&gt;&gt;&gt; kmeans = KMeans(n_clusters=2, random_state=0, n_init=&quot;auto&quot;).fit(X)<\/h1>\n<h1>&gt;&gt;&gt; kmeans.labels_<\/h1>\n<h1>array([1, 1, 1, 0, 0, 0], dtype=int32)<\/h1>\n<h1>&gt;&gt;&gt; kmeans.predict([[0, 0], [12, 3]])<\/h1>\n<h1>array([1, 0], dtype=int32)<\/h1>\n<h1>&gt;&gt;&gt; kmeans.cluster_centers_<\/h1>\n<h1>array([[10., \u00a02.],<\/h1>\n<h1>[ 1., \u00a02.]])<\/h1>\n<h1>In[ ]:<\/h1>\n<p><a href=\"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-41.png\" rel=\"attachment wp-att-76077\"><img data-recalc-dims=\"1\" loading=\"lazy\" decoding=\"async\" src=\"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-41.png?resize=750%2C750\" alt=\"\" title=\"image-41-png\" width=\"750\" height=\"750\" class=\"alignnone size-full wp-image-76077\" \/><\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>Click on the images to see them clearly #!\/usr\/bin\/env python coding: utf-8 In[1]: k-means clustering from numpy import unique from numpy import where from sklearn.datasets import make_classification from sklearn.cluster import KMeans from matplotlib import pyplot import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic(&#8216;matplotlib&#8217;, &#8216;inline&#8217;) import pandas &hellip; <\/p>\n<p><a class=\"more-link btn\" href=\"http:\/\/bangla.sitestree.com\/?p=76075\">Continue reading<\/a><\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[182],"tags":[],"class_list":["post-76075","post","type-post","status-publish","format-standard","hentry","category---blog","item-wrap"],"jetpack_featured_media_url":"","jetpack_sharing_enabled":true,"jetpack-related-posts":[{"id":16923,"url":"http:\/\/bangla.sitestree.com\/?p=16923","url_meta":{"origin":76075,"position":0},"title":"Python Libraries for Data Science esp. for NLP &#8211; Natural Language Processing","author":"Sayed","date":"February 14, 2020","format":false,"excerpt":"For NLP tasks, either you will come across these libraries or you will have to use many of these Python libraries. import nltk # tokenizer nltk.download(\"punkt\") # stop words nltk.download(\"stopwords\") from nltk.tokenize import TreebankWordTokenizer from nltk.tokenize import WordPunctTokenizer from nltk.tokenize import RegexpTokenizer from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords\u2026","rel":"","context":"In &quot;\u09ac\u09cd\u09b2\u0997 \u0964 Blog&quot;","block_context":{"text":"\u09ac\u09cd\u09b2\u0997 \u0964 Blog","link":"http:\/\/bangla.sitestree.com\/?cat=182"},"img":{"alt_text":"","src":"","width":0,"height":0},"classes":[]},{"id":76436,"url":"http:\/\/bangla.sitestree.com\/?p=76436","url_meta":{"origin":76075,"position":1},"title":"1. Libraries used for the project: Predict Future Stock Price using Graph Theory, Machine Learning and Deep Learning)","author":"Sayed","date":"December 4, 2024","format":false,"excerpt":"#import libraries import osimport pandas as pdimport math #Import Libraries for Graph, GNN, and GCN import stellargraph as sgfrom stellargraph import StellarGraphfrom stellargraph.layer import DeepGraphCNNfrom stellargraph.mapper import FullBatchNodeGeneratorfrom stellargraph.mapper import PaddedGraphGeneratorfrom stellargraph.layer import GCN #Machine Learnig related library Imports from tensorflow.keras import layers, optimizers, losses, metrics, Modelfrom sklearn import preprocessing,\u2026","rel":"","context":"In &quot;Code: Predict Future Stock Price using Graph Theory, Machine Learning and Deep Learning)&quot;","block_context":{"text":"Code: Predict Future Stock Price using Graph Theory, Machine Learning and Deep Learning)","link":"http:\/\/bangla.sitestree.com\/?cat=1969"},"img":{"alt_text":"","src":"","width":0,"height":0},"classes":[]},{"id":76580,"url":"http:\/\/bangla.sitestree.com\/?p=76580","url_meta":{"origin":76075,"position":2},"title":"3D Scatter Plot in Python","author":"Sayed","date":"January 12, 2025","format":false,"excerpt":"Visualizing 3-D numeric data with Scatter Plots length, breadth and depth Ref: https:\/\/towardsdatascience.com\/the-art-of-effective-visualization-of-multi-dimensional-data-6c7202990c57 import pandas as pdimport matplotlib.pyplot as pltfrom mpl_toolkits.mplot3d import Axes3Dimport matplotlib as mplimport numpy as npimport seaborn as sns%matplotlib inline fig = plt.figure(figsize=(8, 6))ax = fig.add_subplot(111, projection='3d') xs = wines['residual sugar']ys = wines['fixed acidity']zs = wines['alcohol']ax.scatter(xs, ys,\u2026","rel":"","context":"In &quot;Data Visualization&quot;","block_context":{"text":"Data Visualization","link":"http:\/\/bangla.sitestree.com\/?cat=1903"},"img":{"alt_text":"","src":"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2025\/01\/image-17.png?resize=350%2C200","width":350,"height":200,"srcset":"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2025\/01\/image-17.png?resize=350%2C200 1x, https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2025\/01\/image-17.png?resize=525%2C300 1.5x, https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2025\/01\/image-17.png?resize=700%2C400 2x"},"classes":[]},{"id":14562,"url":"http:\/\/bangla.sitestree.com\/?p=14562","url_meta":{"origin":76075,"position":3},"title":"Basic Numpy Operations","author":"Sayed","date":"January 5, 2019","format":false,"excerpt":"Basic Numpy Operations import numpy as np a = np.arange(15).reshape(3, 5) print(a) print(a.shape) print(a.ndim) print(a.dtype.name) print(a.itemsize) print(a.size) print(type(a)) b = np.array([6, 7, 8]) print(b) type(b) #","rel":"","context":"In &quot;\u09ac\u09cd\u09b2\u0997 \u0964 Blog&quot;","block_context":{"text":"\u09ac\u09cd\u09b2\u0997 \u0964 Blog","link":"http:\/\/bangla.sitestree.com\/?cat=182"},"img":{"alt_text":"","src":"","width":0,"height":0},"classes":[]},{"id":24917,"url":"http:\/\/bangla.sitestree.com\/?p=24917","url_meta":{"origin":76075,"position":4},"title":"Basic Numpy Operations #Root","author":"Author-Check- Article-or-Video","date":"April 13, 2021","format":false,"excerpt":"Basic Numpy Operations import numpy as np a = np.arange(15).reshape(3, 5) print(a) print(a.shape) print(a.ndim) print(a.dtype.name) print(a.itemsize) print(a.size) print(type(a)) b = np.array([6, 7, 8]) print(b) type(b) # From: https:\/\/sitestree.com\/basic-numpy-operations\/ Categories:RootTags: Post Data:2019-01-05 15:20:17 Shop Online: https:\/\/www.ShopForSoul.com\/ (Big Data, Cloud, Security, Machine Learning): Courses: http:\/\/Training.SitesTree.com In Bengali: http:\/\/Bangla.SaLearningSchool.com http:\/\/SitesTree.com 8112223 Canada Inc.\/JustEtc:\u2026","rel":"","context":"In &quot;FromSitesTree.com&quot;","block_context":{"text":"FromSitesTree.com","link":"http:\/\/bangla.sitestree.com\/?cat=1917"},"img":{"alt_text":"","src":"","width":0,"height":0},"classes":[]},{"id":76087,"url":"http:\/\/bangla.sitestree.com\/?p=76087","url_meta":{"origin":76075,"position":5},"title":"Python\/ML Correlation Coefficients","author":"Sayed","date":"May 19, 2024","format":false,"excerpt":"df_s_transpose_pearson = df_s_transpose.corr(method = 'pearson', numeric_only = True) df_s_transpose_pearson # Pearson Correlation Coefficient df_s_transpose_pearson = df_s_transpose.corr(method = 'pearson', numeric_only = True) df_s_transpose_pearson Pearson Correlation Coefficient based Adjacency Graph Matrix df_s_transpose_pearson[df_s_transpose_pearson >= 0.5] = 1 df_s_transpose_pearson[df_s_transpose_pearson < 0.5] = 0 df_s_transpose_pearson Create a Graph import networkx as nx Graph_pearson = nx.Graph(df_s_transpose_pearson)\u2026","rel":"","context":"In &quot;\u09ac\u09cd\u09b2\u0997 \u0964 Blog&quot;","block_context":{"text":"\u09ac\u09cd\u09b2\u0997 \u0964 Blog","link":"http:\/\/bangla.sitestree.com\/?cat=182"},"img":{"alt_text":"","src":"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-44.png?resize=350%2C200","width":350,"height":200,"srcset":"https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-44.png?resize=350%2C200 1x, https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-44.png?resize=525%2C300 1.5x, https:\/\/i0.wp.com\/bangla.sitestree.com\/wp-content\/uploads\/2024\/05\/image-44.png?resize=700%2C400 2x"},"classes":[]}],"_links":{"self":[{"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=\/wp\/v2\/posts\/76075","targetHints":{"allow":["GET"]}}],"collection":[{"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=76075"}],"version-history":[{"count":1,"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=\/wp\/v2\/posts\/76075\/revisions"}],"predecessor-version":[{"id":76078,"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=\/wp\/v2\/posts\/76075\/revisions\/76078"}],"wp:attachment":[{"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=76075"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=76075"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/bangla.sitestree.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=76075"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}