This jupyter notebook tries to construct K-means model from scratch. After that I will try to compare it with sklearn.cluster.KMeans
# Load library
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Generate Data
X = load_iris().data[:,:2] # For plot convenience, suppose we only have two features
y = load_iris().target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
what kmeans does is that:
class kmeans:
'''
This class aims to help perform kmeans clustering
'''
def __init__(self, n_clusters, max_iter, random_state):
'''
Parameter
---------
n_clusters: The number of clusters
max_iter: Maximum number of iterations of the k-means algorithm for a single run
random_state: int, RandomState instance
'''
self.n_clusters = n_clusters
self.max_iter = max_iter
self.random_state = random_state
self.label = []
self.centroid = [None] * n_clusters
def fit(self, X):
'''
This method aims to fit data
'''
if len(self.label) == 0:
np.random.seed(self.random_state)
self.label = np.random.choice(self.n_clusters, size = X.shape[0], replace = True)
for i in range(self.max_iter):
for c in range(self.n_clusters):
data = X[self.label == c]
self.centroid[c] = data.mean(axis = 0)
for index, point in enumerate(X):
dist = [np.linalg.norm(point - self.centroid[c]) for c in range(self.n_clusters)]
self.label[index] = np.argmin(dist)
def predict(self, new_X):
'''
This method aims to predict the cluster based on our centroid
'''
label_new = [None] * new_X.shape[0]
for index, point in enumerate(new_X):
dist = [np.linalg.norm(point - self.centroid[c]) for c in range(self.n_clusters)]
label_new[index] = np.argmin(dist)
return(label_new)
km = kmeans(n_clusters=3, max_iter=300, random_state=42)
km.fit(X_train)
km.label
np.array(km.predict(X_test)).reshape(1,-1)
from sklearn.cluster import KMeans
km_new = KMeans(n_clusters=3)
km_new = km_new.fit(X_train)
km_new.labels_
We could see the result is almost the same. Since the cluster label is little bit different, it is hard to compare, so then let me make some plots.
fig = plt.figure(figsize = (10,5))
ax1 = fig.add_subplot(1,2,1)
ax1.scatter(X_train[:,0], X_train[:,1], c = km.label)
ax1.set_title('K-means Clustering Using Our Function')
fakeLine1 = plt.Line2D([0,0],[0,1], color='yellow', marker='o', linestyle='') # create legend handler
fakeLine2 = plt.Line2D([0,0],[0,1], color='Purple', marker='o', linestyle='')
fakeLine3 = plt.Line2D([0,0],[0,1], color='teal', marker='o', linestyle='')
ax1.legend([fakeLine1,fakeLine2,fakeLine3], ['cluster1', 'cluster2', 'cluster3'])
ax2 = fig.add_subplot(1,2,2)
ax2.scatter(X_train[:,0], X_train[:,1], c = y_train)
ax2.set_title('Original Dataset')
ax2.legend([fakeLine1,fakeLine2,fakeLine3], ['cluster1', 'cluster2', 'cluster3'])
plt.show()