K-means¶

This jupyter notebook tries to construct K-means model from scratch. After that I will try to compare it with sklearn.cluster.KMeans

1. Load Library and Generate Data¶

# Load library
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Generate Data
X = load_iris().data[:,:2] # For plot convenience, suppose we only have two features
y = load_iris().target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

2. Model Construction¶

what kmeans does is that:

First we randomly assign each data point to a cluster
Calculate the mean value in each cluster
calculate the distance between each data point and the centroid, re-assign the data to the nearest cluster
Repeat until max_iter (or the decrease of variance is small)

class kmeans:
    '''
    This class aims to help perform kmeans clustering
    '''
    
    def __init__(self, n_clusters, max_iter, random_state):
        '''
        Parameter
        ---------
        n_clusters: The number of clusters
        max_iter: Maximum number of iterations of the k-means algorithm for a single run
        random_state: int, RandomState instance
        '''
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state
        self.label = []
        self.centroid = [None] * n_clusters
        
    def fit(self, X):
        '''
        This method aims to fit data 
        '''
        if len(self.label) == 0:
            np.random.seed(self.random_state)
            self.label = np.random.choice(self.n_clusters, size = X.shape[0], replace = True)
        
        for i in range(self.max_iter):
            for c in range(self.n_clusters):
                data = X[self.label == c]
                self.centroid[c] = data.mean(axis = 0)
                
            for index, point in enumerate(X):
                dist = [np.linalg.norm(point - self.centroid[c]) for c in range(self.n_clusters)]
                self.label[index] = np.argmin(dist)
        
    def predict(self, new_X):
        '''
        This method aims to predict the cluster based on our centroid
        '''
        label_new = [None] * new_X.shape[0]
        for index, point in enumerate(new_X):
            dist = [np.linalg.norm(point - self.centroid[c]) for c in range(self.n_clusters)]
            label_new[index] = np.argmin(dist) 
            
        return(label_new)

3. Model Evaluation¶

km = kmeans(n_clusters=3, max_iter=300, random_state=42)
km.fit(X_train)

km.label

array([2, 2, 1, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 2, 2, 0, 0, 1, 1, 1, 0, 1,
       0, 2, 1, 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 0, 1, 2, 0, 0, 2, 0, 0,
       0, 0, 1, 0, 2, 0, 1, 2, 2, 0, 1, 2, 0, 2, 2, 0, 0, 1, 0, 0, 1, 0,
       2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 1, 1, 2, 0, 1, 0, 1, 1, 2, 1, 0, 1,
       0, 0, 0, 2, 1, 0, 2, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 2, 0, 0, 2, 0, 1])

np.array(km.predict(X_test)).reshape(1,-1)

array([[0, 2, 1, 0, 1, 2, 0, 1, 0, 0, 1, 2, 2, 2, 2, 1, 1, 0, 0, 1, 2, 0,
        2, 1, 1, 1, 1, 1, 2, 2]])

from sklearn.cluster import KMeans

km_new = KMeans(n_clusters=3)

km_new = km_new.fit(X_train)
km_new.labels_

array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2, 1, 0, 0, 2, 2, 1, 1, 1, 2, 1,
       2, 0, 1, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 1, 0, 2, 2, 0, 2, 2,
       2, 2, 1, 2, 0, 2, 1, 0, 0, 2, 1, 0, 2, 0, 0, 2, 2, 1, 2, 2, 1, 2,
       0, 0, 2, 1, 0, 0, 0, 2, 1, 0, 1, 1, 0, 2, 1, 2, 1, 1, 0, 1, 2, 1,
       2, 2, 2, 0, 1, 2, 0, 2, 1, 1, 0, 2, 1, 1, 0, 1, 0, 1, 1, 1, 2, 1,
       2, 1, 2, 2, 0, 2, 2, 0, 2, 1], dtype=int32)

We could see the result is almost the same. Since the cluster label is little bit different, it is hard to compare, so then let me make some plots.

fig = plt.figure(figsize = (10,5))
ax1 = fig.add_subplot(1,2,1)
ax1.scatter(X_train[:,0], X_train[:,1], c = km.label)
ax1.set_title('K-means Clustering Using Our Function')
fakeLine1 = plt.Line2D([0,0],[0,1], color='yellow', marker='o', linestyle='') # create legend handler
fakeLine2 = plt.Line2D([0,0],[0,1], color='Purple', marker='o', linestyle='')
fakeLine3 = plt.Line2D([0,0],[0,1], color='teal', marker='o', linestyle='')
ax1.legend([fakeLine1,fakeLine2,fakeLine3], ['cluster1', 'cluster2', 'cluster3'])

ax2 = fig.add_subplot(1,2,2)
ax2.scatter(X_train[:,0], X_train[:,1], c = y_train)
ax2.set_title('Original Dataset')
ax2.legend([fakeLine1,fakeLine2,fakeLine3], ['cluster1', 'cluster2', 'cluster3'])
plt.show()