K-means

This jupyter notebook tries to construct K-means model from scratch. After that I will try to compare it with sklearn.cluster.KMeans

1. Load Library and Generate Data

In [1]:
# Load library
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
In [2]:
# Generate Data
X = load_iris().data[:,:2] # For plot convenience, suppose we only have two features
y = load_iris().target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

2. Model Construction

what kmeans does is that:

  1. First we randomly assign each data point to a cluster
  2. Calculate the mean value in each cluster
  3. calculate the distance between each data point and the centroid, re-assign the data to the nearest cluster
  4. Repeat until max_iter (or the decrease of variance is small)
In [23]:
class kmeans:
    '''
    This class aims to help perform kmeans clustering
    '''
    
    def __init__(self, n_clusters, max_iter, random_state):
        '''
        Parameter
        ---------
        n_clusters: The number of clusters
        max_iter: Maximum number of iterations of the k-means algorithm for a single run
        random_state: int, RandomState instance
        '''
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state
        self.label = []
        self.centroid = [None] * n_clusters
        
    def fit(self, X):
        '''
        This method aims to fit data 
        '''
        if len(self.label) == 0:
            np.random.seed(self.random_state)
            self.label = np.random.choice(self.n_clusters, size = X.shape[0], replace = True)
        
        for i in range(self.max_iter):
            for c in range(self.n_clusters):
                data = X[self.label == c]
                self.centroid[c] = data.mean(axis = 0)
                
            for index, point in enumerate(X):
                dist = [np.linalg.norm(point - self.centroid[c]) for c in range(self.n_clusters)]
                self.label[index] = np.argmin(dist)
        
    def predict(self, new_X):
        '''
        This method aims to predict the cluster based on our centroid
        '''
        label_new = [None] * new_X.shape[0]
        for index, point in enumerate(new_X):
            dist = [np.linalg.norm(point - self.centroid[c]) for c in range(self.n_clusters)]
            label_new[index] = np.argmin(dist) 
            
        return(label_new)

3. Model Evaluation

In [24]:
km = kmeans(n_clusters=3, max_iter=300, random_state=42)
km.fit(X_train)
In [25]:
km.label
Out[25]:
array([2, 2, 1, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 2, 2, 0, 0, 1, 1, 1, 0, 1,
       0, 2, 1, 0, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 0, 1, 2, 0, 0, 2, 0, 0,
       0, 0, 1, 0, 2, 0, 1, 2, 2, 0, 1, 2, 0, 2, 2, 0, 0, 1, 0, 0, 1, 0,
       2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 1, 1, 2, 0, 1, 0, 1, 1, 2, 1, 0, 1,
       0, 0, 0, 2, 1, 0, 2, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 2, 0, 0, 2, 0, 1])
In [30]:
np.array(km.predict(X_test)).reshape(1,-1)
Out[30]:
array([[0, 2, 1, 0, 1, 2, 0, 1, 0, 0, 1, 2, 2, 2, 2, 1, 1, 0, 0, 1, 2, 0,
        2, 1, 1, 1, 1, 1, 2, 2]])
In [9]:
from sklearn.cluster import KMeans
In [10]:
km_new = KMeans(n_clusters=3)
In [12]:
km_new = km_new.fit(X_train)
km_new.labels_
Out[12]:
array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2, 1, 0, 0, 2, 2, 1, 1, 1, 2, 1,
       2, 0, 1, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 1, 0, 2, 2, 0, 2, 2,
       2, 2, 1, 2, 0, 2, 1, 0, 0, 2, 1, 0, 2, 0, 0, 2, 2, 1, 2, 2, 1, 2,
       0, 0, 2, 1, 0, 0, 0, 2, 1, 0, 1, 1, 0, 2, 1, 2, 1, 1, 0, 1, 2, 1,
       2, 2, 2, 0, 1, 2, 0, 2, 1, 1, 0, 2, 1, 1, 0, 1, 0, 1, 1, 1, 2, 1,
       2, 1, 2, 2, 0, 2, 2, 0, 2, 1], dtype=int32)

We could see the result is almost the same. Since the cluster label is little bit different, it is hard to compare, so then let me make some plots.

In [55]:
fig = plt.figure(figsize = (10,5))
ax1 = fig.add_subplot(1,2,1)
ax1.scatter(X_train[:,0], X_train[:,1], c = km.label)
ax1.set_title('K-means Clustering Using Our Function')
fakeLine1 = plt.Line2D([0,0],[0,1], color='yellow', marker='o', linestyle='') # create legend handler
fakeLine2 = plt.Line2D([0,0],[0,1], color='Purple', marker='o', linestyle='')
fakeLine3 = plt.Line2D([0,0],[0,1], color='teal', marker='o', linestyle='')
ax1.legend([fakeLine1,fakeLine2,fakeLine3], ['cluster1', 'cluster2', 'cluster3'])

ax2 = fig.add_subplot(1,2,2)
ax2.scatter(X_train[:,0], X_train[:,1], c = y_train)
ax2.set_title('Original Dataset')
ax2.legend([fakeLine1,fakeLine2,fakeLine3], ['cluster1', 'cluster2', 'cluster3'])
plt.show()