This jupyter notebook tries to construct K-nearest-neighbors algorithm from scratch. As normal, After that I will then compare my results with sklearn.neighbors.KNeighborsClassifier
's result
# Load Library
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# Generate Data
X = load_iris().data[:,:2] # For plot convenience, suppose we only have two features
y = load_iris().target
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)
class knn:
'''
This class defines KNN classification algorithm
'''
def __init__(self, k):
'''
Parameter
---------
k: The number of neighbors to use
'''
self.k = k
self.prob = []
self.train_data = None
def fit(self, X_train, y_train):
'''
This method aims to fit data to knn model
'''
# store train data for predicting usage
self.train_data = np.concatenate((X_train, y_train.reshape(-1,1)), axis = 1)
for data in X_train:
dist = [np.linalg.norm(data - X_train[i]) for i in range(X_train.shape[0])]
# For convenience, we will combine dist,y together first
dist_y = np.vstack((dist, y_train)).transpose()
# sort the distance
dist_y = dist_y[dist_y[:,0].argsort()]
# choose the nearest k neighbors
neighbors = dist_y[:self.k,:]
prob = []
for cls in np.unique(y_train):
cls_count = sum(neighbors[:,1] == cls)
prob.append(cls_count/self.k)
self.prob.append(prob)
def predict(self, X_test):
'''
This method aims to make prediction
'''
prediction = []
for data in X_test:
dist = [np.linalg.norm(data - self.train_data[i,:-1]) for i in range(self.train_data.shape[0])]
dist_y = np.vstack((dist, self.train_data[:,-1])).transpose()
# sort the distance
dist_y = dist_y[dist_y[:,0].argsort()]
# choose the nearest k neighbors
neighbors = dist_y[:self.k,:]
prob = []
for cls in np.unique(self.train_data[:,-1]):
cls_count = sum(neighbors[:,1] == cls)
prob.append(cls_count/self.k)
prediction.append(prob)
return(prediction)
k = knn(k=10)
k.fit(X_train, y_train)
np.mean(np.argmax(k.prob,axis = 1) == y_train)
np.mean(np.argmax(k.predict(X_test),axis = 1) == y_test)
from sklearn.neighbors import KNeighborsClassifier
k_new = KNeighborsClassifier(n_neighbors=10).fit(X_train,y_train)
np.mean(k_new.predict(X_test) == y_test)
We could see that our model works exactly same with sklearn
. Let me then make a plot to show how k
influence the accuracy of our model
fig = plt.figure(figsize = (10,6))
ax = fig.add_subplot(111)
for k in range(1,120,1):
model = knn(k)
model.fit(X_train, y_train)
train_acc = np.mean(np.argmax(model.prob,axis = 1) == y_train)
test_acc = np.mean(np.argmax(model.predict(X_test),axis = 1) == y_test)
ax.scatter(k, train_acc, c = 'red')
ax.scatter(k, test_acc, c = 'blue')
fakeLine1 = plt.Line2D([0,0],[0,1], color='red', marker='o', linestyle='')
fakeLine2 = plt.Line2D([0,0],[0,1], color='blue', marker='o', linestyle='')
ax.legend([fakeLine1, fakeLine2], ['training accuracy', 'test accuracy'], loc = 'best')
ax.set_xlabel('The number of nearest neighbors')
ax.set_ylabel('Accuracy')
plt.show()