This notebook tries to Construct Random Forest Model from scratch. Since we have constructed decision tree model before, so I will use sklean
package to construct our base tree. For the datasets, I will still use the classification data I generated before. Besides, this time I will use class
instead of functions
. In the end, I will:
sklearn.ensemble.RandomForestClassifier
module # Load Library
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn import tree
import matplotlib.pyplot as plt
from scipy.stats import mode
from sklearn.metrics import confusion_matrix, roc_curve, auc
# Generate Data
X, y = make_classification(n_samples = 5000, n_features = 5, n_classes = 2, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
class RandomForest:
'''
This class aims to construct the RandomForest object that can fit data and make prediction
'''
def __init__(self, n_estimators = 100, max_depth = None, min_samples_split = 5):
'''
Parameter
---------
n_estimators: The number of trees in the forest
max_depth: The maxmim depth of the tree
min_sample_split: The minimun number of datapoints in leaves
'''
self.n_estimators = n_estimators
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.feature_selected = []
self.total_tree = []
def fit(self, X_train, y_train):
'''
This function aims to train our randomforest objects based on training data. We will use sqrt(P) to bootstrap
'''
feature_index = [i for i in range(X_train.shape[1])] # All the feature indexes
num_feature = X_train.shape[1] # total number of features
num_obs = X_train.shape[0] # total number of observations
max_feature = int(np.sqrt(num_feature)) # number of feature we will use for each tree
for i in range(self.n_estimators):
# construct the base tree for each of the bootstrapped data
obs_selected = np.random.choice(range(num_obs), num_obs, replace = True) # The selected obs
X_bootstrapped = X_train[obs_selected,:] # bootstrapped training X
y_bootstrapped = y_train[obs_selected] # bootstrapped training Y
feature_selected = np.random.choice(num_feature, max_feature, replace = True)
self.feature_selected.append(feature_selected)
tr = tree.DecisionTreeClassifier(max_depth = self.max_depth, min_samples_split = self.min_samples_split)
tr = tr.fit(X_bootstrapped[:,feature_selected], y_bootstrapped)
self.total_tree.append(tr)
def predict(self, X_test):
'''
This function aims to make a prediction based on the model we build above
'''
result = np.empty((0,X_test.shape[0]), dtype = int)
for i in range(self.n_estimators):
feature_selected = self.feature_selected[i] # the selected feature to train the model
tr = self.total_tree[i] # the training tree
result = np.row_stack((result, tr.predict(X_test[:,feature_selected]))) # stacking all the predictions for all the prediction
final_result = mode(result, axis = 0)
label = list(final_result)[0]
prob = np.mean(result, axis = 0)
return((label, prob))
# Construct the random forest object
rf = RandomForest()
rf.fit(X_train, y_train)
label, prob = rf.predict(X_test)
# calculate the accuracy
np.mean(label == y_test)
# Evaluation matrix
confusion_matrix(y_test, label.reshape(1000,))
fpr, tpr, threshold = roc_curve(y_test, prob.reshape(1000,))
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc, lw=2)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
We could see that our model do have a really good performance. Remember our results of Naive Bayes model is around 87% accuracy. Let us then compare it with sklearn.ensemble.RandomForestClassifier
model.
from sklearn.ensemble import RandomForestClassifier
rf2 = RandomForestClassifier()
pred = rf2.fit(X_train, y_train).predict(X_test)
np.mean(pred == y_test)
Good! Our model has the same acuracy as sklearn