Working with the MNIST dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import timeit

Getting the data

Using a 80/20 ratio for training and testing

In [5]:
train = pd.read_csv("train.csv")
features = train.columns[1:]
X = train[features]
y = train['label']
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X/255.,y,test_size=0.2,random_state=0)

Random Forest

In [3]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print "Random Forest Accuracy: ",acc_rf
Random Forest Accuracy:  0.941785714286

Finding the optimal number of trees (between one and hundred trees)

In [4]:
accuracyList = []
for i in range(1,101):
    rf = RandomForestClassifier(n_estimators = i)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    accuracyList.append(accuracy_score(y_test, y_pred_rf))
trees = (accuracyList.index(max(accuracyList))) + 1
print "Number of Trees: ", trees  
print "Higest Random Forest Accuracy: ", max(accuracyList)
Number of Trees:  81
Higest Random Forest Accuracy:  0.965952380952

Plotting the accuracy rate vs. the number of trees

In [ ]:
numbers = list(range(1,101))
plt.plot(numbers, accuracyList)
plt.ylabel('Accuracy Rate')
plt.xlabel('Number of Trees')
plt.savefig("RFGraph.png")

Accuracy vs. Number of Trees

KNN

KNN where K = 1
In [6]:
knn_1 = KNeighborsClassifier(n_neighbors=1)
knn_1.fit(X_train, y_train)
y_pred_knn1 = knn_1.predict(X_test)
acc_knn1 = accuracy_score(y_test, y_pred_knn1)
print "Number of Neighbors: 1" 
print "K Nearest Neighbors Accuracy: ", acc_knn1
Number of Neighbors: 1
K Nearest Neighbors Accuracy:  0.970952380952
KNN where K = 3
In [7]:
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(X_train, y_train)
y_pred_knn3 = knn_3.predict(X_test)
acc_knn3 = accuracy_score(y_test, y_pred_knn3)
print "Number of Neighbors: 3" 
print "K Nearest Neighbors Accuracy: ", acc_knn3
Number of Neighbors: 3
K Nearest Neighbors Accuracy:  0.969761904762
KNN where K = 5
In [8]:
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_5.fit(X_train, y_train)
y_pred_knn5 = knn_5.predict(X_test)
acc_knn5 = accuracy_score(y_test, y_pred_knn5)
print "Number of Neighbors: 5" 
print "K Nearest Neighbors Accuracy: ", acc_knn5
Number of Neighbors: 5
K Nearest Neighbors Accuracy:  0.968095238095
KNN where K = 10
In [11]:
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_10.fit(X_train, y_train)
y_pred_knn10 = knn_10.predict(X_test)
acc_knn10 = accuracy_score(y_test, y_pred_knn10)
print "Number of Neighbors: 10" 
print "K Nearest Neighbors Accuracy: ", acc_knn10
Number of Neighbors: 10
K Nearest Neighbors Accuracy:  0.965119047619

SVM and PCA

Using both a linear kernel and a radial basis function kernel

Without PCA

In [18]:
svcLinear = SVC(kernel = "linear")
svcLinear.fit(X_train, y_train)
predictLinear = svcLinear.predict(X_test)
svc_acc_lin = accuracy_score(y_test, predictLinear)
svcNonLinear = SVC()
svcNonLinear.fit(X_train, y_train)
predictNonLinear = svcNonLinear.predict(X_test)
svc_acc_non = accuracy_score(y_test, predictNonLinear)
print "Linear SVM accuracy: ", svc_acc_lin
print "SVM accuracy: ", svc_acc_non
Linear SVM accuracy:  0.932142857143
SVM accuracy:  0.936666666667

With PCA

In [17]:
pca = PCA(n_components=50, whiten=True)
pca.fit(X_train)
train_data = pca.transform(X_train)
test_data = pca.transform(X_test)
svcPCA = SVC()
svcPCA.fit(train_data, y_train)
predictPCA = svcPCA.predict(test_data)
svc_accPCA = accuracy_score(y_test, predictPCA)
svcPCALin = SVC(kernel = "linear")
svcPCALin.fit(train_data, y_train)
predictPCALin = svcPCALin.predict(test_data)
svc_accPCALin = accuracy_score(y_test, predictPCALin)
print "Linear SVM with PCA accuracy: ", svc_accPCALin
print "SVM with PCA accuracy: ", svc_accPCA
Linear SVM with PCA accuracy:  0.934761904762
SVM with PCA accuracy:  0.979404761905
In [ ]: