In [ ]:
#KNeighbors implementation for Digit Recognition
In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
#from sklearn.datasets import fetch_mldata
mnist = pd.read_csv('train.csv')
data = mnist.drop('label', axis=1)
target = mnist['label']
data = data[0:28000]
target = target[0:28000]
print "Shape of Data: ", data.shape
print "Shape of Target", target.shape
x_test = pd.read_csv('test.csv')
Shape of Data:  (28000, 784)
Shape of Target (28000,)
In [2]:
#split the data, test size default is .25, set train size to .8 and random state is set to an int
#random_state will be random if unset to select different data from the set each time

from sklearn.cross_validation import train_test_split
X,  kaggle_x, Y, kaggle_y = train_test_split(data, target, 
                                                    train_size= 0.8, random_state = 42)
In [3]:
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
#set knnmodel to classifier, defaults are used but can be tuned by setting the parameters manually, 
#see http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
knnmodel = KNeighborsClassifier()
knncvscore = cross_val_score(knnmodel, 
                          X, Y, scoring='accuracy', n_jobs=-1)
In [4]:
print "Knn CV Score is: ", knncvscore
Knn CV Score is:  [ 0.95395529  0.9541862   0.95484995]
In [6]:
#Use KNNModel to predict Digit
finalmodel = knnmodel
finalmodel.fit(X,Y)
predictions = finalmodel.predict(kaggle_x)
In [20]:
#Check how well predictions went! Really useful for testing if you wanted to play with the parameters in the 
#KNeighborsClassifier()
from sklearn.metrics import accuracy_score
accscore = accuracy_score(predictions, kaggle_y)
In [21]:
print "The accuracy score is", accscore
The accuracy score is 0.964107142857
In [8]:
from pandas import DataFrame
import numpy as np
submission = DataFrame(predictions, columns=['Label'], 
                       index=np.arange(1, 28001))
submission.index.names = ['ImageId']
pd.DataFrame({"ImageId": range(1, len(predictions)+1), "Label": predictions}).to_csv('Data_science_club_MNIST_submission.csv', index=False, header=True)
Out[8]:
"\nsubmission = np.savetxt('Data_science_club_MNIST_submission.csv', \n           np.c_[range(1,len(predictions)+1),predictions], \n           delimiter=',', \n           header = 'ImageId,Label', \n           comments = '', \n           fmt='%d')\n"
In [9]:
import csv
with open('Data_science_club_MNIST_submission.csv',"r") as f:
    reader = csv.reader(f,delimiter = ",")
    data = list(reader)
    row_count = len(data)
In [10]:
print row_count
28001