import os, sys, sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%pylab inline
!ls OpticalDigits
filepath_train = 'OpticalDigits/optdigits.tra'
filepath_test = 'OpticalDigits/optdigits.tes'
data_train = pd.DataFrame.from_csv(filepath_train, header = None, index_col = None)
data_test = pd.DataFrame.from_csv(filepath_test, header = None, index_col = None)
print "Size of training set: %d"%len(data_train)
print "Size of test set: %d"%len(data_test)
#split the data into features and labels
train_x = data_train[data_train.columns[:64]]
train_y = data_train[data_train.columns[64]]
test_x = data_test[data_test.columns[:64]]
test_y = data_test[data_test.columns[64]]
#verify that the data is correct
for i in range(5):
print "Label: %d"%train_y[i]
img = plt.figure()
plt.imshow(train_x.loc[i].reshape((8,8)), cmap = cm.Greys_r)
img.set_size_inches ((0.8,0.8))
plt.show()
#look at the label distribution of the data
train_y.hist()
# Reference: http://cs231n.github.io/classification/
import numpy as np
class NearestNeighbor:
def __init__(self):
pass
def train(self, X, y):
""" X is N x D where each row is an example. Y is 1-dimension of size N """
# the nearest neighbor classifier simply remembers all the training data
self.Xtr = X
self.ytr = y
def predict(self, X):
""" X is N x D where each row is an example we wish to predict label for """
num_test = X.shape[0]
# lets make sure that the output type matches the input type
Ypred = np.zeros(num_test, dtype = self.ytr.dtype)
# loop over all test rows
for i in xrange(num_test):
# find the nearest training image to the i'th test image
# using the L1 distance (sum of absolute value differences)
distances = np.sum(np.abs(self.Xtr - X[i,:]), axis = 1)
min_index = np.argmin(distances) # get the index with smallest distance
Ypred[i] = self.ytr[min_index] # predict the label of the nearest example
return Ypred
knn = NearestNeighbor()
knn.train(train_x.as_matrix(), train_y.as_matrix())
test_x.head(5)
print "Predicting %d examples..."%len(test_x)
%time pred = knn.predict(test_x.as_matrix())
import sklearn.metrics as metrics
print "Accuracy: %.2f"%metrics.accuracy_score(test_y, pred)
print metrics.classification_report(test_y, pred)
print metrics.confusion_matrix(test_y, pred)
print "X-axis: predicted, Y-axis: true"
from sklearn import svm
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(train_x.as_matrix(), train_y)
dec = clf.predict(test_x)
print metrics.confusion_matrix(test_y, dec)
print "Accuracy: %.2f"%metrics.accuracy_score(test_y, dec)