Using data from https://www.kaggle.com/c/digit-recognizer/data, the goal of this notebook is to use classifination tool, specifically KNN, to train and predict digits. All the modules used are as shown below, the main ones used are numpy, pandas and sklearn
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from sklearn import neighbors, metrics, grid_search, cross_validation
import matplotlib.pyplot as plt
##import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model
import sklearn
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)
%matplotlib inline
plt.style.use('ggplot')
# reading in data from csv file
df = pd.read_csv(os.path.join('~/', 'downloads', 'train_digit.csv'))
df
#select the first 5000 rows of data, and split label and pixels into two different DataFrames
images = df.iloc[0:5000,1:]
labels = df.iloc[0:5000,:1]
images
#reshaping and plotting pixels into "matrix" form
i=2
img=images.iloc[i].as_matrix()
img=img.reshape((28,28))
plt.imshow(img,cmap='gray')
plt.title(labels.iloc[i,0])
# plotting the distribution of greyscale values for a single image
plt.hist(images.iloc[i])
#since the values for pixels are in grey scale, 0-255, simplifying it to binary, black/white, would make the image more distinct
#thus easier for the classification
images[images>0]=1
img=images.iloc[i].as_matrix().reshape((28,28))
plt.imshow(img,cmap='binary')
plt.title(labels.iloc[i])
# KNN classification
# using the first 5000 images as training images to create the model
X = images
y = labels
model = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')
model.fit(X,y)
print model.score(X,y)
# Use the next 5000 images as test images to evaluate the model previously created
test_images = df.iloc[5000:10000,1:]
test_labels = df.iloc[5000:10000,:1]
test_images[test_images>0]=1
model.score(test_images,test_labels)
# Using the model to output predicted values and comparing it with original values
# check: 1 = correct, 0 = wrong
pred = model.predict(test_images)
check_df = test_labels
check_df['pred_labels'] = pred
check_df['check'] = 0
#creating a DataFrame with original label, predicted label and whether it matches
for x in range(5000,10000):
a = check_df.ix[x][0]
b = check_df.ix[x][1]
if ((a - b) == 0):
check_df.ix[x][2] = 1
check_df
print "total correctly predicted values:", check_df.check.sum()
print "accuracy: ", float(check_df.check.sum())/float(5000)
# Looking at the accuracy of the model for each number, 1-9.
cont_df = pd.DataFrame({'label': range(0,10)})
cont_df['total'] = 0
cont_df['count'] = 0
cont_df['percent'] = float(0)
# Creating a DataFrame with digit, amount of digits presented, amount of predicted digits, and accuracy percentage
for x in range(0,10):
cont_df['total'].ix[x] = check_df[check_df.label == x]['check'].sum()
cont_df['count'].ix[x] = check_df[check_df.label == x]['label'].count()
cont_df['percent'].ix[x] = float(cont_df['total'].ix[x]) / float(cont_df['count'].ix[x])
cont_df
plot_df = cont_df[['total','count']]
plot_df.plot(kind = 'bar')
plt.title('# predicted VS # OG')
cont_df.plot(x = 'label',y = 'percent', kind = 'bar',ylim=0.7)
plt.title('% of correct prediction per label')
From the two graphs above, values 4 and 8 are the ones that has the lowest percentage of correct predictions, while 0 and 1 has the highest
df = pd.read_csv(os.path.join('~/', 'downloads', 'train_digit.csv'))
train_df = df.sample(frac = .8, random_state = 0).sort()
test_df = df.drop(train_df.index)
train_images = train_df.iloc[0:,1:]
train_labels = train_df.iloc[0:,:1]
train_images[train_images>0]=1
test_images = test_df.iloc[0:,1:]
test_labels = test_df.iloc[0:,:1]
test_images[test_images>0]=1
X = train_images
y = train_labels
allmodel = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')
allmodel.fit(X,y)
print allmodel.score(X,y)
pred = allmodel.predict(test_images)
check_df = test_labels
check_df['pred_labels'] = pred
check_df['check'] = 0
check_df = check_df.reset_index()
for x in range(len(check_df)):
a = check_df.ix[x][1]
b = check_df.ix[x][2]
if ((a - b) == 0):
check_df.ix[x][3] = 1
print "total correctly predicted values:", check_df.check.sum()
print "accuracy: ", float(check_df.check.sum())/float(check_df.pred_labels.count())
cont_df = pd.DataFrame({'label': range(0,10)})
cont_df['total'] = 0
cont_df['count'] = 0
cont_df['percent'] = float(0)
for x in range(0,10):
cont_df['total'].ix[x] = check_df[check_df.label == x]['check'].sum()
cont_df['count'].ix[x] = check_df[check_df.label == x]['label'].count()
cont_df['percent'].ix[x] = float(cont_df['total'].ix[x]) / float(cont_df['count'].ix[x])
plot_df = cont_df[['total','count']]
plot_df.plot(kind = 'bar')
plt.title('# predicted VS # OG')
cont_df.plot(x = 'label',y = 'percent', kind = 'bar',ylim=0.7)
plt.title('% of correct prediction per label')
# Reading the test dataset from kaggle, running it through the model and outputting it to a csv file
test_digit_df = pd.read_csv(os.path.join('~/', 'downloads', 'test_digit.csv'))
pred = allmodel.predict(test_digit_df)
subm = pd.DataFrame({'ImageId':[x for x in range(1, len(test_digit_df) + 1)], 'Label':pred})
subm.head()
subm.to_csv('sub.csv')
#using the first 5000 images to speed up testing time
df = pd.read_csv(os.path.join('~/', 'downloads', 'train_digit.csv'))
images = df.iloc[0:5000,1:]
labels = df.iloc[0:5000,:1]
images[images>0]=1
X = images
y = labels
for k in range(1,15):
model = neighbors.KNeighborsClassifier(n_neighbors = k, weights = 'uniform')
model.fit(X,y)
print "k: ", k, "score: ", model.score(X,y)
model = neighbors.KNeighborsClassifier(n_neighbors = 50, weights = 'uniform')
model.fit(X,y)
print "k: ", 50, "score: ", model.score(X,y)
As the K value increases, the score of the model slowly decreases. Thus the most suitable K value is 5.
df = pd.read_csv(os.path.join('~/', 'downloads', 'train_digit.csv'))
images = df.iloc[0:5000,1:]
labels = df.iloc[0:5000,:1]
images[images>0]=1
X = images
y = labels
model = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'distance')
model.fit(X,y)
print "score: ", model.score(X,y)
test_images = df.iloc[5000:10000,1:]
test_labels = df.iloc[5000:10000,:1]
test_images[test_images>0]=1
print model.score(test_images,test_labels)
pred = model.predict(test_images)
check_df = test_labels
check_df['pred_labels'] = pred
check_df['check'] = 0
check_df = check_df.reset_index()
for x in range(len(check_df)):
a = check_df.ix[x][1]
b = check_df.ix[x][2]
if ((a - b) == 0):
check_df.ix[x][3] = 1
cont_df = pd.DataFrame({'label': range(0,10)})
cont_df['total'] = 0
cont_df['count'] = 0
cont_df['percent'] = float(0)
for x in range(0,10):
cont_df['total'].ix[x] = check_df[check_df.label == x]['check'].sum()
cont_df['count'].ix[x] = check_df[check_df.label == x]['label'].count()
cont_df['percent'].ix[x] = float(cont_df['total'].ix[x]) / float(cont_df['count'].ix[x])
cont_df.plot(x = 'label',y = 'percent', kind = 'bar',ylim=0.7)
plt.title('% of correct prediction per label')
df = pd.read_csv(os.path.join('~/', 'downloads', 'train_digit.csv'))
images = df.iloc[0:,1:]
labels = df.iloc[0:,:1]
all_images[all_images>0]=1
X = images
y = labels
distmodel = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'distance')
distmodel.fit(X,y)
unimodel = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')
unimodel.fit(X,y)
print "distance score: ", distmodel.score(X,y)
print "uni score: ", unimodel.score(X,y)
pred = distmodel.predict(X)
check_df = y
check_df['pred_labels'] = pred
check_df['check'] = 0
check_df = check_df.reset_index()
for x in range(len(check_df)):
a = check_df.ix[x][1]
b = check_df.ix[x][2]
if ((a - b) == 0):
check_df.ix[x][3] = 1
cont_df = pd.DataFrame({'label': range(0,10)})
cont_df['total'] = 0
cont_df['count'] = 0
cont_df['percent'] = float(0)
for x in range(0,10):
cont_df['total'].ix[x] = check_df[check_df.label == x]['check'].sum()
cont_df['count'].ix[x] = check_df[check_df.label == x]['label'].count()
cont_df['percent'].ix[x] = float(cont_df['total'].ix[x]) / float(cont_df['count'].ix[x])
cont_df.plot(x = 'label',y = 'percent', kind = 'bar',ylim=0.7)
plt.title('% of correct prediction per label')
pred = unimodel.predict(X)
check_df = y
check_df['pred_labels'] = pred
check_df['check'] = 0
check_df = check_df.reset_index()
for x in range(len(check_df)):
a = check_df.ix[x][1]
b = check_df.ix[x][2]
if ((a - b) == 0):
check_df.ix[x][3] = 1
cont_df = pd.DataFrame({'label': range(0,10)})
cont_df['total'] = 0
cont_df['count'] = 0
cont_df['percent'] = float(0)
for x in range(0,10):
cont_df['total'].ix[x] = check_df[check_df.label == x]['check'].sum()
cont_df['count'].ix[x] = check_df[check_df.label == x]['label'].count()
cont_df['percent'].ix[x] = float(cont_df['total'].ix[x]) / float(cont_df['count'].ix[x])
cont_df.plot(x = 'label',y = 'percent', kind = 'bar',ylim=0.7)
plt.title('% of correct prediction per label')
test_digit_df = pd.read_csv(os.path.join('~/', 'downloads', 'test_digit.csv'))
pred = distmodel.predict(test_digit_df)
subm = pd.DataFrame({'ImageId':[x for x in range(1, len(test_digit_df) + 1)], 'Label':pred})
subm.head()
subm.to_csv('distsub1.csv')