Error analysis on a model trained to recognize handwritten digits#

Importing libraries and packages#

# Mathematical operations and data manipulation
import numpy as np
import pandas as pd

# Dataset
from sklearn.datasets import load_digits

# Model
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score

# Warnings
import warnings

warnings.filterwarnings("ignore")

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

sklearn.datasets.load_digits - The output is a dictionary-like object, which separates the features (callable as data) from the target (callable as target) into two attributes.

dataset = load_digits()

Partitioning and training#

# Convert each attribute (data and target) into a Pandas DataFrame
X = pd.DataFrame(dataset.data)
Y = pd.DataFrame(dataset.target)

print("Shape of X: ", X.shape)
print("Shape of Y: ", Y.shape)

Shape of X:  (1797, 64)
Shape of Y:  (1797, 1)

X_new, X_test, Y_new, Y_test = train_test_split(
    X, Y, test_size=0.1, random_state=101
)
test_size = X_test.shape[0] / X_new.shape[0]
X_train, X_dev, Y_train, Y_dev = train_test_split(
    X_new, Y_new, test_size=test_size, random_state=101
)

print("Shape of X_train: ", X_train.shape)
print("Shape of Y_train: ", Y_train.shape)
print("Shape of X_dev: ", X_dev.shape)
print("Shape of Y_dev: ", Y_dev.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of Y_test: ", Y_test.shape)

Shape of X_train:  (1437, 64)
Shape of Y_train:  (1437, 1)
Shape of X_dev:  (180, 64)
Shape of Y_dev:  (180, 1)
Shape of X_test:  (180, 64)
Shape of Y_test:  (180, 1)

# Using a random seed for reproducibility of the results.
np.random.seed(101)
# Selecting random indices from the X_train set
indices_train = np.random.randint(0, len(X_train), 90)
# Generating the random indices of the dev set
indices_dev = np.random.randint(0, len(X_dev), 90)
# Creating new variables to hold the selected values of X_train
# and X_dev, and the corresponding values from Y_train and Y_dev.
X_train_dev = pd.concat(
    [X_train.iloc[indices_train, :], X_dev.iloc[indices_dev, :]]
)
Y_train_dev = pd.concat(
    [Y_train.iloc[indices_train, :], Y_dev.iloc[indices_dev, :]]
)
# Printing the resulting shapes of the sets
print(X_train_dev.shape, Y_train_dev.shape)

(180, 64) (180, 1)

# Train a decision tree on the train set
model = tree.DecisionTreeClassifier(random_state=101)
model = model.fit(X_train, Y_train)

Metrics#

# Using the predict method to generate the predictions for all
# the sets (train, train/dev, dev, and test).
sets = ["Training", "Train/dev", "Validation", "Testing"]
X_sets = [X_train, X_train_dev, X_dev, X_test]
Y_sets = [Y_train, Y_train_dev, Y_dev, Y_test]

scores = {}
for i in range(0, len(X_sets)):
    pred = model.predict(X_sets[i])
    score = accuracy_score(Y_sets[i], pred)
    scores[sets[i]] = score

print(scores)

{'Training': 1.0, 'Train/dev': 0.9444444444444444, 'Validation': 0.8833333333333333, 'Testing': 0.8833333333333333}