Error analysis on a model trained to recognize handwritten digits#
Importing libraries and packages#
1# Mathematical operations and data manipulation
2import numpy as np
3import pandas as pd
4
5# Dataset
6from sklearn.datasets import load_digits
7
8# Model
9from sklearn.model_selection import train_test_split
10from sklearn import tree
11from sklearn.metrics import accuracy_score
12
13# Warnings
14import warnings
15
16warnings.filterwarnings("ignore")
Set paths#
1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"
Loading dataset#
sklearn.datasets.load_digits - The output is a dictionary-like object, which separates the features (callable as data) from the target (callable as target) into two attributes.
1dataset = load_digits()
Partitioning and training#
1# Convert each attribute (data and target) into a Pandas DataFrame
2X = pd.DataFrame(dataset.data)
3Y = pd.DataFrame(dataset.target)
4
5print("Shape of X: ", X.shape)
6print("Shape of Y: ", Y.shape)
Shape of X: (1797, 64)
Shape of Y: (1797, 1)
1X_new, X_test, Y_new, Y_test = train_test_split(
2 X, Y, test_size=0.1, random_state=101
3)
4test_size = X_test.shape[0] / X_new.shape[0]
5X_train, X_dev, Y_train, Y_dev = train_test_split(
6 X_new, Y_new, test_size=test_size, random_state=101
7)
8
9print("Shape of X_train: ", X_train.shape)
10print("Shape of Y_train: ", Y_train.shape)
11print("Shape of X_dev: ", X_dev.shape)
12print("Shape of Y_dev: ", Y_dev.shape)
13print("Shape of X_test: ", X_test.shape)
14print("Shape of Y_test: ", Y_test.shape)
Shape of X_train: (1437, 64)
Shape of Y_train: (1437, 1)
Shape of X_dev: (180, 64)
Shape of Y_dev: (180, 1)
Shape of X_test: (180, 64)
Shape of Y_test: (180, 1)
1# Using a random seed for reproducibility of the results.
2np.random.seed(101)
3# Selecting random indices from the X_train set
4indices_train = np.random.randint(0, len(X_train), 90)
5# Generating the random indices of the dev set
6indices_dev = np.random.randint(0, len(X_dev), 90)
7# Creating new variables to hold the selected values of X_train
8# and X_dev, and the corresponding values from Y_train and Y_dev.
9X_train_dev = pd.concat(
10 [X_train.iloc[indices_train, :], X_dev.iloc[indices_dev, :]]
11)
12Y_train_dev = pd.concat(
13 [Y_train.iloc[indices_train, :], Y_dev.iloc[indices_dev, :]]
14)
15# Printing the resulting shapes of the sets
16print(X_train_dev.shape, Y_train_dev.shape)
(180, 64) (180, 1)
1# Train a decision tree on the train set
2model = tree.DecisionTreeClassifier(random_state=101)
3model = model.fit(X_train, Y_train)
Metrics#
1# Using the predict method to generate the predictions for all
2# the sets (train, train/dev, dev, and test).
3sets = ["Training", "Train/dev", "Validation", "Testing"]
4X_sets = [X_train, X_train_dev, X_dev, X_test]
5Y_sets = [Y_train, Y_train_dev, Y_dev, Y_test]
6
7scores = {}
8for i in range(0, len(X_sets)):
9 pred = model.predict(X_sets[i])
10 score = accuracy_score(Y_sets[i], pred)
11 scores[sets[i]] = score
12
13print(scores)
{'Training': 1.0, 'Train/dev': 0.9444444444444444, 'Validation': 0.8833333333333333, 'Testing': 0.8833333333333333}