Error analysis on classifications of handwritten digits#

To perform error analysis, the predict method for the different sets of data (training, validation, and testing) is required. The following code snippets present a clean way of measuring all three metrics on all three sets at once for Naive Bayes, Decision Tree and SVM models trained by the handwritten digits dataset.

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import pandas as pd
 3
 4# Dataset
 5from sklearn.datasets import load_digits
 6
 7# Models
 8from sklearn.model_selection import train_test_split
 9from sklearn.naive_bayes import GaussianNB
10from sklearn.tree import DecisionTreeClassifier
11from sklearn.svm import SVC
12from sklearn.metrics import accuracy_score, precision_score, recall_score
13
14# Warnings
15import warnings
16
17warnings.filterwarnings("ignore")

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

sklearn.datasets.load_digits - The output is a dictionary-like object, which separates the features (callable as data) from the target (callable as target) into two attributes.

1dataset = load_digits()

Partitioning dataset#

1# Convert each attribute (data and target) into a Pandas DataFrame
2X = pd.DataFrame(dataset.data)
3Y = pd.DataFrame(dataset.target)
4
5print("Shape of X: ", X.shape)
6print("Shape of Y: ", Y.shape)
Shape of X:  (1797, 64)
Shape of Y:  (1797, 1)
1X.head()
0 1 2 3 4 5 6 7 8 9 ... 54 55 56 57 58 59 60 61 62 63
0 0.0 0.0 5.0 13.0 9.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 6.0 13.0 10.0 0.0 0.0 0.0
1 0.0 0.0 0.0 12.0 13.0 5.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 11.0 16.0 10.0 0.0 0.0
2 0.0 0.0 0.0 4.0 15.0 12.0 0.0 0.0 0.0 0.0 ... 5.0 0.0 0.0 0.0 0.0 3.0 11.0 16.0 9.0 0.0
3 0.0 0.0 7.0 15.0 13.0 1.0 0.0 0.0 0.0 8.0 ... 9.0 0.0 0.0 0.0 7.0 13.0 13.0 9.0 0.0 0.0
4 0.0 0.0 0.0 1.0 11.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 2.0 16.0 4.0 0.0 0.0

5 rows × 64 columns

1# First split of the data using the train_test_split function
2X_train, X_test, Y_train, Y_test = train_test_split(
3    X, Y, test_size=0.2, random_state=0
4)
5
6print("Shape of X_train: ", X_train.shape)
7print("Shape of X_test: ", X_test.shape)
8print("Shape of Y_train: ", Y.shape)
9print("Shape of Y_test: ", Y_test.shape)
Shape of X_train:  (1437, 64)
Shape of X_test:  (360, 64)
Shape of Y_train:  (1797, 1)
Shape of Y_test:  (360, 1)
1# Second split for a validation set (dev set): toobtain a dev set
2# that's the same shape as the test set, it is necessary to calculate
3# the proportion of the size of the test set over the size of the
4# train set before creating a validation set.
5dev_size = X_test.shape[0] / X_train.shape[0]
6print(dev_size)
0.25052192066805845
 1X_train, X_dev, Y_train, Y_dev = train_test_split(
 2    X_train, Y_train, test_size=dev_size
 3)
 4
 5print("Shape of X_train: ", X_train.shape)
 6print("Shape of Y_train: ", Y_train.shape)
 7print("Shape of X_dev: ", X_dev.shape)
 8print("Shape of Y_dev: ", Y_dev.shape)
 9print("Shape of X_test: ", X_test.shape)
10print("Shape of Y_test: ", Y_test.shape)
Shape of X_train:  (1077, 64)
Shape of Y_train:  (1077, 1)
Shape of X_dev:  (360, 64)
Shape of Y_dev:  (360, 1)
Shape of X_test:  (360, 64)
Shape of Y_test:  (360, 1)

Naïve Bayes algorithm#

1model_NB = GaussianNB()
2model_NB.fit(X_train, Y_train)
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
 1pred_1 = model_NB.predict(
 2    [
 3        [
 4            0.0,
 5            0.0,
 6            5.0,
 7            13.0,
 8            9.0,
 9            1.0,
10            0.0,
11            0.0,
12            0.0,
13            0.0,
14            13.0,
15            15.0,
16            10.0,
17            15.0,
18            5.0,
19            0.0,
20            0.0,
21            3.0,
22            15.0,
23            2.0,
24            0.0,
25            11.0,
26            8.0,
27            0.0,
28            0.0,
29            4.0,
30            12.0,
31            0.0,
32            0.0,
33            8.0,
34            8.0,
35            0.0,
36            0.0,
37            5.0,
38            8.0,
39            0.0,
40            0.0,
41            9.0,
42            8.0,
43            0.0,
44            0.0,
45            4.0,
46            11.0,
47            0.0,
48            1.0,
49            12.0,
50            7.0,
51            0.0,
52            0.0,
53            2.0,
54            14.0,
55            5.0,
56            10.0,
57            12.0,
58            0.0,
59            0.0,
60            0.0,
61            0.0,
62            6.0,
63            13.0,
64            10.0,
65            0.0,
66            0.0,
67            0.0,
68        ]
69    ]
70)
71print(pred_1)
[0]

Decision tree algorithm#

1model_tree = DecisionTreeClassifier(random_state=101)
2model_tree.fit(X_train, Y_train)
DecisionTreeClassifier(random_state=101)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
 1pred_2 = model_tree.predict(
 2    [
 3        [
 4            0.0,
 5            0.0,
 6            5.0,
 7            13.0,
 8            9.0,
 9            1.0,
10            0.0,
11            0.0,
12            0.0,
13            0.0,
14            13.0,
15            15.0,
16            10.0,
17            15.0,
18            5.0,
19            0.0,
20            0.0,
21            3.0,
22            15.0,
23            2.0,
24            0.0,
25            11.0,
26            8.0,
27            0.0,
28            0.0,
29            4.0,
30            12.0,
31            0.0,
32            0.0,
33            8.0,
34            8.0,
35            0.0,
36            0.0,
37            5.0,
38            8.0,
39            0.0,
40            0.0,
41            9.0,
42            8.0,
43            0.0,
44            0.0,
45            4.0,
46            11.0,
47            0.0,
48            1.0,
49            12.0,
50            7.0,
51            0.0,
52            0.0,
53            2.0,
54            14.0,
55            5.0,
56            10.0,
57            12.0,
58            0.0,
59            0.0,
60            0.0,
61            0.0,
62            6.0,
63            13.0,
64            10.0,
65            0.0,
66            0.0,
67            0.0,
68        ]
69    ]
70)
71print(pred_2)
[0]

Support vector machine algorithm#

1model_svm = SVC(random_state=101)
2model_svm.fit(X_train, Y_train)
SVC(random_state=101)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
 1pred_3 = model_svm.predict(
 2    [
 3        [
 4            0.0,
 5            0.0,
 6            5.0,
 7            13.0,
 8            9.0,
 9            1.0,
10            0.0,
11            0.0,
12            0.0,
13            0.0,
14            13.0,
15            15.0,
16            10.0,
17            15.0,
18            5.0,
19            0.0,
20            0.0,
21            3.0,
22            15.0,
23            2.0,
24            0.0,
25            11.0,
26            8.0,
27            0.0,
28            0.0,
29            4.0,
30            12.0,
31            0.0,
32            0.0,
33            8.0,
34            8.0,
35            0.0,
36            0.0,
37            5.0,
38            8.0,
39            0.0,
40            0.0,
41            9.0,
42            8.0,
43            0.0,
44            0.0,
45            4.0,
46            11.0,
47            0.0,
48            1.0,
49            12.0,
50            7.0,
51            0.0,
52            0.0,
53            2.0,
54            14.0,
55            5.0,
56            10.0,
57            12.0,
58            0.0,
59            0.0,
60            0.0,
61            0.0,
62            6.0,
63            13.0,
64            10.0,
65            0.0,
66            0.0,
67            0.0,
68        ]
69    ]
70)
71print(pred_3)
[0]

Error analysis#

1X_sets = [X_train, X_dev, X_test]
2Y_sets = [Y_train, Y_dev, Y_test]
1metrics = {
2    "NB": {"Acc": [], "Pre": [], "Rec": []},
3    "DT": {"Acc": [], "Pre": [], "Rec": []},
4    "SVM": {"Acc": [], "Pre": [], "Rec": []},
5}

The default value for the argument ‘average’ in precision_score and recall_score is ‘binary’. And this is a multi-class classification. ‘weighted’ calculates metrics for each label, and find their average weighted by support (the number of true instances for each label).

 1for i in range(0, len(X_sets)):
 2    pred_NB = model_NB.predict(X_sets[i])
 3    metrics["NB"]["Acc"].append(accuracy_score(Y_sets[i], pred_NB))
 4    metrics["NB"]["Pre"].append(
 5        precision_score(Y_sets[i], pred_NB, average="weighted")
 6    )
 7    metrics["NB"]["Rec"].append(
 8        recall_score(Y_sets[i], pred_NB, average="weighted")
 9    )
10
11    pred_tree = model_tree.predict(X_sets[i])
12    metrics["DT"]["Acc"].append(accuracy_score(Y_sets[i], pred_tree))
13    metrics["DT"]["Pre"].append(
14        precision_score(Y_sets[i], pred_tree, average="weighted")
15    )
16    metrics["DT"]["Rec"].append(
17        recall_score(Y_sets[i], pred_tree, average="weighted")
18    )
19
20    pred_svm = model_svm.predict(X_sets[i])
21    metrics["SVM"]["Acc"].append(accuracy_score(Y_sets[i], pred_svm))
22    metrics["SVM"]["Pre"].append(
23        precision_score(Y_sets[i], pred_svm, average="weighted")
24    )
25    metrics["SVM"]["Rec"].append(
26        recall_score(Y_sets[i], pred_svm, average="weighted")
27    )
1print(metrics)
{'NB': {'Acc': [0.8774373259052924, 0.8527777777777777, 0.8416666666666667], 'Pre': [0.894594794327501, 0.8707830700894467, 0.8718887380991854], 'Rec': [0.8774373259052924, 0.8527777777777777, 0.8416666666666667]}, 'DT': {'Acc': [1.0, 0.8638888888888889, 0.8861111111111111], 'Pre': [1.0, 0.8693356298220537, 0.8890506509622977], 'Rec': [1.0, 0.8638888888888889, 0.8861111111111111]}, 'SVM': {'Acc': [0.9962859795728877, 0.9805555555555555, 0.9888888888888889], 'Pre': [0.996302725482844, 0.9809229390681004, 0.9888966049382717], 'Rec': [0.9962859795728877, 0.9805555555555555, 0.9888888888888889]}}