Partitioning the wine dataset#

The split ratio to partition data is not fixed and should be decided by taking into account the amount of data available, the type of algorithm to be used, and the distribution of the data.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Dataset
from sklearn.datasets import load_wine

# Model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Warnings
import warnings

warnings.filterwarnings("ignore")

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

sklearn.datasets.load_wine - The output is a dictionary-like object, which separates the features (callable as data) from the target (callable as target) into two attributes.

dataset = load_wine()

Conventional partitioning#

60/20/20% training, validation, and testing

# Convert each attribute (data and target) into a Pandas DataFrame
X = pd.DataFrame(dataset.data)
Y = pd.DataFrame(dataset.target)

print("Shape of X: ", X.shape)
print("Shape of Y: ", Y.shape)

Shape of X:  (178, 13)
Shape of Y:  (178, 1)

# First split of the data using the train_test_split function
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of Y_train: ", Y.shape)
print("Shape of Y_test: ", Y_test.shape)

Shape of X_train:  (142, 13)
Shape of X_test:  (36, 13)
Shape of Y_train:  (178, 1)
Shape of Y_test:  (36, 1)

# Second split for a validation set (dev set): toobtain a dev set
# that's the same shape as the test set, it is necessary to calculate
# the proportion of the size of the test set over the size of the
# train set before creating a validation set.
dev_size = X_test.shape[0] / X_train.shape[0]
print(dev_size)

0.2535211267605634

X_train, X_dev, Y_train, Y_dev = train_test_split(
    X_train, Y_train, test_size=dev_size
)

print("Shape of X_train: ", X_train.shape)
print("Shape of Y_train: ", Y_train.shape)
print("Shape of X_dev: ", X_dev.shape)
print("Shape of Y_dev: ", Y_dev.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of Y_test: ", Y_test.shape)

Shape of X_train:  (106, 13)
Shape of Y_train:  (106, 1)
Shape of X_dev:  (36, 13)
Shape of Y_dev:  (36, 1)
Shape of X_test:  (36, 13)
Shape of Y_test:  (36, 1)

Cross validation partitioning#

print("Shape of X: ", X.shape)
print("Shape of Y: ", Y.shape)

Shape of X:  (178, 13)
Shape of Y:  (178, 1)

# Split the data into training and testing sets
X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.10)
# Instantiate the KFold class with a 10-fold configuration
kf = KFold(n_splits=10)

# Apply the split method to the data in X .
# Output: the index of the instances to be used as training
# and validation sets.
splits = kf.split(X)

# for loop going through the different split configurations.
# In the loop body, create the variables that will hold the data
# for the training and validation sets.
for train_index, test_index in splits:
    X_train, X_dev = X.iloc[train_index, :], X.iloc[test_index, :]
    Y_train, Y_dev = Y.iloc[train_index, :], Y.iloc[test_index, :]

# The code to train and evaluate the model should be written inside
# the loop body, given that the objective of the cross-validation
# procedure is to train and validate the model using the different
# split configurations.

print("Shape of X_train: ", X_train.shape)
print("Shape of Y_train: ", Y_train.shape)
print("Shape of X_dev: ", X_dev.shape)
print("Shape of Y_dev: ", Y_dev.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of Y_test: ", Y_test.shape)

Shape of X_train:  (144, 13)
Shape of Y_train:  (144, 1)
Shape of X_dev:  (16, 13)
Shape of Y_dev:  (16, 1)
Shape of X_test:  (18, 13)
Shape of Y_test:  (18, 1)