Preprocessing tips database#

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import numpy as np
 3from sklearn.preprocessing import LabelEncoder
 4
 5# Plotting
 6import matplotlib.pyplot as plt
 7import seaborn as sns
 8
 9# Warnings
10import warnings
11
12warnings.filterwarnings("ignore")
13
14%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1dataset = sns.load_dataset("tips")

Exploring dataset#

1# Shape of the dataset
2print("Shape of the dataset: ", dataset.shape)
3# Head
4dataset
Shape of the dataset:  (244, 7)
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

Creating the features and target matrices#

1# Creating a variable, X, to store the features
2X = dataset.drop("tip", axis=1)
3X.head(10)
total_bill sex smoker day time size
0 16.99 Female No Sun Dinner 2
1 10.34 Male No Sun Dinner 3
2 21.01 Male No Sun Dinner 3
3 23.68 Male No Sun Dinner 2
4 24.59 Female No Sun Dinner 4
5 25.29 Male No Sun Dinner 4
6 8.77 Male No Sun Dinner 2
7 26.88 Male No Sun Dinner 4
8 15.04 Male No Sun Dinner 2
9 14.78 Male No Sun Dinner 2
1X.shape
(244, 6)
1# Target
2Y = dataset["tip"]
3Y.head(10)
0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
5    4.71
6    2.00
7    3.12
8    1.96
9    3.23
Name: tip, dtype: float64
1Y.shape
(244,)

Data cleaning#

1size = dataset["size"]
2size.loc[:15] = np.nan
3size.head(20)
0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    3.0
17    3.0
18    3.0
19    3.0
Name: size, dtype: float64
1size.shape
(244,)
1size.isnull().sum()
16
1mean = size.mean()
2mean = round(mean)
3print(mean)
3
1size.fillna(mean, inplace=True)
2size.head(20)
0     3.0
1     3.0
2     3.0
3     3.0
4     3.0
5     3.0
6     3.0
7     3.0
8     3.0
9     3.0
10    3.0
11    3.0
12    3.0
13    3.0
14    3.0
15    3.0
16    3.0
17    3.0
18    3.0
19    3.0
Name: size, dtype: float64

Visualisation#

1plt.hist(size)
2plt.show()
../../_images/080b1f67a4005a30a2deb4a901b065deaac8c46223392184c44718698c2f559c.png

Feature engineering#

Converting categorical features into numeric values#

1enc = LabelEncoder()
2# Using the built-in fit_transform() method to assign a numeric value
3# to each categorical feature and output the result
4dataset["sex"] = enc.fit_transform(dataset["sex"].astype("str"))
5dataset["smoker"] = enc.fit_transform(dataset["smoker"].astype("str"))
6dataset["day"] = enc.fit_transform(dataset["day"].astype("str"))
7dataset["time"] = enc.fit_transform(dataset["time"].astype("str"))
8
9dataset.head()
total_bill tip sex smoker day time size
0 16.99 1.01 0 0 2 0 3.0
1 10.34 1.66 1 0 2 0 3.0
2 21.01 3.50 1 0 2 0 3.0
3 23.68 3.31 1 0 2 0 3.0
4 24.59 3.61 0 0 2 0 3.0

Dealing with outliers#

1min_val = size.mean() - (3 * size.std())
2print(min_val)
-0.19743490657874485
1max_val = size.mean() + (3 * size.std())
2print(max_val)
5.369566054119728
1outliers = size[size > max_val]
2outliers.count()
4
1print(outliers)
125    6.0
141    6.0
143    6.0
156    6.0
Name: size, dtype: float64
1age = size[size <= max_val]
2age.shape
(240,)

Normalizing and standardizing data#

1tips_normalized = (dataset - dataset.min()) / (dataset.max() - dataset.min())
2tips_normalized.head(10)
total_bill tip sex smoker day time size
0 0.291579 0.001111 0.0 0.0 0.666667 0.0 0.4
1 0.152283 0.073333 1.0 0.0 0.666667 0.0 0.4
2 0.375786 0.277778 1.0 0.0 0.666667 0.0 0.4
3 0.431713 0.256667 1.0 0.0 0.666667 0.0 0.4
4 0.450775 0.290000 0.0 0.0 0.666667 0.0 0.4
5 0.465438 0.412222 1.0 0.0 0.666667 0.0 0.4
6 0.119397 0.111111 1.0 0.0 0.666667 0.0 0.4
7 0.498743 0.235556 1.0 0.0 0.666667 0.0 0.4
8 0.250733 0.106667 1.0 0.0 0.666667 0.0 0.4
9 0.245287 0.247778 1.0 0.0 0.666667 0.0 0.4
1tips_standardized = (dataset - dataset.mean()) / dataset.std()
2tips_standardized.head(10)
total_bill tip sex smoker day time size
0 -0.314066 -1.436993 -1.340598 -0.783179 0.278585 -0.620307 0.44613
1 -1.061054 -0.967217 0.742879 -0.783179 0.278585 -0.620307 0.44613
2 0.137497 0.362610 0.742879 -0.783179 0.278585 -0.620307 0.44613
3 0.437416 0.225291 0.742879 -0.783179 0.278585 -0.620307 0.44613
4 0.539635 0.442111 -1.340598 -0.783179 0.278585 -0.620307 0.44613
5 0.618266 1.237116 0.742879 -0.783179 0.278585 -0.620307 0.44613
6 -1.237411 -0.721488 0.742879 -0.783179 0.278585 -0.620307 0.44613
7 0.796869 0.087972 0.742879 -0.783179 0.278585 -0.620307 0.44613
8 -0.533108 -0.750398 0.742879 -0.783179 0.278585 -0.620307 0.44613
9 -0.562313 0.167472 0.742879 -0.783179 0.278585 -0.620307 0.44613