Loading the set#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Warnings
5import warnings
6
7warnings.filterwarnings("ignore")
8
9%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/online_shoppers_intention.csv")
3dataset.head().T
0 1 2 3 4
Administrative 0 0 0 0 0
Administrative_Duration 0.0 0.0 0.0 0.0 0.0
Informational 0 0 0 0 0
Informational_Duration 0.0 0.0 0.0 0.0 0.0
ProductRelated 1 2 1 2 10
ProductRelated_Duration 0.0 64.0 0.0 2.666667 627.5
BounceRates 0.2 0.0 0.2 0.05 0.02
ExitRates 0.2 0.1 0.2 0.14 0.05
PageValues 0.0 0.0 0.0 0.0 0.0
SpecialDay 0.0 0.0 0.0 0.0 0.0
Month Feb Feb Feb Feb Feb
OperatingSystems 1 2 4 3 3
Browser 1 2 1 2 3
Region 1 1 9 2 1
TrafficType 1 2 3 4 4
VisitorType Returning_Visitor Returning_Visitor Returning_Visitor Returning_Visitor Returning_Visitor
Weekend False False False False True
Revenue False False False False False

Exploring dataset#

1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4    print(
5        f"Column: {col:35} | "
6        f"type: {str(dataset[col].dtype):7} | "
7        f"missing values: {dataset[col].isna().sum():3d}"
8    )
Data dimension: (12330, 18)
Column: Administrative                      | type: int64   | missing values:   0
Column: Administrative_Duration             | type: float64 | missing values:   0
Column: Informational                       | type: int64   | missing values:   0
Column: Informational_Duration              | type: float64 | missing values:   0
Column: ProductRelated                      | type: int64   | missing values:   0
Column: ProductRelated_Duration             | type: float64 | missing values:   0
Column: BounceRates                         | type: float64 | missing values:   0
Column: ExitRates                           | type: float64 | missing values:   0
Column: PageValues                          | type: float64 | missing values:   0
Column: SpecialDay                          | type: float64 | missing values:   0
Column: Month                               | type: object  | missing values:   0
Column: OperatingSystems                    | type: int64   | missing values:   0
Column: Browser                             | type: int64   | missing values:   0
Column: Region                              | type: int64   | missing values:   0
Column: TrafficType                         | type: int64   | missing values:   0
Column: VisitorType                         | type: object  | missing values:   0
Column: Weekend                             | type: bool    | missing values:   0
Column: Revenue                             | type: bool    | missing values:   0
1# Computing statistics on numerical features
2dataset.describe().T
count mean std min 25% 50% 75% max
Administrative 12330.0 2.315166 3.321784 0.0 0.000000 1.000000 4.000000 27.000000
Administrative_Duration 12330.0 80.818611 176.779107 0.0 0.000000 7.500000 93.256250 3398.750000
Informational 12330.0 0.503569 1.270156 0.0 0.000000 0.000000 0.000000 24.000000
Informational_Duration 12330.0 34.472398 140.749294 0.0 0.000000 0.000000 0.000000 2549.375000
ProductRelated 12330.0 31.731468 44.475503 0.0 7.000000 18.000000 38.000000 705.000000
ProductRelated_Duration 12330.0 1194.746220 1913.669288 0.0 184.137500 598.936905 1464.157214 63973.522230
BounceRates 12330.0 0.022191 0.048488 0.0 0.000000 0.003112 0.016813 0.200000
ExitRates 12330.0 0.043073 0.048597 0.0 0.014286 0.025156 0.050000 0.200000
PageValues 12330.0 5.889258 18.568437 0.0 0.000000 0.000000 0.000000 361.763742
SpecialDay 12330.0 0.061427 0.198917 0.0 0.000000 0.000000 0.000000 1.000000
OperatingSystems 12330.0 2.124006 0.911325 1.0 2.000000 2.000000 3.000000 8.000000
Browser 12330.0 2.357097 1.717277 1.0 2.000000 2.000000 2.000000 13.000000
Region 12330.0 3.147364 2.401591 1.0 1.000000 3.000000 4.000000 9.000000
TrafficType 12330.0 4.069586 4.025169 1.0 2.000000 2.000000 4.000000 20.000000