Exploratory data analysis#
Uncovering underlying data structures.
Importing libraries and packages#
1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Plotting
5import seaborn as sns
6import matplotlib.pyplot as plt
7
8# Warnings
9import warnings
10
11warnings.filterwarnings("ignore")
12
13%matplotlib inline
Set paths#
1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"
Loading dataset#
1# load data
2dataset = pd.read_csv(f"{data_path}/online_shoppers_intention.csv")
3dataset.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
Administrative | 0 | 0 | 0 | 0 | 0 |
Administrative_Duration | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Informational | 0 | 0 | 0 | 0 | 0 |
Informational_Duration | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
ProductRelated | 1 | 2 | 1 | 2 | 10 |
ProductRelated_Duration | 0.0 | 64.0 | 0.0 | 2.666667 | 627.5 |
BounceRates | 0.2 | 0.0 | 0.2 | 0.05 | 0.02 |
ExitRates | 0.2 | 0.1 | 0.2 | 0.14 | 0.05 |
PageValues | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
SpecialDay | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Month | Feb | Feb | Feb | Feb | Feb |
OperatingSystems | 1 | 2 | 4 | 3 | 3 |
Browser | 1 | 2 | 1 | 2 | 3 |
Region | 1 | 1 | 9 | 2 | 1 |
TrafficType | 1 | 2 | 3 | 4 | 4 |
VisitorType | Returning_Visitor | Returning_Visitor | Returning_Visitor | Returning_Visitor | Returning_Visitor |
Weekend | False | False | False | False | True |
Revenue | False | False | False | False | False |
Exploring dataset#
1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4 print(
5 f"Column: {col:35} | "
6 f"type: {str(dataset[col].dtype):7} | "
7 f"missing values: {dataset[col].isna().sum():3d}"
8 )
Data dimension: (12330, 18)
Column: Administrative | type: int64 | missing values: 0
Column: Administrative_Duration | type: float64 | missing values: 0
Column: Informational | type: int64 | missing values: 0
Column: Informational_Duration | type: float64 | missing values: 0
Column: ProductRelated | type: int64 | missing values: 0
Column: ProductRelated_Duration | type: float64 | missing values: 0
Column: BounceRates | type: float64 | missing values: 0
Column: ExitRates | type: float64 | missing values: 0
Column: PageValues | type: float64 | missing values: 0
Column: SpecialDay | type: float64 | missing values: 0
Column: Month | type: object | missing values: 0
Column: OperatingSystems | type: int64 | missing values: 0
Column: Browser | type: int64 | missing values: 0
Column: Region | type: int64 | missing values: 0
Column: TrafficType | type: int64 | missing values: 0
Column: VisitorType | type: object | missing values: 0
Column: Weekend | type: bool | missing values: 0
Column: Revenue | type: bool | missing values: 0
1# Computing statistics on numerical features
2dataset.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Administrative | 12330.0 | 2.315166 | 3.321784 | 0.0 | 0.000000 | 1.000000 | 4.000000 | 27.000000 |
Administrative_Duration | 12330.0 | 80.818611 | 176.779107 | 0.0 | 0.000000 | 7.500000 | 93.256250 | 3398.750000 |
Informational | 12330.0 | 0.503569 | 1.270156 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 24.000000 |
Informational_Duration | 12330.0 | 34.472398 | 140.749294 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 2549.375000 |
ProductRelated | 12330.0 | 31.731468 | 44.475503 | 0.0 | 7.000000 | 18.000000 | 38.000000 | 705.000000 |
ProductRelated_Duration | 12330.0 | 1194.746220 | 1913.669288 | 0.0 | 184.137500 | 598.936905 | 1464.157214 | 63973.522230 |
BounceRates | 12330.0 | 0.022191 | 0.048488 | 0.0 | 0.000000 | 0.003112 | 0.016813 | 0.200000 |
ExitRates | 12330.0 | 0.043073 | 0.048597 | 0.0 | 0.014286 | 0.025156 | 0.050000 | 0.200000 |
PageValues | 12330.0 | 5.889258 | 18.568437 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 361.763742 |
SpecialDay | 12330.0 | 0.061427 | 0.198917 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
OperatingSystems | 12330.0 | 2.124006 | 0.911325 | 1.0 | 2.000000 | 2.000000 | 3.000000 | 8.000000 |
Browser | 12330.0 | 2.357097 | 1.717277 | 1.0 | 2.000000 | 2.000000 | 2.000000 | 13.000000 |
Region | 12330.0 | 3.147364 | 2.401591 | 1.0 | 1.000000 | 3.000000 | 4.000000 | 9.000000 |
TrafficType | 12330.0 | 4.069586 | 4.025169 | 1.0 | 2.000000 | 2.000000 | 4.000000 | 20.000000 |
Univariate analysis#
1# Baseline Conversion Rate from the Revenue Column
2sns.countplot(dataset["Revenue"])
3plt.title("Baseline Revenue conversion", fontsize=20)
4plt.show()
1print(dataset["Revenue"].value_counts())
2print()
3print(dataset["Revenue"].value_counts(normalize=True))
False 10422
True 1908
Name: Revenue, dtype: int64
False 0.845255
True 0.154745
Name: Revenue, dtype: float64
The baseline conversion rate of online visitors versus overall visitors is a ratio between the total number of online sessions that led to a purchase divided by the total number of sessions.
1print(1908 / 12330 * 100)
15.474452554744525
1# Visitor-Wise Distribution
2sns.countplot(dataset["VisitorType"])
3plt.title("Visitor Type wise Distribution", fontsize=20)
4plt.show()
1# Calculation exact number of each visitor type
2print(dataset["VisitorType"].value_counts())
3print()
4print(dataset["VisitorType"].value_counts(normalize=True))
Returning_Visitor 10551
New_Visitor 1694
Other 85
Name: VisitorType, dtype: int64
Returning_Visitor 0.855718
New_Visitor 0.137388
Other 0.006894
Name: VisitorType, dtype: float64
The number of returning customers is higher than that of new visitors.
1# Traffic-Wise Distribution
2sns.countplot(dataset["TrafficType"])
3plt.title("Traffic Type wise Distribution", fontsize=20)
4plt.show()
1print(dataset["TrafficType"].value_counts(normalize=True))
2 0.317356
1 0.198783
3 0.166423
4 0.086699
13 0.059854
10 0.036496
6 0.036010
8 0.027818
5 0.021087
11 0.020032
20 0.016058
9 0.003406
7 0.003244
15 0.003082
19 0.001379
14 0.001054
18 0.000811
16 0.000243
12 0.000081
17 0.000081
Name: TrafficType, dtype: float64
Sources 2, 1, 3, and 4 account for the majority of our web traffic.
1# Distribution of Customers Session on the Website
2sns.countplot(dataset["Weekend"])
3plt.title("Weekend Session Distribution", fontsize=20)
4plt.show()
1# Count of each subcategory in the weekend column
2print(dataset["Weekend"].value_counts())
3print()
4print(dataset["Weekend"].value_counts(normalize=True))
False 9462
True 2868
Name: Weekend, dtype: int64
False 0.767397
True 0.232603
Name: Weekend, dtype: float64
More visitors visit during weekdays than weekend days.
1# Region-Wise Distribution
2sns.countplot(dataset["Region"])
3plt.title("Region wise Distribution", fontsize=20)
4plt.show()
1print(dataset["Region"].value_counts())
2print()
3print(dataset["Region"].value_counts(normalize=True))
1 4780
3 2403
4 1182
2 1136
6 805
7 761
9 511
8 434
5 318
Name: Region, dtype: int64
1 0.387672
3 0.194891
4 0.095864
2 0.092133
6 0.065288
7 0.061719
9 0.041444
8 0.035199
5 0.025791
Name: Region, dtype: float64
Regions 1 and 3 account for 50% of online sessions.
1# Browser and OS Distribution of Customers
2sns.countplot(dataset["Browser"])
3plt.title("Browser wise session Distribution", fontsize=20)
4plt.show()
1print(dataset["Browser"].value_counts())
2print()
3print(dataset["Browser"].value_counts(normalize=True))
2 7961
1 2462
4 736
5 467
6 174
10 163
8 135
3 105
13 61
7 49
12 10
11 6
9 1
Name: Browser, dtype: int64
2 0.645661
1 0.199676
4 0.059692
5 0.037875
6 0.014112
10 0.013220
8 0.010949
3 0.008516
13 0.004947
7 0.003974
12 0.000811
11 0.000487
9 0.000081
Name: Browser, dtype: float64
1sns.countplot(dataset["OperatingSystems"])
2plt.title("OS wise session Distribution", fontsize=20)
3plt.show()
1print(dataset["OperatingSystems"].value_counts())
2print()
3print(dataset["OperatingSystems"].value_counts(normalize=True))
2 6601
1 2585
3 2555
4 478
8 79
6 19
7 7
5 6
Name: OperatingSystems, dtype: int64
2 0.535361
1 0.209651
3 0.207218
4 0.038767
8 0.006407
6 0.001541
7 0.000568
5 0.000487
Name: OperatingSystems, dtype: float64
1# Administrative Pageview Distribution
2sns.countplot(dataset["Administrative"])
3plt.title("Administrative Pageview Distribution", fontsize=16)
4plt.show()
Users tend to visit page 0 the most often.
1# Information Pageview Distribution
2sns.countplot(dataset["Informational"])
3plt.title("Information Pageview Distribution", fontsize=16)
4plt.show()
1# Percentage count for each information page
2print(dataset["Informational"].value_counts(normalize=True))
0 0.786618
1 0.084428
2 0.059043
3 0.030819
4 0.018005
5 0.008029
6 0.006326
7 0.002920
9 0.001217
8 0.001135
10 0.000568
12 0.000406
14 0.000162
16 0.000081
11 0.000081
24 0.000081
13 0.000081
Name: Informational, dtype: float64
79% of users are visiting pages 0 and 1.
1# Special Day Session Distribution
2sns.countplot(dataset["SpecialDay"])
3plt.title("Special Day session Distribution", fontsize=16)
4plt.show()
1# Percentage distribution for special days
2print(dataset["SpecialDay"].value_counts(normalize=True))
0.0 0.898540
0.6 0.028467
0.8 0.026358
0.4 0.019708
0.2 0.014436
1.0 0.012490
Name: SpecialDay, dtype: float64
89.8% of visitors visited during a non-special day (special day subcategory 0 ), showing that special days do not work that well.
Bivariate analysis#
1# Revenue Versus Visitor Type
2g = sns.catplot(
3 "Revenue",
4 col="VisitorType",
5 col_wrap=3,
6 data=dataset,
7 kind="count",
8 height=5,
9 aspect=1,
10)
11plt.show()
More revenue conversions happen for returning customers than new customers.
1# Revenue Versus Traffic Type
2sns.countplot(x="TrafficType", hue="Revenue", data=dataset)
3plt.legend(loc="right")
4plt.show()
Most revenue conversion happens for web traffic generated from source 2.
Relationship between Revenue and Other Variables#
1sns.countplot(x="Region", hue="Revenue", data=dataset)
2plt.show()
Region 1 accounts for most sales, and region 3 the second most.
1sns.countplot(x="Browser", hue="Revenue", data=dataset)
2plt.show()
More revenue-generating transactions have been performed from Browser 2.
1sns.countplot(x="OperatingSystems", hue="Revenue", data=dataset)
2plt.show()
More revenue-generating transactions happened with OS 2 than the other types.
1sns.countplot(
2 x="Month",
3 hue="Revenue",
4 data=dataset,
5 order=[
6 "Feb",
7 "Mar",
8 "May",
9 "June",
10 "Jul",
11 "Aug",
12 "Sep",
13 "Oct",
14 "Nov",
15 "Dec",
16 ],
17)
18plt.show()
The greatest number of purchases were made in November.
Linear relationships#
1# Bounce Rate versus Exit Rate
2sns.set(style="whitegrid")
3ax = sns.lmplot(x="BounceRates", y="ExitRates", data=dataset)
There is a positive correlation between the bounce rate and the exit rate. With the increase in bounce rate, the exit rate of the page increases.
1# Page Value versus Bounce Rate
2sns.set(style="whitegrid")
3ax = sns.lmplot(x="PageValues", y="BounceRates", data=dataset)
There is a negative correlation between page value and bounce rate. As the page value increases, the bounce rate decreases.
1# Page Value versus Exit Rate
2sns.set(style="whitegrid")
3ax = sns.lmplot(x="PageValues", y="ExitRates", data=dataset)
Negative correlation between page value and exit rate. Web pages with a better page value have a lower exit rate.
1# Impact of Administration Page Views and
2# Administrative Pageview Duration on Revenue
3sns.set(style="whitegrid")
4ax = sns.lmplot(
5 x="Administrative",
6 y="Administrative_Duration",
7 hue="Revenue",
8 data=dataset,
9)
Administrative-related pageviews and the administrative-related pageview duration are positively correlated. When there is an increase in the number of administrative pageviews, the administrative pageview duration also increases.
1# Impact of Information Page Views and Information Pageview Duration on Revenue
2sns.set(style="whitegrid")
3ax = sns.lmplot(
4 x="Informational", y="Informational_Duration", hue="Revenue", data=dataset
5)
Information page views and information pageview duration are positively correlated. Customers who have made online purchases visited fewer numbers of informational pages. This implies that informational pageviews do not have much effect on revenue generation.