Clustering#

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import pandas as pd
 3
 4# Plotting
 5import matplotlib.pyplot as plt
 6
 7# Modelling
 8from sklearn.cluster import KMeans
 9
10# Warnings
11import warnings
12
13warnings.filterwarnings("ignore")
14
15%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/online_shoppers_intention.csv")
3dataset.head().T
0 1 2 3 4
Administrative 0 0 0 0 0
Administrative_Duration 0.0 0.0 0.0 0.0 0.0
Informational 0 0 0 0 0
Informational_Duration 0.0 0.0 0.0 0.0 0.0
ProductRelated 1 2 1 2 10
ProductRelated_Duration 0.0 64.0 0.0 2.666667 627.5
BounceRates 0.2 0.0 0.2 0.05 0.02
ExitRates 0.2 0.1 0.2 0.14 0.05
PageValues 0.0 0.0 0.0 0.0 0.0
SpecialDay 0.0 0.0 0.0 0.0 0.0
Month Feb Feb Feb Feb Feb
OperatingSystems 1 2 4 3 3
Browser 1 2 1 2 3
Region 1 1 9 2 1
TrafficType 1 2 3 4 4
VisitorType Returning_Visitor Returning_Visitor Returning_Visitor Returning_Visitor Returning_Visitor
Weekend False False False False True
Revenue False False False False False

Exploring dataset#

1# Printing dimensionality of the data, columns, types and missing values
2print(f"Data dimension: {dataset.shape}")
3for col in dataset.columns:
4    print(
5        f"Column: {col:35} | "
6        f"type: {str(dataset[col].dtype):7} | "
7        f"missing values: {dataset[col].isna().sum():3d}"
8    )
Data dimension: (12330, 18)
Column: Administrative                      | type: int64   | missing values:   0
Column: Administrative_Duration             | type: float64 | missing values:   0
Column: Informational                       | type: int64   | missing values:   0
Column: Informational_Duration              | type: float64 | missing values:   0
Column: ProductRelated                      | type: int64   | missing values:   0
Column: ProductRelated_Duration             | type: float64 | missing values:   0
Column: BounceRates                         | type: float64 | missing values:   0
Column: ExitRates                           | type: float64 | missing values:   0
Column: PageValues                          | type: float64 | missing values:   0
Column: SpecialDay                          | type: float64 | missing values:   0
Column: Month                               | type: object  | missing values:   0
Column: OperatingSystems                    | type: int64   | missing values:   0
Column: Browser                             | type: int64   | missing values:   0
Column: Region                              | type: int64   | missing values:   0
Column: TrafficType                         | type: int64   | missing values:   0
Column: VisitorType                         | type: object  | missing values:   0
Column: Weekend                             | type: bool    | missing values:   0
Column: Revenue                             | type: bool    | missing values:   0
1# Computing statistics on numerical features
2dataset.describe().T
count mean std min 25% 50% 75% max
Administrative 12330.0 2.315166 3.321784 0.0 0.000000 1.000000 4.000000 27.000000
Administrative_Duration 12330.0 80.818611 176.779107 0.0 0.000000 7.500000 93.256250 3398.750000
Informational 12330.0 0.503569 1.270156 0.0 0.000000 0.000000 0.000000 24.000000
Informational_Duration 12330.0 34.472398 140.749294 0.0 0.000000 0.000000 0.000000 2549.375000
ProductRelated 12330.0 31.731468 44.475503 0.0 7.000000 18.000000 38.000000 705.000000
ProductRelated_Duration 12330.0 1194.746220 1913.669288 0.0 184.137500 598.936905 1464.157214 63973.522230
BounceRates 12330.0 0.022191 0.048488 0.0 0.000000 0.003112 0.016813 0.200000
ExitRates 12330.0 0.043073 0.048597 0.0 0.014286 0.025156 0.050000 0.200000
PageValues 12330.0 5.889258 18.568437 0.0 0.000000 0.000000 0.000000 361.763742
SpecialDay 12330.0 0.061427 0.198917 0.0 0.000000 0.000000 0.000000 1.000000
OperatingSystems 12330.0 2.124006 0.911325 1.0 2.000000 2.000000 3.000000 8.000000
Browser 12330.0 2.357097 1.717277 1.0 2.000000 2.000000 2.000000 13.000000
Region 12330.0 3.147364 2.401591 1.0 1.000000 3.000000 4.000000 9.000000
TrafficType 12330.0 4.069586 4.025169 1.0 2.000000 2.000000 4.000000 20.000000

Clustering#

K-means Clustering for Informational Duration versus Bounce Rate#

 1x = dataset.iloc[:, [3, 6]].values
 2
 3wcss = []
 4for i in range(1, 11):
 5    km = KMeans(
 6        n_clusters=i,
 7        init="k-means++",
 8        max_iter=300,
 9        n_init=10,
10        random_state=0,
11        algorithm="elkan",
12        tol=0.001,
13    )
14    km.fit(x)
15    labels = km.labels_
16    wcss.append(km.inertia_)
17
18plt.rcParams["figure.figsize"] = (15, 7)
19plt.plot(range(1, 11), wcss)
20plt.grid()
21plt.tight_layout()
22plt.title("The Elbow Method", fontsize=20)
23plt.xlabel("No. of Clusters")
24plt.ylabel("wcss")
25plt.show()
../../_images/96a266bf0e81432572f5c267a35cd7dcdb9d9a78997b6c5c8f9e064eab965e6c.png

k=2 is the optimum value for clustering.

 1km = KMeans(
 2    n_clusters=2, init="k-means++", max_iter=300, n_init=10, random_state=0
 3)
 4y_means = km.fit_predict(x)
 5
 6plt.scatter(
 7    x[y_means == 0, 0],
 8    x[y_means == 0, 1],
 9    s=100,
10    c="pink",
11    label="Un-interested Customers",
12)
13plt.scatter(
14    x[y_means == 1, 0],
15    x[y_means == 1, 1],
16    s=100,
17    c="yellow",
18    label="Target Customers",
19)
20plt.scatter(
21    km.cluster_centers_[:, 0],
22    km.cluster_centers_[:, 1],
23    s=50,
24    c="blue",
25    label="centeroid",
26)
27
28plt.title("Informational Duration vs Bounce Rates", fontsize=20)
29plt.grid()
30plt.xlabel("Informational Duration")
31plt.ylabel("Bounce Rates")
32plt.legend()
33plt.show()
../../_images/e50db88db8fa5fffdd2cf891d75fa6277c531ea16c1a9642186d77f101779323.png

Target customers spend around 850-900 seconds on average on the Information page.

K-means Clustering for Informational Duration versus Exit Rate#

 1x = dataset.iloc[:, [4, 7]].values
 2
 3wcss = []
 4for i in range(1, 11):
 5    km = KMeans(
 6        n_clusters=i,
 7        init="k-means++",
 8        max_iter=300,
 9        n_init=10,
10        random_state=0,
11        algorithm="elkan",
12        tol=0.001,
13    )
14    km.fit(x)
15    labels = km.labels_
16    wcss.append(km.inertia_)
17
18plt.rcParams["figure.figsize"] = (15, 7)
19plt.plot(range(1, 11), wcss)
20plt.grid()
21plt.tight_layout()
22plt.title("The Elbow Method", fontsize=20)
23plt.xlabel("No. of Clusters")
24plt.ylabel("wcss")
25plt.show()
../../_images/70fa4a17fe4b3f579140102df3fb9d027f1203c880d8e629b479a057af65a328.png
 1km = KMeans(
 2    n_clusters=2, init="k-means++", max_iter=300, n_init=10, random_state=0
 3)
 4y_means = km.fit_predict(x)
 5
 6plt.scatter(
 7    x[y_means == 0, 0],
 8    x[y_means == 0, 1],
 9    s=100,
10    c="pink",
11    label="Un-interested Customers",
12)
13plt.scatter(
14    x[y_means == 1, 0],
15    x[y_means == 1, 1],
16    s=100,
17    c="yellow",
18    label="Target Customers",
19)
20plt.scatter(
21    km.cluster_centers_[:, 0],
22    km.cluster_centers_[:, 1],
23    s=50,
24    c="blue",
25    label="centeroid",
26)
27
28plt.title("Informational Duration vs Exit Rates", fontsize=20)
29plt.grid()
30plt.xlabel("Informational Duration")
31plt.ylabel("Exit Rates")
32plt.legend()
33plt.show()
../../_images/6fb46db11136a1ae5839ae950528e29e899c2698d2145cbddd67b5053ac55625.png

K-means Clustering for Administrative Duration versus Bounce Rate and Administrative Duration versus Exit Rate#

1# Administrative duration vs Bounce Rate
2x = dataset.iloc[:, [1, 6]].values
3x.shape
(12330, 2)
 1wcss = []
 2for i in range(1, 11):
 3    km = KMeans(
 4        n_clusters=i,
 5        init="k-means++",
 6        max_iter=300,
 7        n_init=10,
 8        random_state=0,
 9        algorithm="elkan",
10        tol=0.001,
11    )
12    km.fit(x)
13    labels = km.labels_
14    wcss.append(km.inertia_)
15
16plt.rcParams["figure.figsize"] = (15, 7)
17plt.plot(range(1, 11), wcss)
18plt.grid()
19plt.tight_layout()
20plt.title("The Elbow Method", fontsize=20)
21plt.xlabel("No. of Clusters")
22plt.ylabel("wcss")
23plt.show()
../../_images/a22b4a9924933f31854079f1d4adcda02e538a82c68fbb6072b9c73c56bfd0c9.png
 1km = KMeans(
 2    n_clusters=2, init="k-means++", max_iter=300, n_init=10, random_state=0
 3)
 4y_means = km.fit_predict(x)
 5
 6plt.scatter(
 7    x[y_means == 0, 0],
 8    x[y_means == 0, 1],
 9    s=100,
10    c="pink",
11    label="Un-interested Customers",
12)
13plt.scatter(
14    x[y_means == 1, 0],
15    x[y_means == 1, 1],
16    s=100,
17    c="cyan",
18    label="Target Customers",
19)
20plt.scatter(
21    km.cluster_centers_[:, 0],
22    km.cluster_centers_[:, 1],
23    s=50,
24    c="blue",
25    label="centeroid",
26)
27
28plt.title("Administrative Duration vs Bounce Rate", fontsize=20)
29plt.grid()
30plt.xlabel("Administrative Duration")
31plt.ylabel("Bounce Rates")
32plt.legend()
33plt.show()
../../_images/075d68d25769bfc5b016489518eccc89b8399582204c3ce8ab69c724f0b22ff6.png
 1# Administrative duration vs Exit Rate
 2x = dataset.iloc[:, [1, 7]].values
 3
 4wcss = []
 5for i in range(1, 11):
 6    km = KMeans(
 7        n_clusters=i,
 8        init="k-means++",
 9        max_iter=300,
10        n_init=10,
11        random_state=0,
12        algorithm="elkan",
13        tol=0.001,
14    )
15    km.fit(x)
16    labels = km.labels_
17    wcss.append(km.inertia_)
18
19plt.rcParams["figure.figsize"] = (15, 7)
20plt.plot(range(1, 11), wcss)
21plt.grid()
22plt.tight_layout()
23plt.title("The Elbow Method", fontsize=20)
24plt.xlabel("No. of Clusters")
25plt.ylabel("wcss")
26plt.show()
../../_images/a22b4a9924933f31854079f1d4adcda02e538a82c68fbb6072b9c73c56bfd0c9.png
 1km = KMeans(
 2    n_clusters=2, init="k-means++", max_iter=300, n_init=10, random_state=0
 3)
 4y_means = km.fit_predict(x)
 5
 6plt.scatter(
 7    x[y_means == 0, 0],
 8    x[y_means == 0, 1],
 9    s=100,
10    c="pink",
11    label="Un-interested Customers",
12)
13plt.scatter(
14    x[y_means == 1, 0],
15    x[y_means == 1, 1],
16    s=100,
17    c="yellow",
18    label="Target Customers",
19)
20plt.scatter(
21    km.cluster_centers_[:, 0],
22    km.cluster_centers_[:, 1],
23    s=50,
24    c="blue",
25    label="centeroid",
26)
27
28plt.title("Administrative Duration vs Exit Rates", fontsize=20)
29plt.grid()
30plt.xlabel("Administrative Duration")
31plt.ylabel("Exit Rates")
32plt.legend()
33plt.show()
../../_images/7e821f451b49793f961aebef50562e46faa8cf431b98ad82d696398c9c146269.png

Uninterested customers spend less time in administrative pages compared with the target customers, who spend around 750 seconds on the administrative page before exiting.

  • The conversion rates of new visitors are high compared to those of returning customers.

  • While the number of returning customers to the website is high, the conversion rate is low compared to that of new customers.

  • Pages with a high page value have a lower bounce rate. We should be talking with our tech team to find ways to improve the page value of the web pages.