Data analysis#

  • Which customers placed the most and fewest orders?

  • Which customers spent the most and least money?

  • Which months were the most and least popular for this online retail store?

  • Which dates of the month were the most and least popular for this online retail store?

  • Which days were the most and least popular for this online retail store?

  • Which hours of the day were most and least popular for this online retail store?

  • Which items were ordered the most and least?

  • Which countries placed the most and fewest orders?

  • Which countries spent the most and least money?

Importing libraries and packages#

 1# Mathematical operations and data manipulation
 2import pandas as pd
 3
 4# Visualisation
 5import matplotlib.pyplot as plt
 6
 7# Warnings
 8import warnings
 9
10warnings.filterwarnings("ignore")
11
12%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/engineered_retail.csv")
3dataset.head().T
0 1 2 3 4
invoice 489434 489434 489434 489434 489434
country United Kingdom United Kingdom United Kingdom United Kingdom United Kingdom
cust_id 13085.0 13085.0 13085.0 13085.0 13085.0
stock_code 85048 79323P 79323W 22041 21232
desc 15cm christmas glass ball 20 lights pink cherry lights white cherry lights record frame 7" single size strawberry ceramic trinket box
quantity 12 12 12 48 24
unit_price 6.95 6.75 6.75 2.1 1.25
date 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00
spent 83.4 81.0 81.0 100.8 30.0
year_month 200912 200912 200912 200912 200912
year 2009 2009 2009 2009 2009
month 12 12 12 12 12
day 1 1 1 1 1
day_of_week 2 2 2 2 2
hour 7 7 7 7 7

Data analysis#

Which customers placed the most and fewest orders?

1# Orders by each customer
2ord_cust = dataset.groupby(by=["cust_id", "country"], as_index=False)[
3    "invoice"
4].count()
5ord_cust.head(10)
cust_id country invoice
0 12346.0 United Kingdom 33
1 12347.0 Iceland 71
2 12348.0 Finland 20
3 12349.0 Italy 102
4 12351.0 Unspecified 21
5 12352.0 Norway 18
6 12353.0 Bahrain 20
7 12355.0 Bahrain 22
8 12356.0 Portugal 84
9 12357.0 Switzerland 165
1plt.subplots(figsize=(15, 6))
2oc = plt.plot(ord_cust.cust_id, ord_cust.invoice)
3plt.xlabel("Customer ID")
4plt.ylabel("Number of Orders")
5plt.title("Number of Orders made by Customers")
6plt.show()
../../_images/d9801289e38b7d1ff69464407eac95cf58d90dd24c2638da1738fb40d11dfc2f.png
1# The top 5 customers
2ord_cust.describe()
cust_id invoice
count 4315.000000 4315.000000
mean 15346.442642 94.474623
std 1702.986420 201.977000
min 12346.000000 1.000000
25% 13878.500000 18.000000
50% 15346.000000 44.000000
75% 16833.500000 102.000000
max 18287.000000 5570.000000
1# 5 customers who ordered the most often
2ord_cust.sort_values(by="invoice", ascending=False).head()
cust_id country invoice
1844 14911.0 EIRE 5570
3992 17841.0 United Kingdom 5043
1610 14606.0 United Kingdom 3866
1273 14156.0 EIRE 2648
256 12748.0 United Kingdom 2633
1# Who placed the fewest orders
2ord_cust.sort_values(by="invoice", ascending=False).tail()
cust_id country invoice
1233 14095.0 United Kingdom 1
1239 14106.0 United Kingdom 1
2752 16165.0 United Kingdom 1
3655 17378.0 United Kingdom 1
1427 14366.0 United Kingdom 1

Which customers spent the most and least money on an item?

1spent_cust = dataset.groupby(
2    by=["cust_id", "country", "quantity", "unit_price"], as_index=False
3)["spent"].sum()
4spent_cust.head()
cust_id country quantity unit_price spent
0 12346.0 United Kingdom 1 1.00 1.00
1 12346.0 United Kingdom 1 3.25 3.25
2 12346.0 United Kingdom 1 5.95 23.80
3 12346.0 United Kingdom 1 7.49 142.31
4 12346.0 United Kingdom 5 4.50 157.50
1plt.subplots(figsize=(15, 6))
2sc = plt.plot(spent_cust.cust_id, spent_cust.spent)
3plt.xlabel("Customer ID")
4plt.ylabel("Total Amount Spent")
5plt.title("Amount Spent by Customers")
6plt.show()
../../_images/d820b3fd20a542376dd96f86a4ad73cf0aef8efa17581e86b77824b269719bb1.png
1# Spent the most
2spent_cust.sort_values(by="spent", ascending=False).head()
cust_id country quantity unit_price spent
144871 18102.0 United Kingdom 300 4.58 43968.0
144915 18102.0 United Kingdom 600 3.00 18000.0
82744 15769.0 United Kingdom 200 1.65 17490.0
84312 15838.0 United Kingdom 9360 1.69 15818.4
144912 18102.0 United Kingdom 576 3.00 13824.0
1# Spent the least
2spent_cust.sort_values(by="spent", ascending=False).tail()
cust_id country quantity unit_price spent
21568 13317.0 United Kingdom 1 0.001 0.001
61105 14857.0 United Kingdom 1 0.001 0.001
48967 14459.0 United Kingdom 1 0.001 0.001
6111 12671.0 Germany 1 0.001 0.001
32451 13765.0 United Kingdom 1 0.001 0.001

Which months were the most and least popular for this online retail store?

1ord_month = (
2    dataset.groupby(["invoice"])["year_month"]
3    .unique()
4    .value_counts()
5    .sort_index()
6)
7ord_month
[200912]    1512
[201001]    1010
[201002]    1104
[201003]    1521
[201004]    1329
[201005]    1377
[201006]    1497
[201007]    1381
[201008]    1293
[201009]    1688
[201010]    2133
[201011]    2587
[201012]     776
Name: year_month, dtype: int64
 1om = ord_month.plot(kind="bar", figsize=(15, 6))
 2om.set_xlabel("Month")
 3om.set_ylabel("Number of Orders")
 4om.set_title("Orders per Month")
 5om.set_xticklabels(
 6    (
 7        "Dec 09",
 8        "Jan 10",
 9        "Feb 10",
10        "Mar 10",
11        "Apr 10",
12        "May 10",
13        "Jun 10",
14        "Jul 10",
15        "Aug 10",
16        "Sep 10",
17        "Oct 10",
18        "Nov 10",
19        "Dec 10",
20    ),
21    rotation="horizontal",
22)
23plt.show()
../../_images/d5c673b25391f14c3fdc43cda8dad1ced0415de0f03c04a7da67c1117361ff07.png

Which dates of the month were the most and least popular for this online retail store?

1ord_day = (
2    dataset.groupby("invoice")["day"].unique().value_counts().sort_index()
3)
4ord_day
[1]     708
[2]     696
[3]     610
[4]     595
[5]     661
[6]     572
[7]     812
[8]     827
[9]     689
[10]    609
[11]    655
[12]    576
[13]    512
[14]    634
[15]    732
[16]    617
[17]    600
[18]    687
[19]    601
[20]    506
[21]    649
[22]    636
[23]    573
[24]    602
[25]    667
[26]    672
[27]    517
[28]    671
[29]    614
[30]    457
[31]    251
Name: day, dtype: int64
1od = ord_day.plot(kind="bar", figsize=(15, 6))
2od.set_xlabel("Day of the Month")
3od.set_ylabel("Number of Orders")
4od.set_title("Orders per Day of the Month")
5od.set_xticklabels(labels=[i for i in range(1, 32)], rotation="horizontal")
6plt.show()
../../_images/b44962e3ea604523e1ab04f70a06ece9e9d98618ace45dfd27c067025cdc090b.png

Which days were the most and least popular for this online retail store?

1ord_dayofweek = (
2    dataset.groupby("invoice")["day_of_week"]
3    .unique()
4    .value_counts()
5    .sort_index()
6)
7ord_dayofweek
[1]    2985
[2]    3513
[3]    3426
[4]    3976
[5]    2612
[6]      30
[7]    2666
Name: day_of_week, dtype: int64
1odw = ord_dayofweek.plot(kind="bar", figsize=(15, 6))
2odw.set_xlabel("Day of the Week")
3odw.set_ylabel("Number of Orders")
4odw.set_title("Orders per Day of the Week")
5odw.set_xticklabels(
6    labels=["Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"],
7    rotation="horizontal",
8)
9plt.show()
../../_images/bdacd643799a32ec28d30427347a03c9c733d7f39885280ce0d23c1f3b024a96.png

Which hours of the day were most and least popular for this online retail store?

1ord_hour = (
2    dataset.groupby(by=["invoice"])["hour"]
3    .unique()
4    .value_counts()
5    .sort_index()
6)
7ord_hour
[7]       49
[8]      444
[9]     1209
[10]    2232
[11]    2407
[12]    3173
[13]    2891
[14]    2365
[15]    2061
[16]    1263
[17]     637
[18]     258
[19]     185
[20]      34
Name: hour, dtype: int64
1oh = ord_hour.plot(kind="bar", figsize=(15, 6))
2oh.set_xlabel("Hour of the Day")
3oh.set_ylabel("Number of Orders")
4oh.set_title("Orders per Hour of the Day")
5oh.set_xticklabels(labels=[i for i in range(7, 21)], rotation="horizontal")
6plt.show()
../../_images/ec503ad772a1e569436b9370d5531120288e38a38199d6821050ef2ea1fabcb4.png

Which items were ordered the most and least?

1q_item = dataset.groupby(by=["desc"], as_index=False)["quantity"].sum()
2q_item.head()
desc quantity
0 doormat union jack guns and roses 169
1 3 stripey mice feltcraft 663
2 4 purple flock dinner candles 200
3 animal stickers 385
4 bank charges 2
1q_item.sort_values(by="quantity", ascending=False).head()
desc quantity
4260 white hanging heart t-light holder 56915
4366 world war 2 gliders asstd designs 54754
691 brocade ring purse 48166
2632 pack of 72 retro spot cake cases 45156
262 assorted colour bird ornament 44551
1q_item.sort_values(by="quantity", ascending=False).tail()
desc quantity
2544 opal white/silver flower necklace 1
1789 green chenille shaggy c/cover 1
2337 midnight blue crystal drop earrings 1
3728 silicon cube 25w, blue 1
1381 f.fairy s/3 sml candle, lavender 1

Which countries placed the most and fewest orders?

1ord_coun = dataset.groupby(["country"])["invoice"].count().sort_values()
2ord_coun.head()
country
Nigeria        30
Bahrain        42
Korea          53
West Indies    54
Brazil         62
Name: invoice, dtype: int64
1ocoun = ord_coun.plot(kind="barh", figsize=(15, 6))
2ocoun.set_xlabel("Number of Orders")
3ocoun.set_ylabel("Country")
4ocoun.set_title("Orders per Country")
5plt.show()
../../_images/c98781b63f6a5ba33364557aefe3e9bfaa02bc5e7583addf1eec19af1c358289.png
1del ord_coun["United Kingdom"]
2
3ocoun2 = ord_coun.plot(kind="barh", figsize=(15, 6))
4ocoun2.set_xlabel("Number of Orders")
5ocoun2.set_ylabel("Country")
6ocoun2.set_title("Orders per Country")
7plt.show()
../../_images/23f7ac81a82ad873b05387957f06c5094a48eb95127f5984856b608d38583bfc.png

Which countries spent the most and least money?

1coun_spent = dataset.groupby("country")["spent"].sum().sort_values()
2
3cs = coun_spent.plot(kind="barh", figsize=(15, 6))
4cs.set_xlabel("Amount Spent")
5cs.set_ylabel("Country")
6cs.set_title("Amount Spent per Country")
7plt.show()
../../_images/00ea5719ed43f753ea0369f5be1708bc04fb7b961f11a7aac12b43f6b65995ff.png
1del coun_spent["United Kingdom"]
2
3cs2 = coun_spent.plot(kind="barh", figsize=(15, 6))
4cs2.set_xlabel("Amount Spent")
5cs2.set_ylabel("Country")
6cs2.set_title("Amount Spent per Country")
7plt.show()
../../_images/05a3347272ea57c9f7875dfc338be5bcff02cb358087a17b5c890ae454486bd2.png