Data preparation and feature engineering#

Modifying already existing features and transforming them into new features.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Warnings
5import warnings
6
7warnings.filterwarnings("ignore")
8
9%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load data
2dataset = pd.read_csv(f"{data_path}/cleaned_retail.csv")
3dataset.head().T
0 1 2 3 4
invoice 489434 489434 489434 489434 489434
stock_code 85048 79323P 79323W 22041 21232
desc 15CM CHRISTMAS GLASS BALL 20 LIGHTS PINK CHERRY LIGHTS WHITE CHERRY LIGHTS RECORD FRAME 7" SINGLE SIZE STRAWBERRY CERAMIC TRINKET BOX
quantity 12 12 12 48 24
date 01/12/2009 07:45 01/12/2009 07:45 01/12/2009 07:45 01/12/2009 07:45 01/12/2009 07:45
unit_price 6.95 6.75 6.75 2.1 1.25
cust_id 13085.0 13085.0 13085.0 13085.0 13085.0
country United Kingdom United Kingdom United Kingdom United Kingdom United Kingdom

Data preparation and feature engineering#

1dataset.desc = dataset.desc.str.lower()
2dataset.head().T
0 1 2 3 4
invoice 489434 489434 489434 489434 489434
stock_code 85048 79323P 79323W 22041 21232
desc 15cm christmas glass ball 20 lights pink cherry lights white cherry lights record frame 7" single size strawberry ceramic trinket box
quantity 12 12 12 48 24
date 01/12/2009 07:45 01/12/2009 07:45 01/12/2009 07:45 01/12/2009 07:45 01/12/2009 07:45
unit_price 6.95 6.75 6.75 2.1 1.25
cust_id 13085.0 13085.0 13085.0 13085.0 13085.0
country United Kingdom United Kingdom United Kingdom United Kingdom United Kingdom
1dataset["date"] = pd.to_datetime(dataset.date, format="%d/%m/%Y %H:%M")
2dataset.head()
invoice stock_code desc quantity date unit_price cust_id country
0 489434 85048 15cm christmas glass ball 20 lights 12 2009-12-01 07:45:00 6.95 13085.0 United Kingdom
1 489434 79323P pink cherry lights 12 2009-12-01 07:45:00 6.75 13085.0 United Kingdom
2 489434 79323W white cherry lights 12 2009-12-01 07:45:00 6.75 13085.0 United Kingdom
3 489434 22041 record frame 7" single size 48 2009-12-01 07:45:00 2.10 13085.0 United Kingdom
4 489434 21232 strawberry ceramic trinket box 24 2009-12-01 07:45:00 1.25 13085.0 United Kingdom

Expanding the date column to create six new features that will help in analysing the dataset better.

 1# Add the first column at location 4, and increment the location
 2# by 1 with each new column. The first column is year_month and
 3# the values are the year and the month put together.
 4dataset.insert(
 5    loc=4,
 6    column="year_month",
 7    value=dataset.date.map(lambda x: 100 * x.year + x.month),
 8)
 9dataset.insert(loc=5, column="year", value=dataset.date.dt.year)
10dataset.insert(loc=6, column="month", value=dataset.date.dt.month)
11dataset.insert(loc=7, column="day", value=dataset.date.dt.day)
12dataset.insert(loc=8, column="hour", value=dataset.date.dt.hour)
13# Adding 1 to the day of the week so that the days are numbered
14# 1-7 instead of 0-6
15dataset.insert(
16    loc=9, column="day_of_week", value=dataset.date.dt.dayofweek + 1
17)
18dataset.head().T
0 1 2 3 4
invoice 489434 489434 489434 489434 489434
stock_code 85048 79323P 79323W 22041 21232
desc 15cm christmas glass ball 20 lights pink cherry lights white cherry lights record frame 7" single size strawberry ceramic trinket box
quantity 12 12 12 48 24
year_month 200912 200912 200912 200912 200912
year 2009 2009 2009 2009 2009
month 12 12 12 12 12
day 1 1 1 1 1
hour 7 7 7 7 7
day_of_week 2 2 2 2 2
date 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00
unit_price 6.95 6.75 6.75 2.1 1.25
cust_id 13085.0 13085.0 13085.0 13085.0 13085.0
country United Kingdom United Kingdom United Kingdom United Kingdom United Kingdom

The price of an item is given by unit_price, but that doesn’t give the total amount spent in a particular instance.

1dataset.insert(
2    loc=11, column="spent", value=(dataset["quantity"] * dataset["unit_price"])
3)
4dataset.head().T
0 1 2 3 4
invoice 489434 489434 489434 489434 489434
stock_code 85048 79323P 79323W 22041 21232
desc 15cm christmas glass ball 20 lights pink cherry lights white cherry lights record frame 7" single size strawberry ceramic trinket box
quantity 12 12 12 48 24
year_month 200912 200912 200912 200912 200912
year 2009 2009 2009 2009 2009
month 12 12 12 12 12
day 1 1 1 1 1
hour 7 7 7 7 7
day_of_week 2 2 2 2 2
date 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00
spent 83.4 81.0 81.0 100.8 30.0
unit_price 6.95 6.75 6.75 2.1 1.25
cust_id 13085.0 13085.0 13085.0 13085.0 13085.0
country United Kingdom United Kingdom United Kingdom United Kingdom United Kingdom
 1# Repositioning the columns for readability
 2dataset = dataset[
 3    [
 4        "invoice",
 5        "country",
 6        "cust_id",
 7        "stock_code",
 8        "desc",
 9        "quantity",
10        "unit_price",
11        "date",
12        "spent",
13        "year_month",
14        "year",
15        "month",
16        "day",
17        "day_of_week",
18        "hour",
19    ]
20]
21dataset.head().T
0 1 2 3 4
invoice 489434 489434 489434 489434 489434
country United Kingdom United Kingdom United Kingdom United Kingdom United Kingdom
cust_id 13085.0 13085.0 13085.0 13085.0 13085.0
stock_code 85048 79323P 79323W 22041 21232
desc 15cm christmas glass ball 20 lights pink cherry lights white cherry lights record frame 7" single size strawberry ceramic trinket box
quantity 12 12 12 48 24
unit_price 6.95 6.75 6.75 2.1 1.25
date 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00 2009-12-01 07:45:00
spent 83.4 81.0 81.0 100.8 30.0
year_month 200912 200912 200912 200912 200912
year 2009 2009 2009 2009 2009
month 12 12 12 12 12
day 1 1 1 1 1
day_of_week 2 2 2 2 2
hour 7 7 7 7 7
1dataset.to_csv(f"{data_path}/engineered_retail.csv", index=False)