Preprocessing#

Loading the data and performing some initial exploration on it to acquire some basic knowledge about the data, how the various features are distributed.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Warnings
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load data
dataset = pd.read_csv(f"{data_path}/online_retail_II.csv")
dataset.head().T

	0	1	2	3	4
Invoice	489434	489434	489434	489434	489434
StockCode	85048	79323P	79323W	22041	21232
Description	15CM CHRISTMAS GLASS BALL 20 LIGHTS	PINK CHERRY LIGHTS	WHITE CHERRY LIGHTS	RECORD FRAME 7" SINGLE SIZE	STRAWBERRY CERAMIC TRINKET BOX
Quantity	12	12	12	48	24
InvoiceDate	01/12/2009 07:45	01/12/2009 07:45	01/12/2009 07:45	01/12/2009 07:45	01/12/2009 07:45
Price	6.95	6.75	6.75	2.1	1.25
Customer ID	13085.0	13085.0	13085.0	13085.0	13085.0
Country	United Kingdom	United Kingdom	United Kingdom	United Kingdom	United Kingdom

Exploring dataset#

# Printing dimensionality of the data, columns, types and missing values
print(f"Data dimension: {dataset.shape}")
for col in dataset.columns:
    print(
        f"Column: {col:35} | "
        f"type: {str(dataset[col].dtype):7} | "
        f"missing values: {dataset[col].isna().sum():3d}"
    )

Data dimension: (525461, 8)
Column: Invoice                             | type: object  | missing values:   0
Column: StockCode                           | type: object  | missing values:   0
Column: Description                         | type: object  | missing values: 2928
Column: Quantity                            | type: int64   | missing values:   0
Column: InvoiceDate                         | type: object  | missing values:   0
Column: Price                               | type: float64 | missing values:   0
Column: Customer ID                         | type: float64 | missing values: 107927
Column: Country                             | type: object  | missing values:   0

Column Description has some missing values, Customer ID has a lot of (20%) missing values.

# Computing statistics on numerical features
dataset.describe().T

	count	mean	std	min	25%	50%	75%	max
Quantity	525461.0	10.337667	107.424110	-9600.00	1.00	3.0	10.00	19152.00
Price	525461.0	4.688834	146.126914	-53594.36	1.25	2.1	4.21	25111.09
Customer ID	417534.0	15360.645478	1680.811316	12346.00	13983.00	15311.0	16799.00	18287.00

Preprocessing#

dataset.rename(
    index=str,
    columns={
        "Invoice": "invoice",
        "StockCode": "stock_code",
        "Quantity": "quantity",
        "InvoiceDate": "date",
        "Price": "unit_price",
        "Country": "country",
        "Description": "desc",
        "Customer ID": "cust_id",
    },
    inplace=True,
)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   invoice     525461 non-null  object 
 1   stock_code  525461 non-null  object 
 2   desc        522533 non-null  object 
 3   quantity    525461 non-null  int64  
 4   date        525461 non-null  object 
 5   unit_price  525461 non-null  float64
 6   cust_id     417534 non-null  float64
 7   country     525461 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 36.1+ MB

dataset.to_csv(f"{data_path}/preprocessed_retail.csv", index=False)