Profiling#

Importing libraries and packages#

 1# Warnings
 2import warnings
 3
 4# Mathematical operations and data manipulation
 5import pandas as pd
 6
 7# import pandas_profiling as pp
 8
 9# For loading .arff files
10from scipy.io import arff
11
12warnings.filterwarnings("ignore")
13warnings.filterwarnings("ignore", category=DeprecationWarning)
14%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

 1# Loading the 5 raw .arff files into a list
 2def load_arff_raw_data():
 3    n = 5
 4    return [
 5        arff.loadarff(f"{data_path}/" + str(i + 1) + "year.arff")
 6        for i in range(n)
 7    ]
 8
 9
10# Loading the 5 raw .arff files into pandas dataframes
11def load_dataframes():
12    return [
13        pd.DataFrame(data_i_year[0]) for data_i_year in load_arff_raw_data()
14    ]
15
16
17# Setting column headers from X1 ... X64 and the class label as Y,
18# for all the 5 dataframes.
19def set_new_headers(dataframes):
20    cols = ["X" + str(i + 1) for i in range(len(dataframes[0].columns) - 1)]
21    cols.append("Y")
22    for df in dataframes:
23        df.columns = cols
24
25
26# dataframes is the list of pandas dataframes for the 5 year datafiles.
27dfs = load_dataframes()
28
29# Set new headers for the dataframes: the renamed set of feature (X1 to X64)
30set_new_headers(dfs)
31
32# print the first 5 rows of a dataset 'year1'
33dfs[0].head()
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X56 X57 X58 X59 X60 X61 X62 X63 X64 Y
0 0.200550 0.37951 0.39641 2.0472 32.3510 0.38825 0.249760 1.33050 1.1389 0.50494 ... 0.121960 0.39718 0.87804 0.001924 8.4160 5.1372 82.658 4.4158 7.4277 b'0'
1 0.209120 0.49988 0.47225 1.9447 14.7860 0.00000 0.258340 0.99601 1.6996 0.49788 ... 0.121300 0.42002 0.85300 0.000000 4.1486 3.2732 107.350 3.4000 60.9870 b'0'
2 0.248660 0.69592 0.26713 1.5548 -1.1523 0.00000 0.309060 0.43695 1.3090 0.30408 ... 0.241140 0.81774 0.76599 0.694840 4.9909 3.9510 134.270 2.7185 5.2078 b'0'
3 0.081483 0.30734 0.45879 2.4928 51.9520 0.14988 0.092704 1.86610 1.0571 0.57353 ... 0.054015 0.14207 0.94598 0.000000 4.5746 3.6147 86.435 4.2228 5.5497 b'0'
4 0.187320 0.61323 0.22960 1.4063 -7.3128 0.18732 0.187320 0.63070 1.1559 0.38677 ... 0.134850 0.48431 0.86515 0.124440 6.3985 4.3158 127.210 2.8692 7.8980 b'0'

5 rows × 65 columns

1dfs[0].shape
(7027, 65)
 1# Converting dtypes of all the columns (other than the class
 2# label columns) to float.
 3def convert_columns_type_float(dataframes):
 4    for i in range(5):
 5        index = 1
 6        while index <= 63:
 7            colname = dfs[i].columns[index]
 8            col = getattr(dfs[i], colname)
 9            dataframes[i][colname] = col.astype(float)
10            index += 1
11
12
13convert_columns_type_float(dfs)
1# The class labels for all the dataframes are originally in object type.
2# Converting to int types
3def convert_class_label_type_int(dataframes):
4    for i in range(len(dataframes)):
5        col = getattr(dfs[i], "Y")
6        dataframes[i]["Y"] = col.astype(int)
7
8
9convert_class_label_type_int(dfs)
1# Shape of the year1 DataFrame
2dfs[0].head()
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X56 X57 X58 X59 X60 X61 X62 X63 X64 Y
0 0.200550 0.37951 0.39641 2.0472 32.3510 0.38825 0.249760 1.33050 1.1389 0.50494 ... 0.121960 0.39718 0.87804 0.001924 8.4160 5.1372 82.658 4.4158 7.4277 0
1 0.209120 0.49988 0.47225 1.9447 14.7860 0.00000 0.258340 0.99601 1.6996 0.49788 ... 0.121300 0.42002 0.85300 0.000000 4.1486 3.2732 107.350 3.4000 60.9870 0
2 0.248660 0.69592 0.26713 1.5548 -1.1523 0.00000 0.309060 0.43695 1.3090 0.30408 ... 0.241140 0.81774 0.76599 0.694840 4.9909 3.9510 134.270 2.7185 5.2078 0
3 0.081483 0.30734 0.45879 2.4928 51.9520 0.14988 0.092704 1.86610 1.0571 0.57353 ... 0.054015 0.14207 0.94598 0.000000 4.5746 3.6147 86.435 4.2228 5.5497 0
4 0.187320 0.61323 0.22960 1.4063 -7.3128 0.18732 0.187320 0.63070 1.1559 0.38677 ... 0.134850 0.48431 0.86515 0.124440 6.3985 4.3158 127.210 2.8692 7.8980 0

5 rows × 65 columns

X1 net profit / total assets
X2 total liabilities / total assets
X3 working capital / total assets
X4 current assets / short-term liabilities
X5 ((cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)) * 365
X6 retained earnings / total assets
X7 EBIT / total assets
X8 book value of equity / total liabilities
X9 sales / total assets
X10 equity / total assets
X11 (gross profit + extraordinary items + financial expenses) / total assets
X12 gross profit / short-term liabilities
X13 (gross profit + depreciation) / sales
X14 (gross profit + interest) / total assets
X15 (total liabilities * 365) / (gross profit + depreciation)
X16 (gross profit + depreciation) / total liabilities
X17 total assets / total liabilities
X18 gross profit / total assets
X19 gross profit / sales
X20 (inventory * 365) / sales
X21 sales (n) / sales (n-1)
X22 profit on operating activities / total assets
X23 net profit / sales
X24 gross profit (in 3 years) / total assets
X25 (equity - share capital) / total assets
X26 (net profit + depreciation) / total liabilities
X27 profit on operating activities / financial expenses
X28 working capital / fixed assets
X29 logarithm of total assets
X30 (total liabilities - cash) / sales
X31 (gross profit + interest) / sales
X32 (current liabilities * 365) / cost of products sold
X33 operating expenses / short-term liabilities
X34 operating expenses / total liabilities
X35 profit on sales / total assets
X36 total sales / total assets
X37 (current assets - inventories) / long-term liabilities
X38 constant capital / total assets
X39 profit on sales / sales
X40 (current assets - inventory - receivables) / short-term liabilities
X41 total liabilities / ((profit on operating activities + depreciation) * (12/365))
X42 profit on operating activities / sales
X43 rotation receivables + inventory turnover in days
X44 (receivables * 365) / sales
X45 net profit / inventory
X46 (current assets - inventory) / short-term liabilities
X47 (inventory * 365) / cost of products sold
X48 EBITDA (profit on operating activities - depreciation) / total assets
X49 EBITDA (profit on operating activities - depreciation) / sales
X50 current assets / total liabilities
X51 short-term liabilities / total assets
X52 (short-term liabilities * 365) / cost of products sold)
X53 equity / fixed assets
X54 constant capital / fixed assets
X55 working capital
X56 (sales - cost of products sold) / sales
X57 (current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation)
X58 total costs /total sales
X59 long-term liabilities / equity
X60 sales / inventory
X61 sales / receivables
X62 (short-term liabilities *365) / sales
X63 sales / short-term liabilities
X64 sales / fixed assets

Pandas profiling#

Took too long. Put below in code cell to get the report in the assets directory.

for j in range(0, 5):
    profile = pp.ProfileReport(
        dfs[j],
        title="Pandas Profiling Report",
        plot={"histogram": {"bins": 8}},
    )
    profile.to_file(output_file=f"{assets_path}/" + str(j) + "output.html")