Loading arff data#

Importing libraries and packages#

 1# Warnings
 2import warnings
 3
 4# Mathematical operations and data manipulation
 5import pandas as pd
 6
 7# For loading .arff files
 8from scipy.io import arff
 9
10warnings.filterwarnings("ignore")
11warnings.filterwarnings("ignore", category=DeprecationWarning)
12%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

 1# Loading the 5 raw .arff files into a list
 2def load_arff_raw_data():
 3    n = 5
 4    return [
 5        arff.loadarff(f"{data_path}/" + str(i + 1) + "year.arff")
 6        for i in range(n)
 7    ]
 8
 9
10# Loading the 5 raw .arff files into pandas dataframes
11def load_dataframes():
12    return [
13        pd.DataFrame(data_i_year[0]) for data_i_year in load_arff_raw_data()
14    ]
15
16
17# Setting column headers from X1 ... X64 and the class label as Y,
18# for all the 5 dataframes.
19def set_new_headers(dataframes):
20    cols = ["X" + str(i + 1) for i in range(len(dataframes[0].columns) - 1)]
21    cols.append("Y")
22    for df in dataframes:
23        df.columns = cols
24
25
26# dataframes is the list of pandas dataframes for the 5 year datafiles.
27dfs = load_dataframes()
28
29# Set new headers for the dataframes: the renamed set of feature
30# (X1 to X64)
31set_new_headers(dfs)
32
33# print the first 5 rows of a dataset 'year1'
34dfs[0].head()
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X56 X57 X58 X59 X60 X61 X62 X63 X64 Y
0 0.200550 0.37951 0.39641 2.0472 32.3510 0.38825 0.249760 1.33050 1.1389 0.50494 ... 0.121960 0.39718 0.87804 0.001924 8.4160 5.1372 82.658 4.4158 7.4277 b'0'
1 0.209120 0.49988 0.47225 1.9447 14.7860 0.00000 0.258340 0.99601 1.6996 0.49788 ... 0.121300 0.42002 0.85300 0.000000 4.1486 3.2732 107.350 3.4000 60.9870 b'0'
2 0.248660 0.69592 0.26713 1.5548 -1.1523 0.00000 0.309060 0.43695 1.3090 0.30408 ... 0.241140 0.81774 0.76599 0.694840 4.9909 3.9510 134.270 2.7185 5.2078 b'0'
3 0.081483 0.30734 0.45879 2.4928 51.9520 0.14988 0.092704 1.86610 1.0571 0.57353 ... 0.054015 0.14207 0.94598 0.000000 4.5746 3.6147 86.435 4.2228 5.5497 b'0'
4 0.187320 0.61323 0.22960 1.4063 -7.3128 0.18732 0.187320 0.63070 1.1559 0.38677 ... 0.134850 0.48431 0.86515 0.124440 6.3985 4.3158 127.210 2.8692 7.8980 b'0'

5 rows × 65 columns

1dfs[0].shape
(7027, 65)
 1# Converting dtypes of all the columns (other than the class
 2# label columns) to float.
 3def convert_columns_type_float(dataframes):
 4    for i in range(5):
 5        index = 1
 6        while index <= 63:
 7            colname = dfs[i].columns[index]
 8            col = getattr(dfs[i], colname)
 9            dataframes[i][colname] = col.astype(float)
10            index += 1
11
12
13convert_columns_type_float(dfs)
1# The class labels for all the dataframes are originally in object type.
2# Converting to int types
3def convert_class_label_type_int(dataframes):
4    for i in range(len(dataframes)):
5        col = getattr(dfs[i], "Y")
6        dataframes[i]["Y"] = col.astype(int)
7
8
9convert_class_label_type_int(dfs)
1# Shape of the year1 DataFrame
2dfs[0].head()
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 ... X56 X57 X58 X59 X60 X61 X62 X63 X64 Y
0 0.200550 0.37951 0.39641 2.0472 32.3510 0.38825 0.249760 1.33050 1.1389 0.50494 ... 0.121960 0.39718 0.87804 0.001924 8.4160 5.1372 82.658 4.4158 7.4277 0
1 0.209120 0.49988 0.47225 1.9447 14.7860 0.00000 0.258340 0.99601 1.6996 0.49788 ... 0.121300 0.42002 0.85300 0.000000 4.1486 3.2732 107.350 3.4000 60.9870 0
2 0.248660 0.69592 0.26713 1.5548 -1.1523 0.00000 0.309060 0.43695 1.3090 0.30408 ... 0.241140 0.81774 0.76599 0.694840 4.9909 3.9510 134.270 2.7185 5.2078 0
3 0.081483 0.30734 0.45879 2.4928 51.9520 0.14988 0.092704 1.86610 1.0571 0.57353 ... 0.054015 0.14207 0.94598 0.000000 4.5746 3.6147 86.435 4.2228 5.5497 0
4 0.187320 0.61323 0.22960 1.4063 -7.3128 0.18732 0.187320 0.63070 1.1559 0.38677 ... 0.134850 0.48431 0.86515 0.124440 6.3985 4.3158 127.210 2.8692 7.8980 0

5 rows × 65 columns

X1 net profit / total assets X2 total liabilities / total assets X3 working capital / total assets X4 current assets / short-term liabilities X5 ((cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)) * 365 X6 retained earnings / total assets X7 EBIT / total assets X8 book value of equity / total liabilities X9 sales / total assets X10 equity / total assets X11 (gross profit + extraordinary items + financial expenses) / total assets X12 gross profit / short-term liabilities X13 (gross profit + depreciation) / sales X14 (gross profit + interest) / total assets X15 (total liabilities * 365) / (gross profit + depreciation) X16 (gross profit + depreciation) / total liabilities X17 total assets / total liabilities X18 gross profit / total assets X19 gross profit / sales X20 (inventory * 365) / sales X21 sales (n) / sales (n-1) X22 profit on operating activities / total assets X23 net profit / sales X24 gross profit (in 3 years) / total assets X25 (equity - share capital) / total assets X26 (net profit + depreciation) / total liabilities X27 profit on operating activities / financial expenses X28 working capital / fixed assets X29 logarithm of total assets X30 (total liabilities - cash) / sales X31 (gross profit + interest) / sales X32 (current liabilities * 365) / cost of products sold X33 operating expenses / short-term liabilities X34 operating expenses / total liabilities X35 profit on sales / total assets X36 total sales / total assets X37 (current assets - inventories) / long-term liabilities X38 constant capital / total assets X39 profit on sales / sales X40 (current assets - inventory - receivables) / short-term liabilities X41 total liabilities / ((profit on operating activities + depreciation) * (12/365)) X42 profit on operating activities / sales X43 rotation receivables + inventory turnover in days X44 (receivables * 365) / sales X45 net profit / inventory X46 (current assets - inventory) / short-term liabilities X47 (inventory * 365) / cost of products sold X48 EBITDA (profit on operating activities - depreciation) / total assets X49 EBITDA (profit on operating activities - depreciation) / sales X50 current assets / total liabilities X51 short-term liabilities / total assets X52 (short-term liabilities * 365) / cost of products sold) X53 equity / fixed assets X54 constant capital / fixed assets X55 working capital X56 (sales - cost of products sold) / sales X57 (current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation) X58 total costs /total sales X59 long-term liabilities / equity X60 sales / inventory X61 sales / receivables X62 (short-term liabilities *365) / sales X63 sales / short-term liabilities X64 sales / fixed assets