Reading a PDF file#

Parsing PDF files for table extraction with tabula-py.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

# Reading from file
import tabula

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Tabula checks#

tabula_py documentation

# Check java is installed (tabula_py depends on it)
!java -version

openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21)
OpenJDK 64-Bit Server VM JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21, mixed mode)

tabula.environment_info()

Python version:
    3.9.12 (main, Jun  1 2022, 11:38:51) 
[GCC 7.5.0]
Java version:
    openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21)
OpenJDK 64-Bit Server VM JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21, mixed mode)
tabula-py version: 2.3.0
platform: Linux-5.15.0-1004-aws-x86_64-with-glibc2.31
uname:
    uname_result(system='Linux', node='build-19253410-project-832800-wrangling', release='5.15.0-1004-aws', version='#6-Ubuntu SMP Thu Mar 31 09:44:20 UTC 2022', machine='x86_64')
linux_distribution: ('Ubuntu', '20.04', 'focal')
mac_ver: ('', ('', '', ''), '')
    

Loading dataset#

# tabula.read_pdf will return a list of DataFrames as output.
# Extract the first DataFrame using df[0]. Also, read_pdf()
# function reads only page 1 by default. For extracting other
# or all pages, use the pages parameter.

dataset_18_1 = tabula.read_pdf(
    f"{data_path}/Housing_data.pdf", pages=1, pandas_options={"header": None}
)
dataset_18_1[0]

	0	1	2	3	4	5	6	7	8	9
0	0.17004	12.5	7.87	0	0.524	6.004	85.9	6.5921	5	311
1	0.22489	12.5	7.87	0	0.524	6.377	94.3	6.3467	5	311
2	0.11747	12.5	7.87	0	0.524	6.009	82.9	6.2267	5	311
3	0.09378	12.5	7.87	0	0.524	5.889	39.0	5.4509	5	311

dataset_18_2 = tabula.read_pdf(
    f"{data_path}/Housing_data.pdf", pages=2, pandas_options={"header": None}
)
dataset_18_2[0]

	0	1	2	3
0	15.2	386.71	17.10	18.9
1	15.2	392.52	20.45	15.0
2	15.2	396.90	13.27	18.9
3	15.2	390.50	15.71	21.7

Wrangling#

# names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX']
#
# df = pd.DataFrame (dataset_18_1[0], columns = ['names'])
# df
# dataset_18_1 = tabula.read_pdf(f'{data_path}/Housing_data.pdf',
# pages=1, pandas_options={'header':None, 'names':names[:10]})
# dataset_18_1

df1 = pd.DataFrame(dataset_18_1[0])
df2 = pd.DataFrame(dataset_18_2[0])
df18 = pd.concat([df1, df2], axis=1)
df18

	0	1	2	3	4	5	6	7	8	9	0	1	2	3
0	0.17004	12.5	7.87	0	0.524	6.004	85.9	6.5921	5	311	15.2	386.71	17.10	18.9
1	0.22489	12.5	7.87	0	0.524	6.377	94.3	6.3467	5	311	15.2	392.52	20.45	15.0
2	0.11747	12.5	7.87	0	0.524	6.009	82.9	6.2267	5	311	15.2	396.90	13.27	18.9
3	0.09378	12.5	7.87	0	0.524	5.889	39.0	5.4509	5	311	15.2	390.50	15.71	21.7

names = [
    "CRIM",
    "ZN",
    "INDUS",
    "CHAS",
    "NOX",
    "RM",
    "AGE",
    "DIS",
    "RAD",
    "TAX",
    "PTRATIO",
    "B",
    "LSTAT",
    "PRICE",
]

dataset_18_1 = tabula.read_pdf(
    f"{data_path}/Housing_data.pdf",
    pages=1,
    pandas_options={"header": None, "names": names[:10]},
)
df_1 = dataset_18_1[0]
print(df_1)

dataset_18_2 = tabula.read_pdf(
    f"{data_path}/Housing_data.pdf",
    pages=2,
    pandas_options={"header": None, "names": names[10:]},
)
df_2 = dataset_18_2[0]
print(df_2)

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX
0.17004  12.5   7.87     0  0.524  6.004  85.9  6.5921    5  311
0.22489  12.5   7.87     0  0.524  6.377  94.3  6.3467    5  311
0.11747  12.5   7.87     0  0.524  6.009  82.9  6.2267    5  311
0.09378  12.5   7.87     0  0.524  5.889  39.0  5.4509    5  311

   PTRATIO       B  LSTAT  PRICE
   15.2  386.71  17.10   18.9
   15.2  392.52  20.45   15.0
   15.2  396.90  13.27   18.9
   15.2  390.50  15.71   21.7

dataset_18_2 = tabula.read_pdf(
    f"{data_path}/Housing_data.pdf",
    pages=2,
    pandas_options={"header": None, "names": names[10:]},
)
df_2 = dataset_18_2[0]

dataset_18 = pd.concat([df_1, df_2], axis=1)
dataset_18

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	PRICE
0	0.17004	12.5	7.87	0	0.524	6.004	85.9	6.5921	5	311	15.2	386.71	17.10	18.9
1	0.22489	12.5	7.87	0	0.524	6.377	94.3	6.3467	5	311	15.2	392.52	20.45	15.0
2	0.11747	12.5	7.87	0	0.524	6.009	82.9	6.2267	5	311	15.2	396.90	13.27	18.9
3	0.09378	12.5	7.87	0	0.524	5.889	39.0	5.4509	5	311	15.2	390.50	15.71	21.7