Reading a PDF file#

Parsing PDF files for table extraction with tabula-py.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import pandas as pd
3
4# Reading from file
5import tabula

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Tabula checks#

tabula_py documentation

1# Check java is installed (tabula_py depends on it)
2!java -version
openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21)
OpenJDK 64-Bit Server VM JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21, mixed mode)
1tabula.environment_info()
Python version:
    3.9.12 (main, Jun  1 2022, 11:38:51) 
[GCC 7.5.0]
Java version:
    openjdk version "11.0.13" 2021-10-19
OpenJDK Runtime Environment JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21)
OpenJDK 64-Bit Server VM JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21, mixed mode)
tabula-py version: 2.3.0
platform: Linux-5.15.0-1004-aws-x86_64-with-glibc2.31
uname:
    uname_result(system='Linux', node='build-19253410-project-832800-wrangling', release='5.15.0-1004-aws', version='#6-Ubuntu SMP Thu Mar 31 09:44:20 UTC 2022', machine='x86_64')
linux_distribution: ('Ubuntu', '20.04', 'focal')
mac_ver: ('', ('', '', ''), '')
    

Loading dataset#

1# tabula.read_pdf will return a list of DataFrames as output.
2# Extract the first DataFrame using df[0]. Also, read_pdf()
3# function reads only page 1 by default. For extracting other
4# or all pages, use the pages parameter.
5
6dataset_18_1 = tabula.read_pdf(
7    f"{data_path}/Housing_data.pdf", pages=1, pandas_options={"header": None}
8)
9dataset_18_1[0]
0 1 2 3 4 5 6 7 8 9
0 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311
1 0.22489 12.5 7.87 0 0.524 6.377 94.3 6.3467 5 311
2 0.11747 12.5 7.87 0 0.524 6.009 82.9 6.2267 5 311
3 0.09378 12.5 7.87 0 0.524 5.889 39.0 5.4509 5 311
1dataset_18_2 = tabula.read_pdf(
2    f"{data_path}/Housing_data.pdf", pages=2, pandas_options={"header": None}
3)
4dataset_18_2[0]
0 1 2 3
0 15.2 386.71 17.10 18.9
1 15.2 392.52 20.45 15.0
2 15.2 396.90 13.27 18.9
3 15.2 390.50 15.71 21.7

Wrangling#

1# names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX']
2#
3# df = pd.DataFrame (dataset_18_1[0], columns = ['names'])
4# df
5# dataset_18_1 = tabula.read_pdf(f'{data_path}/Housing_data.pdf',
6# pages=1, pandas_options={'header':None, 'names':names[:10]})
7# dataset_18_1
1df1 = pd.DataFrame(dataset_18_1[0])
2df2 = pd.DataFrame(dataset_18_2[0])
3df18 = pd.concat([df1, df2], axis=1)
4df18
0 1 2 3 4 5 6 7 8 9 0 1 2 3
0 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71 17.10 18.9
1 0.22489 12.5 7.87 0 0.524 6.377 94.3 6.3467 5 311 15.2 392.52 20.45 15.0
2 0.11747 12.5 7.87 0 0.524 6.009 82.9 6.2267 5 311 15.2 396.90 13.27 18.9
3 0.09378 12.5 7.87 0 0.524 5.889 39.0 5.4509 5 311 15.2 390.50 15.71 21.7
 1names = [
 2    "CRIM",
 3    "ZN",
 4    "INDUS",
 5    "CHAS",
 6    "NOX",
 7    "RM",
 8    "AGE",
 9    "DIS",
10    "RAD",
11    "TAX",
12    "PTRATIO",
13    "B",
14    "LSTAT",
15    "PRICE",
16]
17
18dataset_18_1 = tabula.read_pdf(
19    f"{data_path}/Housing_data.pdf",
20    pages=1,
21    pandas_options={"header": None, "names": names[:10]},
22)
23df_1 = dataset_18_1[0]
24print(df_1)
25
26dataset_18_2 = tabula.read_pdf(
27    f"{data_path}/Housing_data.pdf",
28    pages=2,
29    pandas_options={"header": None, "names": names[10:]},
30)
31df_2 = dataset_18_2[0]
32print(df_2)
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX
0  0.17004  12.5   7.87     0  0.524  6.004  85.9  6.5921    5  311
1  0.22489  12.5   7.87     0  0.524  6.377  94.3  6.3467    5  311
2  0.11747  12.5   7.87     0  0.524  6.009  82.9  6.2267    5  311
3  0.09378  12.5   7.87     0  0.524  5.889  39.0  5.4509    5  311
   PTRATIO       B  LSTAT  PRICE
0     15.2  386.71  17.10   18.9
1     15.2  392.52  20.45   15.0
2     15.2  396.90  13.27   18.9
3     15.2  390.50  15.71   21.7
1dataset_18_2 = tabula.read_pdf(
2    f"{data_path}/Housing_data.pdf",
3    pages=2,
4    pandas_options={"header": None, "names": names[10:]},
5)
6df_2 = dataset_18_2[0]
1dataset_18 = pd.concat([df_1, df_2], axis=1)
2dataset_18
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT PRICE
0 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71 17.10 18.9
1 0.22489 12.5 7.87 0 0.524 6.377 94.3 6.3467 5 311 15.2 392.52 20.45 15.0
2 0.11747 12.5 7.87 0 0.524 6.009 82.9 6.2267 5 311 15.2 396.90 13.27 18.9
3 0.09378 12.5 7.87 0 0.524 5.889 39.0 5.4509 5 311 15.2 390.50 15.71 21.7