Skiprows and nrows#

The first few rows of a CSV data file are often metadata about the data source or similar information, and the footer can also contain information not useful for further processing. Both can be skipped. Combining skiprows and nrows can also be used to read data in small chunks.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading datasets#

dataset_5 = pd.read_csv(f"{data_path}/skiprows.csv")
dataset_5

	Filetype: CSV	Unnamed: 1	Unnamed: 2	Unnamed: 3
0	NaN	Info	NaN	NaN
1	Column 1	Column 2	Column 3	Column 4
2	2	1500	Good	300000
3	3	1300	Fair	240000
4	3	1900	Very good	450000
5	3	1850	Bad	280000
6	2	1640	Good	310000

dataset_5 = pd.read_csv(f"{data_path}/skiprows.csv", skiprows=2)
dataset_5

	Column 1	Column 2	Column 3	Column 4
0	2	1500	Good	300000
1	3	1300	Fair	240000
2	3	1900	Very good	450000
3	3	1850	Bad	280000
4	2	1640	Good	310000

dataset_6 = pd.read_csv(f"{data_path}/skipfooter.csv")
dataset_6

	Filetype: CSV	Unnamed: 1	Unnamed: 2	Unnamed: 3
0	NaN	Info	NaN	NaN
1	Column 1	Column 2	Column 3	Column 4
2	2	1500	Good	300000
3	3	1300	Fair	240000
4	3	1900	Very good	450000
5	3	1850	Bad	280000
6	2	1640	Good	310000
7	NaN	This is the end of file	NaN	NaN

# Use skipfooter and the engine='python' option to enable this. There
# are two engines for these CSV reader functions, based on C or Python,
# of which only the Python engine supports the skipfooter option.
dataset_6 = pd.read_csv(
    f"{data_path}/skipfooter.csv", skiprows=2, skipfooter=1, engine="python"
)
dataset_6

	Column 1	Column 2	Column 3	Column 4
0	2	1500	Good	300000
1	3	1300	Fair	240000
2	3	1900	Very good	450000
3	3	1850	Bad	280000
4	2	1640	Good	310000

# Reading Only the First N Rows
dataset_7 = pd.read_csv(f"{data_path}/example_1.csv", nrows=2)
dataset_7

	Column 1	Column 2	Column 3	Column 4
0	2	1500	Good	300000
1	3	1300	Fair	240000

# Combining skiprows and nrows to read data in small chunks
list_of_dataframes = []
rows_in_a_chunk = 10
number_of_chunks = 5

dummy = pd.read_csv(f"{data_path}/cleaned_mpi_disagg_by_groups.csv", nrows=2)
columns = dummy.columns

for i in range(0, number_of_chunks * rows_in_a_chunk, rows_in_a_chunk):
    dataset = pd.read_csv(
        f"{data_path}/cleaned_mpi_disagg_by_groups.csv",
        header=0,
        skiprows=i,
        nrows=rows_in_a_chunk,
        names=columns,
    )
    list_of_dataframes.append(dataset)

dataset_8 = pd.read_csv(f"{data_path}/blankline.csv", skip_blank_lines=False)
dataset_8

	Column 1	Column 2	Column 3	Column 4
0	2.0	1500.0	Good	300000.0
1	3.0	1300.0	Fair	240000.0
2	NaN	NaN	NaN	NaN
3	3.0	1900.0	Very good	450000.0
4	3.0	1850.0	Bad	280000.0
5	NaN	NaN	NaN	NaN
6	2.0	1640.0	Good	310000.0