Skiprows and nrows#

The first few rows of a CSV data file are often metadata about the data source or similar information, and the footer can also contain information not useful for further processing. Both can be skipped. Combining skiprows and nrows can also be used to read data in small chunks.

Importing libraries and packages#

1# Mathematical operations and data manipulation
2import pandas as pd

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading datasets#

1dataset_5 = pd.read_csv(f"{data_path}/skiprows.csv")
2dataset_5
Filetype: CSV Unnamed: 1 Unnamed: 2 Unnamed: 3
0 NaN Info NaN NaN
1 Column 1 Column 2 Column 3 Column 4
2 2 1500 Good 300000
3 3 1300 Fair 240000
4 3 1900 Very good 450000
5 3 1850 Bad 280000
6 2 1640 Good 310000
1dataset_5 = pd.read_csv(f"{data_path}/skiprows.csv", skiprows=2)
2dataset_5
Column 1 Column 2 Column 3 Column 4
0 2 1500 Good 300000
1 3 1300 Fair 240000
2 3 1900 Very good 450000
3 3 1850 Bad 280000
4 2 1640 Good 310000
1dataset_6 = pd.read_csv(f"{data_path}/skipfooter.csv")
2dataset_6
Filetype: CSV Unnamed: 1 Unnamed: 2 Unnamed: 3
0 NaN Info NaN NaN
1 Column 1 Column 2 Column 3 Column 4
2 2 1500 Good 300000
3 3 1300 Fair 240000
4 3 1900 Very good 450000
5 3 1850 Bad 280000
6 2 1640 Good 310000
7 NaN This is the end of file NaN NaN
1# Use skipfooter and the engine='python' option to enable this. There
2# are two engines for these CSV reader functions, based on C or Python,
3# of which only the Python engine supports the skipfooter option.
4dataset_6 = pd.read_csv(
5    f"{data_path}/skipfooter.csv", skiprows=2, skipfooter=1, engine="python"
6)
7dataset_6
Column 1 Column 2 Column 3 Column 4
0 2 1500 Good 300000
1 3 1300 Fair 240000
2 3 1900 Very good 450000
3 3 1850 Bad 280000
4 2 1640 Good 310000
1# Reading Only the First N Rows
2dataset_7 = pd.read_csv(f"{data_path}/example_1.csv", nrows=2)
3dataset_7
Column 1 Column 2 Column 3 Column 4
0 2 1500 Good 300000
1 3 1300 Fair 240000
 1# Combining skiprows and nrows to read data in small chunks
 2list_of_dataframes = []
 3rows_in_a_chunk = 10
 4number_of_chunks = 5
 5
 6dummy = pd.read_csv(f"{data_path}/cleaned_mpi_disagg_by_groups.csv", nrows=2)
 7columns = dummy.columns
 8
 9for i in range(0, number_of_chunks * rows_in_a_chunk, rows_in_a_chunk):
10    dataset = pd.read_csv(
11        f"{data_path}/cleaned_mpi_disagg_by_groups.csv",
12        header=0,
13        skiprows=i,
14        nrows=rows_in_a_chunk,
15        names=columns,
16    )
17    list_of_dataframes.append(dataset)
1dataset_8 = pd.read_csv(f"{data_path}/blankline.csv", skip_blank_lines=False)
2dataset_8
Column 1 Column 2 Column 3 Column 4
0 2.0 1500.0 Good 300000.0
1 3.0 1300.0 Fair 240000.0
2 NaN NaN NaN NaN
3 3.0 1900.0 Very good 450000.0
4 3.0 1850.0 Bad 280000.0
5 NaN NaN NaN NaN
6 2.0 1640.0 Good 310000.0