Joining#

Joining is done based on index keys and combines the columns of two potentially differently indexed DataFrames into a single one. Useful if the records in different tables are indexed differently but represent the same data.

Importing libraries and packages#

# Mathematical operations and data manipulation
import pandas as pd

Set paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

dataset = pd.read_csv(f"{data_path}/cleaned_mpi_disagg_by_groups.csv")

Wrangling#

dataset.head()

	Country	Type of survey	Survey year	Ethnic/racial/caste group	MPI: Value for the country	MPI: Value for the group	Headcount (%)	Number of multidimensionally poor people by group (thousands)	Intensity of deprivation (%)	Health (%)	...	Cooking fuel (%)	Sanitation (%)	Drinking water (%)	Electricity (%)	Housing (%)	Assets (%)	Population share by group (%)	Population size by group (thousands)	Population size (thousands)	Region
0	Bangladesh	MICS	2019	Bengali	0.104060	0.102702	24.384759	39284.990511	42.117223	17.441109	...	12.484664	8.274627	0.569494	2.308714	12.478603	8.562996	98.809242	161104.688057	163046.173	South Asia
1	Bangladesh	MICS	2019	Other	0.104060	0.216783	45.868093	890.521140	47.262356	10.881517	...	11.733451	10.198139	8.354676	8.593331	11.536150	10.738271	1.190756	1941.482818	163046.173	South Asia
2	Belize	MICS	2015/2016	Creole	0.017109	0.003768	1.051818	0.940881	35.820526	52.086931	...	1.126231	3.964365	1.126231	3.383591	6.162911	4.409921	22.916001	89.452839	390.351	Latin America and the Caribbean
3	Belize	MICS	2015/2016	Garifuna	0.017109	0.003887	1.097083	0.224891	35.433114	85.184902	...	2.963020	2.963020	0.000000	2.963020	2.963020	2.963020	5.251431	20.499014	390.351	Latin America and the Caribbean
4	Belize	MICS	2015/2016	Maya	0.017109	0.078922	18.631953	8.557940	42.358151	37.911840	...	11.931632	7.811719	2.319572	9.465594	11.165109	4.267081	11.766724	45.931523	390.351	Latin America and the Caribbean

5 rows × 26 columns

dataset_1 = dataset[
    ["Child mortality (%)", "Nutrition (%)", "Ethnic/racial/caste group"]
][0:4]
dataset_1.set_index(["Ethnic/racial/caste group"], inplace=True)
dataset_1

	Child mortality (%)	Nutrition (%)
Ethnic/racial/caste group
Bengali	2.115224	15.325885
Other	1.010234	9.871282
Creole	14.073077	38.013855
Garifuna	38.147920	47.036982

dataset_2 = dataset[
    ["Ethnic/racial/caste group", "Sanitation (%)", "Drinking water (%)"]
][4:8]
dataset_2.set_index(["Ethnic/racial/caste group"], inplace=True)
dataset_2

	Sanitation (%)	Drinking water (%)
Ethnic/racial/caste group
Maya	7.811719	2.319572
Mestizo/Spanish/Latino	6.440338	3.184134
Other	2.080374	0.000000
Aymara	14.212216	5.715832

inner_join = pd.merge(
    dataset_1, dataset_2, how="inner", on="Ethnic/racial/caste group"
).drop_duplicates()
inner_join

	Child mortality (%)	Nutrition (%)	Sanitation (%)	Drinking water (%)
Ethnic/racial/caste group
Other	1.010234	9.871282	2.080374	0.0

outer_join = pd.merge(
    dataset_1, dataset_2, how="outer", on="Ethnic/racial/caste group"
).drop_duplicates()
outer_join

	Child mortality (%)	Nutrition (%)	Sanitation (%)	Drinking water (%)
Ethnic/racial/caste group
Bengali	2.115224	15.325885	NaN	NaN
Other	1.010234	9.871282	2.080374	0.000000
Creole	14.073077	38.013855	NaN	NaN
Garifuna	38.147920	47.036982	NaN	NaN
Maya	NaN	NaN	7.811719	2.319572
Mestizo/Spanish/Latino	NaN	NaN	6.440338	3.184134
Aymara	NaN	NaN	14.212216	5.715832

dataset_1.join(dataset_2, how="left").drop_duplicates()

	Child mortality (%)	Nutrition (%)	Sanitation (%)	Drinking water (%)
Ethnic/racial/caste group
Bengali	2.115224	15.325885	NaN	NaN
Other	1.010234	9.871282	2.080374	0.0
Creole	14.073077	38.013855	NaN	NaN
Garifuna	38.147920	47.036982	NaN	NaN

dataset_1.join(dataset_2, how="right").drop_duplicates()

	Child mortality (%)	Nutrition (%)	Sanitation (%)	Drinking water (%)
Ethnic/racial/caste group
Maya	NaN	NaN	7.811719	2.319572
Mestizo/Spanish/Latino	NaN	NaN	6.440338	3.184134
Other	1.010234	9.871282	2.080374	0.000000
Aymara	NaN	NaN	14.212216	5.715832

dataset_join_index = pd.merge(
    dataset_1, dataset_2, right_index=True, left_index=True
)
dataset_join_index

	Child mortality (%)	Nutrition (%)	Sanitation (%)	Drinking water (%)
Ethnic/racial/caste group
Other	1.010234	9.871282	2.080374	0.0