Ames housing#

No fake data, and much wrangling needed.

Importing libraries and packages#

 1import pandas as pd
 2import numpy as np
 3import matplotlib.pyplot as plt
 4import seaborn as sns
 5
 6# The style package adds support for easy-to-switch plotting
 7# "styles" with the same parameters as a matplotlib rc file
 8# (which is read at startup to configure Matplotlib).
 9# There are a number of pre-defined styles provided by Matplotlib.
10print(plt.style.available)
['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
1plt.style.use("fivethirtyeight")
2
3# To render matplotlib images in Python notebooks at double the
4# standard resolution, providing users of high-resolution screens
5# with a better visualization experience, set the retina or
6# png2x option:
7# %config InlineBackend.figure_format = 'retina'

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# Get the data
2ames = pd.read_csv("datasets/AmesHousing.csv")
1ames.head()
Order PID MS SubClass MS Zoning Lot Frontage Lot Area Street Alley Lot Shape Land Contour ... Pool Area Pool QC Fence Misc Feature Misc Val Mo Sold Yr Sold Sale Type Sale Condition SalePrice
0 1 526301100 20 RL 141.0 31770 Pave NaN IR1 Lvl ... 0 NaN NaN NaN 0 5 2010 WD Normal 215000
1 2 526350040 20 RH 80.0 11622 Pave NaN Reg Lvl ... 0 NaN MnPrv NaN 0 6 2010 WD Normal 105000
2 3 526351010 20 RL 81.0 14267 Pave NaN IR1 Lvl ... 0 NaN NaN Gar2 12500 6 2010 WD Normal 172000
3 4 526353030 20 RL 93.0 11160 Pave NaN Reg Lvl ... 0 NaN NaN NaN 0 4 2010 WD Normal 244000
4 5 527105010 60 RL 74.0 13830 Pave NaN IR1 Lvl ... 0 NaN MnPrv NaN 0 3 2010 WD Normal 189900

5 rows × 82 columns

Wrangling#

1# Change columns names to lower case
2ames.columns = ames.columns.str.lower().str.replace(" ", "_")
3
4# Increase pandas default max_rows and max_colums
5pd.options.display.max_rows = 250
6pd.options.display.max_columns = 250
7
8# Look at shape, information on, and summary statistics of the dataset
9ames.shape
(2930, 82)
1ames.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   order            2930 non-null   int64  
 1   pid              2930 non-null   int64  
 2   ms_subclass      2930 non-null   int64  
 3   ms_zoning        2930 non-null   object 
 4   lot_frontage     2440 non-null   float64
 5   lot_area         2930 non-null   int64  
 6   street           2930 non-null   object 
 7   alley            198 non-null    object 
 8   lot_shape        2930 non-null   object 
 9   land_contour     2930 non-null   object 
 10  utilities        2930 non-null   object 
 11  lot_config       2930 non-null   object 
 12  land_slope       2930 non-null   object 
 13  neighborhood     2930 non-null   object 
 14  condition_1      2930 non-null   object 
 15  condition_2      2930 non-null   object 
 16  bldg_type        2930 non-null   object 
 17  house_style      2930 non-null   object 
 18  overall_qual     2930 non-null   int64  
 19  overall_cond     2930 non-null   int64  
 20  year_built       2930 non-null   int64  
 21  year_remod/add   2930 non-null   int64  
 22  roof_style       2930 non-null   object 
 23  roof_matl        2930 non-null   object 
 24  exterior_1st     2930 non-null   object 
 25  exterior_2nd     2930 non-null   object 
 26  mas_vnr_type     2907 non-null   object 
 27  mas_vnr_area     2907 non-null   float64
 28  exter_qual       2930 non-null   object 
 29  exter_cond       2930 non-null   object 
 30  foundation       2930 non-null   object 
 31  bsmt_qual        2850 non-null   object 
 32  bsmt_cond        2850 non-null   object 
 33  bsmt_exposure    2847 non-null   object 
 34  bsmtfin_type_1   2850 non-null   object 
 35  bsmtfin_sf_1     2929 non-null   float64
 36  bsmtfin_type_2   2849 non-null   object 
 37  bsmtfin_sf_2     2929 non-null   float64
 38  bsmt_unf_sf      2929 non-null   float64
 39  total_bsmt_sf    2929 non-null   float64
 40  heating          2930 non-null   object 
 41  heating_qc       2930 non-null   object 
 42  central_air      2930 non-null   object 
 43  electrical       2929 non-null   object 
 44  1st_flr_sf       2930 non-null   int64  
 45  2nd_flr_sf       2930 non-null   int64  
 46  low_qual_fin_sf  2930 non-null   int64  
 47  gr_liv_area      2930 non-null   int64  
 48  bsmt_full_bath   2928 non-null   float64
 49  bsmt_half_bath   2928 non-null   float64
 50  full_bath        2930 non-null   int64  
 51  half_bath        2930 non-null   int64  
 52  bedroom_abvgr    2930 non-null   int64  
 53  kitchen_abvgr    2930 non-null   int64  
 54  kitchen_qual     2930 non-null   object 
 55  totrms_abvgrd    2930 non-null   int64  
 56  functional       2930 non-null   object 
 57  fireplaces       2930 non-null   int64  
 58  fireplace_qu     1508 non-null   object 
 59  garage_type      2773 non-null   object 
 60  garage_yr_blt    2771 non-null   float64
 61  garage_finish    2771 non-null   object 
 62  garage_cars      2929 non-null   float64
 63  garage_area      2929 non-null   float64
 64  garage_qual      2771 non-null   object 
 65  garage_cond      2771 non-null   object 
 66  paved_drive      2930 non-null   object 
 67  wood_deck_sf     2930 non-null   int64  
 68  open_porch_sf    2930 non-null   int64  
 69  enclosed_porch   2930 non-null   int64  
 70  3ssn_porch       2930 non-null   int64  
 71  screen_porch     2930 non-null   int64  
 72  pool_area        2930 non-null   int64  
 73  pool_qc          13 non-null     object 
 74  fence            572 non-null    object 
 75  misc_feature     106 non-null    object 
 76  misc_val         2930 non-null   int64  
 77  mo_sold          2930 non-null   int64  
 78  yr_sold          2930 non-null   int64  
 79  sale_type        2930 non-null   object 
 80  sale_condition   2930 non-null   object 
 81  saleprice        2930 non-null   int64  
dtypes: float64(11), int64(28), object(43)
memory usage: 1.8+ MB
1ames.describe().T
count mean std min 25% 50% 75% max
order 2930.0 1.465500e+03 8.459625e+02 1.0 7.332500e+02 1465.5 2.197750e+03 2.930000e+03
pid 2930.0 7.144645e+08 1.887308e+08 526301100.0 5.284770e+08 535453620.0 9.071811e+08 1.007100e+09
ms_subclass 2930.0 5.738737e+01 4.263802e+01 20.0 2.000000e+01 50.0 7.000000e+01 1.900000e+02
lot_frontage 2440.0 6.922459e+01 2.336533e+01 21.0 5.800000e+01 68.0 8.000000e+01 3.130000e+02
lot_area 2930.0 1.014792e+04 7.880018e+03 1300.0 7.440250e+03 9436.5 1.155525e+04 2.152450e+05
overall_qual 2930.0 6.094881e+00 1.411026e+00 1.0 5.000000e+00 6.0 7.000000e+00 1.000000e+01
overall_cond 2930.0 5.563140e+00 1.111537e+00 1.0 5.000000e+00 5.0 6.000000e+00 9.000000e+00
year_built 2930.0 1.971356e+03 3.024536e+01 1872.0 1.954000e+03 1973.0 2.001000e+03 2.010000e+03
year_remod/add 2930.0 1.984267e+03 2.086029e+01 1950.0 1.965000e+03 1993.0 2.004000e+03 2.010000e+03
mas_vnr_area 2907.0 1.018968e+02 1.791126e+02 0.0 0.000000e+00 0.0 1.640000e+02 1.600000e+03
bsmtfin_sf_1 2929.0 4.426296e+02 4.555908e+02 0.0 0.000000e+00 370.0 7.340000e+02 5.644000e+03
bsmtfin_sf_2 2929.0 4.972243e+01 1.691685e+02 0.0 0.000000e+00 0.0 0.000000e+00 1.526000e+03
bsmt_unf_sf 2929.0 5.592625e+02 4.394942e+02 0.0 2.190000e+02 466.0 8.020000e+02 2.336000e+03
total_bsmt_sf 2929.0 1.051615e+03 4.406151e+02 0.0 7.930000e+02 990.0 1.302000e+03 6.110000e+03
1st_flr_sf 2930.0 1.159558e+03 3.918909e+02 334.0 8.762500e+02 1084.0 1.384000e+03 5.095000e+03
2nd_flr_sf 2930.0 3.354560e+02 4.283957e+02 0.0 0.000000e+00 0.0 7.037500e+02 2.065000e+03
low_qual_fin_sf 2930.0 4.676792e+00 4.631051e+01 0.0 0.000000e+00 0.0 0.000000e+00 1.064000e+03
gr_liv_area 2930.0 1.499690e+03 5.055089e+02 334.0 1.126000e+03 1442.0 1.742750e+03 5.642000e+03
bsmt_full_bath 2928.0 4.313525e-01 5.248202e-01 0.0 0.000000e+00 0.0 1.000000e+00 3.000000e+00
bsmt_half_bath 2928.0 6.113388e-02 2.452536e-01 0.0 0.000000e+00 0.0 0.000000e+00 2.000000e+00
full_bath 2930.0 1.566553e+00 5.529406e-01 0.0 1.000000e+00 2.0 2.000000e+00 4.000000e+00
half_bath 2930.0 3.795222e-01 5.026293e-01 0.0 0.000000e+00 0.0 1.000000e+00 2.000000e+00
bedroom_abvgr 2930.0 2.854266e+00 8.277311e-01 0.0 2.000000e+00 3.0 3.000000e+00 8.000000e+00
kitchen_abvgr 2930.0 1.044369e+00 2.140762e-01 0.0 1.000000e+00 1.0 1.000000e+00 3.000000e+00
totrms_abvgrd 2930.0 6.443003e+00 1.572964e+00 2.0 5.000000e+00 6.0 7.000000e+00 1.500000e+01
fireplaces 2930.0 5.993174e-01 6.479209e-01 0.0 0.000000e+00 1.0 1.000000e+00 4.000000e+00
garage_yr_blt 2771.0 1.978132e+03 2.552841e+01 1895.0 1.960000e+03 1979.0 2.002000e+03 2.207000e+03
garage_cars 2929.0 1.766815e+00 7.605664e-01 0.0 1.000000e+00 2.0 2.000000e+00 5.000000e+00
garage_area 2929.0 4.728197e+02 2.150465e+02 0.0 3.200000e+02 480.0 5.760000e+02 1.488000e+03
wood_deck_sf 2930.0 9.375188e+01 1.263616e+02 0.0 0.000000e+00 0.0 1.680000e+02 1.424000e+03
open_porch_sf 2930.0 4.753345e+01 6.748340e+01 0.0 0.000000e+00 27.0 7.000000e+01 7.420000e+02
enclosed_porch 2930.0 2.301160e+01 6.413906e+01 0.0 0.000000e+00 0.0 0.000000e+00 1.012000e+03
3ssn_porch 2930.0 2.592491e+00 2.514133e+01 0.0 0.000000e+00 0.0 0.000000e+00 5.080000e+02
screen_porch 2930.0 1.600205e+01 5.608737e+01 0.0 0.000000e+00 0.0 0.000000e+00 5.760000e+02
pool_area 2930.0 2.243345e+00 3.559718e+01 0.0 0.000000e+00 0.0 0.000000e+00 8.000000e+02
misc_val 2930.0 5.063515e+01 5.663443e+02 0.0 0.000000e+00 0.0 0.000000e+00 1.700000e+04
mo_sold 2930.0 6.216041e+00 2.714492e+00 1.0 4.000000e+00 6.0 8.000000e+00 1.200000e+01
yr_sold 2930.0 2.007790e+03 1.316613e+00 2006.0 2.007000e+03 2008.0 2.009000e+03 2.010000e+03
saleprice 2930.0 1.807961e+05 7.988669e+04 12789.0 1.295000e+05 160000.0 2.135000e+05 7.550000e+05
1# Investigate nulls
2ames.isnull().sum()[0:90]
order                 0
pid                   0
ms_subclass           0
ms_zoning             0
lot_frontage        490
lot_area              0
street                0
alley              2732
lot_shape             0
land_contour          0
utilities             0
lot_config            0
land_slope            0
neighborhood          0
condition_1           0
condition_2           0
bldg_type             0
house_style           0
overall_qual          0
overall_cond          0
year_built            0
year_remod/add        0
roof_style            0
roof_matl             0
exterior_1st          0
exterior_2nd          0
mas_vnr_type         23
mas_vnr_area         23
exter_qual            0
exter_cond            0
foundation            0
bsmt_qual            80
bsmt_cond            80
bsmt_exposure        83
bsmtfin_type_1       80
bsmtfin_sf_1          1
bsmtfin_type_2       81
bsmtfin_sf_2          1
bsmt_unf_sf           1
total_bsmt_sf         1
heating               0
heating_qc            0
central_air           0
electrical            1
1st_flr_sf            0
2nd_flr_sf            0
low_qual_fin_sf       0
gr_liv_area           0
bsmt_full_bath        2
bsmt_half_bath        2
full_bath             0
half_bath             0
bedroom_abvgr         0
kitchen_abvgr         0
kitchen_qual          0
totrms_abvgrd         0
functional            0
fireplaces            0
fireplace_qu       1422
garage_type         157
garage_yr_blt       159
garage_finish       159
garage_cars           1
garage_area           1
garage_qual         159
garage_cond         159
paved_drive           0
wood_deck_sf          0
open_porch_sf         0
enclosed_porch        0
3ssn_porch            0
screen_porch          0
pool_area             0
pool_qc            2917
fence              2358
misc_feature       2824
misc_val              0
mo_sold               0
yr_sold               0
sale_type             0
sale_condition        0
saleprice             0
dtype: int64
1# Missing values: Lot Frontage (Linear feet of street connected to property)
2ames["lot_frontage"].value_counts().head()
60.0    276
80.0    137
70.0    133
50.0    117
75.0    105
Name: lot_frontage, dtype: int64
1# Filter to examine only null values:
2lot_nulls = ames[ames["lot_frontage"].isna()]
3lot_nulls.head()
order pid ms_subclass ms_zoning lot_frontage lot_area street alley lot_shape land_contour utilities lot_config land_slope neighborhood condition_1 condition_2 bldg_type house_style overall_qual overall_cond year_built year_remod/add roof_style roof_matl exterior_1st exterior_2nd mas_vnr_type mas_vnr_area exter_qual exter_cond foundation bsmt_qual bsmt_cond bsmt_exposure bsmtfin_type_1 bsmtfin_sf_1 bsmtfin_type_2 bsmtfin_sf_2 bsmt_unf_sf total_bsmt_sf heating heating_qc central_air electrical 1st_flr_sf 2nd_flr_sf low_qual_fin_sf gr_liv_area bsmt_full_bath bsmt_half_bath full_bath half_bath bedroom_abvgr kitchen_abvgr kitchen_qual totrms_abvgrd functional fireplaces fireplace_qu garage_type garage_yr_blt garage_finish garage_cars garage_area garage_qual garage_cond paved_drive wood_deck_sf open_porch_sf enclosed_porch 3ssn_porch screen_porch pool_area pool_qc fence misc_feature misc_val mo_sold yr_sold sale_type sale_condition saleprice
11 12 527165230 20 RL NaN 7980 Pave NaN IR1 Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 1Story 6 7 1992 2007 Gable CompShg HdBoard HdBoard None 0.0 TA Gd PConc Gd TA No ALQ 935.0 Unf 0.0 233.0 1168.0 GasA Ex Y SBrkr 1187 0 0 1187 1.0 0.0 2 0 3 1 TA 6 Typ 0 NaN Attchd 1992.0 Fin 2.0 420.0 TA TA Y 483 21 0 0 0 0 NaN GdPrv Shed 500 3 2010 WD Normal 185000
14 15 527182190 120 RL NaN 6820 Pave NaN IR1 Lvl AllPub Corner Gtl StoneBr Norm Norm TwnhsE 1Story 8 5 1985 1985 Gable CompShg HdBoard HdBoard None 0.0 Gd TA PConc Gd TA Av GLQ 368.0 BLQ 1120.0 0.0 1488.0 GasA TA Y SBrkr 1502 0 0 1502 1.0 0.0 1 1 1 1 Gd 4 Typ 0 NaN Attchd 1985.0 RFn 2.0 528.0 TA TA Y 0 54 0 0 140 0 NaN NaN NaN 0 6 2010 WD Normal 212000
22 23 527368020 60 FV NaN 7500 Pave NaN Reg Lvl AllPub Inside Gtl Somerst Norm Norm 1Fam 2Story 7 5 2000 2000 Gable CompShg VinylSd VinylSd None 0.0 Gd TA PConc Gd TA No GLQ 533.0 Unf 0.0 281.0 814.0 GasA Ex Y SBrkr 814 860 0 1674 1.0 0.0 2 1 3 1 Gd 7 Typ 0 NaN Attchd 2000.0 RFn 2.0 663.0 TA TA Y 0 96 0 0 0 0 NaN NaN NaN 0 1 2010 WD Normal 216000
23 24 527402200 20 RL NaN 11241 Pave NaN IR1 Lvl AllPub CulDSac Gtl NAmes Norm Norm 1Fam 1Story 6 7 1970 1970 Gable CompShg Wd Sdng Wd Sdng BrkFace 180.0 TA TA CBlock TA TA No ALQ 578.0 Unf 0.0 426.0 1004.0 GasA Ex Y SBrkr 1004 0 0 1004 1.0 0.0 1 0 2 1 TA 5 Typ 1 TA Attchd 1970.0 Fin 2.0 480.0 TA TA Y 0 0 0 0 0 0 NaN NaN Shed 700 3 2010 WD Normal 149000
24 25 527402250 20 RL NaN 12537 Pave NaN IR1 Lvl AllPub CulDSac Gtl NAmes Norm Norm 1Fam 1Story 5 6 1971 2008 Gable CompShg VinylSd VinylSd None 0.0 TA TA CBlock TA TA No GLQ 734.0 Unf 0.0 344.0 1078.0 GasA Ex Y SBrkr 1078 0 0 1078 1.0 0.0 1 1 3 1 TA 6 Typ 1 Fa Attchd 1971.0 Fin 2.0 500.0 TA TA Y 0 0 0 0 0 0 NaN NaN NaN 0 4 2010 WD Normal 149900
1# Group lots by lot_config
2grouped_lots = ames.groupby(["lot_config"]).agg(
3    {
4        "lot_frontage": "mean",
5    }
6)
7grouped_lots
lot_frontage
lot_config
Corner 82.690418
CulDSac 56.304348
FR2 61.276923
FR3 79.300000
Inside 67.147374
 1# Impute the missing values with the mean for that lot configuration type:
 2ames.loc[
 3    (ames["lot_frontage"].isna()) & (ames["lot_config"] == "Corner"),
 4    "lot_frontage",
 5] = 83.25
 6ames.loc[
 7    (ames["lot_frontage"].isna()) & (ames["lot_config"] == "CulDSac"),
 8    "lot_frontage",
 9] = 54.734375
10ames.loc[
11    (ames["lot_frontage"].isna()) & (ames["lot_config"] == "FR2"),
12    "lot_frontage",
13] = 60.836735
14ames.loc[
15    (ames["lot_frontage"].isna()) & (ames["lot_config"] == "FR3"),
16    "lot_frontage",
17] = 87.000000
18ames.loc[
19    (ames["lot_frontage"].isna()) & (ames["lot_config"] == "Inside"),
20    "lot_frontage",
21] = 66.759571
22
23# Check no null values are left
24ames["lot_frontage"].isna().sum()
0
1# Check for outliers:
2ames["lot_frontage"].sort_values().tail()
1745    182.0
1638    195.0
2278    200.0
1498    313.0
1265    313.0
Name: lot_frontage, dtype: float64
1x_frontage_maybe_drop = ames[ames["lot_frontage"] == 313]
2x_frontage_maybe_drop
order pid ms_subclass ms_zoning lot_frontage lot_area street alley lot_shape land_contour utilities lot_config land_slope neighborhood condition_1 condition_2 bldg_type house_style overall_qual overall_cond year_built year_remod/add roof_style roof_matl exterior_1st exterior_2nd mas_vnr_type mas_vnr_area exter_qual exter_cond foundation bsmt_qual bsmt_cond bsmt_exposure bsmtfin_type_1 bsmtfin_sf_1 bsmtfin_type_2 bsmtfin_sf_2 bsmt_unf_sf total_bsmt_sf heating heating_qc central_air electrical 1st_flr_sf 2nd_flr_sf low_qual_fin_sf gr_liv_area bsmt_full_bath bsmt_half_bath full_bath half_bath bedroom_abvgr kitchen_abvgr kitchen_qual totrms_abvgrd functional fireplaces fireplace_qu garage_type garage_yr_blt garage_finish garage_cars garage_area garage_qual garage_cond paved_drive wood_deck_sf open_porch_sf enclosed_porch 3ssn_porch screen_porch pool_area pool_qc fence misc_feature misc_val mo_sold yr_sold sale_type sale_condition saleprice
1265 1266 535427070 20 RL 313.0 27650 Pave NaN IR2 HLS AllPub Inside Mod NAmes PosA Norm 1Fam 1Story 7 7 1960 2007 Flat Tar&Grv Wd Sdng Wd Sdng None 0.0 TA TA CBlock Gd TA Gd GLQ 425.0 Unf 0.0 160.0 585.0 GasA Ex Y SBrkr 2069 0 0 2069 1.0 0.0 2 0 4 1 Gd 9 Typ 1 Gd Attchd 1960.0 RFn 2.0 505.0 TA TA Y 0 0 0 0 0 0 NaN NaN NaN 0 11 2008 WD Normal 242000
1498 1499 908154235 60 RL 313.0 63887 Pave NaN IR3 Bnk AllPub Corner Gtl Edwards Feedr Norm 1Fam 2Story 10 5 2008 2008 Hip ClyTile Stucco Stucco Stone 796.0 Ex TA PConc Ex TA Gd GLQ 5644.0 Unf 0.0 466.0 6110.0 GasA Ex Y SBrkr 4692 950 0 5642 2.0 0.0 2 1 3 1 Ex 12 Typ 3 Gd Attchd 2008.0 Fin 2.0 1418.0 TA TA Y 214 292 0 0 0 480 Gd NaN NaN 0 1 2008 New Partial 160000
1# Missing values: Alleys
2ames["alley"].value_counts()
Grvl    120
Pave     78
Name: alley, dtype: int64
1ames["alley"].isnull().sum()
2732
1# 2732 houses without alley. Maybe hot encode this one. Later.
2# For now replace NaN with NA
3ames["alley"] = ames["alley"].replace(np.nan, "NA")
4
5# Check with:
6ames["alley"].value_counts()
NA      2732
Grvl     120
Pave      78
Name: alley, dtype: int64
1# Missing values: Masonry veneer type and area
2# (mas_vnr_type resp. mas_vnr_area)
3print(ames["mas_vnr_type"].value_counts())
4print(ames["mas_vnr_area"].value_counts())
5
6ames[ames["mas_vnr_type"].isna()].head()
None       1752
BrkFace     880
Stone       249
BrkCmn       25
CBlock        1
Name: mas_vnr_type, dtype: int64
0.0       1748
120.0       15
176.0       13
200.0       13
216.0       12
          ... 
714.0        1
647.0        1
1290.0       1
495.0        1
443.0        1
Name: mas_vnr_area, Length: 445, dtype: int64
order pid ms_subclass ms_zoning lot_frontage lot_area street alley lot_shape land_contour utilities lot_config land_slope neighborhood condition_1 condition_2 bldg_type house_style overall_qual overall_cond year_built year_remod/add roof_style roof_matl exterior_1st exterior_2nd mas_vnr_type mas_vnr_area exter_qual exter_cond foundation bsmt_qual bsmt_cond bsmt_exposure bsmtfin_type_1 bsmtfin_sf_1 bsmtfin_type_2 bsmtfin_sf_2 bsmt_unf_sf total_bsmt_sf heating heating_qc central_air electrical 1st_flr_sf 2nd_flr_sf low_qual_fin_sf gr_liv_area bsmt_full_bath bsmt_half_bath full_bath half_bath bedroom_abvgr kitchen_abvgr kitchen_qual totrms_abvgrd functional fireplaces fireplace_qu garage_type garage_yr_blt garage_finish garage_cars garage_area garage_qual garage_cond paved_drive wood_deck_sf open_porch_sf enclosed_porch 3ssn_porch screen_porch pool_area pool_qc fence misc_feature misc_val mo_sold yr_sold sale_type sale_condition saleprice
55 56 528240070 60 RL 66.759571 7851 Pave NA Reg Lvl AllPub Inside Gtl Gilbert Norm Norm 1Fam 2Story 6 5 2002 2002 Gable CompShg VinylSd VinylSd NaN NaN Gd TA PConc Gd TA No GLQ 625.0 Unf 0.0 235.0 860.0 GasA Ex Y SBrkr 860 1100 0 1960 1.0 0.0 2 1 4 1 Gd 8 Typ 2 TA BuiltIn 2002.0 Fin 2.0 440.0 TA TA Y 288 48 0 0 0 0 NaN NaN NaN 0 5 2010 WD Normal 216500
484 485 528275160 60 RL 83.250000 12891 Pave NA IR1 Lvl AllPub Corner Gtl Gilbert Norm Norm 1Fam 2Story 8 5 2002 2002 Gable CompShg VinylSd VinylSd NaN NaN Gd TA PConc Ex TA No GLQ 651.0 Unf 0.0 219.0 870.0 GasA Ex Y SBrkr 878 1126 0 2004 1.0 0.0 2 1 4 1 Gd 8 Typ 1 Gd BuiltIn 2002.0 Fin 3.0 644.0 TA TA Y 0 48 0 0 0 0 NaN NaN NaN 0 7 2009 WD Normal 257500
517 518 528458020 20 FV 90.000000 7993 Pave NA IR1 Lvl AllPub Inside Gtl Somerst Norm Norm 1Fam 1Story 7 5 2008 2009 Gable CompShg VinylSd VinylSd NaN NaN Gd TA PConc Ex TA No Unf 0.0 Unf 0.0 1436.0 1436.0 GasA Ex Y SBrkr 1436 0 0 1436 0.0 0.0 2 0 3 1 Gd 6 Typ 0 NaN Attchd 2008.0 Fin 2.0 529.0 TA TA Y 0 121 0 0 0 0 NaN NaN NaN 0 10 2009 New Partial 225000
538 539 531371050 20 RL 67.000000 10083 Pave NA Reg Lvl AllPub Inside Gtl SawyerW Norm Norm 1Fam 1Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd NaN NaN Gd TA PConc Gd TA No GLQ 833.0 Unf 0.0 343.0 1176.0 GasA Ex Y SBrkr 1200 0 0 1200 1.0 0.0 2 0 2 1 Gd 5 Typ 0 NaN Attchd 2003.0 RFn 2.0 555.0 TA TA Y 0 41 0 0 0 0 NaN NaN NaN 0 8 2009 WD Normal 184900
867 868 907260030 60 RL 70.000000 8749 Pave NA Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2002 2002 Gable CompShg VinylSd VinylSd NaN NaN Gd TA PConc Gd TA No Unf 0.0 Unf 0.0 840.0 840.0 GasA Ex Y SBrkr 840 885 0 1725 0.0 0.0 2 1 3 1 Gd 6 Typ 0 NaN Attchd 2002.0 RFn 2.0 550.0 TA TA Y 0 48 0 0 0 0 NaN NaN NaN 0 11 2009 WD Normal 198500
1# Replace masonry type np.nan with None (most frequent)
2ames["mas_vnr_type"] = ames["mas_vnr_type"].replace(np.nan, "None")
3
4# Replace masonry area np.nan with 0 (most frequent)
5ames["mas_vnr_area"] = ames["mas_vnr_area"].replace(np.nan, 0)
1# Missing values: Basement related variables
2ames["bsmt_qual"].value_counts()
TA    1283
Gd    1219
Ex     258
Fa      88
Po       2
Name: bsmt_qual, dtype: int64
 1# Missing NA if no basement
 2ames[
 3    [
 4        "bsmt_qual",
 5        "bsmt_cond",
 6        "bsmt_exposure",
 7        "bsmtfin_type_1",
 8        "bsmtfin_sf_1",
 9        "bsmtfin_type_2",
10        "bsmtfin_sf_2",
11        "bsmt_unf_sf",
12        "total_bsmt_sf",
13        "bsmt_full_bath",
14        "bsmt_half_bath",
15    ]
16].isnull().sum()
bsmt_qual         80
bsmt_cond         80
bsmt_exposure     83
bsmtfin_type_1    80
bsmtfin_sf_1       1
bsmtfin_type_2    81
bsmtfin_sf_2       1
bsmt_unf_sf        1
total_bsmt_sf      1
bsmt_full_bath     2
bsmt_half_bath     2
dtype: int64
 1# NA if no basement
 2basement_to_na = [
 3    "bsmt_qual",
 4    "bsmt_cond",
 5    "bsmt_exposure",
 6    "bsmtfin_type_1",
 7    "bsmtfin_type_2",
 8]
 9
10# 0 (sqft) if basement is NA
11basement_to_zero = [
12    "bsmtfin_sf_2",
13    "bsmt_unf_sf",
14    "total_bsmt_sf",
15    "bsmtfin_sf_1",
16    "bsmt_full_bath",
17    "bsmt_half_bath",
18]
19
20for i in basement_to_na:
21    ames[i] = ames[i].replace(np.nan, "NA")
22
23# The missing values are for NA basement properties.
24# Setting sqft value to zero.
25ames[ames["bsmtfin_sf_1"].isna()]
26
27# Set sqft to 0 IF basement is NA, else drop as incomplete
28# basement data and only 1 null record for each
29for i in basement_to_zero:
30    ames[i] = ames[i].replace(np.nan, 0)
1# Missing values: Fireplace quality
2ames["fireplace_qu"].isnull().sum()
1422
1len(ames["fireplace_qu"])
2930
1ames["fireplace_qu"].unique()
array(['Gd', nan, 'TA', 'Po', 'Ex', 'Fa'], dtype=object)
1# Check houses without fireplaces are also houses where fireplace
2# quality is a missing value
3len(ames[(ames["fireplace_qu"].isna()) & (ames["fireplaces"] < 1)])
1422
1# Yep, so replace:
2ames["fireplace_qu"] = ames["fireplace_qu"].replace(np.nan, "NA")
1# Missing values: Garage related features
2ames[
3    ["garage_type", "garage_finish", "garage_qual", "garage_cond"]
4].isnull().sum()
garage_type      157
garage_finish    159
garage_qual      159
garage_cond      159
dtype: int64
 1garage_to_na = ["garage_type", "garage_finish", "garage_qual", "garage_cond"]
 2garage_to_0 = ["garage_yr_blt", "garage_cars", "garage_area"]
 3
 4# Replace ordinal columns with NA based on data dictionary:
 5for i in garage_to_na:
 6    ames[i] = ames[i].replace(np.nan, "NA")
 7
 8for i in basement_to_zero:
 9    ames[i] = ames[i].replace(np.nan, 0)
10
11# Not getting that to work, so one by one:
12
13ames["garage_yr_blt"] = ames["garage_yr_blt"].replace(np.nan, 0)
14ames["garage_cars"] = ames["garage_cars"].replace(np.nan, 0)
15ames["garage_area"] = ames["garage_area"].replace(np.nan, 0)
 1ames[
 2    [
 3        "garage_type",
 4        "garage_finish",
 5        "garage_qual",
 6        "garage_cond",
 7        "garage_yr_blt",
 8        "garage_cars",
 9        "garage_area",
10    ]
11].isnull().sum()
garage_type      0
garage_finish    0
garage_qual      0
garage_cond      0
garage_yr_blt    0
garage_cars      0
garage_area      0
dtype: int64
1# Missing values: Pool
2ames["pool_qc"].isnull().sum()
2917
1print(ames["pool_qc"].unique())
[nan 'Ex' 'Gd' 'TA' 'Fa']
1# For now:
2for i in ames[(ames["pool_qc"].isna()) & (ames["pool_area"] == 0)]:
3    ames["pool_qc"] = ames["pool_qc"].replace(np.nan, "NA")
1# Missing values: Fence feature
2ames["fence"].isnull().sum()
2358
1ames["fence"] = ames["fence"].replace(np.nan, "NA")
2ames["fence"].isnull().sum()
0
1# Missing values: Misc feature
2ames["misc_feature"].isnull().sum()
2824
1for i in ames[(ames["misc_feature"].isna()) & (ames["misc_val"] == 0)]:
2    ames["misc_feature"] = ames["misc_feature"].replace(np.nan, "NA")
3
4ames["misc_feature"].isnull().sum()
0
1ames.isnull().sum()
order              0
pid                0
ms_subclass        0
ms_zoning          0
lot_frontage       0
lot_area           0
street             0
alley              0
lot_shape          0
land_contour       0
utilities          0
lot_config         0
land_slope         0
neighborhood       0
condition_1        0
condition_2        0
bldg_type          0
house_style        0
overall_qual       0
overall_cond       0
year_built         0
year_remod/add     0
roof_style         0
roof_matl          0
exterior_1st       0
exterior_2nd       0
mas_vnr_type       0
mas_vnr_area       0
exter_qual         0
exter_cond         0
foundation         0
bsmt_qual          0
bsmt_cond          0
bsmt_exposure      0
bsmtfin_type_1     0
bsmtfin_sf_1       0
bsmtfin_type_2     0
bsmtfin_sf_2       0
bsmt_unf_sf        0
total_bsmt_sf      0
heating            0
heating_qc         0
central_air        0
electrical         1
1st_flr_sf         0
2nd_flr_sf         0
low_qual_fin_sf    0
gr_liv_area        0
bsmt_full_bath     0
bsmt_half_bath     0
full_bath          0
half_bath          0
bedroom_abvgr      0
kitchen_abvgr      0
kitchen_qual       0
totrms_abvgrd      0
functional         0
fireplaces         0
fireplace_qu       0
garage_type        0
garage_yr_blt      0
garage_finish      0
garage_cars        0
garage_area        0
garage_qual        0
garage_cond        0
paved_drive        0
wood_deck_sf       0
open_porch_sf      0
enclosed_porch     0
3ssn_porch         0
screen_porch       0
pool_area          0
pool_qc            0
fence              0
misc_feature       0
misc_val           0
mo_sold            0
yr_sold            0
sale_type          0
sale_condition     0
saleprice          0
dtype: int64
  1# Check the datatypes against the data dictionary to make sure
  2# the data frame is correctly reading the data and create lists
  3# of each data type for use in EDA
  4ames_indexing = [["id", "pid"]]
  5
  6ames_ordinal = ames[
  7    [
  8        "saleprice",
  9        "lot_shape",
 10        "utilities",
 11        "land_slope",
 12        "overall_qual",
 13        "overall_cond",
 14        "exter_qual",
 15        "exter_cond",
 16        "bsmt_qual",
 17        "bsmt_cond",
 18        "bsmt_exposure",
 19        "bsmtfin_type_1",
 20        "bsmtfin_type_2",
 21        "heating_qc",
 22        "electrical",
 23        "kitchen_qual",
 24        "functional",
 25        "fireplace_qu",
 26        "garage_finish",
 27        "garage_qual",
 28        "garage_cond",
 29        "paved_drive",
 30        "pool_qc",
 31        "fence",
 32    ]
 33]
 34
 35ames_nominal = ames[
 36    [
 37        "ms_subclass",
 38        "ms_zoning",
 39        "street",
 40        "alley",
 41        "land_contour",
 42        "lot_config",
 43        "neighborhood",
 44        "condition_1",
 45        "condition_2",
 46        "bldg_type",
 47        "house_style",
 48        "roof_style",
 49        "roof_matl",
 50        "exterior_1st",
 51        "exterior_2nd",
 52        "mas_vnr_type",
 53        "foundation",
 54        "heating",
 55        "central_air",
 56        "garage_type",
 57        "misc_feature",
 58        "sale_type",
 59    ]
 60]
 61
 62ames_categorical = ames[
 63    [
 64        "street",
 65        "alley",
 66        "lot_shape",
 67        "land_contour",
 68        "utilities",
 69        "lot_config",
 70        "land_slope",
 71        "neighborhood",
 72        "condition_1",
 73        "condition_2",
 74        "bldg_type",
 75        "house_style",
 76        "roof_style",
 77        "roof_matl",
 78        "exterior_1st",
 79        "exterior_2nd",
 80        "mas_vnr_type",
 81        "exter_qual",
 82        "exter_cond",
 83        "foundation",
 84        "bsmt_qual",
 85        "bsmt_cond",
 86        "bsmt_exposure",
 87        "bsmtfin_type_1",
 88        "bsmtfin_type_2",
 89        "heating",
 90        "heating_qc",
 91        "central_air",
 92        "electrical",
 93        "kitchen_qual",
 94        "functional",
 95        "fireplace_qu",
 96        "garage_type",
 97        "garage_finish",
 98        "garage_qual",
 99        "garage_cond",
100        "paved_drive",
101        "pool_qc",
102        "fence",
103        "misc_feature",
104        "sale_type",
105    ]
106]
107
108ames_numeric_category = ames[
109    [
110        "saleprice",
111        "ms_subclass",
112        "ms_zoning",
113        "overall_qual",
114        "overall_cond",
115        "mo_sold",
116    ]
117]
118
119ames_numeric = ames[
120    [
121        "saleprice",
122        "lot_frontage",
123        "lot_area",
124        "year_built",
125        "year_remod/add",
126        "mas_vnr_area",
127        "bsmtfin_sf_2",
128        "bsmt_unf_sf",
129        "total_bsmt_sf",
130        "1st_flr_sf",
131        "2nd_flr_sf",
132        "low_qual_fin_sf",
133        "gr_liv_area",
134        "bsmt_full_bath",
135        "bsmt_half_bath",
136        "full_bath",
137        "half_bath",
138        "bedroom_abvgr",
139        "kitchen_abvgr",
140        "totrms_abvgrd",
141        "fireplaces",
142        "garage_yr_blt",
143        "garage_cars",
144        "garage_area",
145        "wood_deck_sf",
146        "open_porch_sf",
147        "enclosed_porch",
148        "3ssn_porch",
149        "screen_porch",
150        "pool_area",
151        "misc_val",
152        "yr_sold",
153    ]
154]
155
156# Target variable
157y = ames["saleprice"]
1sns.pairplot(ames_numeric_category)
<seaborn.axisgrid.PairGrid at 0x7eff33cf86a0>
../../_images/a1ce9222edd4281aeb9a31ab724fbadc0df995888aaa443c26f5617710b9a5d9.png
1# Ordinal columns
2ames_ordinal.head()
saleprice lot_shape utilities land_slope overall_qual overall_cond exter_qual exter_cond bsmt_qual bsmt_cond bsmt_exposure bsmtfin_type_1 bsmtfin_type_2 heating_qc electrical kitchen_qual functional fireplace_qu garage_finish garage_qual garage_cond paved_drive pool_qc fence
0 215000 IR1 AllPub Gtl 6 5 TA TA TA Gd Gd BLQ Unf Fa SBrkr TA Typ Gd Fin TA TA P NA NA
1 105000 Reg AllPub Gtl 5 6 TA TA TA TA No Rec LwQ TA SBrkr TA Typ NA Unf TA TA Y NA MnPrv
2 172000 IR1 AllPub Gtl 6 6 TA TA TA TA No ALQ Unf TA SBrkr Gd Typ NA Unf TA TA Y NA NA
3 244000 Reg AllPub Gtl 7 5 Gd TA TA TA No ALQ Unf Ex SBrkr Ex Typ TA Fin TA TA Y NA NA
4 189900 IR1 AllPub Gtl 5 5 TA TA Gd TA No GLQ Unf Gd SBrkr TA Typ TA Fin TA TA Y NA MnPrv