Ames housing#
No fake data, and much wrangling needed.
Importing libraries and packages#
1import pandas as pd
2import numpy as np
3import matplotlib.pyplot as plt
4import seaborn as sns
5
6# The style package adds support for easy-to-switch plotting
7# "styles" with the same parameters as a matplotlib rc file
8# (which is read at startup to configure Matplotlib).
9# There are a number of pre-defined styles provided by Matplotlib.
10print(plt.style.available)
['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
1plt.style.use("fivethirtyeight")
2
3# To render matplotlib images in Python notebooks at double the
4# standard resolution, providing users of high-resolution screens
5# with a better visualization experience, set the retina or
6# png2x option:
7# %config InlineBackend.figure_format = 'retina'
Set paths#
1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"
Loading dataset#
1# Get the data
2ames = pd.read_csv("datasets/AmesHousing.csv")
1ames.head()
Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | ... | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 526301100 | 20 | RL | 141.0 | 31770 | Pave | NaN | IR1 | Lvl | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
1 | 2 | 526350040 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | ... | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
2 | 3 | 526351010 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | ... | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
3 | 4 | 526353030 | 20 | RL | 93.0 | 11160 | Pave | NaN | Reg | Lvl | ... | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 244000 |
4 | 5 | 527105010 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | ... | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal | 189900 |
5 rows × 82 columns
Wrangling#
1# Change columns names to lower case
2ames.columns = ames.columns.str.lower().str.replace(" ", "_")
3
4# Increase pandas default max_rows and max_colums
5pd.options.display.max_rows = 250
6pd.options.display.max_columns = 250
7
8# Look at shape, information on, and summary statistics of the dataset
9ames.shape
(2930, 82)
1ames.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order 2930 non-null int64
1 pid 2930 non-null int64
2 ms_subclass 2930 non-null int64
3 ms_zoning 2930 non-null object
4 lot_frontage 2440 non-null float64
5 lot_area 2930 non-null int64
6 street 2930 non-null object
7 alley 198 non-null object
8 lot_shape 2930 non-null object
9 land_contour 2930 non-null object
10 utilities 2930 non-null object
11 lot_config 2930 non-null object
12 land_slope 2930 non-null object
13 neighborhood 2930 non-null object
14 condition_1 2930 non-null object
15 condition_2 2930 non-null object
16 bldg_type 2930 non-null object
17 house_style 2930 non-null object
18 overall_qual 2930 non-null int64
19 overall_cond 2930 non-null int64
20 year_built 2930 non-null int64
21 year_remod/add 2930 non-null int64
22 roof_style 2930 non-null object
23 roof_matl 2930 non-null object
24 exterior_1st 2930 non-null object
25 exterior_2nd 2930 non-null object
26 mas_vnr_type 2907 non-null object
27 mas_vnr_area 2907 non-null float64
28 exter_qual 2930 non-null object
29 exter_cond 2930 non-null object
30 foundation 2930 non-null object
31 bsmt_qual 2850 non-null object
32 bsmt_cond 2850 non-null object
33 bsmt_exposure 2847 non-null object
34 bsmtfin_type_1 2850 non-null object
35 bsmtfin_sf_1 2929 non-null float64
36 bsmtfin_type_2 2849 non-null object
37 bsmtfin_sf_2 2929 non-null float64
38 bsmt_unf_sf 2929 non-null float64
39 total_bsmt_sf 2929 non-null float64
40 heating 2930 non-null object
41 heating_qc 2930 non-null object
42 central_air 2930 non-null object
43 electrical 2929 non-null object
44 1st_flr_sf 2930 non-null int64
45 2nd_flr_sf 2930 non-null int64
46 low_qual_fin_sf 2930 non-null int64
47 gr_liv_area 2930 non-null int64
48 bsmt_full_bath 2928 non-null float64
49 bsmt_half_bath 2928 non-null float64
50 full_bath 2930 non-null int64
51 half_bath 2930 non-null int64
52 bedroom_abvgr 2930 non-null int64
53 kitchen_abvgr 2930 non-null int64
54 kitchen_qual 2930 non-null object
55 totrms_abvgrd 2930 non-null int64
56 functional 2930 non-null object
57 fireplaces 2930 non-null int64
58 fireplace_qu 1508 non-null object
59 garage_type 2773 non-null object
60 garage_yr_blt 2771 non-null float64
61 garage_finish 2771 non-null object
62 garage_cars 2929 non-null float64
63 garage_area 2929 non-null float64
64 garage_qual 2771 non-null object
65 garage_cond 2771 non-null object
66 paved_drive 2930 non-null object
67 wood_deck_sf 2930 non-null int64
68 open_porch_sf 2930 non-null int64
69 enclosed_porch 2930 non-null int64
70 3ssn_porch 2930 non-null int64
71 screen_porch 2930 non-null int64
72 pool_area 2930 non-null int64
73 pool_qc 13 non-null object
74 fence 572 non-null object
75 misc_feature 106 non-null object
76 misc_val 2930 non-null int64
77 mo_sold 2930 non-null int64
78 yr_sold 2930 non-null int64
79 sale_type 2930 non-null object
80 sale_condition 2930 non-null object
81 saleprice 2930 non-null int64
dtypes: float64(11), int64(28), object(43)
memory usage: 1.8+ MB
1ames.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
order | 2930.0 | 1.465500e+03 | 8.459625e+02 | 1.0 | 7.332500e+02 | 1465.5 | 2.197750e+03 | 2.930000e+03 |
pid | 2930.0 | 7.144645e+08 | 1.887308e+08 | 526301100.0 | 5.284770e+08 | 535453620.0 | 9.071811e+08 | 1.007100e+09 |
ms_subclass | 2930.0 | 5.738737e+01 | 4.263802e+01 | 20.0 | 2.000000e+01 | 50.0 | 7.000000e+01 | 1.900000e+02 |
lot_frontage | 2440.0 | 6.922459e+01 | 2.336533e+01 | 21.0 | 5.800000e+01 | 68.0 | 8.000000e+01 | 3.130000e+02 |
lot_area | 2930.0 | 1.014792e+04 | 7.880018e+03 | 1300.0 | 7.440250e+03 | 9436.5 | 1.155525e+04 | 2.152450e+05 |
overall_qual | 2930.0 | 6.094881e+00 | 1.411026e+00 | 1.0 | 5.000000e+00 | 6.0 | 7.000000e+00 | 1.000000e+01 |
overall_cond | 2930.0 | 5.563140e+00 | 1.111537e+00 | 1.0 | 5.000000e+00 | 5.0 | 6.000000e+00 | 9.000000e+00 |
year_built | 2930.0 | 1.971356e+03 | 3.024536e+01 | 1872.0 | 1.954000e+03 | 1973.0 | 2.001000e+03 | 2.010000e+03 |
year_remod/add | 2930.0 | 1.984267e+03 | 2.086029e+01 | 1950.0 | 1.965000e+03 | 1993.0 | 2.004000e+03 | 2.010000e+03 |
mas_vnr_area | 2907.0 | 1.018968e+02 | 1.791126e+02 | 0.0 | 0.000000e+00 | 0.0 | 1.640000e+02 | 1.600000e+03 |
bsmtfin_sf_1 | 2929.0 | 4.426296e+02 | 4.555908e+02 | 0.0 | 0.000000e+00 | 370.0 | 7.340000e+02 | 5.644000e+03 |
bsmtfin_sf_2 | 2929.0 | 4.972243e+01 | 1.691685e+02 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 1.526000e+03 |
bsmt_unf_sf | 2929.0 | 5.592625e+02 | 4.394942e+02 | 0.0 | 2.190000e+02 | 466.0 | 8.020000e+02 | 2.336000e+03 |
total_bsmt_sf | 2929.0 | 1.051615e+03 | 4.406151e+02 | 0.0 | 7.930000e+02 | 990.0 | 1.302000e+03 | 6.110000e+03 |
1st_flr_sf | 2930.0 | 1.159558e+03 | 3.918909e+02 | 334.0 | 8.762500e+02 | 1084.0 | 1.384000e+03 | 5.095000e+03 |
2nd_flr_sf | 2930.0 | 3.354560e+02 | 4.283957e+02 | 0.0 | 0.000000e+00 | 0.0 | 7.037500e+02 | 2.065000e+03 |
low_qual_fin_sf | 2930.0 | 4.676792e+00 | 4.631051e+01 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 1.064000e+03 |
gr_liv_area | 2930.0 | 1.499690e+03 | 5.055089e+02 | 334.0 | 1.126000e+03 | 1442.0 | 1.742750e+03 | 5.642000e+03 |
bsmt_full_bath | 2928.0 | 4.313525e-01 | 5.248202e-01 | 0.0 | 0.000000e+00 | 0.0 | 1.000000e+00 | 3.000000e+00 |
bsmt_half_bath | 2928.0 | 6.113388e-02 | 2.452536e-01 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 2.000000e+00 |
full_bath | 2930.0 | 1.566553e+00 | 5.529406e-01 | 0.0 | 1.000000e+00 | 2.0 | 2.000000e+00 | 4.000000e+00 |
half_bath | 2930.0 | 3.795222e-01 | 5.026293e-01 | 0.0 | 0.000000e+00 | 0.0 | 1.000000e+00 | 2.000000e+00 |
bedroom_abvgr | 2930.0 | 2.854266e+00 | 8.277311e-01 | 0.0 | 2.000000e+00 | 3.0 | 3.000000e+00 | 8.000000e+00 |
kitchen_abvgr | 2930.0 | 1.044369e+00 | 2.140762e-01 | 0.0 | 1.000000e+00 | 1.0 | 1.000000e+00 | 3.000000e+00 |
totrms_abvgrd | 2930.0 | 6.443003e+00 | 1.572964e+00 | 2.0 | 5.000000e+00 | 6.0 | 7.000000e+00 | 1.500000e+01 |
fireplaces | 2930.0 | 5.993174e-01 | 6.479209e-01 | 0.0 | 0.000000e+00 | 1.0 | 1.000000e+00 | 4.000000e+00 |
garage_yr_blt | 2771.0 | 1.978132e+03 | 2.552841e+01 | 1895.0 | 1.960000e+03 | 1979.0 | 2.002000e+03 | 2.207000e+03 |
garage_cars | 2929.0 | 1.766815e+00 | 7.605664e-01 | 0.0 | 1.000000e+00 | 2.0 | 2.000000e+00 | 5.000000e+00 |
garage_area | 2929.0 | 4.728197e+02 | 2.150465e+02 | 0.0 | 3.200000e+02 | 480.0 | 5.760000e+02 | 1.488000e+03 |
wood_deck_sf | 2930.0 | 9.375188e+01 | 1.263616e+02 | 0.0 | 0.000000e+00 | 0.0 | 1.680000e+02 | 1.424000e+03 |
open_porch_sf | 2930.0 | 4.753345e+01 | 6.748340e+01 | 0.0 | 0.000000e+00 | 27.0 | 7.000000e+01 | 7.420000e+02 |
enclosed_porch | 2930.0 | 2.301160e+01 | 6.413906e+01 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 1.012000e+03 |
3ssn_porch | 2930.0 | 2.592491e+00 | 2.514133e+01 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 5.080000e+02 |
screen_porch | 2930.0 | 1.600205e+01 | 5.608737e+01 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 5.760000e+02 |
pool_area | 2930.0 | 2.243345e+00 | 3.559718e+01 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 8.000000e+02 |
misc_val | 2930.0 | 5.063515e+01 | 5.663443e+02 | 0.0 | 0.000000e+00 | 0.0 | 0.000000e+00 | 1.700000e+04 |
mo_sold | 2930.0 | 6.216041e+00 | 2.714492e+00 | 1.0 | 4.000000e+00 | 6.0 | 8.000000e+00 | 1.200000e+01 |
yr_sold | 2930.0 | 2.007790e+03 | 1.316613e+00 | 2006.0 | 2.007000e+03 | 2008.0 | 2.009000e+03 | 2.010000e+03 |
saleprice | 2930.0 | 1.807961e+05 | 7.988669e+04 | 12789.0 | 1.295000e+05 | 160000.0 | 2.135000e+05 | 7.550000e+05 |
1# Investigate nulls
2ames.isnull().sum()[0:90]
order 0
pid 0
ms_subclass 0
ms_zoning 0
lot_frontage 490
lot_area 0
street 0
alley 2732
lot_shape 0
land_contour 0
utilities 0
lot_config 0
land_slope 0
neighborhood 0
condition_1 0
condition_2 0
bldg_type 0
house_style 0
overall_qual 0
overall_cond 0
year_built 0
year_remod/add 0
roof_style 0
roof_matl 0
exterior_1st 0
exterior_2nd 0
mas_vnr_type 23
mas_vnr_area 23
exter_qual 0
exter_cond 0
foundation 0
bsmt_qual 80
bsmt_cond 80
bsmt_exposure 83
bsmtfin_type_1 80
bsmtfin_sf_1 1
bsmtfin_type_2 81
bsmtfin_sf_2 1
bsmt_unf_sf 1
total_bsmt_sf 1
heating 0
heating_qc 0
central_air 0
electrical 1
1st_flr_sf 0
2nd_flr_sf 0
low_qual_fin_sf 0
gr_liv_area 0
bsmt_full_bath 2
bsmt_half_bath 2
full_bath 0
half_bath 0
bedroom_abvgr 0
kitchen_abvgr 0
kitchen_qual 0
totrms_abvgrd 0
functional 0
fireplaces 0
fireplace_qu 1422
garage_type 157
garage_yr_blt 159
garage_finish 159
garage_cars 1
garage_area 1
garage_qual 159
garage_cond 159
paved_drive 0
wood_deck_sf 0
open_porch_sf 0
enclosed_porch 0
3ssn_porch 0
screen_porch 0
pool_area 0
pool_qc 2917
fence 2358
misc_feature 2824
misc_val 0
mo_sold 0
yr_sold 0
sale_type 0
sale_condition 0
saleprice 0
dtype: int64
1# Missing values: Lot Frontage (Linear feet of street connected to property)
2ames["lot_frontage"].value_counts().head()
60.0 276
80.0 137
70.0 133
50.0 117
75.0 105
Name: lot_frontage, dtype: int64
1# Filter to examine only null values:
2lot_nulls = ames[ames["lot_frontage"].isna()]
3lot_nulls.head()
order | pid | ms_subclass | ms_zoning | lot_frontage | lot_area | street | alley | lot_shape | land_contour | utilities | lot_config | land_slope | neighborhood | condition_1 | condition_2 | bldg_type | house_style | overall_qual | overall_cond | year_built | year_remod/add | roof_style | roof_matl | exterior_1st | exterior_2nd | mas_vnr_type | mas_vnr_area | exter_qual | exter_cond | foundation | bsmt_qual | bsmt_cond | bsmt_exposure | bsmtfin_type_1 | bsmtfin_sf_1 | bsmtfin_type_2 | bsmtfin_sf_2 | bsmt_unf_sf | total_bsmt_sf | heating | heating_qc | central_air | electrical | 1st_flr_sf | 2nd_flr_sf | low_qual_fin_sf | gr_liv_area | bsmt_full_bath | bsmt_half_bath | full_bath | half_bath | bedroom_abvgr | kitchen_abvgr | kitchen_qual | totrms_abvgrd | functional | fireplaces | fireplace_qu | garage_type | garage_yr_blt | garage_finish | garage_cars | garage_area | garage_qual | garage_cond | paved_drive | wood_deck_sf | open_porch_sf | enclosed_porch | 3ssn_porch | screen_porch | pool_area | pool_qc | fence | misc_feature | misc_val | mo_sold | yr_sold | sale_type | sale_condition | saleprice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
11 | 12 | 527165230 | 20 | RL | NaN | 7980 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 1Story | 6 | 7 | 1992 | 2007 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | TA | Gd | PConc | Gd | TA | No | ALQ | 935.0 | Unf | 0.0 | 233.0 | 1168.0 | GasA | Ex | Y | SBrkr | 1187 | 0 | 0 | 1187 | 1.0 | 0.0 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 0 | NaN | Attchd | 1992.0 | Fin | 2.0 | 420.0 | TA | TA | Y | 483 | 21 | 0 | 0 | 0 | 0 | NaN | GdPrv | Shed | 500 | 3 | 2010 | WD | Normal | 185000 |
14 | 15 | 527182190 | 120 | RL | NaN | 6820 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 1985 | 1985 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 368.0 | BLQ | 1120.0 | 0.0 | 1488.0 | GasA | TA | Y | SBrkr | 1502 | 0 | 0 | 1502 | 1.0 | 0.0 | 1 | 1 | 1 | 1 | Gd | 4 | Typ | 0 | NaN | Attchd | 1985.0 | RFn | 2.0 | 528.0 | TA | TA | Y | 0 | 54 | 0 | 0 | 140 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal | 212000 |
22 | 23 | 527368020 | 60 | FV | NaN | 7500 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Somerst | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 533.0 | Unf | 0.0 | 281.0 | 814.0 | GasA | Ex | Y | SBrkr | 814 | 860 | 0 | 1674 | 1.0 | 0.0 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 0 | NaN | Attchd | 2000.0 | RFn | 2.0 | 663.0 | TA | TA | Y | 0 | 96 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal | 216000 |
23 | 24 | 527402200 | 20 | RL | NaN | 11241 | Pave | NaN | IR1 | Lvl | AllPub | CulDSac | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 7 | 1970 | 1970 | Gable | CompShg | Wd Sdng | Wd Sdng | BrkFace | 180.0 | TA | TA | CBlock | TA | TA | No | ALQ | 578.0 | Unf | 0.0 | 426.0 | 1004.0 | GasA | Ex | Y | SBrkr | 1004 | 0 | 0 | 1004 | 1.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 1 | TA | Attchd | 1970.0 | Fin | 2.0 | 480.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | Shed | 700 | 3 | 2010 | WD | Normal | 149000 |
24 | 25 | 527402250 | 20 | RL | NaN | 12537 | Pave | NaN | IR1 | Lvl | AllPub | CulDSac | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 5 | 6 | 1971 | 2008 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | GLQ | 734.0 | Unf | 0.0 | 344.0 | 1078.0 | GasA | Ex | Y | SBrkr | 1078 | 0 | 0 | 1078 | 1.0 | 0.0 | 1 | 1 | 3 | 1 | TA | 6 | Typ | 1 | Fa | Attchd | 1971.0 | Fin | 2.0 | 500.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 149900 |
1# Group lots by lot_config
2grouped_lots = ames.groupby(["lot_config"]).agg(
3 {
4 "lot_frontage": "mean",
5 }
6)
7grouped_lots
lot_frontage | |
---|---|
lot_config | |
Corner | 82.690418 |
CulDSac | 56.304348 |
FR2 | 61.276923 |
FR3 | 79.300000 |
Inside | 67.147374 |
1# Impute the missing values with the mean for that lot configuration type:
2ames.loc[
3 (ames["lot_frontage"].isna()) & (ames["lot_config"] == "Corner"),
4 "lot_frontage",
5] = 83.25
6ames.loc[
7 (ames["lot_frontage"].isna()) & (ames["lot_config"] == "CulDSac"),
8 "lot_frontage",
9] = 54.734375
10ames.loc[
11 (ames["lot_frontage"].isna()) & (ames["lot_config"] == "FR2"),
12 "lot_frontage",
13] = 60.836735
14ames.loc[
15 (ames["lot_frontage"].isna()) & (ames["lot_config"] == "FR3"),
16 "lot_frontage",
17] = 87.000000
18ames.loc[
19 (ames["lot_frontage"].isna()) & (ames["lot_config"] == "Inside"),
20 "lot_frontage",
21] = 66.759571
22
23# Check no null values are left
24ames["lot_frontage"].isna().sum()
0
1# Check for outliers:
2ames["lot_frontage"].sort_values().tail()
1745 182.0
1638 195.0
2278 200.0
1498 313.0
1265 313.0
Name: lot_frontage, dtype: float64
1x_frontage_maybe_drop = ames[ames["lot_frontage"] == 313]
2x_frontage_maybe_drop
order | pid | ms_subclass | ms_zoning | lot_frontage | lot_area | street | alley | lot_shape | land_contour | utilities | lot_config | land_slope | neighborhood | condition_1 | condition_2 | bldg_type | house_style | overall_qual | overall_cond | year_built | year_remod/add | roof_style | roof_matl | exterior_1st | exterior_2nd | mas_vnr_type | mas_vnr_area | exter_qual | exter_cond | foundation | bsmt_qual | bsmt_cond | bsmt_exposure | bsmtfin_type_1 | bsmtfin_sf_1 | bsmtfin_type_2 | bsmtfin_sf_2 | bsmt_unf_sf | total_bsmt_sf | heating | heating_qc | central_air | electrical | 1st_flr_sf | 2nd_flr_sf | low_qual_fin_sf | gr_liv_area | bsmt_full_bath | bsmt_half_bath | full_bath | half_bath | bedroom_abvgr | kitchen_abvgr | kitchen_qual | totrms_abvgrd | functional | fireplaces | fireplace_qu | garage_type | garage_yr_blt | garage_finish | garage_cars | garage_area | garage_qual | garage_cond | paved_drive | wood_deck_sf | open_porch_sf | enclosed_porch | 3ssn_porch | screen_porch | pool_area | pool_qc | fence | misc_feature | misc_val | mo_sold | yr_sold | sale_type | sale_condition | saleprice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1265 | 1266 | 535427070 | 20 | RL | 313.0 | 27650 | Pave | NaN | IR2 | HLS | AllPub | Inside | Mod | NAmes | PosA | Norm | 1Fam | 1Story | 7 | 7 | 1960 | 2007 | Flat | Tar&Grv | Wd Sdng | Wd Sdng | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | GLQ | 425.0 | Unf | 0.0 | 160.0 | 585.0 | GasA | Ex | Y | SBrkr | 2069 | 0 | 0 | 2069 | 1.0 | 0.0 | 2 | 0 | 4 | 1 | Gd | 9 | Typ | 1 | Gd | Attchd | 1960.0 | RFn | 2.0 | 505.0 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 11 | 2008 | WD | Normal | 242000 |
1498 | 1499 | 908154235 | 60 | RL | 313.0 | 63887 | Pave | NaN | IR3 | Bnk | AllPub | Corner | Gtl | Edwards | Feedr | Norm | 1Fam | 2Story | 10 | 5 | 2008 | 2008 | Hip | ClyTile | Stucco | Stucco | Stone | 796.0 | Ex | TA | PConc | Ex | TA | Gd | GLQ | 5644.0 | Unf | 0.0 | 466.0 | 6110.0 | GasA | Ex | Y | SBrkr | 4692 | 950 | 0 | 5642 | 2.0 | 0.0 | 2 | 1 | 3 | 1 | Ex | 12 | Typ | 3 | Gd | Attchd | 2008.0 | Fin | 2.0 | 1418.0 | TA | TA | Y | 214 | 292 | 0 | 0 | 0 | 480 | Gd | NaN | NaN | 0 | 1 | 2008 | New | Partial | 160000 |
1# Missing values: Alleys
2ames["alley"].value_counts()
Grvl 120
Pave 78
Name: alley, dtype: int64
1ames["alley"].isnull().sum()
2732
1# 2732 houses without alley. Maybe hot encode this one. Later.
2# For now replace NaN with NA
3ames["alley"] = ames["alley"].replace(np.nan, "NA")
4
5# Check with:
6ames["alley"].value_counts()
NA 2732
Grvl 120
Pave 78
Name: alley, dtype: int64
1# Missing values: Masonry veneer type and area
2# (mas_vnr_type resp. mas_vnr_area)
3print(ames["mas_vnr_type"].value_counts())
4print(ames["mas_vnr_area"].value_counts())
5
6ames[ames["mas_vnr_type"].isna()].head()
None 1752
BrkFace 880
Stone 249
BrkCmn 25
CBlock 1
Name: mas_vnr_type, dtype: int64
0.0 1748
120.0 15
176.0 13
200.0 13
216.0 12
...
714.0 1
647.0 1
1290.0 1
495.0 1
443.0 1
Name: mas_vnr_area, Length: 445, dtype: int64
order | pid | ms_subclass | ms_zoning | lot_frontage | lot_area | street | alley | lot_shape | land_contour | utilities | lot_config | land_slope | neighborhood | condition_1 | condition_2 | bldg_type | house_style | overall_qual | overall_cond | year_built | year_remod/add | roof_style | roof_matl | exterior_1st | exterior_2nd | mas_vnr_type | mas_vnr_area | exter_qual | exter_cond | foundation | bsmt_qual | bsmt_cond | bsmt_exposure | bsmtfin_type_1 | bsmtfin_sf_1 | bsmtfin_type_2 | bsmtfin_sf_2 | bsmt_unf_sf | total_bsmt_sf | heating | heating_qc | central_air | electrical | 1st_flr_sf | 2nd_flr_sf | low_qual_fin_sf | gr_liv_area | bsmt_full_bath | bsmt_half_bath | full_bath | half_bath | bedroom_abvgr | kitchen_abvgr | kitchen_qual | totrms_abvgrd | functional | fireplaces | fireplace_qu | garage_type | garage_yr_blt | garage_finish | garage_cars | garage_area | garage_qual | garage_cond | paved_drive | wood_deck_sf | open_porch_sf | enclosed_porch | 3ssn_porch | screen_porch | pool_area | pool_qc | fence | misc_feature | misc_val | mo_sold | yr_sold | sale_type | sale_condition | saleprice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
55 | 56 | 528240070 | 60 | RL | 66.759571 | 7851 | Pave | NA | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 5 | 2002 | 2002 | Gable | CompShg | VinylSd | VinylSd | NaN | NaN | Gd | TA | PConc | Gd | TA | No | GLQ | 625.0 | Unf | 0.0 | 235.0 | 860.0 | GasA | Ex | Y | SBrkr | 860 | 1100 | 0 | 1960 | 1.0 | 0.0 | 2 | 1 | 4 | 1 | Gd | 8 | Typ | 2 | TA | BuiltIn | 2002.0 | Fin | 2.0 | 440.0 | TA | TA | Y | 288 | 48 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 216500 |
484 | 485 | 528275160 | 60 | RL | 83.250000 | 12891 | Pave | NA | IR1 | Lvl | AllPub | Corner | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2002 | 2002 | Gable | CompShg | VinylSd | VinylSd | NaN | NaN | Gd | TA | PConc | Ex | TA | No | GLQ | 651.0 | Unf | 0.0 | 219.0 | 870.0 | GasA | Ex | Y | SBrkr | 878 | 1126 | 0 | 2004 | 1.0 | 0.0 | 2 | 1 | 4 | 1 | Gd | 8 | Typ | 1 | Gd | BuiltIn | 2002.0 | Fin | 3.0 | 644.0 | TA | TA | Y | 0 | 48 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 7 | 2009 | WD | Normal | 257500 |
517 | 518 | 528458020 | 20 | FV | 90.000000 | 7993 | Pave | NA | IR1 | Lvl | AllPub | Inside | Gtl | Somerst | Norm | Norm | 1Fam | 1Story | 7 | 5 | 2008 | 2009 | Gable | CompShg | VinylSd | VinylSd | NaN | NaN | Gd | TA | PConc | Ex | TA | No | Unf | 0.0 | Unf | 0.0 | 1436.0 | 1436.0 | GasA | Ex | Y | SBrkr | 1436 | 0 | 0 | 1436 | 0.0 | 0.0 | 2 | 0 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 2008.0 | Fin | 2.0 | 529.0 | TA | TA | Y | 0 | 121 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 10 | 2009 | New | Partial | 225000 |
538 | 539 | 531371050 | 20 | RL | 67.000000 | 10083 | Pave | NA | Reg | Lvl | AllPub | Inside | Gtl | SawyerW | Norm | Norm | 1Fam | 1Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | NaN | NaN | Gd | TA | PConc | Gd | TA | No | GLQ | 833.0 | Unf | 0.0 | 343.0 | 1176.0 | GasA | Ex | Y | SBrkr | 1200 | 0 | 0 | 1200 | 1.0 | 0.0 | 2 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2.0 | 555.0 | TA | TA | Y | 0 | 41 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 8 | 2009 | WD | Normal | 184900 |
867 | 868 | 907260030 | 60 | RL | 70.000000 | 8749 | Pave | NA | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2002 | 2002 | Gable | CompShg | VinylSd | VinylSd | NaN | NaN | Gd | TA | PConc | Gd | TA | No | Unf | 0.0 | Unf | 0.0 | 840.0 | 840.0 | GasA | Ex | Y | SBrkr | 840 | 885 | 0 | 1725 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 2002.0 | RFn | 2.0 | 550.0 | TA | TA | Y | 0 | 48 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 11 | 2009 | WD | Normal | 198500 |
1# Replace masonry type np.nan with None (most frequent)
2ames["mas_vnr_type"] = ames["mas_vnr_type"].replace(np.nan, "None")
3
4# Replace masonry area np.nan with 0 (most frequent)
5ames["mas_vnr_area"] = ames["mas_vnr_area"].replace(np.nan, 0)
1# Missing values: Basement related variables
2ames["bsmt_qual"].value_counts()
TA 1283
Gd 1219
Ex 258
Fa 88
Po 2
Name: bsmt_qual, dtype: int64
1# Missing NA if no basement
2ames[
3 [
4 "bsmt_qual",
5 "bsmt_cond",
6 "bsmt_exposure",
7 "bsmtfin_type_1",
8 "bsmtfin_sf_1",
9 "bsmtfin_type_2",
10 "bsmtfin_sf_2",
11 "bsmt_unf_sf",
12 "total_bsmt_sf",
13 "bsmt_full_bath",
14 "bsmt_half_bath",
15 ]
16].isnull().sum()
bsmt_qual 80
bsmt_cond 80
bsmt_exposure 83
bsmtfin_type_1 80
bsmtfin_sf_1 1
bsmtfin_type_2 81
bsmtfin_sf_2 1
bsmt_unf_sf 1
total_bsmt_sf 1
bsmt_full_bath 2
bsmt_half_bath 2
dtype: int64
1# NA if no basement
2basement_to_na = [
3 "bsmt_qual",
4 "bsmt_cond",
5 "bsmt_exposure",
6 "bsmtfin_type_1",
7 "bsmtfin_type_2",
8]
9
10# 0 (sqft) if basement is NA
11basement_to_zero = [
12 "bsmtfin_sf_2",
13 "bsmt_unf_sf",
14 "total_bsmt_sf",
15 "bsmtfin_sf_1",
16 "bsmt_full_bath",
17 "bsmt_half_bath",
18]
19
20for i in basement_to_na:
21 ames[i] = ames[i].replace(np.nan, "NA")
22
23# The missing values are for NA basement properties.
24# Setting sqft value to zero.
25ames[ames["bsmtfin_sf_1"].isna()]
26
27# Set sqft to 0 IF basement is NA, else drop as incomplete
28# basement data and only 1 null record for each
29for i in basement_to_zero:
30 ames[i] = ames[i].replace(np.nan, 0)
1# Missing values: Fireplace quality
2ames["fireplace_qu"].isnull().sum()
1422
1len(ames["fireplace_qu"])
2930
1ames["fireplace_qu"].unique()
array(['Gd', nan, 'TA', 'Po', 'Ex', 'Fa'], dtype=object)
1# Check houses without fireplaces are also houses where fireplace
2# quality is a missing value
3len(ames[(ames["fireplace_qu"].isna()) & (ames["fireplaces"] < 1)])
1422
1# Yep, so replace:
2ames["fireplace_qu"] = ames["fireplace_qu"].replace(np.nan, "NA")
1# Missing values: Garage related features
2ames[
3 ["garage_type", "garage_finish", "garage_qual", "garage_cond"]
4].isnull().sum()
garage_type 157
garage_finish 159
garage_qual 159
garage_cond 159
dtype: int64
1garage_to_na = ["garage_type", "garage_finish", "garage_qual", "garage_cond"]
2garage_to_0 = ["garage_yr_blt", "garage_cars", "garage_area"]
3
4# Replace ordinal columns with NA based on data dictionary:
5for i in garage_to_na:
6 ames[i] = ames[i].replace(np.nan, "NA")
7
8for i in basement_to_zero:
9 ames[i] = ames[i].replace(np.nan, 0)
10
11# Not getting that to work, so one by one:
12
13ames["garage_yr_blt"] = ames["garage_yr_blt"].replace(np.nan, 0)
14ames["garage_cars"] = ames["garage_cars"].replace(np.nan, 0)
15ames["garage_area"] = ames["garage_area"].replace(np.nan, 0)
1ames[
2 [
3 "garage_type",
4 "garage_finish",
5 "garage_qual",
6 "garage_cond",
7 "garage_yr_blt",
8 "garage_cars",
9 "garage_area",
10 ]
11].isnull().sum()
garage_type 0
garage_finish 0
garage_qual 0
garage_cond 0
garage_yr_blt 0
garage_cars 0
garage_area 0
dtype: int64
1# Missing values: Pool
2ames["pool_qc"].isnull().sum()
2917
1print(ames["pool_qc"].unique())
[nan 'Ex' 'Gd' 'TA' 'Fa']
1# For now:
2for i in ames[(ames["pool_qc"].isna()) & (ames["pool_area"] == 0)]:
3 ames["pool_qc"] = ames["pool_qc"].replace(np.nan, "NA")
1# Missing values: Fence feature
2ames["fence"].isnull().sum()
2358
1ames["fence"] = ames["fence"].replace(np.nan, "NA")
2ames["fence"].isnull().sum()
0
1# Missing values: Misc feature
2ames["misc_feature"].isnull().sum()
2824
1for i in ames[(ames["misc_feature"].isna()) & (ames["misc_val"] == 0)]:
2 ames["misc_feature"] = ames["misc_feature"].replace(np.nan, "NA")
3
4ames["misc_feature"].isnull().sum()
0
1ames.isnull().sum()
order 0
pid 0
ms_subclass 0
ms_zoning 0
lot_frontage 0
lot_area 0
street 0
alley 0
lot_shape 0
land_contour 0
utilities 0
lot_config 0
land_slope 0
neighborhood 0
condition_1 0
condition_2 0
bldg_type 0
house_style 0
overall_qual 0
overall_cond 0
year_built 0
year_remod/add 0
roof_style 0
roof_matl 0
exterior_1st 0
exterior_2nd 0
mas_vnr_type 0
mas_vnr_area 0
exter_qual 0
exter_cond 0
foundation 0
bsmt_qual 0
bsmt_cond 0
bsmt_exposure 0
bsmtfin_type_1 0
bsmtfin_sf_1 0
bsmtfin_type_2 0
bsmtfin_sf_2 0
bsmt_unf_sf 0
total_bsmt_sf 0
heating 0
heating_qc 0
central_air 0
electrical 1
1st_flr_sf 0
2nd_flr_sf 0
low_qual_fin_sf 0
gr_liv_area 0
bsmt_full_bath 0
bsmt_half_bath 0
full_bath 0
half_bath 0
bedroom_abvgr 0
kitchen_abvgr 0
kitchen_qual 0
totrms_abvgrd 0
functional 0
fireplaces 0
fireplace_qu 0
garage_type 0
garage_yr_blt 0
garage_finish 0
garage_cars 0
garage_area 0
garage_qual 0
garage_cond 0
paved_drive 0
wood_deck_sf 0
open_porch_sf 0
enclosed_porch 0
3ssn_porch 0
screen_porch 0
pool_area 0
pool_qc 0
fence 0
misc_feature 0
misc_val 0
mo_sold 0
yr_sold 0
sale_type 0
sale_condition 0
saleprice 0
dtype: int64
1# Check the datatypes against the data dictionary to make sure
2# the data frame is correctly reading the data and create lists
3# of each data type for use in EDA
4ames_indexing = [["id", "pid"]]
5
6ames_ordinal = ames[
7 [
8 "saleprice",
9 "lot_shape",
10 "utilities",
11 "land_slope",
12 "overall_qual",
13 "overall_cond",
14 "exter_qual",
15 "exter_cond",
16 "bsmt_qual",
17 "bsmt_cond",
18 "bsmt_exposure",
19 "bsmtfin_type_1",
20 "bsmtfin_type_2",
21 "heating_qc",
22 "electrical",
23 "kitchen_qual",
24 "functional",
25 "fireplace_qu",
26 "garage_finish",
27 "garage_qual",
28 "garage_cond",
29 "paved_drive",
30 "pool_qc",
31 "fence",
32 ]
33]
34
35ames_nominal = ames[
36 [
37 "ms_subclass",
38 "ms_zoning",
39 "street",
40 "alley",
41 "land_contour",
42 "lot_config",
43 "neighborhood",
44 "condition_1",
45 "condition_2",
46 "bldg_type",
47 "house_style",
48 "roof_style",
49 "roof_matl",
50 "exterior_1st",
51 "exterior_2nd",
52 "mas_vnr_type",
53 "foundation",
54 "heating",
55 "central_air",
56 "garage_type",
57 "misc_feature",
58 "sale_type",
59 ]
60]
61
62ames_categorical = ames[
63 [
64 "street",
65 "alley",
66 "lot_shape",
67 "land_contour",
68 "utilities",
69 "lot_config",
70 "land_slope",
71 "neighborhood",
72 "condition_1",
73 "condition_2",
74 "bldg_type",
75 "house_style",
76 "roof_style",
77 "roof_matl",
78 "exterior_1st",
79 "exterior_2nd",
80 "mas_vnr_type",
81 "exter_qual",
82 "exter_cond",
83 "foundation",
84 "bsmt_qual",
85 "bsmt_cond",
86 "bsmt_exposure",
87 "bsmtfin_type_1",
88 "bsmtfin_type_2",
89 "heating",
90 "heating_qc",
91 "central_air",
92 "electrical",
93 "kitchen_qual",
94 "functional",
95 "fireplace_qu",
96 "garage_type",
97 "garage_finish",
98 "garage_qual",
99 "garage_cond",
100 "paved_drive",
101 "pool_qc",
102 "fence",
103 "misc_feature",
104 "sale_type",
105 ]
106]
107
108ames_numeric_category = ames[
109 [
110 "saleprice",
111 "ms_subclass",
112 "ms_zoning",
113 "overall_qual",
114 "overall_cond",
115 "mo_sold",
116 ]
117]
118
119ames_numeric = ames[
120 [
121 "saleprice",
122 "lot_frontage",
123 "lot_area",
124 "year_built",
125 "year_remod/add",
126 "mas_vnr_area",
127 "bsmtfin_sf_2",
128 "bsmt_unf_sf",
129 "total_bsmt_sf",
130 "1st_flr_sf",
131 "2nd_flr_sf",
132 "low_qual_fin_sf",
133 "gr_liv_area",
134 "bsmt_full_bath",
135 "bsmt_half_bath",
136 "full_bath",
137 "half_bath",
138 "bedroom_abvgr",
139 "kitchen_abvgr",
140 "totrms_abvgrd",
141 "fireplaces",
142 "garage_yr_blt",
143 "garage_cars",
144 "garage_area",
145 "wood_deck_sf",
146 "open_porch_sf",
147 "enclosed_porch",
148 "3ssn_porch",
149 "screen_porch",
150 "pool_area",
151 "misc_val",
152 "yr_sold",
153 ]
154]
155
156# Target variable
157y = ames["saleprice"]
1sns.pairplot(ames_numeric_category)
<seaborn.axisgrid.PairGrid at 0x7eff33cf86a0>
1# Ordinal columns
2ames_ordinal.head()
saleprice | lot_shape | utilities | land_slope | overall_qual | overall_cond | exter_qual | exter_cond | bsmt_qual | bsmt_cond | bsmt_exposure | bsmtfin_type_1 | bsmtfin_type_2 | heating_qc | electrical | kitchen_qual | functional | fireplace_qu | garage_finish | garage_qual | garage_cond | paved_drive | pool_qc | fence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 215000 | IR1 | AllPub | Gtl | 6 | 5 | TA | TA | TA | Gd | Gd | BLQ | Unf | Fa | SBrkr | TA | Typ | Gd | Fin | TA | TA | P | NA | NA |
1 | 105000 | Reg | AllPub | Gtl | 5 | 6 | TA | TA | TA | TA | No | Rec | LwQ | TA | SBrkr | TA | Typ | NA | Unf | TA | TA | Y | NA | MnPrv |
2 | 172000 | IR1 | AllPub | Gtl | 6 | 6 | TA | TA | TA | TA | No | ALQ | Unf | TA | SBrkr | Gd | Typ | NA | Unf | TA | TA | Y | NA | NA |
3 | 244000 | Reg | AllPub | Gtl | 7 | 5 | Gd | TA | TA | TA | No | ALQ | Unf | Ex | SBrkr | Ex | Typ | TA | Fin | TA | TA | Y | NA | NA |
4 | 189900 | IR1 | AllPub | Gtl | 5 | 5 | TA | TA | Gd | TA | No | GLQ | Unf | Gd | SBrkr | TA | Typ | TA | Fin | TA | TA | Y | NA | MnPrv |