Ames Housing Feature Selection#

Selecting a subset of the most predictive features. There is an element of randomness in the Lasso regression.

With the aim to ensure reproducibility between runs of the same notebook, and between the research and production environment, for each step that includes some element of randomness, it is extremely important that the seed is set.

Libraries and packages#

 1# to handle datasets
 2import pandas as pd
 3import numpy as np
 4
 5# to build the models
 6from sklearn.linear_model import Lasso
 7from sklearn.feature_selection import SelectFromModel
 8
 9# to visualise al the columns in the dataframe
10pd.pandas.set_option("display.max_columns", None)

Paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load the train and test set with the engineered variables in
2# the Ames Housing feature engineering notebook
3
4X_train = pd.read_csv(f"{data_path}/xtrain.csv")
5X_test = pd.read_csv(f"{data_path}/xtest.csv")
6
7X_train.head()
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold SaleType SaleCondition LotFrontage_na MasVnrArea_na GarageYrBlt_na
0 0.750000 0.75 0.461171 0.366365 1.0 1.0 0.333333 1.000000 1.0 0.0 0.0 0.863636 0.4 1.0 0.75 0.6 0.777778 0.50 0.014706 0.049180 0.0 0.0 1.0 1.0 0.333333 0.00000 0.666667 0.5 1.0 0.666667 0.666667 0.666667 1.0 0.002835 0.0 0.0 0.673479 0.239935 1.0 1.00 1.0 1.0 0.559760 0.0 0.0 0.523250 0.000000 0.0 0.666667 0.0 0.375 0.333333 0.666667 0.416667 1.0 0.000000 0.0 0.75 0.018692 1.0 0.75 0.430183 0.5 0.5 1.0 0.116686 0.032907 0.0 0.0 0.0 0.0 0.0 0.00 1.0 0.0 0.545455 0.666667 0.75 0.0 0.0 0.0
1 0.750000 0.75 0.456066 0.388528 1.0 1.0 0.333333 0.333333 1.0 0.0 0.0 0.363636 0.4 1.0 0.75 0.6 0.444444 0.75 0.360294 0.049180 0.0 0.0 0.6 0.6 0.666667 0.03375 0.666667 0.5 0.5 0.333333 0.666667 0.000000 0.8 0.142807 0.0 0.0 0.114724 0.172340 1.0 1.00 1.0 1.0 0.434539 0.0 0.0 0.406196 0.333333 0.0 0.333333 0.5 0.375 0.333333 0.666667 0.250000 1.0 0.000000 0.0 0.75 0.457944 0.5 0.25 0.220028 0.5 0.5 1.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.75 1.0 0.0 0.636364 0.666667 0.75 0.0 0.0 0.0
2 0.916667 0.75 0.394699 0.336782 1.0 1.0 0.000000 0.333333 1.0 0.0 0.0 0.954545 0.4 1.0 1.00 0.6 0.888889 0.50 0.036765 0.098361 1.0 0.0 0.3 0.2 0.666667 0.25750 1.000000 0.5 1.0 1.000000 0.666667 0.000000 1.0 0.080794 0.0 0.0 0.601951 0.286743 1.0 1.00 1.0 1.0 0.627205 0.0 0.0 0.586296 0.333333 0.0 0.666667 0.0 0.250 0.333333 1.000000 0.333333 1.0 0.333333 0.8 0.75 0.046729 0.5 0.50 0.406206 0.5 0.5 1.0 0.228705 0.149909 0.0 0.0 0.0 0.0 0.0 0.00 1.0 0.0 0.090909 0.666667 0.75 0.0 0.0 0.0
3 0.750000 0.75 0.445002 0.482280 1.0 1.0 0.666667 0.666667 1.0 0.0 0.0 0.454545 0.4 1.0 0.75 0.6 0.666667 0.50 0.066176 0.163934 0.0 0.0 1.0 1.0 0.333333 0.00000 0.666667 0.5 1.0 0.666667 0.666667 1.000000 1.0 0.255670 0.0 0.0 0.018114 0.242553 1.0 1.00 1.0 1.0 0.566920 0.0 0.0 0.529943 0.333333 0.0 0.666667 0.0 0.375 0.333333 0.666667 0.250000 1.0 0.333333 0.4 0.75 0.084112 0.5 0.50 0.362482 0.5 0.5 1.0 0.469078 0.045704 0.0 0.0 0.0 0.0 0.0 0.00 1.0 0.0 0.636364 0.666667 0.75 1.0 0.0 0.0
4 0.750000 0.75 0.577658 0.391756 1.0 1.0 0.333333 0.333333 1.0 0.0 0.0 0.363636 0.4 1.0 0.75 0.6 0.555556 0.50 0.323529 0.737705 0.0 0.0 0.6 0.7 0.666667 0.17000 0.333333 0.5 0.5 0.333333 0.666667 0.000000 0.6 0.086818 0.0 0.0 0.434278 0.233224 1.0 0.75 1.0 1.0 0.549026 0.0 0.0 0.513216 0.000000 0.0 0.666667 0.0 0.375 0.333333 0.333333 0.416667 1.0 0.333333 0.8 0.75 0.411215 0.5 0.50 0.406206 0.5 0.5 1.0 0.000000 0.000000 0.0 1.0 0.0 0.0 0.0 0.00 1.0 0.0 0.545455 0.666667 0.75 0.0 0.0 0.0
1# load the target (remember that the target is log transformed)
2y_train = pd.read_csv(f"{data_path}/ytrain.csv")
3y_test = pd.read_csv(f"{data_path}/ytest.csv")
4
5y_train.head()
SalePrice
0 12.211060
1 11.887931
2 12.675764
3 12.278393
4 12.103486

Feature Selection#

 1# Doing the model fitting and feature selection
 2# together in a few lines of code
 3
 4# First, specify the Lasso Regression model, and select a
 5# suitable alpha (equivalent of penalty).
 6# The bigger the alpha the fewer features will be selected.
 7
 8# Then we use the selectFromModel object from sklearn, which
 9# will select automatically the features which coefficients are non-zero
10
11# Set the seed, the random state in this function
12sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))
13
14# Training Lasso model and selecting features
15sel_.fit(X_train, y_train)
SelectFromModel(estimator=Lasso(alpha=0.001, random_state=0))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
1sel_.get_support().sum()
36
1# Visualising selected features (marked with True)
2sel_.get_support()
array([ True,  True, False,  True, False, False,  True,  True, False,
        True, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False,  True, False, False, False,  True,
       False,  True,  True, False,  True,  True, False, False, False,
       False, False, False,  True,  True, False,  True,  True, False,
        True,  True, False,  True,  True, False, False,  True,  True,
        True,  True,  True, False, False,  True,  True, False, False,
       False,  True,  True, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False])
 1# Printing the number of total and selected features
 2
 3# this is how we can make a list of the selected features
 4selected_feats = X_train.columns[(sel_.get_support())]
 5
 6# Printing some stats
 7print("total features: {}".format((X_train.shape[1])))
 8print("selected features: {}".format(len(selected_feats)))
 9print(
10    "features with coefficients shrank to zero: {}".format(
11        np.sum(sel_.estimator_.coef_ == 0)
12    )
13)
total features: 81
selected features: 36
features with coefficients shrank to zero: 45
1# Printing the selected features
2selected_feats
Index(['MSSubClass', 'MSZoning', 'LotArea', 'LotShape', 'LandContour',
       'LotConfig', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'ExterQual', 'Foundation',
       'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageFinish', 'GarageCars', 'PavedDrive', 'WoodDeckSF',
       'ScreenPorch', 'SaleCondition'],
      dtype='object')
1pd.Series(selected_feats).to_csv(
2    f"{data_path}/selected_features.csv", index=False
3)