Ames Housing Feature Selection#

Selecting a subset of the most predictive features. There is an element of randomness in the Lasso regression.

With the aim to ensure reproducibility between runs of the same notebook, and between the research and production environment, for each step that includes some element of randomness, it is extremely important that the seed is set.

Libraries and packages#

# to handle datasets
import pandas as pd
import numpy as np

# to build the models
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option("display.max_columns", None)

Paths#

# Path to datasets directory
data_path = "./datasets"
# Path to assets directory (for saving results to)
assets_path = "./assets"

Loading dataset#

# load the train and test set with the engineered variables in
# the Ames Housing feature engineering notebook

X_train = pd.read_csv(f"{data_path}/xtrain.csv")
X_test = pd.read_csv(f"{data_path}/xtest.csv")

X_train.head()

	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	LowQualFinSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	SaleType	SaleCondition	LotFrontage_na	MasVnrArea_na	GarageYrBlt_na
0	0.750000	0.75	0.461171	0.366365	1.0	1.0	0.333333	1.000000	1.0	0.0	0.0	0.863636	0.4	1.0	0.75	0.6	0.777778	0.50	0.014706	0.049180	0.0	0.0	1.0	1.0	0.333333	0.00000	0.666667	0.5	1.0	0.666667	0.666667	0.666667	1.0	0.002835	0.0	0.0	0.673479	0.239935	1.0	1.00	1.0	1.0	0.559760	0.0	0.0	0.523250	0.000000	0.0	0.666667	0.0	0.375	0.333333	0.666667	0.416667	1.0	0.000000	0.0	0.75	0.018692	1.0	0.75	0.430183	0.5	0.5	1.0	0.116686	0.032907	0.0	0.0	0.0	0.0	0.0	0.00	1.0	0.0	0.545455	0.666667	0.75	0.0	0.0	0.0
1	0.750000	0.75	0.456066	0.388528	1.0	1.0	0.333333	0.333333	1.0	0.0	0.0	0.363636	0.4	1.0	0.75	0.6	0.444444	0.75	0.360294	0.049180	0.0	0.0	0.6	0.6	0.666667	0.03375	0.666667	0.5	0.5	0.333333	0.666667	0.000000	0.8	0.142807	0.0	0.0	0.114724	0.172340	1.0	1.00	1.0	1.0	0.434539	0.0	0.0	0.406196	0.333333	0.0	0.333333	0.5	0.375	0.333333	0.666667	0.250000	1.0	0.000000	0.0	0.75	0.457944	0.5	0.25	0.220028	0.5	0.5	1.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.75	1.0	0.0	0.636364	0.666667	0.75	0.0	0.0	0.0
2	0.916667	0.75	0.394699	0.336782	1.0	1.0	0.000000	0.333333	1.0	0.0	0.0	0.954545	0.4	1.0	1.00	0.6	0.888889	0.50	0.036765	0.098361	1.0	0.0	0.3	0.2	0.666667	0.25750	1.000000	0.5	1.0	1.000000	0.666667	0.000000	1.0	0.080794	0.0	0.0	0.601951	0.286743	1.0	1.00	1.0	1.0	0.627205	0.0	0.0	0.586296	0.333333	0.0	0.666667	0.0	0.250	0.333333	1.000000	0.333333	1.0	0.333333	0.8	0.75	0.046729	0.5	0.50	0.406206	0.5	0.5	1.0	0.228705	0.149909	0.0	0.0	0.0	0.0	0.0	0.00	1.0	0.0	0.090909	0.666667	0.75	0.0	0.0	0.0
3	0.750000	0.75	0.445002	0.482280	1.0	1.0	0.666667	0.666667	1.0	0.0	0.0	0.454545	0.4	1.0	0.75	0.6	0.666667	0.50	0.066176	0.163934	0.0	0.0	1.0	1.0	0.333333	0.00000	0.666667	0.5	1.0	0.666667	0.666667	1.000000	1.0	0.255670	0.0	0.0	0.018114	0.242553	1.0	1.00	1.0	1.0	0.566920	0.0	0.0	0.529943	0.333333	0.0	0.666667	0.0	0.375	0.333333	0.666667	0.250000	1.0	0.333333	0.4	0.75	0.084112	0.5	0.50	0.362482	0.5	0.5	1.0	0.469078	0.045704	0.0	0.0	0.0	0.0	0.0	0.00	1.0	0.0	0.636364	0.666667	0.75	1.0	0.0	0.0
4	0.750000	0.75	0.577658	0.391756	1.0	1.0	0.333333	0.333333	1.0	0.0	0.0	0.363636	0.4	1.0	0.75	0.6	0.555556	0.50	0.323529	0.737705	0.0	0.0	0.6	0.7	0.666667	0.17000	0.333333	0.5	0.5	0.333333	0.666667	0.000000	0.6	0.086818	0.0	0.0	0.434278	0.233224	1.0	0.75	1.0	1.0	0.549026	0.0	0.0	0.513216	0.000000	0.0	0.666667	0.0	0.375	0.333333	0.333333	0.416667	1.0	0.333333	0.8	0.75	0.411215	0.5	0.50	0.406206	0.5	0.5	1.0	0.000000	0.000000	0.0	1.0	0.0	0.0	0.0	0.00	1.0	0.0	0.545455	0.666667	0.75	0.0	0.0	0.0

# load the target (remember that the target is log transformed)
y_train = pd.read_csv(f"{data_path}/ytrain.csv")
y_test = pd.read_csv(f"{data_path}/ytest.csv")

y_train.head()

	SalePrice
0	12.211060
1	11.887931
2	12.675764
3	12.278393
4	12.103486

Feature Selection#

# Doing the model fitting and feature selection
# together in a few lines of code

# First, specify the Lasso Regression model, and select a
# suitable alpha (equivalent of penalty).
# The bigger the alpha the fewer features will be selected.

# Then we use the selectFromModel object from sklearn, which
# will select automatically the features which coefficients are non-zero

# Set the seed, the random state in this function
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))

# Training Lasso model and selecting features
sel_.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.001, random_state=0))

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

sel_.get_support().sum()

# Visualising selected features (marked with True)
sel_.get_support()

array([ True,  True, False,  True, False, False,  True,  True, False,
        True, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False,  True, False, False, False,  True,
       False,  True,  True, False,  True,  True, False, False, False,
       False, False, False,  True,  True, False,  True,  True, False,
        True,  True, False,  True,  True, False, False,  True,  True,
        True,  True,  True, False, False,  True,  True, False, False,
       False,  True,  True, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False])

# Printing the number of total and selected features

# this is how we can make a list of the selected features
selected_feats = X_train.columns[(sel_.get_support())]

# Printing some stats
print("total features: {}".format((X_train.shape[1])))
print("selected features: {}".format(len(selected_feats)))
print(
    "features with coefficients shrank to zero: {}".format(
        np.sum(sel_.estimator_.coef_ == 0)
    )
)

total features: 81
selected features: 36
features with coefficients shrank to zero: 45

# Printing the selected features
selected_feats

Index(['MSSubClass', 'MSZoning', 'LotArea', 'LotShape', 'LandContour',
       'LotConfig', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'ExterQual', 'Foundation',
       'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
       'HalfBath', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageFinish', 'GarageCars', 'PavedDrive', 'WoodDeckSF',
       'ScreenPorch', 'SaleCondition'],
      dtype='object')

pd.Series(selected_feats).to_csv(
    f"{data_path}/selected_features.csv", index=False
)