Hypothesis testing#

With hypothesis testing a general conclusion is made about a large group (a population) based on the analysis and measurements performed on a smaller group (a sample).

In general, a null hypothesis and a complementary alternative are defined, a test statistic whose value (a quantity calculated from the sample) is the basis for accepting or rejecting the null hypothesis is identified, and a significance level specified. Once a significance level is specified, the rejection points are computed, which are the values with which the test statistic is compared.

Importing libraries and packages#

 1# Warnings
 2import warnings
 3
 4# Mathematical operations and data manipulation
 5import pandas as pd
 6import random
 7
 8# Plotting
 9import seaborn as sns
10import matplotlib.pyplot as plt
11
12# Statistics
13from scipy.stats import ttest_1samp
14from scipy.stats import ttest_ind
15
16warnings.filterwarnings("ignore")
17%matplotlib inline

Set paths#

1# Path to datasets directory
2data_path = "./datasets"
3# Path to assets directory (for saving results to)
4assets_path = "./assets"

Loading dataset#

1# load hourly data
2dataset = pd.read_csv(f"{data_path}/preprocessed_hour.csv")
3dataset.head()
instant dteday season yr mnth hr holiday weekday workingday weathersit temp atemp hum windspeed casual registered cnt
0 1 2011-01-01 winter 2011 1 0 0 Saturday 0 clear 0.24 0.2879 81.0 0.0 3 13 16
1 2 2011-01-01 winter 2011 1 1 0 Saturday 0 clear 0.22 0.2727 80.0 0.0 8 32 40
2 3 2011-01-01 winter 2011 1 2 0 Saturday 0 clear 0.22 0.2727 80.0 0.0 5 27 32
3 4 2011-01-01 winter 2011 1 3 0 Saturday 0 clear 0.24 0.2879 75.0 0.0 3 10 13
4 5 2011-01-01 winter 2011 1 4 0 Saturday 0 clear 0.24 0.2879 75.0 0.0 0 1 1
1# print some generic statistics about the data
2print(f"Shape of data: {dataset.shape}")
3print(f"Number of missing values in the data: {dataset.isnull().sum().sum()}")
4
5# get statistics on the numerical columns
6dataset.describe().T
Shape of data: (17379, 17)
Number of missing values in the data: 0
count mean std min 25% 50% 75% max
instant 17379.0 8690.000000 5017.029500 1.00 4345.5000 8690.0000 13034.5000 17379.0000
yr 17379.0 2011.502561 0.500008 2011.00 2011.0000 2012.0000 2012.0000 2012.0000
mnth 17379.0 6.537775 3.438776 1.00 4.0000 7.0000 10.0000 12.0000
hr 17379.0 11.546752 6.914405 0.00 6.0000 12.0000 18.0000 23.0000
holiday 17379.0 0.028770 0.167165 0.00 0.0000 0.0000 0.0000 1.0000
workingday 17379.0 0.682721 0.465431 0.00 0.0000 1.0000 1.0000 1.0000
temp 17379.0 0.496987 0.192556 0.02 0.3400 0.5000 0.6600 1.0000
atemp 17379.0 0.475775 0.171850 0.00 0.3333 0.4848 0.6212 1.0000
hum 17379.0 62.722884 19.292983 0.00 48.0000 63.0000 78.0000 100.0000
windspeed 17379.0 12.736540 8.196795 0.00 7.0015 12.9980 16.9979 56.9969
casual 17379.0 35.676218 49.305030 0.00 4.0000 17.0000 48.0000 367.0000
registered 17379.0 153.786869 151.357286 0.00 34.0000 115.0000 220.0000 886.0000
cnt 17379.0 189.463088 181.387599 1.00 40.0000 142.0000 281.0000 977.0000

Estimating average registered rides#

 1# Computing population mean of registered rides
 2population_mean = dataset.registered.mean()
 3
 4# Sample of the data (summer 2011)
 5sample = dataset[
 6    (dataset.season == "summer") & (dataset.yr == 2011)
 7].registered
 8
 9# t-test and computing p-value
10test_result = ttest_1samp(sample, population_mean)
11print(
12    f"Test statistic: {test_result[0]:.03f}, "
13    f"p-value: {test_result[1]:.03f}"
14)
15
16# Sample as 5% of the full data
17random.seed(111)
18sample_unbiased = dataset.registered.sample(frac=0.05)
19test_result_unbiased = ttest_1samp(sample_unbiased, population_mean)
20print(
21    f"Unbiased test statistic: {test_result_unbiased[0]:.03f}, "
22    f"p-value: {test_result_unbiased[1]:.03f}"
23)
Test statistic: -3.492, p-value: 0.000
Unbiased test statistic: -0.874, p-value: 0.382

Hypothesis testing on registered rides#

 1# Defining mask, indicating if the day is weekend or work day
 2weekend_days = ["Saturday", "Sunday"]
 3weekend_mask = dataset.weekday.isin(weekend_days)
 4workingdays_mask = ~dataset.weekday.isin(weekend_days)
 5
 6# Selecting registered rides for the weekend and working days
 7weekend_data = dataset.registered[weekend_mask]
 8workingdays_data = dataset.registered[workingdays_mask]
 9
10# t-test
11test_res = ttest_ind(weekend_data, workingdays_data)
12print(f"Statistic value: {test_res[0]:.03f}, p-value: {test_res[1]:.03f}")
13
14# Plotting distributions of registered rides for working vs weekend days
15sns.distplot(weekend_data, label="weekend days")
16sns.distplot(workingdays_data, label="working days")
17plt.legend()
18plt.xlabel("rides")
19plt.ylabel("frequency")
20plt.title("Registered rides distributions")
21plt.savefig(f"{assets_path}/hypothesis_testing_a.png", format="png")
Statistic value: -16.004, p-value: 0.000
../../_images/2f7d1fb76177118070f60d4a6caf30cf2bda7816cbbda59b5da58eb8e88971f7.png
 1# Selecting casual rides for the weekend and working days
 2weekend_data = dataset.casual[weekend_mask]
 3workingdays_data = dataset.casual[workingdays_mask]
 4
 5# t-test
 6test_res = ttest_ind(weekend_data, workingdays_data)
 7print(f"Statistic value: {test_res[0]:.03f}, p-value: {test_res[1]:.03f}")
 8
 9# Plotting distributions of casual rides for working vs weekend days
10sns.distplot(weekend_data, label="weekend days")
11sns.distplot(workingdays_data, label="working days")
12plt.legend()
13plt.xlabel("rides")
14plt.ylabel("frequency")
15plt.title("Casual rides distributions")
16plt.savefig(f"{assets_path}/hypothesis_testing_b.png", format="png")
Statistic value: 41.077, p-value: 0.000
../../_images/8528f26e0dc71add65796868926ff732313604c2a43427c8e2cd249fcb106fa8.png