# !pip install matplotlib --upgrade
Note:
Upgrade the matplotlib version form 3.2.2 to 3.5.3
Use command
!pip install matplotlib --upgrade
About the Business
This business case is about India’s leading micro-mobility service provider, which offers unique vehicles for the daily commute. Starting off as a mission to eliminate traffic congestion in India, the company provides the safest commute solution through a user-friendly mobile app to enable shared, solo and sustainable commuting.
Special zones are located at all the appropriate locations (including metro stations, bus stands, office spaces, residential areas, corporate offices, etc) to make those first and last-miles smooth, affordable, and convenient!
Business Problem
The company has recently suffered considerable dips in their revenues. They have contracted a consulting company to understand the factors on which the demand for these shared electric cycles depends. Specifically, they want to understand the factors affecting the demand for these shared electric cycles in the American market.
Analysis Approach and Assumptions
Since the significance level to test the hypothesis is not mentioned, we will consider 0.05 as a thresold as it is most comonly used.
We will carry out the following steps:
Importing Required Libraries
# !pip install matplotlib==3.6.2
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math
from scipy.stats import norm
from scipy import stats
df = pd.read_csv('bike_sharing.csv')
df.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 |
Custom function to get Discriptive Summary of the Dataset
def get_df_summary(df_, print_summary=True, properties_as_columns=True):
# Shape and memory usage of DataFrame
print(f'RangeIndex: {df_.shape[0]} entries; Data columns (total {df_.shape[1]} columns)')
memory_used = df_.memory_usage().sum()/1024
if memory_used > 1024*1024:
memory_used = f'{round(memory_used/1024/1024, 1)}+ GB'
elif memory_used > 1024:
memory_used = f'{round(memory_used/1024, 1)}+ MB'
else:
memory_used = f'{round(memory_used, 1)}+ KB'
print(f'memory usage: {memory_used}\n')
# Create an empty df with column names from original df
df2 = pd.DataFrame(columns=[None]+df_.columns.to_list())
# Add dtype
property_ = ['dtype']
for clm in df_.columns:
property_.append(df_[clm].dtype)
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Add Missing Values Counts
property_ = ['Missing Counts']
for clm in df_.columns:
property_.append(df_[clm].isna().sum())
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Add nUniques
property_ = ['nUniques']
for clm in df_.columns:
property_.append(df_[clm].nunique())
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Add unique values
property_ = ['Top 10 Unique Values']
for clm in df_.columns:
df1 = df_[clm].value_counts().reset_index()
df1['margin'] = df1[clm]*100/ df1[clm].sum()
property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[:10]]))
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Getting Numeric Variables Statistics
df4 = pd.DataFrame(columns=df_.columns.to_list())
df4 = df4.append(df_.describe()).drop('count').rename({
'25%': 'Q1',
'50%': 'Median',
'75%': 'Q3'
}).reset_index().set_index('index').round(1)
df4 = df4.T
df4['LW (1.5)'] = df4.apply(lambda x: max(x['min'], x['Q1'] - 1.5*(x['Q3']-x['Q1'])), axis=1)
df4['UW (1.5)'] = df4.apply(lambda x: min(x['max'], x['Q3'] + 1.5*(x['Q3']-x['Q1'])), axis=1)
df4['mean-3*std'] = df4.apply(lambda x: max(x['min'], x['mean'] - 3*x['std']), axis=1)
df4['mean+3*std'] = df4.apply(lambda x: min(x['max'], x['mean'] + 3*x['std']), axis=1)
lst_IQR_Outlier = []
lst_std_Outlier = []
for clm in df4.index:
if clm in df_.describe().columns:
iqr_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'LW (1.5)']) | (df_[clm]>df4.loc[clm,'UW (1.5)'])].shape[0]
iqr_outlier_pct = f'({round(iqr_outlier_count * 100.0 / df_.__len__(), 1)}%)' if iqr_outlier_count != 0 else ''
std_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'mean-3*std']) | (df_[clm]>df4.loc[clm,'mean+3*std'])].shape[0]
std_outlier_pct = f'({round(std_outlier_count * 100.0 / df_.__len__(), 1)}%)' if std_outlier_count != 0 else ''
lst_IQR_Outlier.append(f'{iqr_outlier_count} {iqr_outlier_pct}')
lst_std_Outlier.append(f'{std_outlier_count} {std_outlier_pct}')
else:
lst_IQR_Outlier.append(np.nan)
lst_std_Outlier.append(np.nan)
df4['Outlier Count (1.5*IQR)'] = lst_IQR_Outlier
df4['Outlier Count (3*std)'] = lst_std_Outlier
df4 = df4.round(1).T.reset_index().rename({'index': None}, axis=1)
df2 = df2.append(df4)
# Sort the columns acording to dtype
df2 = df2.set_index(None).T.astype(str).sort_values('dtype', ascending=False)
df2 = df2[['dtype', 'Missing Counts', 'nUniques', 'Top 10 Unique Values','min','max',
'LW (1.5)', 'Q1', 'Median', 'Q3', 'UW (1.5)', 'Outlier Count (1.5*IQR)',
'mean-3*std', 'mean', 'std', 'mean+3*std', 'Outlier Count (3*std)']]
if not properties_as_columns: df2 = df2.T
if print_summary: print(df2)
return df2
Descriptive Summary
df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary
RangeIndex: 10886 entries; Data columns (total 12 columns) memory usage: 1020.7+ KB
| datetime | season | holiday | workingday | weather | humidity | casual | registered | count | temp | atemp | windspeed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| dtype | object | int64 | int64 | int64 | int64 | int64 | int64 | int64 | int64 | float64 | float64 | float64 |
| Missing Counts | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| nUniques | 10886 | 4 | 2 | 2 | 4 | 89 | 309 | 731 | 822 | 49 | 60 | 28 |
| Top 10 Unique Values | 2011-01-01 00:00:00 (0%), 2012-05-01 21:00:00 ... | 4.0 (25%), 2.0 (25%), 3.0 (25%), 1.0 (24%) | 0.0 (97%), 1.0 (2%) | 1.0 (68%), 0.0 (31%) | 1.0 (66%), 2.0 (26%), 3.0 (7%), 4.0 (0%) | 88.0 (3%), 94.0 (2%), 83.0 (2%), 87.0 (2%), 70... | 0.0 (9%), 1.0 (6%), 2.0 (4%), 3.0 (4%), 4.0 (3... | 3.0 (1%), 4.0 (1%), 5.0 (1%), 6.0 (1%), 2.0 (1... | 5.0 (1%), 4.0 (1%), 3.0 (1%), 6.0 (1%), 2.0 (1... | 14.76 (4%), 26.24 (4%), 28.7 (3%), 13.94 (3%),... | 31.06 (6%), 25.76 (3%), 22.725 (3%), 20.455 (3... | 0.0 (12%), 8.9981 (10%), 11.0014 (9%), 12.998 ... |
| min | nan | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.8 | 0.8 | 0.0 |
| max | nan | 4.0 | 1.0 | 1.0 | 4.0 | 100.0 | 367.0 | 886.0 | 977.0 | 41.0 | 45.5 | 57.0 |
| LW (1.5) | nan | 1.0 | 0.0 | 0.0 | 1.0 | 2.0 | 0.0 | 0.0 | 1.0 | 0.8 | 0.8 | 0.0 |
| Q1 | nan | 2.0 | 0.0 | 0.0 | 1.0 | 47.0 | 4.0 | 36.0 | 42.0 | 13.9 | 16.7 | 7.0 |
| Median | nan | 3.0 | 0.0 | 1.0 | 1.0 | 62.0 | 17.0 | 118.0 | 145.0 | 20.5 | 24.2 | 13.0 |
| Q3 | nan | 4.0 | 0.0 | 1.0 | 2.0 | 77.0 | 49.0 | 222.0 | 284.0 | 26.2 | 31.1 | 17.0 |
| UW (1.5) | nan | 4.0 | 0.0 | 1.0 | 3.5 | 100.0 | 116.5 | 501.0 | 647.0 | 41.0 | 45.5 | 32.0 |
| Outlier Count (1.5*IQR) | nan | 0 | 311 (2.9%) | 0 | 1 (0.0%) | 22 (0.2%) | 749 (6.9%) | 423 (3.9%) | 300 (2.8%) | 0 | 2 (0.0%) | 227 (2.1%) |
| mean-3*std | nan | 1.0 | 0.0 | 0.0 | 1.0 | 4.3 | 0.0 | 0.0 | 1.0 | 0.8 | 0.8 | 0.0 |
| mean | nan | 2.5 | 0.0 | 0.7 | 1.4 | 61.9 | 36.0 | 155.6 | 191.6 | 20.2 | 23.7 | 12.8 |
| std | nan | 1.1 | 0.2 | 0.5 | 0.6 | 19.2 | 50.0 | 151.0 | 181.1 | 7.8 | 8.5 | 8.2 |
| mean+3*std | nan | 4.0 | 0.6 | 1.0 | 3.2 | 100.0 | 186.0 | 608.6 | 734.9 | 41.0 | 45.5 | 37.4 |
| Outlier Count (3*std) | nan | 0 | 311 (2.9%) | 0 | 1 (0.0%) | 22 (0.2%) | 286 (2.6%) | 235 (2.2%) | 147 (1.4%) | 0 | 2 (0.0%) | 67 (0.6%) |
Convert season, holiday, workingday and weather variable dtypes to Category
This can lead to a reduction in memory requirements and provide performance benefits
for col in ['season', 'holiday', 'workingday', 'weather']:
df[col] = df[col].astype('category')
Check potential outliers based on the criteria 1.5*IQR and 3*Std
df_potiential_outliers = df.loc[(df['count']<float(df_summary.loc['LW (1.5)','count'])) | (df['count']>float(df_summary.loc['UW (1.5)','count']))]
df_potiential_outliers
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6611 | 2012-03-12 18:00:00 | 1 | 0 | 1 | 2 | 24.60 | 31.060 | 43 | 12.9980 | 89 | 623 | 712 |
| 6634 | 2012-03-13 17:00:00 | 1 | 0 | 1 | 1 | 28.70 | 31.820 | 37 | 7.0015 | 62 | 614 | 676 |
| 6635 | 2012-03-13 18:00:00 | 1 | 0 | 1 | 1 | 28.70 | 31.820 | 34 | 19.9995 | 96 | 638 | 734 |
| 6649 | 2012-03-14 08:00:00 | 1 | 0 | 1 | 1 | 18.04 | 21.970 | 82 | 0.0000 | 34 | 628 | 662 |
| 6658 | 2012-03-14 17:00:00 | 1 | 0 | 1 | 1 | 28.70 | 31.820 | 28 | 6.0032 | 140 | 642 | 782 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10678 | 2012-12-11 08:00:00 | 4 | 0 | 1 | 2 | 13.94 | 15.150 | 61 | 19.9995 | 16 | 708 | 724 |
| 10702 | 2012-12-12 08:00:00 | 4 | 0 | 1 | 2 | 10.66 | 12.880 | 65 | 11.0014 | 18 | 670 | 688 |
| 10726 | 2012-12-13 08:00:00 | 4 | 0 | 1 | 1 | 9.84 | 11.365 | 60 | 12.9980 | 24 | 655 | 679 |
| 10846 | 2012-12-18 08:00:00 | 4 | 0 | 1 | 1 | 15.58 | 19.695 | 94 | 0.0000 | 10 | 652 | 662 |
| 10870 | 2012-12-19 08:00:00 | 4 | 0 | 1 | 1 | 9.84 | 12.880 | 87 | 7.0015 | 13 | 665 | 678 |
300 rows × 12 columns
df_potiential_outliers_summary = get_df_summary(df_potiential_outliers, print_summary=False, properties_as_columns=False)
df_potiential_outliers_summary
RangeIndex: 300 entries; Data columns (total 12 columns) memory usage: 22.9+ KB
| datetime | humidity | casual | registered | count | temp | atemp | windspeed | season | holiday | workingday | weather | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| dtype | object | int64 | int64 | int64 | int64 | float64 | float64 | float64 | category | category | category | category |
| Missing Counts | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| nUniques | 300 | 67 | 165 | 202 | 180 | 35 | 39 | 19 | 4 | 2 | 2 | 3 |
| Top 10 Unique Values | 2012-03-12 18:00:00 (0%), 2012-09-12 08:00:00 ... | 36.0 (5%), 43.0 (4%), 34.0 (3%), 41.0 (3%), 53... | 102.0 (2%), 43.0 (2%), 33.0 (2%), 86.0 (1%), 9... | 670.0 (1%), 677.0 (1%), 665.0 (1%), 697.0 (1%)... | 668.0 (2%), 678.0 (2%), 681.0 (1%), 671.0 (1%)... | 26.24 (8%), 28.7 (8%), 24.6 (7%), 29.52 (7%), ... | 31.06 (18%), 31.82 (8%), 32.575 (6%), 30.305 (... | 11.0014 (12%), 15.0013 (10%), 16.9979 (9%), 8.... | 3.0 (38%), 2.0 (33%), 4.0 (23%), 1.0 (5%) | 0.0 (99%), 1.0 (0%) | 1.0 (83%), 0.0 (16%) | 1.0 (75%), 2.0 (21%), 3.0 (3%), 4.0 (0%) |
| min | nan | 17.0 | 10.0 | 316.0 | 648.0 | 9.0 | 10.6 | 0.0 | nan | nan | nan | nan |
| max | nan | 94.0 | 367.0 | 886.0 | 977.0 | 37.7 | 42.4 | 39.0 | nan | nan | nan | nan |
| LW (1.5) | nan | 17.0 | 10.0 | 450.2 | 648.0 | 13.2 | 16.3 | 0.0 | nan | nan | nan | nan |
| Q1 | nan | 37.8 | 43.8 | 615.8 | 681.8 | 23.0 | 26.5 | 9.0 | nan | nan | nan | nan |
| Median | nan | 51.0 | 90.0 | 665.0 | 731.5 | 27.1 | 31.1 | 13.0 | nan | nan | nan | nan |
| Q3 | nan | 65.0 | 117.0 | 726.2 | 812.0 | 29.5 | 33.3 | 19.0 | nan | nan | nan | nan |
| UW (1.5) | nan | 94.0 | 226.8 | 886.0 | 977.0 | 37.7 | 42.4 | 34.0 | nan | nan | nan | nan |
| Outlier Count (1.5*IQR) | nan | 0 | 43 (14.3%) | 40 (13.3%) | 0 | 12 (4.0%) | 13 (4.3%) | 8 (2.7%) | nan | nan | nan | nan |
| mean-3*std | nan | 17.0 | 10.0 | 316.0 | 648.0 | 10.0 | 12.3 | 0.0 | nan | nan | nan | nan |
| mean | nan | 52.1 | 108.3 | 642.8 | 751.1 | 26.2 | 30.0 | 13.6 | nan | nan | nan | nan |
| std | nan | 17.0 | 85.8 | 122.6 | 77.5 | 5.4 | 5.9 | 7.8 | nan | nan | nan | nan |
| mean+3*std | nan | 94.0 | 365.7 | 886.0 | 977.0 | 37.7 | 42.4 | 37.0 | nan | nan | nan | nan |
| Outlier Count (3*std) | nan | 0 | 1 (0.3%) | 0 | 0 | 5 (1.7%) | 6 (2.0%) | 1 (0.3%) | nan | nan | nan | nan |
It should be noted that the records that looked like outliers is actual data and is not accidentally inserted. Furthermore, the potential outliers represent a very small portion of the data, and the variation is not large enough to affect our analysis. Therefore, we decide to keep these outliers for further analysis.
Create a save point for the cleaned version of dataframe
df_v01 = df
Rename the old columns and reorder
df = df[['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']]
df.head()
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 |
Final Discriptive Summary of the Dataset
df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary
RangeIndex: 10886 entries; Data columns (total 12 columns) memory usage: 723.7+ KB
| datetime | humidity | casual | registered | count | temp | atemp | windspeed | season | holiday | workingday | weather | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| dtype | object | int64 | int64 | int64 | int64 | float64 | float64 | float64 | category | category | category | category |
| Missing Counts | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| nUniques | 10886 | 89 | 309 | 731 | 822 | 49 | 60 | 28 | 4 | 2 | 2 | 4 |
| Top 10 Unique Values | 2011-01-01 00:00:00 (0%), 2012-05-01 21:00:00 ... | 88.0 (3%), 94.0 (2%), 83.0 (2%), 87.0 (2%), 70... | 0.0 (9%), 1.0 (6%), 2.0 (4%), 3.0 (4%), 4.0 (3... | 3.0 (1%), 4.0 (1%), 5.0 (1%), 6.0 (1%), 2.0 (1... | 5.0 (1%), 4.0 (1%), 3.0 (1%), 6.0 (1%), 2.0 (1... | 14.76 (4%), 26.24 (4%), 28.7 (3%), 13.94 (3%),... | 31.06 (6%), 25.76 (3%), 22.725 (3%), 20.455 (3... | 0.0 (12%), 8.9981 (10%), 11.0014 (9%), 12.998 ... | 4.0 (25%), 2.0 (25%), 3.0 (25%), 1.0 (24%) | 0.0 (97%), 1.0 (2%) | 1.0 (68%), 0.0 (31%) | 1.0 (66%), 2.0 (26%), 3.0 (7%), 4.0 (0%) |
| min | nan | 0.0 | 0.0 | 0.0 | 1.0 | 0.8 | 0.8 | 0.0 | nan | nan | nan | nan |
| max | nan | 100.0 | 367.0 | 886.0 | 977.0 | 41.0 | 45.5 | 57.0 | nan | nan | nan | nan |
| LW (1.5) | nan | 2.0 | 0.0 | 0.0 | 1.0 | 0.8 | 0.8 | 0.0 | nan | nan | nan | nan |
| Q1 | nan | 47.0 | 4.0 | 36.0 | 42.0 | 13.9 | 16.7 | 7.0 | nan | nan | nan | nan |
| Median | nan | 62.0 | 17.0 | 118.0 | 145.0 | 20.5 | 24.2 | 13.0 | nan | nan | nan | nan |
| Q3 | nan | 77.0 | 49.0 | 222.0 | 284.0 | 26.2 | 31.1 | 17.0 | nan | nan | nan | nan |
| UW (1.5) | nan | 100.0 | 116.5 | 501.0 | 647.0 | 41.0 | 45.5 | 32.0 | nan | nan | nan | nan |
| Outlier Count (1.5*IQR) | nan | 22 (0.2%) | 749 (6.9%) | 423 (3.9%) | 300 (2.8%) | 0 | 2 (0.0%) | 227 (2.1%) | nan | nan | nan | nan |
| mean-3*std | nan | 4.3 | 0.0 | 0.0 | 1.0 | 0.8 | 0.8 | 0.0 | nan | nan | nan | nan |
| mean | nan | 61.9 | 36.0 | 155.6 | 191.6 | 20.2 | 23.7 | 12.8 | nan | nan | nan | nan |
| std | nan | 19.2 | 50.0 | 151.0 | 181.1 | 7.8 | 8.5 | 8.2 | nan | nan | nan | nan |
| mean+3*std | nan | 100.0 | 186.0 | 608.6 | 734.9 | 41.0 | 45.5 | 37.4 | nan | nan | nan | nan |
| Outlier Count (3*std) | nan | 22 (0.2%) | 286 (2.6%) | 235 (2.2%) | 147 (1.4%) | 0 | 2 (0.0%) | 67 (0.6%) | nan | nan | nan | nan |
def plot_hist_custom(ax_, df_, var_, title_, scale_x_major, scale_x_minor, color_='#DC3535'):
if pd.api.types.is_datetime64_any_dtype(df_[var_]):
df_[var_] = df_[var_].dt.year
m1 = df_[var_].mean()
st1 = df_[var_].std()
q1 = df_[var_].quantile(.25)
q2 = df_[var_].median()
q3 = df_[var_].quantile(.75)
sns.histplot(data=df_, x=var_, kde=True,
binwidth=scale_x_minor,
color=color_, ax=ax_, linewidth=2)
df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
plt.xlabel('')
plt.ylabel('Count', size=12)
plt.yticks(size=12)
sns.despine(bottom=True, left=False, trim=True, ax=ax_)
ax_.spines['left'].set_color('grey')
ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
ax_.xaxis.set_major_formatter('{x:.0f}')
ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))
ax_.figure.set_size_inches(16,11)
plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)
return plt
def plot_box_custom(ax_, df_, var_, xlabel_, scale_x_major, scale_x_minor, color_='#DC3535'):
if pd.api.types.is_datetime64_any_dtype(df_[var_]):
df_[var_] = df_[var_].dt.year
m1 = df_[var_].mean()
st1 = df_[var_].std()
q1 = df_[var_].quantile(.25)
q2 = df_[var_].median()
q3 = df_[var_].quantile(.75)
df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
df_mean['y'] = [-0.3, 0.2]
df_q1['y'] = [0.2, 0.25]
df_q2['y'] = [0.1, 0.25]
df_q3['y'] = [0.2, 0.25]
sns.boxplot(data=df_, x=var_, ax=ax_,
color=color_, showmeans=True,
flierprops={"marker": "x"}, medianprops={"color": "cyan"},
width=0.4, fliersize=1, linewidth=2, notch=True)
sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
text = f' μ={m1:.1f}\n σ={st1:.1f}'
ax_.annotate(text, xy=(m1, -0.3), rotation=90)
sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
text = f'Q1={q1:.1f} '
ax_.annotate(text, xy=(q1-0.1, 0.25), rotation=90, va='top', ha='right')
sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
text = f'med={q2:.1f} '
ax_.annotate(text, xy=(q2, 0.25), rotation=90, va='top', ha='center')
sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
text = f'Q3={q3:.1f} '
ax_.annotate(text, xy=(q3+0.1, 0.25), rotation=90, va='top', ha='left')
plt.sca(ax_)
plt.xlabel(xlabel_, size=12)
plt.ylabel('')
plt.xticks(size=12)
sns.despine(bottom=False, left=False, ax=ax_)
ax_.spines['bottom'].set_color('grey')
ax_.spines['left'].set_color('grey')
ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
ax_.xaxis.set_major_formatter('{x:.0f}')
ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))
ax_.figure.set_size_inches(16,11)
plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)
return plt
def plt_dist_plot(ax_, df_, type_var_, var_, title_, xlabel_, ylabel_):
hue_order = df[type_var_].unique()
sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=['#3A5BA0', '#DC3535', '#18978F'])
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.xlabel(xlabel_, size=12)
plt.ylabel(ylabel_, size=12)
plt.xticks(size=12)
plt.yticks(size=12)
plt.legend(labels=hue_order, loc='best', fontsize=12)
sns.despine(right=True, top=True, ax=ax_)
ax_.spines['left'].set_color('grey')
ax_.spines['bottom'].set_color('grey')
ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86)
return plt
def plt_most10_count_plot(ax_, df_, type_var_, var_, type_, title_, independant_label_, dependant_label_, unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='v', agg_func='Count', agg_var=None):
agg_func = agg_func.lower()
if agg_func == 'count':
total_records_count = df_[var_].count()
if type_var_ and type_:
list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
else:
list_most10 = df_[var_].value_counts().iloc[:10].index
df_ = df_.loc[df_[var_].isin(list_most10), [var_]]
elif agg_func == 'sum':
total_records_count = df_[agg_var].sum()
if type_var_ and type_:
list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
else:
list_most10 = df[[var_, agg_var]].groupby(var_)[agg_var].sum().sort_values(ascending=False).iloc[:10]
else:
pass
if not highlight_num:
if agg_func == 'count':
list_highlight = df_[var_].value_counts()/total_records_count
elif agg_func == 'sum':
list_highlight = list_most10.to_list()/total_records_count
else:
pass
list_highlight = list_highlight[list_highlight > 0.1]
highlight_num = len(list_highlight)
custom_palette = [highlight_color for i in range(highlight_num)] + ['grey' for i in range(10-highlight_num)]
if orientation_ == 'v':
if agg_func == 'count':
sns.countplot(data=df_, x=var_, order=list_most10, ax=ax_,
palette = custom_palette)
elif agg_func == 'sum':
sns.barplot(x=[str(i) for i in list_most10.index.to_list()], y=list_most10.to_list(), ax=ax_,
palette = custom_palette)
else:
pass
plt.sca(ax_)
plt.xlabel(independant_label_)
plt.ylabel(dependant_label_)
plt.tick_params(
axis='y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left=False, # ticks along the left edge are off
right=False, # ticks along the right edge are off
labelleft=False) # labels along the left edge are off
labels = []
for label in ax_.get_xticklabels():
text = label.get_text()
labels.append(textwrap.fill(text, width=10,
break_long_words=False))
ax_.set_xticklabels(labels, rotation=60)
bar_labels=[]
for container in ax_.containers:
for rect in container:
# Rectangle widths are already integer-valued but are floating
# type, so it helps to remove the trailing decimal point and 0 by
# converting width to int type
# Shift the text to the left side of the right edge
yloc = 4
# White on magenta
clr = 'white'
align = 'bottom'
rotation_ = 90
count_ = 0 if np.isnan(rect.get_height()) else rect.get_height()
pct_ = int(count_*100/total_records_count)
pct_unit = f'({pct_}%) {unit_}'
label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_height() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_height() > 0.055 else ''
size_ = 10 if count_/container[0].get_height() > 0.1 else 9 if count_/container[0].get_height() > 0.06 else 8 if count_/container[0].get_height() > 0.055 else 7
xloc = rect.get_x() + rect.get_width() / 2
ax_.annotate(label_text, xy=(xloc, 0), xytext=(0, yloc),
textcoords="offset points", size=size_,
ha='center', va=align, rotation=rotation_,
color=clr, clip_on=True)
else:
if agg_func == 'count':
sns.countplot(data=df_, y=var_, order=list_most10, ax=ax_,
palette = custom_palette)
elif agg_func == 'sum':
sns.barplot(y=[str(i) for i in list_most10.index.to_list()], x=list_most10.to_list(), ax=ax_,
palette = custom_palette)
else:
pass
plt.sca(ax_)
plt.xlabel(dependant_label_)
plt.ylabel(independant_label_)
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
labels = []
for label in ax_.get_yticklabels():
text = label.get_text()
labels.append(textwrap.fill(text, width=15,
break_long_words=False))
ax_.set_yticklabels(labels, rotation=0)
bar_labels=[]
for container in ax_.containers:
for rect in container:
# Rectangle widths are already integer-valued but are floating
# type, so it helps to remove the trailing decimal point and 0 by
# converting width to int type
# Shift the text to the left side of the right edge
xloc = 2
# White on magenta
clr = 'white'
align = 'left'
count_ = rect.get_width()
pct_ = int(count_*100/total_records_count)
pct_unit = f'({pct_}%) {unit_}'
label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_width() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_width() > 0.055 else ''
size_ = 10 if count_/container[0].get_width() > 0.1 else 9 if count_/container[0].get_width() > 0.06 else 8 if count_/container[0].get_width() > 0.055 else 7
# Center the text vertically in the bar
yloc = rect.get_y() + rect.get_height() / 2
ax_.annotate(label_text, xy=(0, yloc), xytext=(xloc, 0),
textcoords="offset points", size=size_,
ha=align, va='center',
color=clr, clip_on=True)
sns.despine(left=True, bottom=True, ax=ax_)
plt.title(title_, size=16, color='grey')
plt.xticks(size=12)
plt.yticks(size=12)
ax_.spines['left'].set_color('grey')
ax_.spines['bottom'].set_color('grey')
ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86)
return plt
def plot_numeric_distribution(df_, var_, main_title, xlablel_, title_, highligh_num = None, box_major=None, box_minor=None):
fig = plt.figure()
ax1 = plt.subplot(2, 1, 1)
ax2 = plt.subplot(2, 1, 2, sharex=ax1)
plot_hist_custom(ax_=ax1, df_=df, var_=var_, title_=f'Distribution of {var_}', scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')
plot_box_custom(ax_=ax2, df_=df, var_=var_, xlabel_=var_, scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')
fig.set_size_inches(8,10)
plt.subplots_adjust(hspace = 0, wspace=0.25)
plt.show()
# fig = plt.figure()
# ax1 = plt.subplot()
# plt_most10_count_plot(ax_=ax1, df_=df_, type_var_ = '', var_=var_, type_='', title_=title_, independant_label_='', dependant_label_='', unit_='', highlight_num=highligh_num, highlight_color='#18978F', orientation_='h')
# fig.set_size_inches(8,4)
# plt.show()
def plot_kde_distribution(ax_, df_, var_, summary_var_, title_, x_label_='', y_label_=''):
list_most10 = df_[[var_, summary_var_]].groupby(var_)[summary_var_].sum().sort_values(ascending=False).iloc[:10]
var_list = list_most10.index.to_list()
distribution_list = []
for var_category in var_list:
# df_cur = df_.loc[df[var_] == var_category, [var_, summary_var_]]
distribution_list.append(df_.loc[df_[var_] == var_category, summary_var_])
# --------------------
df_dist = pd.DataFrame(var_list, columns=['Category'])
df_dist['values'] = distribution_list
df_dist = df_dist.explode('values').reset_index(drop=True)
sns.kdeplot(data=df_dist, x='values', ax=ax_, fill=True, hue='Category', hue_order=var_list[::-1], lw=4, palette='deep')
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.xlabel(x_label_, size=12)
plt.ylabel(y_label_, size=12)
plt.xticks(size=12)
plt.yticks([])
plt.legend(labels=var_list, loc='upper left', fontsize=12)
ax_.set_xlim(xmin=np.min([np.min(i) for i in distribution_list])*0.9)
sns.despine(right=True, top=True, left=True, bottom=False, ax=ax_)
ax_.spines['bottom'].set_color('grey')
ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86)
return plt
Count
plot_numeric_distribution(df, 'count', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=50)
Distribution is right skewed.
Add a new variable: log(count) and check its distribution.
df['count_log'] = np.log(df['count'])
plot_numeric_distribution(df, 'count_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
The distribution is nearly log normal.
Casual
plot_numeric_distribution(df, 'casual', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=20)
Distribution is right skewed.
Add a new variable: log(casual) and check its distribution.
df['casual_log'] = np.log(df['casual'])
plot_numeric_distribution(df, 'casual_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
Registered
plot_numeric_distribution(df, 'registered', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=20)
Distribution is right skewed.
Add a new variable: log(registered) and check its distribution.
df['registered_log'] = np.log(df['registered'])
/usr/local/lib/python3.8/dist-packages/pandas/core/arraylike.py:364: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
plot_numeric_distribution(df, 'registered_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
Temperature
plot_numeric_distribution(df, 'temp', 'Tittle_', 'xlablel_', 'title_', box_major=10, box_minor=2)
The distribution is bimodal.
Feeling Temperature
plot_numeric_distribution(df, 'atemp', 'Tittle_', 'xlablel_', 'title_', box_major=10, box_minor=2)
The distribution is multi-modal.
Humidity
plot_numeric_distribution(df, 'humidity', 'Tittle_', 'xlablel_', 'title_', box_major=20, box_minor=5)
The distribution is bimodal.
Windspeed
plot_numeric_distribution(df, 'windspeed', 'Tittle_', 'xlablel_', 'title_', box_major=10, box_minor=2)
The distribution is bimodal.
Season
plt_most10_count_plot(plt.subplot(), df, '', 'season', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Season wise Count
plot_kde_distribution(plt.subplot(), df, 'season', 'count', 'Season wise Count', 'count', 'Density').show()
plot_kde_distribution(plt.subplot(), df, 'season', 'count_log', 'Season wise log(Count)', 'log(count)', 'Density').show()
Log normal distribution for season 1 (spring) is apparently different from the other 3 (summer, fall, winter).
Holiday
plt_most10_count_plot(plt.subplot(), df, '', 'holiday', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Holiday vs log(Count)
plot_kde_distribution(plt.subplot(), df, 'holiday', 'count_log', 'Holiday vs log(Count)', 'log(count)', 'Density').show()
Log normal distributions for number of bikes rented on holidays or otherwise are apparently different.
Working Day
plt_most10_count_plot(plt.subplot(), df, '', 'workingday', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Working Day vs log(Count)
plot_kde_distribution(plt.subplot(), df, 'workingday', 'count_log', 'Working Day vs log(Count)', 'log(count)', 'Density').show()
Log normal distributions for number of bikes rented on working days or otherwise are apparently different.
Weather
plt_most10_count_plot(plt.subplot(), df, '', 'weather', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Weather vs log(Count)
plot_kde_distribution(plt.subplot(), df, 'weather', 'count_log', 'Weather vs log(Count)', 'log(count)', 'Density').show()
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:316: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning. warnings.warn(msg, UserWarning)
Log normal distributions for number of bikes rented during perticular weather condition is apparently different than other.
Null Hypothesis (Ho): Number of bikes rented does not depend on working day or not
Alternate Hypothesis (Ha): Number of bikes rented on working day are diffrent from that rented on non working day
Test: 2 sample t test, 2 tailed
df1 = df.loc[df['workingday']==0, ['count_log']]
df2 = df.loc[df['workingday']==1, ['count_log']]
Ttest_indResult(statistic=array([1.8913669]), pvalue=array([0.05860191]))
print('df1 var:', np.round(df1.var().count_log, 1))
print('df2 var:', np.round(df2.var().count_log, 1))
df1 var: 1.9 df2 var: 2.4
Variances are in acceptable limit (4:1). We can assume equal variance.
stats.ttest_ind(a=df1, b=df2, equal_var=True)
Ttest_indResult(statistic=array([1.8913669]), pvalue=array([0.05860191]))
Since the p-value is greater than 0.05, we fail to reject the null hypothesis.
It shows that there is no significant evidence that the working day or otherwise have any effect on number of bikes rented.
Null Hypothesis (Ho): Number of bikes rented does not depend on season
Alternate Hypothesis (Ha): Number of bikes rented on different seasons are not same
Test: ANOVA test
df1 = df.loc[df['season']==1, ['count_log']]
df2 = df.loc[df['season']==2, ['count_log']]
df3 = df.loc[df['season']==3, ['count_log']]
df4 = df.loc[df['season']==4, ['count_log']]
stats.f_oneway(df1, df2, df3, df4)
F_onewayResult(statistic=array([192.4476898]), pvalue=array([1.30713646e-121]))
Since the p-value is greater than 0.05, we reject the null hypothesis.
It shows that there is significant evidence that the number of bikes rented are defferent for one season than the other.
Null Hypothesis (Ho): Number of bikes rented does not depend on weather
Alternate Hypothesis (Ha): Number of bikes rented on different weather condition are not same
Test: ANOVA test
df1 = df.loc[df['weather']==1, ['count_log']]
df2 = df.loc[df['weather']==2, ['count_log']]
df3 = df.loc[df['weather']==3, ['count_log']]
df4 = df.loc[df['weather']==4, ['count_log']]
stats.f_oneway(df1, df2, df3, df4)
F_onewayResult(statistic=array([49.87947148]), pvalue=array([5.21450012e-32]))
Since the p-value is greater than 0.05, we reject the null hypothesis.
It shows that there is significant evidence that the number of bikes rented are defferent for different weather conditions.
Null Hypothesis (Ho): Weather conditions are independent of season
Alternate Hypothesis (Ha): Weather conditions depend on season
Test: chi-squared test
df_table = pd.crosstab(df['season'], df['weather'])
df_table
| weather | 1 | 2 | 3 | 4 |
|---|---|---|---|---|
| season | ||||
| 1 | 1759 | 715 | 211 | 1 |
| 2 | 1801 | 708 | 224 | 0 |
| 3 | 1930 | 604 | 199 | 0 |
| 4 | 1702 | 807 | 225 | 0 |
chi_stat, p_value, degree_of_freedom, expected_values = stats.chi2_contingency(df_table)
print('chi_stat:', chi_stat)
print('p_value:', p_value)
chi_stat: 49.158655596893624 p_value: 1.549925073686492e-07
Since the p-value is greater than 0.05, we reject the null hypothesis.
It shows that there is significant evidence that the weather conditions depend on season.