from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
import sys
path_to_module = '/content/drive/MyDrive/DSML/Custom_Functions'
sys.path.append(path_to_module)
from Data_Analysis_Visualization import custom_get_df_summary, custom_plot_hist, custom_plot_box, custom_plot_numeric_distribution, custom_plt_dist_plot, custom_plt_most_least10_count_plot, custom_plot_kde_distribution
Note:
Upgrade the matplotlib version form 3.2.2 to 3.5.3
Use command
!pip install matplotlib --upgrade
About the Business
The business case is about well known Hopital chain in India. It was established in 1983, renowned as the architect of modern healthcare in India. As the nation's first corporate hospital, it is acclaimed for pioneering the private healthcare revolution in the country.
Business Problem
The company wants to know:
Which variables are significant in predicting the reason for hospitalization for different regions
How well some variables like viral load, smoking, Severity Level describe the hospitalization charges
Concept Used
We will carry out the following steps (Concept Used):
Importing Required Libraries
# !pip install matplotlib==3.5.3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math
import re
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
df = pd.read_csv('/content/Hospitals_data.csv')
pd.set_option('display.max_columns', None)
df.head(4)
| Unnamed: 0 | age | sex | smoker | region | viral load | severity level | hospitalization charges | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 19 | female | yes | southwest | 9.30 | 0 | 42212 |
| 1 | 1 | 18 | male | no | southeast | 11.26 | 1 | 4314 |
| 2 | 2 | 28 | male | no | southeast | 11.00 | 3 | 11124 |
| 3 | 3 | 33 | male | no | northwest | 7.57 | 0 | 54961 |
Column Profiling
Custom function to get Discriptive Summary of the Dataset
Descriptive Summary
df_summary = custom_get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary
RangeIndex: 1338 entries; Data columns (total 8 columns) memory usage: 83.8+ KB
| sex | smoker | region | Unnamed: 0 | age | severity level | hospitalization charges | viral load | |
|---|---|---|---|---|---|---|---|---|
| dtype | object | object | object | int64 | int64 | int64 | int64 | float64 |
| Missing Counts | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| nUniques | 2 | 2 | 4 | 1338 | 47 | 6 | 1320 | 462 |
| Top 10 Unique Values | male (50%), female (49%) | no (79%), yes (20%) | southeast (27%), southwest (24%), northwest (2... | 0.0 (0%), 898.0 (0%), 896.0 (0%), 895.0 (0%), ... | 18.0 (5%), 19.0 (5%), 50.0 (2%), 51.0 (2%), 47... | 0.0 (42%), 1.0 (24%), 2.0 (17%), 3.0 (11%), 4.... | 4593.0 (0%), 35986.0 (0%), 4099.0 (0%), 23618.... | 9.63 (0%), 10.77 (0%), 11.37 (0%), 10.17 (0%),... |
| Bottom 10 Unique Values | female (49%), male (50%) | yes (20%), no (79%) | northeast (24%), southwest (24%), northwest (2... | 0.0 (0%), 895.0 (0%), 894.0 (0%), 893.0 (0%), ... | 64.0 (1%), 61.0 (1%), 63.0 (1%), 60.0 (1%), 62... | 5.0 (1%), 4.0 (1%), 3.0 (11%), 2.0 (17%), 1.0 ... | 42212.0 (0%), 33888.0 (0%), 110507.0 (0%), 260... | 9.3 (0%), 8.37 (0%), 14.47 (0%), 13.07 (0%), 1... |
| min | nan | nan | nan | 0.0 | 18.0 | 0.0 | 2805.0 | 5.3 |
| max | nan | nan | nan | 1337.0 | 64.0 | 5.0 | 159426.0 | 17.7 |
| LW (1.5) | nan | nan | nan | 0.0 | 18.0 | 0.0 | 2805.0 | 5.3 |
| Q1 | nan | nan | nan | 334.2 | 27.0 | 0.0 | 11851.0 | 8.8 |
| Median | nan | nan | nan | 668.5 | 39.0 | 1.0 | 23455.0 | 10.1 |
| Q3 | nan | nan | nan | 1002.8 | 51.0 | 2.0 | 41599.5 | 11.6 |
| UW (1.5) | nan | nan | nan | 1337.0 | 64.0 | 5.0 | 86222.2 | 15.8 |
| Outlier Count (1.5*IQR) | nan | nan | nan | 0 | 0 | 0 | 139 (10.4%) | 9 (0.7%) |
| mean-3*std | nan | nan | nan | 0.0 | 18.0 | 0.0 | 2805.0 | 5.3 |
| mean | nan | nan | nan | 668.5 | 39.2 | 1.1 | 33176.1 | 10.2 |
| std | nan | nan | nan | 386.4 | 14.0 | 1.2 | 30275.0 | 2.0 |
| mean+3*std | nan | nan | nan | 1337.0 | 64.0 | 4.7 | 124001.1 | 16.2 |
| Outlier Count (3*std) | nan | nan | nan | 0 | 0 | 18 (1.3%) | 7 (0.5%) | 4 (0.3%) |
Drop redundant Field: 'Unnammed: 0'
df = df.drop('Unnamed: 0', axis=1)
Convert 'sex', 'smoker', 'region' and 'severity level' variable dtypes to Category
This can lead to a reduction in memory requirements and provide performance benefits
for col in ['sex', 'smoker', 'region', 'severity level']:
df[col] = df[col].astype('category')
Outlier Check and Treatment:
We will consider 3*std for the potential outlier checking.
df.loc[(df['hospitalization charges'] < float(df_summary.loc['mean-3*std', 'hospitalization charges'])) | (df['hospitalization charges'] > float(df_summary.loc['mean+3*std', 'hospitalization charges']))]
| age | sex | smoker | region | viral load | severity level | hospitalization charges | |
|---|---|---|---|---|---|---|---|
| 34 | 28 | male | yes | southwest | 12.13 | 1 | 127986 |
| 543 | 54 | female | yes | southeast | 15.80 | 0 | 159426 |
| 577 | 31 | female | yes | northeast | 12.70 | 1 | 146428 |
| 819 | 33 | female | yes | northwest | 11.84 | 0 | 137839 |
| 1146 | 60 | male | yes | southwest | 10.93 | 0 | 131477 |
| 1230 | 52 | male | yes | northwest | 11.50 | 3 | 150053 |
| 1300 | 45 | male | yes | southeast | 10.12 | 0 | 156482 |
df.loc[(df['viral load'] < float(df_summary.loc['mean-3*std', 'viral load'])) | (df['viral load'] > float(df_summary.loc['mean+3*std', 'viral load']))]
| age | sex | smoker | region | viral load | severity level | hospitalization charges | |
|---|---|---|---|---|---|---|---|
| 116 | 58 | male | no | southeast | 16.35 | 0 | 28453 |
| 847 | 23 | male | no | southeast | 16.79 | 1 | 6095 |
| 1047 | 22 | male | yes | southeast | 17.53 | 1 | 111253 |
| 1317 | 18 | male | no | southeast | 17.71 | 0 | 2909 |
It should be noted that the records that looked like outliers is actual data and is not accidentally inserted. Furthermore, the potential outliers represent a very small portion of the data, and the variation is not large enough to affect our analysis. Therefore, we decide to keep these outliers for further analysis.
Summary of dataframe after the preprocessing operations
df_summary = custom_get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary
RangeIndex: 1338 entries; Data columns (total 7 columns) memory usage: 37.4+ KB
| age | hospitalization charges | viral load | sex | smoker | region | severity level | |
|---|---|---|---|---|---|---|---|
| dtype | int64 | int64 | float64 | category | category | category | category |
| Missing Counts | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| nUniques | 47 | 1320 | 462 | 2 | 2 | 4 | 6 |
| Top 10 Unique Values | 18.0 (5%), 19.0 (5%), 50.0 (2%), 51.0 (2%), 47... | 4593.0 (0%), 35986.0 (0%), 4099.0 (0%), 23618.... | 9.63 (0%), 10.77 (0%), 11.37 (0%), 10.17 (0%),... | male (50%), female (49%) | no (79%), yes (20%) | southeast (27%), northwest (24%), southwest (2... | 0.0 (42%), 1.0 (24%), 2.0 (17%), 3.0 (11%), 4.... |
| Bottom 10 Unique Values | 64.0 (1%), 61.0 (1%), 63.0 (1%), 60.0 (1%), 62... | 42212.0 (0%), 33888.0 (0%), 110507.0 (0%), 260... | 9.3 (0%), 8.37 (0%), 14.47 (0%), 13.07 (0%), 1... | female (49%), male (50%) | yes (20%), no (79%) | northeast (24%), northwest (24%), southwest (2... | 5.0 (1%), 4.0 (1%), 3.0 (11%), 2.0 (17%), 1.0 ... |
| min | 18.0 | 2805.0 | 5.3 | nan | nan | nan | nan |
| max | 64.0 | 159426.0 | 17.7 | nan | nan | nan | nan |
| LW (1.5) | 18.0 | 2805.0 | 5.3 | nan | nan | nan | nan |
| Q1 | 27.0 | 11851.0 | 8.8 | nan | nan | nan | nan |
| Median | 39.0 | 23455.0 | 10.1 | nan | nan | nan | nan |
| Q3 | 51.0 | 41599.5 | 11.6 | nan | nan | nan | nan |
| UW (1.5) | 64.0 | 86222.2 | 15.8 | nan | nan | nan | nan |
| Outlier Count (1.5*IQR) | 0 | 139 (10.4%) | 9 (0.7%) | nan | nan | nan | nan |
| mean-3*std | 18.0 | 2805.0 | 5.3 | nan | nan | nan | nan |
| mean | 39.2 | 33176.1 | 10.2 | nan | nan | nan | nan |
| std | 14.0 | 30275.0 | 2.0 | nan | nan | nan | nan |
| mean+3*std | 64.0 | 124001.1 | 16.2 | nan | nan | nan | nan |
| Outlier Count (3*std) | 0 | 7 (0.5%) | 4 (0.3%) | nan | nan | nan | nan |
df.head(4)
| age | sex | smoker | region | viral load | severity level | hospitalization charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | yes | southwest | 9.30 | 0 | 42212 |
| 1 | 18 | male | no | southeast | 11.26 | 1 | 4314 |
| 2 | 28 | male | no | southeast | 11.00 | 3 | 11124 |
| 3 | 33 | male | no | northwest | 7.57 | 0 | 54961 |
df.columns
Index(['age', 'sex', 'smoker', 'region', 'viral load', 'severity level',
'hospitalization charges', 'hospitalization charges log', 'age log'],
dtype='object')
# def custom_plt_dist_plot():
ax_=plt.subplot()
df_=df
type_var_='region'
var_='viral load'
title_='title...'
xlabel_ = 'xlll'
ylabel_ = 'ylllll'
palette = ['#3A5BA0', '#DC3535', '#18978F']
if df_[type_var_].nunique() > 3:
palette = sns.color_palette()
palette = palette[:df_[type_var_].nunique()]
hue_order = df_[type_var_].unique()
sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=palette)
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.xlabel(xlabel_, size=12)
plt.ylabel(ylabel_, size=12)
plt.xticks(size=12)
plt.yticks(size=12)
plt.legend(labels=hue_order, loc='best', fontsize=12)
sns.despine(right=True, top=True, ax=ax_)
ax_.spines['left'].set_color('grey')
ax_.spines['bottom'].set_color('grey')
ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86)
plt.show()
custom_plot_numeric_distribution(df, 'hospitalization charges', 'title_', box_major=40000, box_minor=10000).show()
Hospitalization Charges
custom_plot_numeric_distribution(df, 'hospitalization charges', 'title_', box_major=40000, box_minor=10000).show()
df['hospitalization charges log'] = np.log(df['hospitalization charges'])
custom_plot_numeric_distribution(df, 'hospitalization charges log', 'title_', box_major=2, box_minor=0.5).show()
Viral Load
custom_plot_numeric_distribution(df, 'viral load', 'title_', box_major=5, box_minor=1).show()
custom_plot_numeric_distribution(df, 'viral load', 'title_', box_major=5, box_minor=1).show()
Age
custom_plot_numeric_distribution(df, 'age', 'title_', box_major=10, box_minor=2).show()
df['age log'] = np.log(df['age'])
custom_plot_numeric_distribution(df, 'age log', 'title_', box_major=1, box_minor=0.25).show()
fig = plt.figure()
ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)
custom_plt_most_least10_count_plot(ax1, df, '', 'age', '', '(Top) Age wise Count', 'Type', 'Count', unit_='', highlight_num=2, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None, order_='top')
custom_plt_most_least10_count_plot(ax2, df, '', 'age', '', '(Bottom) Age wise Count', 'Type', 'Count', unit_='', highlight_num=5, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None, order_='bottom')
plt.show()
Sex
custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'sex', '', 'Gender wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Smoker
custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'smoker', '', 'Smoking habit wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Region
custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'region', '', 'Region wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Severity Level
custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'severity level', '', 'Severity Level wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Hospitalization charges vs age and viral load (and other categorical variables)
fig, ax = plt.subplots(2, 2, figsize=(10, 14))
sns.scatterplot(data = df, x = 'viral load', y = 'hospitalization charges', ax=ax[0, 0])
sns.scatterplot(data = df, x = 'age', y = 'hospitalization charges', ax=ax[0, 1])
sns.scatterplot(data = df, x = 'viral load', y = 'hospitalization charges log', ax=ax[1, 0])
sns.scatterplot(data = df, x = 'age', y = 'hospitalization charges log', ax=ax[1, 1])
plt.show()
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='sex', ax=axs[0, 0])
sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='severity level', ax=axs[0,1])
sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='smoker', ax=axs[1,0])
sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='region', ax=axs[1,1])
plt.show()
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='sex', ax=axs[0, 0])
sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='severity level', ax=axs[0,1])
sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='smoker', ax=axs[1,0])
sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='region', ax=axs[1,1])
plt.show()
sns.heatmap(df.corr(), annot=True, cbar=False, cmap=sns.cubehelix_palette(as_cmap=True))
plt.show()
Assume, Significance Level ($\alpha$) as 0.05. We assume that the observations are independant of each other.
It is observed that 'Hospitalization Chages' feature visually appear to be log normally distributed, where as 'Viral Load' feature is normally distributed. 'Age' is uniformly distributed.
We can assume, 'Hospitalization Chages' feature is log normally distributed and 'Viral Load' feature is normally distributed.
We will check for equal variance using Levene statistical Test
Test 1 (Right Tailed Two Sample t Test)
H0 = Mean of hospitalization charges for smokers and non-smokers are not significantly different
Ha = Mean of hospitalization charges for smokers is significantly greater than that for non-smokers
custom_plot_kde_distribution(plt.subplot(), df, 'smoker', 'hospitalization charges log', 'Season wise Count', 'count', 'Density').show()
Visullay the distribution does not look gaussian. We will confirm our observation using Shapiro-Wilk Test.
stats.shapiro(df.loc[df['smoker']=='yes', 'hospitalization charges log'])
ShapiroResult(statistic=0.9288101196289062, pvalue=3.4734257137181146e-10)
stats.shapiro(df.loc[df['smoker']=='no', 'hospitalization charges log'])
ShapiroResult(statistic=0.9761605858802795, pvalue=3.2253915250440857e-12)
Since p_value < significance level; distributions are not normal. But the distributions are approximately normal. We will proceed with the t-test assuming that the distributions are normal.
stats.levene(df.loc[df['smoker']=='yes', 'hospitalization charges log'], df.loc[df['smoker']=='no', 'hospitalization charges log'])
LeveneResult(statistic=89.49584320261768, pvalue=1.332832802389931e-20)
Since p_value < significance level; we observe unequal variances
stats.ttest_ind(df.loc[df['smoker']=='yes', 'hospitalization charges log'], df.loc[df['smoker']=='no', 'hospitalization charges log'], equal_var=False, alternative='greater')
Ttest_indResult(statistic=46.37082591943892, pvalue=1.9600240968860024e-234)
Since p_value < significance level; mean of hospitalization charges for smokers is significantly greater than that for non-smokers
Test 2 (Two Tailed Two Sample t Test)
H0 = Mean of viral load for females and males are not significantly different
Ha = Mean of viral load for females and males are significantly different
custom_plot_kde_distribution(plt.subplot(), df, 'sex', 'viral load', 'Season wise Count', 'count', 'Density').show()
Visullay both the distribution look nealy identical and gaussian distributed.
stats.shapiro(df.loc[df['sex']=='female', 'viral load'])
ShapiroResult(statistic=0.9930474162101746, pvalue=0.003624602919444442)
stats.shapiro(df.loc[df['sex']=='male', 'viral load'])
ShapiroResult(statistic=0.9930650591850281, pvalue=0.003189612179994583)
stats.levene(df.loc[df['sex']=='female', 'viral load'], df.loc[df['sex']=='male', 'viral load'])
LeveneResult(statistic=0.0038754151966871046, pvalue=0.9503708012456551)
Since p_value > significance level; we can assume equal variances
stats.ttest_ind(df.loc[df['sex']=='female', 'viral load'], df.loc[df['sex']=='male', 'viral load'], equal_var=True, alternative='two-sided')
Ttest_indResult(statistic=-1.695711164450323, pvalue=0.0901735841670204)
Since p_value > significance level; mean of viral load for females and males are not significantly different
Test 3 (chi squared test)
H0 = Smoking is not significantly dependant on regions (independant)
Ha = Smoking is significantly dependant on regions
df_table = pd.crosstab(df['smoker'], df['region'])
df_table
| region | northeast | northwest | southeast | southwest |
|---|---|---|---|---|
| smoker | ||||
| no | 257 | 267 | 273 | 267 |
| yes | 67 | 58 | 91 | 58 |
chi_stat, p_value, degree_of_freedom, expected_values = stats.chi2_contingency(df_table)
print('chi_stat:', chi_stat)
print('p_value:', p_value)
chi_stat: 7.34347776140707 p_value: 0.06171954839170547
Since p_value > significance level; smoking is not significantly dependant on regions (independant)
Test 4 (One way ANOVA)
H0 = Mean viral load of women with 0 Severity level , 1 Severity level, and 2 Severity level are not significantly different
Ha = Atleast one of the mean viral loads of women with 0 Severity level , 1 Severity level, and 2 Severity level is significantly different than other means
df_new = df.loc[(df['sex']=='female') & ((df['severity level'] == 0) | (df['severity level'] == 1) | (df['severity level'] == 2))]
custom_plot_kde_distribution(plt.subplot(), df.loc[(df['sex']=='female')], 'severity level', 'viral load', 'Season wise Count', 'count', 'Density').show()
df1 = df.loc[(df['sex']=='female') & (df['severity level'] == 0), 'viral load']
df2 = df.loc[(df['sex']=='female') & (df['severity level'] == 1), 'viral load']
df3 = df.loc[(df['sex']=='female') & (df['severity level'] == 2), 'viral load']
stats.f_oneway(df1, df2, df3)
F_onewayResult(statistic=0.3355061434584082, pvalue=0.7151189650367746)
Since p_value > significance level; mean viral load of women with 0 Severity level , 1 Severity level, and 2 Severity level are not significantly different