from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


import sys    
path_to_module = '/content/drive/MyDrive/DSML/Custom_Functions'
sys.path.append(path_to_module)


from Data_Analysis_Visualization import custom_get_df_summary, custom_plot_hist, custom_plot_box, custom_plot_numeric_distribution, custom_plt_dist_plot, custom_plt_most_least10_count_plot, custom_plot_kde_distribution


# !pip install matplotlib==3.5.3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math
import re
from sklearn.preprocessing import MinMaxScaler
from scipy import stats


df = pd.read_csv('/content/Hospitals_data.csv')
pd.set_option('display.max_columns', None)
df.head(4)


df_summary = custom_get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 1338 entries; Data columns (total 8 columns)
memory usage: 83.8+ KB


df = df.drop('Unnamed: 0', axis=1)


for col in ['sex', 'smoker', 'region', 'severity level']:
  df[col] = df[col].astype('category')


df.loc[(df['hospitalization charges'] < float(df_summary.loc['mean-3*std', 'hospitalization charges'])) | (df['hospitalization charges'] > float(df_summary.loc['mean+3*std', 'hospitalization charges']))]


df.loc[(df['viral load'] < float(df_summary.loc['mean-3*std', 'viral load'])) | (df['viral load'] > float(df_summary.loc['mean+3*std', 'viral load']))]


df_summary = custom_get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 1338 entries; Data columns (total 7 columns)
memory usage: 37.4+ KB


df.head(4)


df.columns

Index(['age', 'sex', 'smoker', 'region', 'viral load', 'severity level',
       'hospitalization charges', 'hospitalization charges log', 'age log'],
      dtype='object')


# def custom_plt_dist_plot():
ax_=plt.subplot()
df_=df
type_var_='region'
var_='viral load'
title_='title...'
xlabel_ = 'xlll'
ylabel_ = 'ylllll'
palette = ['#3A5BA0', '#DC3535', '#18978F']

if df_[type_var_].nunique() > 3:
  palette = sns.color_palette()

palette = palette[:df_[type_var_].nunique()]


hue_order = df_[type_var_].unique()
sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
                  hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=palette)
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.xlabel(xlabel_, size=12)
plt.ylabel(ylabel_, size=12)
plt.xticks(size=12)
plt.yticks(size=12)
plt.legend(labels=hue_order, loc='best', fontsize=12)

sns.despine(right=True, top=True, ax=ax_)
ax_.spines['left'].set_color('grey')
ax_.spines['bottom'].set_color('grey')

ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86) 

plt.show()


custom_plot_numeric_distribution(df, 'hospitalization charges', 'title_', box_major=40000, box_minor=10000).show()


custom_plot_numeric_distribution(df, 'hospitalization charges', 'title_', box_major=40000, box_minor=10000).show()
df['hospitalization charges log'] = np.log(df['hospitalization charges'])
custom_plot_numeric_distribution(df, 'hospitalization charges log', 'title_', box_major=2, box_minor=0.5).show()


custom_plot_numeric_distribution(df, 'viral load', 'title_', box_major=5, box_minor=1).show()


custom_plot_numeric_distribution(df, 'viral load', 'title_', box_major=5, box_minor=1).show()


custom_plot_numeric_distribution(df, 'age', 'title_', box_major=10, box_minor=2).show()
df['age log'] = np.log(df['age'])
custom_plot_numeric_distribution(df, 'age log', 'title_', box_major=1, box_minor=0.25).show()


fig = plt.figure()

ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)

custom_plt_most_least10_count_plot(ax1, df, '', 'age', '', '(Top) Age wise Count', 'Type', 'Count', unit_='', highlight_num=2, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None, order_='top')
custom_plt_most_least10_count_plot(ax2, df, '', 'age', '', '(Bottom) Age wise Count', 'Type', 'Count', unit_='', highlight_num=5, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None, order_='bottom')

plt.show()


custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'sex', '', 'Gender wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'smoker', '', 'Smoking habit wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'region', '', 'Region wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


custom_plt_most_least10_count_plot(plt.subplot(), df, '', 'severity level', '', 'Severity Level wise Count', 'Type', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


fig, ax = plt.subplots(2, 2, figsize=(10, 14))

sns.scatterplot(data = df, x = 'viral load', y = 'hospitalization charges', ax=ax[0, 0])
sns.scatterplot(data = df, x = 'age', y = 'hospitalization charges', ax=ax[0, 1])

sns.scatterplot(data = df, x = 'viral load', y = 'hospitalization charges log', ax=ax[1, 0])
sns.scatterplot(data = df, x = 'age', y = 'hospitalization charges log', ax=ax[1, 1])
plt.show()


fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))

sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='sex', ax=axs[0, 0])
sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='severity level', ax=axs[0,1])
sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='smoker', ax=axs[1,0])
sns.scatterplot(y='hospitalization charges', x='age', data=df, hue='region', ax=axs[1,1])
plt.show()


fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))

sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='sex', ax=axs[0, 0])
sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='severity level', ax=axs[0,1])
sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='smoker', ax=axs[1,0])
sns.scatterplot(y='hospitalization charges', x='viral load', data=df, hue='region', ax=axs[1,1])
plt.show()


sns.heatmap(df.corr(), annot=True, cbar=False, cmap=sns.cubehelix_palette(as_cmap=True))
plt.show()


custom_plot_kde_distribution(plt.subplot(), df, 'smoker', 'hospitalization charges log', 'Season wise Count', 'count', 'Density').show()


stats.shapiro(df.loc[df['smoker']=='yes', 'hospitalization charges log'])

ShapiroResult(statistic=0.9288101196289062, pvalue=3.4734257137181146e-10)


stats.shapiro(df.loc[df['smoker']=='no', 'hospitalization charges log'])

ShapiroResult(statistic=0.9761605858802795, pvalue=3.2253915250440857e-12)


stats.levene(df.loc[df['smoker']=='yes', 'hospitalization charges log'], df.loc[df['smoker']=='no', 'hospitalization charges log'])

LeveneResult(statistic=89.49584320261768, pvalue=1.332832802389931e-20)


stats.ttest_ind(df.loc[df['smoker']=='yes', 'hospitalization charges log'], df.loc[df['smoker']=='no', 'hospitalization charges log'], equal_var=False, alternative='greater')

Ttest_indResult(statistic=46.37082591943892, pvalue=1.9600240968860024e-234)


custom_plot_kde_distribution(plt.subplot(), df, 'sex', 'viral load', 'Season wise Count', 'count', 'Density').show()


stats.shapiro(df.loc[df['sex']=='female', 'viral load'])

ShapiroResult(statistic=0.9930474162101746, pvalue=0.003624602919444442)


stats.shapiro(df.loc[df['sex']=='male', 'viral load'])

ShapiroResult(statistic=0.9930650591850281, pvalue=0.003189612179994583)


stats.levene(df.loc[df['sex']=='female', 'viral load'], df.loc[df['sex']=='male', 'viral load'])

LeveneResult(statistic=0.0038754151966871046, pvalue=0.9503708012456551)


stats.ttest_ind(df.loc[df['sex']=='female', 'viral load'], df.loc[df['sex']=='male', 'viral load'], equal_var=True, alternative='two-sided')

Ttest_indResult(statistic=-1.695711164450323, pvalue=0.0901735841670204)


df_table = pd.crosstab(df['smoker'], df['region'])
df_table


chi_stat, p_value, degree_of_freedom, expected_values = stats.chi2_contingency(df_table)
print('chi_stat:', chi_stat)
print('p_value:', p_value)

chi_stat: 7.34347776140707
p_value: 0.06171954839170547


df_new = df.loc[(df['sex']=='female') & ((df['severity level'] == 0) | (df['severity level'] == 1) | (df['severity level'] == 2))]
custom_plot_kde_distribution(plt.subplot(), df.loc[(df['sex']=='female')], 'severity level', 'viral load', 'Season wise Count', 'count', 'Density').show()


df1 = df.loc[(df['sex']=='female') & (df['severity level'] == 0), 'viral load']
df2 = df.loc[(df['sex']=='female') & (df['severity level'] == 1), 'viral load']
df3 = df.loc[(df['sex']=='female') & (df['severity level'] == 2), 'viral load']


stats.f_oneway(df1, df2, df3)

F_onewayResult(statistic=0.3355061434584082, pvalue=0.7151189650367746)

	sex	smoker	region	Unnamed: 0	age	severity level	hospitalization charges	viral load
dtype	object	object	object	int64	int64	int64	int64	float64
Missing Counts	0	0	0	0	0	0	0	0
nUniques	2	2	4	1338	47	6	1320	462
Top 10 Unique Values	male (50%), female (49%)	no (79%), yes (20%)	southeast (27%), southwest (24%), northwest (2...	0.0 (0%), 898.0 (0%), 896.0 (0%), 895.0 (0%), ...	18.0 (5%), 19.0 (5%), 50.0 (2%), 51.0 (2%), 47...	0.0 (42%), 1.0 (24%), 2.0 (17%), 3.0 (11%), 4....	4593.0 (0%), 35986.0 (0%), 4099.0 (0%), 23618....	9.63 (0%), 10.77 (0%), 11.37 (0%), 10.17 (0%),...
Bottom 10 Unique Values	female (49%), male (50%)	yes (20%), no (79%)	northeast (24%), southwest (24%), northwest (2...	0.0 (0%), 895.0 (0%), 894.0 (0%), 893.0 (0%), ...	64.0 (1%), 61.0 (1%), 63.0 (1%), 60.0 (1%), 62...	5.0 (1%), 4.0 (1%), 3.0 (11%), 2.0 (17%), 1.0 ...	42212.0 (0%), 33888.0 (0%), 110507.0 (0%), 260...	9.3 (0%), 8.37 (0%), 14.47 (0%), 13.07 (0%), 1...
min	nan	nan	nan	0.0	18.0	0.0	2805.0	5.3
max	nan	nan	nan	1337.0	64.0	5.0	159426.0	17.7
LW (1.5)	nan	nan	nan	0.0	18.0	0.0	2805.0	5.3
Q1	nan	nan	nan	334.2	27.0	0.0	11851.0	8.8
Median	nan	nan	nan	668.5	39.0	1.0	23455.0	10.1
Q3	nan	nan	nan	1002.8	51.0	2.0	41599.5	11.6
UW (1.5)	nan	nan	nan	1337.0	64.0	5.0	86222.2	15.8
Outlier Count (1.5*IQR)	nan	nan	nan	0	0	0	139 (10.4%)	9 (0.7%)
mean-3*std	nan	nan	nan	0.0	18.0	0.0	2805.0	5.3
mean	nan	nan	nan	668.5	39.2	1.1	33176.1	10.2
std	nan	nan	nan	386.4	14.0	1.2	30275.0	2.0
mean+3*std	nan	nan	nan	1337.0	64.0	4.7	124001.1	16.2
Outlier Count (3*std)	nan	nan	nan	0	0	18 (1.3%)	7 (0.5%)	4 (0.3%)

	age	hospitalization charges	viral load	sex	smoker	region	severity level
dtype	int64	int64	float64	category	category	category	category
Missing Counts	0	0	0	0	0	0	0
nUniques	47	1320	462	2	2	4	6
Top 10 Unique Values	18.0 (5%), 19.0 (5%), 50.0 (2%), 51.0 (2%), 47...	4593.0 (0%), 35986.0 (0%), 4099.0 (0%), 23618....	9.63 (0%), 10.77 (0%), 11.37 (0%), 10.17 (0%),...	male (50%), female (49%)	no (79%), yes (20%)	southeast (27%), northwest (24%), southwest (2...	0.0 (42%), 1.0 (24%), 2.0 (17%), 3.0 (11%), 4....
Bottom 10 Unique Values	64.0 (1%), 61.0 (1%), 63.0 (1%), 60.0 (1%), 62...	42212.0 (0%), 33888.0 (0%), 110507.0 (0%), 260...	9.3 (0%), 8.37 (0%), 14.47 (0%), 13.07 (0%), 1...	female (49%), male (50%)	yes (20%), no (79%)	northeast (24%), northwest (24%), southwest (2...	5.0 (1%), 4.0 (1%), 3.0 (11%), 2.0 (17%), 1.0 ...
min	18.0	2805.0	5.3	nan	nan	nan	nan
max	64.0	159426.0	17.7	nan	nan	nan	nan
LW (1.5)	18.0	2805.0	5.3	nan	nan	nan	nan
Q1	27.0	11851.0	8.8	nan	nan	nan	nan
Median	39.0	23455.0	10.1	nan	nan	nan	nan
Q3	51.0	41599.5	11.6	nan	nan	nan	nan
UW (1.5)	64.0	86222.2	15.8	nan	nan	nan	nan
Outlier Count (1.5*IQR)	0	139 (10.4%)	9 (0.7%)	nan	nan	nan	nan
mean-3*std	18.0	2805.0	5.3	nan	nan	nan	nan
mean	39.2	33176.1	10.2	nan	nan	nan	nan
std	14.0	30275.0	2.0	nan	nan	nan	nan
mean+3*std	64.0	124001.1	16.2	nan	nan	nan	nan
Outlier Count (3*std)	0	7 (0.5%)	4 (0.3%)	nan	nan	nan	nan

Define Problem Statement and perform Exploratory Data Analysis¶

Download the dataset and observe a subset of the data¶

Custom functions to plot custom charts and tables¶

Univariate and Bi-Variate Analysis¶

Hypothesis Testing¶

Insights:¶

Recommendations:¶

	Unnamed: 0	age	sex	smoker	region	viral load	severity level	hospitalization charges
0	0	19	female	yes	southwest	9.30	0	42212
1	1	18	male	no	southeast	11.26	1	4314
2	2	28	male	no	southeast	11.00	3	11124
3	3	33	male	no	northwest	7.57	0	54961

	age	sex	smoker	region	viral load	severity level	hospitalization charges
34	28	male	yes	southwest	12.13	1	127986
543	54	female	yes	southeast	15.80	0	159426
577	31	female	yes	northeast	12.70	1	146428
819	33	female	yes	northwest	11.84	0	137839
1146	60	male	yes	southwest	10.93	0	131477
1230	52	male	yes	northwest	11.50	3	150053
1300	45	male	yes	southeast	10.12	0	156482

	age	sex	smoker	region	viral load	severity level	hospitalization charges
116	58	male	no	southeast	16.35	0	28453
847	23	male	no	southeast	16.79	1	6095
1047	22	male	yes	southeast	17.53	1	111253
1317	18	male	no	southeast	17.71	0	2909