# !pip install matplotlib --upgrade


# !pip install matplotlib==3.6.2
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math
from scipy.stats import norm
from scipy import stats


df = pd.read_csv('bike_sharing.csv')
df.head()


def get_df_summary(df_, print_summary=True, properties_as_columns=True):
  # Shape and memory usage of DataFrame
  print(f'RangeIndex: {df_.shape[0]} entries; Data columns (total {df_.shape[1]} columns)')
  memory_used = df_.memory_usage().sum()/1024
  if memory_used > 1024*1024:
    memory_used =  f'{round(memory_used/1024/1024, 1)}+ GB'
  elif memory_used > 1024:
    memory_used =  f'{round(memory_used/1024, 1)}+ MB'
  else:
    memory_used =  f'{round(memory_used, 1)}+ KB'
  print(f'memory usage: {memory_used}\n')

  # Create an empty df with column names from original df
  df2 = pd.DataFrame(columns=[None]+df_.columns.to_list())

  # Add dtype
  property_ = ['dtype']
  for clm in df_.columns:
    property_.append(df_[clm].dtype)
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add Missing Values Counts
  property_ = ['Missing Counts']
  for clm in df_.columns:
    property_.append(df_[clm].isna().sum())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add nUniques
  property_ = ['nUniques']
  for clm in df_.columns:
    property_.append(df_[clm].nunique())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add unique values
  property_ = ['Top 10 Unique Values']
  for clm in df_.columns:
    df1 = df_[clm].value_counts().reset_index()
    df1['margin'] = df1[clm]*100/ df1[clm].sum()
    property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[:10]]))
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Getting Numeric Variables Statistics
  df4 = pd.DataFrame(columns=df_.columns.to_list())
  df4 = df4.append(df_.describe()).drop('count').rename({
      '25%': 'Q1',
      '50%': 'Median',
      '75%': 'Q3'
  }).reset_index().set_index('index').round(1)
  df4 = df4.T
  df4['LW (1.5)'] = df4.apply(lambda x: max(x['min'], x['Q1'] - 1.5*(x['Q3']-x['Q1'])), axis=1)
  df4['UW (1.5)'] = df4.apply(lambda x: min(x['max'], x['Q3'] + 1.5*(x['Q3']-x['Q1'])), axis=1)

  df4['mean-3*std'] = df4.apply(lambda x: max(x['min'], x['mean'] - 3*x['std']), axis=1)
  df4['mean+3*std'] = df4.apply(lambda x: min(x['max'], x['mean'] + 3*x['std']), axis=1)

  lst_IQR_Outlier = []
  lst_std_Outlier = []
  for clm in df4.index:
    if clm in df_.describe().columns:
      iqr_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'LW (1.5)']) | (df_[clm]>df4.loc[clm,'UW (1.5)'])].shape[0]
      iqr_outlier_pct = f'({round(iqr_outlier_count * 100.0 / df_.__len__(), 1)}%)' if iqr_outlier_count != 0 else ''
      
      std_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'mean-3*std']) | (df_[clm]>df4.loc[clm,'mean+3*std'])].shape[0]
      std_outlier_pct = f'({round(std_outlier_count * 100.0 / df_.__len__(), 1)}%)' if std_outlier_count != 0 else ''
      
      lst_IQR_Outlier.append(f'{iqr_outlier_count} {iqr_outlier_pct}')
      lst_std_Outlier.append(f'{std_outlier_count} {std_outlier_pct}')
    else:
      lst_IQR_Outlier.append(np.nan)
      lst_std_Outlier.append(np.nan)

  df4['Outlier Count (1.5*IQR)'] = lst_IQR_Outlier
  df4['Outlier Count (3*std)'] = lst_std_Outlier

  df4 = df4.round(1).T.reset_index().rename({'index': None}, axis=1)
  df2 = df2.append(df4)

  # Sort the columns acording to dtype
  df2 = df2.set_index(None).T.astype(str).sort_values('dtype', ascending=False)
  df2 = df2[['dtype', 'Missing Counts', 'nUniques', 'Top 10 Unique Values','min','max', 
  'LW (1.5)', 'Q1', 'Median', 'Q3',   'UW (1.5)', 'Outlier Count (1.5*IQR)', 
  'mean-3*std',  'mean', 'std', 'mean+3*std', 'Outlier Count (3*std)']]

  if not properties_as_columns: df2 = df2.T
  if print_summary: print(df2) 

  return df2


df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 10886 entries; Data columns (total 12 columns)
memory usage: 1020.7+ KB


for col in ['season', 'holiday', 'workingday', 'weather']:
  df[col] = df[col].astype('category')


df_potiential_outliers = df.loc[(df['count']<float(df_summary.loc['LW (1.5)','count'])) | (df['count']>float(df_summary.loc['UW (1.5)','count']))]
df_potiential_outliers


df_potiential_outliers_summary = get_df_summary(df_potiential_outliers, print_summary=False, properties_as_columns=False)
df_potiential_outliers_summary

RangeIndex: 300 entries; Data columns (total 12 columns)
memory usage: 22.9+ KB


df_v01 = df


df = df[['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']]
df.head()


df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 10886 entries; Data columns (total 12 columns)
memory usage: 723.7+ KB


def plot_hist_custom(ax_, df_, var_, title_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  sns.histplot(data=df_, x=var_, kde=True,
               binwidth=scale_x_minor, 
               color=color_, ax=ax_, linewidth=2)
  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)

  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
  plt.xlabel('')
  plt.ylabel('Count', size=12)
  plt.yticks(size=12)
  sns.despine(bottom=True, left=False, trim=True, ax=ax_)
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plot_box_custom(ax_, df_, var_, xlabel_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
 
  df_mean['y'] = [-0.3, 0.2]
  df_q1['y'] = [0.2, 0.25]
  df_q2['y'] = [0.1, 0.25]
  df_q3['y'] = [0.2, 0.25]

  sns.boxplot(data=df_, x=var_, ax=ax_,
                    color=color_, showmeans=True,
                    flierprops={"marker": "x"}, medianprops={"color": "cyan"},
                    width=0.4, fliersize=1, linewidth=2, notch=True)
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f' μ={m1:.1f}\n σ={st1:.1f}'
  ax_.annotate(text, xy=(m1, -0.3), rotation=90)

  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q1={q1:.1f} '
  ax_.annotate(text, xy=(q1-0.1, 0.25), rotation=90, va='top', ha='right')
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f'med={q2:.1f} '
  ax_.annotate(text, xy=(q2, 0.25), rotation=90, va='top', ha='center')
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q3={q3:.1f} '
  ax_.annotate(text, xy=(q3+0.1, 0.25), rotation=90, va='top', ha='left')

  plt.sca(ax_)
  plt.xlabel(xlabel_, size=12)
  plt.ylabel('')
  plt.xticks(size=12)
  sns.despine(bottom=False, left=False, ax=ax_)
  ax_.spines['bottom'].set_color('grey')
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plt_dist_plot(ax_, df_, type_var_, var_, title_, xlabel_, ylabel_):
  hue_order = df[type_var_].unique()
  sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
                   hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=['#3A5BA0', '#DC3535', '#18978F'])
  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.xlabel(xlabel_, size=12)
  plt.ylabel(ylabel_, size=12)
  plt.xticks(size=12)
  plt.yticks(size=12)
  plt.legend(labels=hue_order, loc='best', fontsize=12)

  sns.despine(right=True, top=True, ax=ax_)
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plt_most10_count_plot(ax_, df_, type_var_, var_, type_, title_, independant_label_, dependant_label_, unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='v', agg_func='Count', agg_var=None):
  agg_func = agg_func.lower()

  if agg_func == 'count':
    total_records_count = df_[var_].count()

    if type_var_ and type_:
      list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
      df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
    else:
      list_most10 = df_[var_].value_counts().iloc[:10].index
      df_ = df_.loc[df_[var_].isin(list_most10), [var_]]

  elif agg_func == 'sum':
    total_records_count = df_[agg_var].sum()

    if type_var_ and type_:
      list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
      df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
    else:
      list_most10 = df[[var_, agg_var]].groupby(var_)[agg_var].sum().sort_values(ascending=False).iloc[:10]

  else:
    pass

  if not highlight_num:
    if agg_func == 'count':
      list_highlight = df_[var_].value_counts()/total_records_count
    elif agg_func == 'sum':
      list_highlight = list_most10.to_list()/total_records_count
    else:
      pass

    list_highlight = list_highlight[list_highlight > 0.1]
    highlight_num = len(list_highlight)

  custom_palette = [highlight_color for i in range(highlight_num)] + ['grey' for i in range(10-highlight_num)]


  if orientation_ == 'v':
    if agg_func == 'count':
      sns.countplot(data=df_, x=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    elif agg_func == 'sum':
      sns.barplot(x=[str(i) for i in list_most10.index.to_list()], y=list_most10.to_list(), ax=ax_,
                  palette = custom_palette)
    else:
      pass

    plt.sca(ax_)
    plt.xlabel(independant_label_)
    plt.ylabel(dependant_label_)

    plt.tick_params(
      axis='y',          # changes apply to the y-axis
      which='both',      # both major and minor ticks are affected
      left=False,      # ticks along the left edge are off
      right=False,         # ticks along the right edge are off
      labelleft=False) # labels along the left edge are off

    labels = []
    for label in ax_.get_xticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=10,
                      break_long_words=False))
    ax_.set_xticklabels(labels, rotation=60)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        yloc = 4
        # White on magenta
        clr = 'white'
        align = 'bottom'
        rotation_ = 90

        count_ = 0 if np.isnan(rect.get_height()) else rect.get_height()
        pct_ = int(count_*100/total_records_count)
        pct_unit = f'({pct_}%) {unit_}'

        label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_height() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_height() > 0.055 else ''

        size_ = 10 if count_/container[0].get_height() > 0.1 else 9 if count_/container[0].get_height() > 0.06 else 8 if count_/container[0].get_height() > 0.055 else 7

        xloc = rect.get_x() + rect.get_width() / 2
        ax_.annotate(label_text, xy=(xloc, 0), xytext=(0, yloc),
                            textcoords="offset points", size=size_,
                            ha='center', va=align, rotation=rotation_,
                            color=clr, clip_on=True)

  else:
    if agg_func == 'count':
      sns.countplot(data=df_, y=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    elif agg_func == 'sum':
      sns.barplot(y=[str(i) for i in list_most10.index.to_list()], x=list_most10.to_list(), ax=ax_,
                  palette = custom_palette)
    else:
      pass
    
    plt.sca(ax_)
    plt.xlabel(dependant_label_)
    plt.ylabel(independant_label_)

    plt.tick_params(
      axis='x',          # changes apply to the x-axis
      which='both',      # both major and minor ticks are affected
      bottom=False,      # ticks along the bottom edge are off
      top=False,         # ticks along the top edge are off
      labelbottom=False) # labels along the bottom edge are off

    labels = []
    for label in ax_.get_yticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=15,
                      break_long_words=False))
    ax_.set_yticklabels(labels, rotation=0)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        xloc = 2
        # White on magenta
        clr = 'white'
        align = 'left'

        count_ = rect.get_width()
        pct_ = int(count_*100/total_records_count)
        pct_unit = f'({pct_}%) {unit_}'

        label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_width() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_width() > 0.055 else ''

        size_ = 10 if count_/container[0].get_width() > 0.1 else 9 if count_/container[0].get_width() > 0.06 else 8 if count_/container[0].get_width() > 0.055 else 7

        # Center the text vertically in the bar
        yloc = rect.get_y() + rect.get_height() / 2
        ax_.annotate(label_text, xy=(0, yloc), xytext=(xloc, 0),
                            textcoords="offset points", size=size_,
                            ha=align, va='center',
                            color=clr, clip_on=True)

  sns.despine(left=True, bottom=True, ax=ax_)

  plt.title(title_, size=16, color='grey')
  plt.xticks(size=12)
  plt.yticks(size=12)
  
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plot_numeric_distribution(df_, var_, main_title, xlablel_, title_, highligh_num = None, box_major=None, box_minor=None):
  fig = plt.figure()

  ax1 = plt.subplot(2, 1, 1)
  ax2 = plt.subplot(2, 1, 2, sharex=ax1)

  plot_hist_custom(ax_=ax1, df_=df, var_=var_, title_=f'Distribution of {var_}', scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')
  plot_box_custom(ax_=ax2, df_=df, var_=var_, xlabel_=var_, scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')

  fig.set_size_inches(8,10)
  plt.subplots_adjust(hspace = 0, wspace=0.25)
  plt.show()

  # fig = plt.figure()
  # ax1 = plt.subplot()
  # plt_most10_count_plot(ax_=ax1, df_=df_, type_var_ = '', var_=var_, type_='', title_=title_, independant_label_='', dependant_label_='', unit_='', highlight_num=highligh_num, highlight_color='#18978F', orientation_='h')

  # fig.set_size_inches(8,4)
  # plt.show()


def plot_kde_distribution(ax_, df_, var_, summary_var_, title_, x_label_='', y_label_=''):
  list_most10 = df_[[var_, summary_var_]].groupby(var_)[summary_var_].sum().sort_values(ascending=False).iloc[:10]
  var_list = list_most10.index.to_list()
  distribution_list = []

  for var_category in var_list:
    # df_cur = df_.loc[df[var_] == var_category, [var_, summary_var_]]
    distribution_list.append(df_.loc[df_[var_] == var_category, summary_var_])

    # --------------------
  df_dist = pd.DataFrame(var_list, columns=['Category'])
  df_dist['values'] = distribution_list
  df_dist = df_dist.explode('values').reset_index(drop=True)

  sns.kdeplot(data=df_dist, x='values', ax=ax_, fill=True, hue='Category', hue_order=var_list[::-1], lw=4, palette='deep')

  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.xlabel(x_label_, size=12)
  plt.ylabel(y_label_, size=12)
  plt.xticks(size=12)
  plt.yticks([])
  plt.legend(labels=var_list, loc='upper left', fontsize=12)

  ax_.set_xlim(xmin=np.min([np.min(i) for i in distribution_list])*0.9)

  sns.despine(right=True, top=True, left=True, bottom=False, ax=ax_)
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


plot_numeric_distribution(df, 'count', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=50)


df['count_log'] = np.log(df['count'])


plot_numeric_distribution(df, 'count_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'casual', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=20)


df['casual_log'] = np.log(df['casual'])


plot_numeric_distribution(df, 'casual_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'registered', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=20)


df['registered_log'] = np.log(df['registered'])

/usr/local/lib/python3.8/dist-packages/pandas/core/arraylike.py:364: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)


plot_numeric_distribution(df, 'registered_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'temp', 'Tittle_', 'xlablel_', 'title_', box_major=10, box_minor=2)


plot_numeric_distribution(df, 'atemp', 'Tittle_', 'xlablel_', 'title_', box_major=10, box_minor=2)


plot_numeric_distribution(df, 'humidity', 'Tittle_', 'xlablel_', 'title_', box_major=20, box_minor=5)


plot_numeric_distribution(df, 'windspeed', 'Tittle_', 'xlablel_', 'title_', box_major=10, box_minor=2)


plt_most10_count_plot(plt.subplot(), df, '', 'season', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


plot_kde_distribution(plt.subplot(), df, 'season', 'count', 'Season wise Count', 'count', 'Density').show()


plot_kde_distribution(plt.subplot(), df, 'season', 'count_log', 'Season wise log(Count)', 'log(count)', 'Density').show()


plt_most10_count_plot(plt.subplot(), df, '', 'holiday', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


plot_kde_distribution(plt.subplot(), df, 'holiday', 'count_log', 'Holiday vs log(Count)', 'log(count)', 'Density').show()


plt_most10_count_plot(plt.subplot(), df, '', 'workingday', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


plot_kde_distribution(plt.subplot(), df, 'workingday', 'count_log', 'Working Day vs log(Count)', 'log(count)', 'Density').show()


plt_most10_count_plot(plt.subplot(), df, '', 'weather', '', 'title_', 'independant_label_', 'dependant_label_', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


plot_kde_distribution(plt.subplot(), df, 'weather', 'count_log', 'Weather vs log(Count)', 'log(count)', 'Density').show()

/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:316: UserWarning: Dataset has 0 variance; skipping density estimate. Pass `warn_singular=False` to disable this warning.
  warnings.warn(msg, UserWarning)


df1 = df.loc[df['workingday']==0, ['count_log']]
df2 = df.loc[df['workingday']==1, ['count_log']]

Ttest_indResult(statistic=array([1.8913669]), pvalue=array([0.05860191]))


print('df1 var:', np.round(df1.var().count_log, 1))
print('df2 var:', np.round(df2.var().count_log, 1))

df1 var: 1.9
df2 var: 2.4


stats.ttest_ind(a=df1, b=df2, equal_var=True)

Ttest_indResult(statistic=array([1.8913669]), pvalue=array([0.05860191]))


df1 = df.loc[df['season']==1, ['count_log']]
df2 = df.loc[df['season']==2, ['count_log']]
df3 = df.loc[df['season']==3, ['count_log']]
df4 = df.loc[df['season']==4, ['count_log']]


stats.f_oneway(df1, df2, df3, df4)

F_onewayResult(statistic=array([192.4476898]), pvalue=array([1.30713646e-121]))


df1 = df.loc[df['weather']==1, ['count_log']]
df2 = df.loc[df['weather']==2, ['count_log']]
df3 = df.loc[df['weather']==3, ['count_log']]
df4 = df.loc[df['weather']==4, ['count_log']]


stats.f_oneway(df1, df2, df3, df4)

F_onewayResult(statistic=array([49.87947148]), pvalue=array([5.21450012e-32]))


df_table = pd.crosstab(df['season'], df['weather'])
df_table


chi_stat, p_value, degree_of_freedom, expected_values = stats.chi2_contingency(df_table)
print('chi_stat:', chi_stat)
print('p_value:', p_value)

chi_stat: 49.158655596893624
p_value: 1.549925073686492e-07

	datetime	season	holiday	workingday	weather	humidity	casual	registered	count	temp	atemp	windspeed
dtype	object	int64	int64	int64	int64	int64	int64	int64	int64	float64	float64	float64
Missing Counts	0	0	0	0	0	0	0	0	0	0	0	0
nUniques	10886	4	2	2	4	89	309	731	822	49	60	28
Top 10 Unique Values	2011-01-01 00:00:00 (0%), 2012-05-01 21:00:00 ...	4.0 (25%), 2.0 (25%), 3.0 (25%), 1.0 (24%)	0.0 (97%), 1.0 (2%)	1.0 (68%), 0.0 (31%)	1.0 (66%), 2.0 (26%), 3.0 (7%), 4.0 (0%)	88.0 (3%), 94.0 (2%), 83.0 (2%), 87.0 (2%), 70...	0.0 (9%), 1.0 (6%), 2.0 (4%), 3.0 (4%), 4.0 (3...	3.0 (1%), 4.0 (1%), 5.0 (1%), 6.0 (1%), 2.0 (1...	5.0 (1%), 4.0 (1%), 3.0 (1%), 6.0 (1%), 2.0 (1...	14.76 (4%), 26.24 (4%), 28.7 (3%), 13.94 (3%),...	31.06 (6%), 25.76 (3%), 22.725 (3%), 20.455 (3...	0.0 (12%), 8.9981 (10%), 11.0014 (9%), 12.998 ...
min	nan	1.0	0.0	0.0	1.0	0.0	0.0	0.0	1.0	0.8	0.8	0.0
max	nan	4.0	1.0	1.0	4.0	100.0	367.0	886.0	977.0	41.0	45.5	57.0
LW (1.5)	nan	1.0	0.0	0.0	1.0	2.0	0.0	0.0	1.0	0.8	0.8	0.0
Q1	nan	2.0	0.0	0.0	1.0	47.0	4.0	36.0	42.0	13.9	16.7	7.0
Median	nan	3.0	0.0	1.0	1.0	62.0	17.0	118.0	145.0	20.5	24.2	13.0
Q3	nan	4.0	0.0	1.0	2.0	77.0	49.0	222.0	284.0	26.2	31.1	17.0
UW (1.5)	nan	4.0	0.0	1.0	3.5	100.0	116.5	501.0	647.0	41.0	45.5	32.0
Outlier Count (1.5*IQR)	nan	0	311 (2.9%)	0	1 (0.0%)	22 (0.2%)	749 (6.9%)	423 (3.9%)	300 (2.8%)	0	2 (0.0%)	227 (2.1%)
mean-3*std	nan	1.0	0.0	0.0	1.0	4.3	0.0	0.0	1.0	0.8	0.8	0.0
mean	nan	2.5	0.0	0.7	1.4	61.9	36.0	155.6	191.6	20.2	23.7	12.8
std	nan	1.1	0.2	0.5	0.6	19.2	50.0	151.0	181.1	7.8	8.5	8.2
mean+3*std	nan	4.0	0.6	1.0	3.2	100.0	186.0	608.6	734.9	41.0	45.5	37.4
Outlier Count (3*std)	nan	0	311 (2.9%)	0	1 (0.0%)	22 (0.2%)	286 (2.6%)	235 (2.2%)	147 (1.4%)	0	2 (0.0%)	67 (0.6%)

	datetime	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count
6611	2012-03-12 18:00:00	1	0	1	2	24.60	31.060	43	12.9980	89	623	712
6634	2012-03-13 17:00:00	1	0	1	1	28.70	31.820	37	7.0015	62	614	676
6635	2012-03-13 18:00:00	1	0	1	1	28.70	31.820	34	19.9995	96	638	734
6649	2012-03-14 08:00:00	1	0	1	1	18.04	21.970	82	0.0000	34	628	662
6658	2012-03-14 17:00:00	1	0	1	1	28.70	31.820	28	6.0032	140	642	782
...	...	...	...	...	...	...	...	...	...	...	...	...
10678	2012-12-11 08:00:00	4	0	1	2	13.94	15.150	61	19.9995	16	708	724
10702	2012-12-12 08:00:00	4	0	1	2	10.66	12.880	65	11.0014	18	670	688
10726	2012-12-13 08:00:00	4	0	1	1	9.84	11.365	60	12.9980	24	655	679
10846	2012-12-18 08:00:00	4	0	1	1	15.58	19.695	94	0.0000	10	652	662
10870	2012-12-19 08:00:00	4	0	1	1	9.84	12.880	87	7.0015	13	665	678

	datetime	humidity	casual	registered	count	temp	atemp	windspeed	season	holiday	workingday	weather
dtype	object	int64	int64	int64	int64	float64	float64	float64	category	category	category	category
Missing Counts	0	0	0	0	0	0	0	0	0	0	0	0
nUniques	300	67	165	202	180	35	39	19	4	2	2	3
Top 10 Unique Values	2012-03-12 18:00:00 (0%), 2012-09-12 08:00:00 ...	36.0 (5%), 43.0 (4%), 34.0 (3%), 41.0 (3%), 53...	102.0 (2%), 43.0 (2%), 33.0 (2%), 86.0 (1%), 9...	670.0 (1%), 677.0 (1%), 665.0 (1%), 697.0 (1%)...	668.0 (2%), 678.0 (2%), 681.0 (1%), 671.0 (1%)...	26.24 (8%), 28.7 (8%), 24.6 (7%), 29.52 (7%), ...	31.06 (18%), 31.82 (8%), 32.575 (6%), 30.305 (...	11.0014 (12%), 15.0013 (10%), 16.9979 (9%), 8....	3.0 (38%), 2.0 (33%), 4.0 (23%), 1.0 (5%)	0.0 (99%), 1.0 (0%)	1.0 (83%), 0.0 (16%)	1.0 (75%), 2.0 (21%), 3.0 (3%), 4.0 (0%)
min	nan	17.0	10.0	316.0	648.0	9.0	10.6	0.0	nan	nan	nan	nan
max	nan	94.0	367.0	886.0	977.0	37.7	42.4	39.0	nan	nan	nan	nan
LW (1.5)	nan	17.0	10.0	450.2	648.0	13.2	16.3	0.0	nan	nan	nan	nan
Q1	nan	37.8	43.8	615.8	681.8	23.0	26.5	9.0	nan	nan	nan	nan
Median	nan	51.0	90.0	665.0	731.5	27.1	31.1	13.0	nan	nan	nan	nan
Q3	nan	65.0	117.0	726.2	812.0	29.5	33.3	19.0	nan	nan	nan	nan
UW (1.5)	nan	94.0	226.8	886.0	977.0	37.7	42.4	34.0	nan	nan	nan	nan
Outlier Count (1.5*IQR)	nan	0	43 (14.3%)	40 (13.3%)	0	12 (4.0%)	13 (4.3%)	8 (2.7%)	nan	nan	nan	nan
mean-3*std	nan	17.0	10.0	316.0	648.0	10.0	12.3	0.0	nan	nan	nan	nan
mean	nan	52.1	108.3	642.8	751.1	26.2	30.0	13.6	nan	nan	nan	nan
std	nan	17.0	85.8	122.6	77.5	5.4	5.9	7.8	nan	nan	nan	nan
mean+3*std	nan	94.0	365.7	886.0	977.0	37.7	42.4	37.0	nan	nan	nan	nan
Outlier Count (3*std)	nan	0	1 (0.3%)	0	0	5 (1.7%)	6 (2.0%)	1 (0.3%)	nan	nan	nan	nan

weather	1	2	3	4
season
1	1759	715	211	1
2	1801	708	224	0
3	1930	604	199	0
4	1702	807	225	0

Define Problem Statement and perform Exploratory Data Analysis¶

Download the dataset and observe a subset of the data¶

Custom functions to plot custom charts and tables¶

Univariate and Bivariate Analysis¶

Hypothesis Testing¶

Test 1: Does number of bikes rented depend on whether it is a working day or not?¶

Test 2: Does number of bikes rented depend on season?¶

Test 3: Does number of bikes rented depend on weather?¶

Test 4: Does weather conditions vary (or depend on) with the season?¶

Inference¶

Recommendation¶

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1