# !pip install matplotlib --upgrade


# !pip install matplotlib==3.6.2
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math
from scipy.stats import norm


df = pd.read_csv('/content/Sales_data.txt')
df.head()


def get_df_summary(df_, print_summary=True, properties_as_columns=True):
  # Shape and memory usage of DataFrame
  print(f'RangeIndex: {df_.shape[0]} entries; Data columns (total {df_.shape[1]} columns)')
  memory_used = df_.memory_usage().sum()/1024
  if memory_used > 1024*1024:
    memory_used =  f'{round(memory_used/1024/1024, 1)}+ GB'
  elif memory_used > 1024:
    memory_used =  f'{round(memory_used/1024, 1)}+ MB'
  else:
    memory_used =  f'{round(memory_used, 1)}+ KB'
  print(f'memory usage: {memory_used}\n')

  # Create an empty df with column names from original df
  df2 = pd.DataFrame(columns=[None]+df_.columns.to_list())

  # Add dtype
  property_ = ['dtype']
  for clm in df_.columns:
    property_.append(df_[clm].dtype)
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add Missing Values Counts
  property_ = ['Missing Counts']
  for clm in df_.columns:
    property_.append(df_[clm].isna().sum())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add nUniques
  property_ = ['nUniques']
  for clm in df_.columns:
    property_.append(df_[clm].nunique())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add unique values
  property_ = ['Top 10 Unique Values']
  for clm in df_.columns:
    df1 = df_[clm].value_counts().reset_index()
    df1['margin'] = df1[clm]*100/ df1[clm].sum()
    property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[:10]]))
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Getting Numeric Variables Statistics
  df4 = pd.DataFrame(columns=df_.columns.to_list())
  df4 = df4.append(df_.describe()).drop('count').rename({
      '25%': 'Q1',
      '50%': 'Median',
      '75%': 'Q3'
  }).reset_index().set_index('index').round(1)
  df4 = df4.T
  df4['LW (1.5)'] = df4.apply(lambda x: max(x['min'], x['Q1'] - 1.5*(x['Q3']-x['Q1'])), axis=1)
  df4['UW (1.5)'] = df4.apply(lambda x: min(x['max'], x['Q3'] + 1.5*(x['Q3']-x['Q1'])), axis=1)

  df4['mean-3*std'] = df4.apply(lambda x: max(x['min'], x['mean'] - 3*x['std']), axis=1)
  df4['mean+3*std'] = df4.apply(lambda x: min(x['max'], x['mean'] + 3*x['std']), axis=1)

  lst_IQR_Outlier = []
  lst_std_Outlier = []
  for clm in df4.index:
    if clm in df_.describe().columns:
      iqr_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'LW (1.5)']) | (df_[clm]>df4.loc[clm,'UW (1.5)'])].shape[0]
      iqr_outlier_pct = f'({round(iqr_outlier_count * 100.0 / df_.__len__(), 1)}%)' if iqr_outlier_count != 0 else ''
      
      std_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'mean-3*std']) | (df_[clm]>df4.loc[clm,'mean+3*std'])].shape[0]
      std_outlier_pct = f'({round(std_outlier_count * 100.0 / df_.__len__(), 1)}%)' if std_outlier_count != 0 else ''
      
      lst_IQR_Outlier.append(f'{iqr_outlier_count} {iqr_outlier_pct}')
      lst_std_Outlier.append(f'{std_outlier_count} {std_outlier_pct}')
    else:
      lst_IQR_Outlier.append(np.nan)
      lst_std_Outlier.append(np.nan)

  df4['Outlier Count (1.5*IQR)'] = lst_IQR_Outlier
  df4['Outlier Count (3*std)'] = lst_std_Outlier

  df4 = df4.round(1).T.reset_index().rename({'index': None}, axis=1)
  df2 = df2.append(df4)

  # Sort the columns acording to dtype
  df2 = df2.set_index(None).T.astype(str).sort_values('dtype', ascending=False)
  df2 = df2[['dtype', 'Missing Counts', 'nUniques', 'Top 10 Unique Values','min','max', 
  'LW (1.5)', 'Q1', 'Median', 'Q3',   'UW (1.5)', 'Outlier Count (1.5*IQR)', 
  'mean-3*std',  'mean', 'std', 'mean+3*std', 'Outlier Count (3*std)']]

  if not properties_as_columns: df2 = df2.T
  if print_summary: print(df2) 

  return df2


df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 550068 entries; Data columns (total 10 columns)
memory usage: 42.0+ MB


df['Product_Category'].sort_values().unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])


df['Age'].sort_values().unique()

array(['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+'],
      dtype=object)


df['Occupation'].sort_values().unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])


for col in ['Product_ID', 'Product_Category', 'User_ID', 'Gender', 'Age', 'Marital_Status', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years']:
  df[col] = df[col].astype('category')


df_potiential_outliers = df.loc[(df['Purchase']<float(df_summary.loc['LW (1.5)','Purchase'])) | (df['Purchase']>float(df_summary.loc['UW (1.5)','Purchase']))]
df_potiential_outliers


df_potiential_outliers_summary = get_df_summary(df_potiential_outliers, print_summary=False, properties_as_columns=False)
df_potiential_outliers_summary

RangeIndex: 2677 entries; Data columns (total 10 columns)
memory usage: 405.4+ KB


df.loc[df['Product_Category'].isin([10, 15, 9])].describe()


df_v01 = df


df = df.rename({
    'Stay_In_Current_City_Years': 'City_Stay'
}, axis=1)
df = df[['Product_ID', 'Product_Category', 'User_ID', 'Gender', 'Age', 'Marital_Status', 'Occupation', 'City_Category', 'City_Stay', 'Purchase']]
df.head()


df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 550068 entries; Data columns (total 10 columns)
memory usage: 10.3+ MB


def plot_hist_custom(ax_, df_, var_, title_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  sns.histplot(data=df_, x=var_, kde=True,
               binwidth=scale_x_minor, 
               color=color_, ax=ax_, linewidth=2)
  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)

  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
  plt.xlabel('')
  plt.ylabel('Count', size=12)
  plt.yticks(size=12)
  sns.despine(bottom=True, left=False, trim=True, ax=ax_)
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plot_box_custom(ax_, df_, var_, xlabel_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
 
  df_mean['y'] = [-0.3, 0.2]
  df_q1['y'] = [0.2, 0.25]
  df_q2['y'] = [0.1, 0.25]
  df_q3['y'] = [0.2, 0.25]

  sns.boxplot(data=df_, x=var_, ax=ax_,
                    color=color_, showmeans=True,
                    flierprops={"marker": "x"}, medianprops={"color": "cyan"},
                    width=0.4, fliersize=1, linewidth=2, notch=True)
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f' μ={m1:.1f}\n σ={st1:.1f}'
  ax_.annotate(text, xy=(m1, -0.3), rotation=90)

  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q1={q1:.1f} '
  ax_.annotate(text, xy=(q1-0.1, 0.25), rotation=90, va='top', ha='right')
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f'med={q2:.1f} '
  ax_.annotate(text, xy=(q2, 0.25), rotation=90, va='top', ha='center')
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q3={q3:.1f} '
  ax_.annotate(text, xy=(q3+0.1, 0.25), rotation=90, va='top', ha='left')

  plt.sca(ax_)
  plt.xlabel(xlabel_, size=12)
  plt.ylabel('')
  plt.xticks(size=12)
  sns.despine(bottom=False, left=False, ax=ax_)
  ax_.spines['bottom'].set_color('grey')
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plt_dist_plot(ax_, df_, type_var_, var_, title_, xlabel_, ylabel_):
  hue_order = df[type_var_].unique()
  sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
                   hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=['#3A5BA0', '#DC3535', '#18978F'])
  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.xlabel(xlabel_, size=12)
  plt.ylabel(ylabel_, size=12)
  plt.xticks(size=12)
  plt.yticks(size=12)
  plt.legend(labels=hue_order, loc='best', fontsize=12)

  sns.despine(right=True, top=True, ax=ax_)
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plt_most10_count_plot(ax_, df_, type_var_, var_, type_, title_, independant_label_, dependant_label_, unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='v', agg_func='Count', agg_var=None):
  agg_func = agg_func.lower()

  if agg_func == 'count':
    total_records_count = df_[var_].count()

    if type_var_ and type_:
      list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
      df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
    else:
      list_most10 = df_[var_].value_counts().iloc[:10].index
      df_ = df_.loc[df_[var_].isin(list_most10), [var_]]

  elif agg_func == 'sum':
    total_records_count = df_[agg_var].sum()

    if type_var_ and type_:
      list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
      df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
    else:
      list_most10 = df[[var_, agg_var]].groupby(var_)[agg_var].sum().sort_values(ascending=False).iloc[:10]

  else:
    pass

  if not highlight_num:
    if agg_func == 'count':
      list_highlight = df_[var_].value_counts()/total_records_count
    elif agg_func == 'sum':
      list_highlight = list_most10.to_list()/total_records_count
    else:
      pass

    list_highlight = list_highlight[list_highlight > 0.1]
    highlight_num = len(list_highlight)

  custom_palette = [highlight_color for i in range(highlight_num)] + ['grey' for i in range(10-highlight_num)]


  if orientation_ == 'v':
    if agg_func == 'count':
      sns.countplot(data=df_, x=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    elif agg_func == 'sum':
      sns.barplot(x=[str(i) for i in list_most10.index.to_list()], y=list_most10.to_list(), ax=ax_,
                  palette = custom_palette)
    else:
      pass

    plt.sca(ax_)
    plt.xlabel(independant_label_)
    plt.ylabel(dependant_label_)

    plt.tick_params(
      axis='y',          # changes apply to the y-axis
      which='both',      # both major and minor ticks are affected
      left=False,      # ticks along the left edge are off
      right=False,         # ticks along the right edge are off
      labelleft=False) # labels along the left edge are off

    labels = []
    for label in ax_.get_xticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=10,
                      break_long_words=False))
    ax_.set_xticklabels(labels, rotation=60)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        yloc = 4
        # White on magenta
        clr = 'white'
        align = 'bottom'
        rotation_ = 90

        count_ = 0 if np.isnan(rect.get_height()) else rect.get_height()
        pct_ = int(count_*100/total_records_count)
        pct_unit = f'({pct_}%) {unit_}'

        label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_height() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_height() > 0.055 else ''

        size_ = 10 if count_/container[0].get_height() > 0.1 else 9 if count_/container[0].get_height() > 0.06 else 8 if count_/container[0].get_height() > 0.055 else 7

        xloc = rect.get_x() + rect.get_width() / 2
        ax_.annotate(label_text, xy=(xloc, 0), xytext=(0, yloc),
                            textcoords="offset points", size=size_,
                            ha='center', va=align, rotation=rotation_,
                            color=clr, clip_on=True)

  else:
    if agg_func == 'count':
      sns.countplot(data=df_, y=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    elif agg_func == 'sum':
      sns.barplot(y=[str(i) for i in list_most10.index.to_list()], x=list_most10.to_list(), ax=ax_,
                  palette = custom_palette)
    else:
      pass
    
    plt.sca(ax_)
    plt.xlabel(dependant_label_)
    plt.ylabel(independant_label_)

    plt.tick_params(
      axis='x',          # changes apply to the x-axis
      which='both',      # both major and minor ticks are affected
      bottom=False,      # ticks along the bottom edge are off
      top=False,         # ticks along the top edge are off
      labelbottom=False) # labels along the bottom edge are off

    labels = []
    for label in ax_.get_yticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=15,
                      break_long_words=False))
    ax_.set_yticklabels(labels, rotation=0)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        xloc = 2
        # White on magenta
        clr = 'white'
        align = 'left'

        count_ = rect.get_width()
        pct_ = int(count_*100/total_records_count)
        pct_unit = f'({pct_}%) {unit_}'

        label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_width() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_width() > 0.055 else ''

        size_ = 10 if count_/container[0].get_width() > 0.1 else 9 if count_/container[0].get_width() > 0.06 else 8 if count_/container[0].get_width() > 0.055 else 7

        # Center the text vertically in the bar
        yloc = rect.get_y() + rect.get_height() / 2
        ax_.annotate(label_text, xy=(0, yloc), xytext=(xloc, 0),
                            textcoords="offset points", size=size_,
                            ha=align, va='center',
                            color=clr, clip_on=True)

  sns.despine(left=True, bottom=True, ax=ax_)

  plt.title(title_, size=16, color='grey')
  plt.xticks(size=12)
  plt.yticks(size=12)
  
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plot_numeric_distribution(df_, var_, main_title, xlablel_, title_, highligh_num = None, box_major=None, box_minor=None):
  fig = plt.figure()

  ax1 = plt.subplot(2, 1, 1)
  ax2 = plt.subplot(2, 1, 2, sharex=ax1)

  plot_hist_custom(ax_=ax1, df_=df, var_=var_, title_=f'Distribution of {var_}', scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')
  plot_box_custom(ax_=ax2, df_=df, var_=var_, xlabel_=var_, scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')

  fig.set_size_inches(8,10)
  plt.subplots_adjust(hspace = 0, wspace=0.25)
  plt.show()

  fig = plt.figure()
  ax1 = plt.subplot()
  plt_most10_count_plot(ax_=ax1, df_=df_, type_var_ = '', var_=var_, type_='', title_=title_, independant_label_='', dependant_label_='', unit_='', highlight_num=highligh_num, highlight_color='#18978F', orientation_='h')

  fig.set_size_inches(8,4)
  plt.show()


def get_bootstrap_CI(df_, var_, summary_var_, confidence_level, sample_size=None, num_repitions=None):
  list_most10 = df[[var_, summary_var_]].groupby(var_)[summary_var_].sum().sort_values(ascending=False).iloc[:10]
  var_list = list_most10.index.to_list()
  range_list = []
  distribution_list = []

  for var_category in var_list:
    df_cur = df_.loc[df[var_] == var_category, [var_, summary_var_]]

    stats_list = []
    if not sample_size:
      sample_size = df_cur.__len__()
      stats_list.append(df_cur[summary_var_].mean())

    num_repitions = min(max(1000, df_cur.__len__()//2), 10000) if not num_repitions else num_repitions

    for _ in range(num_repitions):
        mean = df_cur.sample(sample_size, replace=True)[summary_var_].mean()
        stats_list.append(mean)

    distribution_list.append(stats_list)

    range_list.append([norm.ppf((100-confidence_level)/200)*np.std(stats_list)+np.mean(stats_list), norm.ppf(1-((100-confidence_level)/200))*np.std(stats_list)+np.mean(stats_list)])

  var_list = [str(i) for i in var_list]
  return [var_list, range_list, distribution_list]


def plot_kde_distribution(ax_, var_list, distribution_list, title_, x_label_='', y_label_=''):
  df_dist = pd.DataFrame(var_list, columns=['Category'])
  df_dist['sample_mean'] = distribution_list
  df_dist = df_dist.explode('sample_mean').reset_index(drop=True)

  sns.kdeplot(data=df_dist, x='sample_mean', ax=ax_, fill=True, hue='Category', hue_order=var_list[::-1], lw=4, palette='deep')

  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.xlabel(x_label_, size=12)
  plt.ylabel(y_label_, size=12)
  plt.xticks(size=12)
  plt.yticks([])
  plt.legend(labels=var_list, loc='upper left', fontsize=12)

  ax_.set_xlim(xmin=np.min([np.min(i) for i in distribution_list])*0.9)

  sns.despine(right=True, top=True, left=True, bottom=False, ax=ax_)
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plot_CI(ax_, var_list, range_list, distribution_list, title_, x_label_='', y_label_=''):
  x = var_list
  y = [i[1] for i in range_list]
  bar1 = sns.barplot(x=y, y=x, color='#18978F', ax=ax_)

  y = [i[0] for i in range_list]
  bar2 = sns.barplot(x=y, y=x, color='white', ax=ax_)
  bar2.set_xlim(xmin=np.min([np.min(i) for i in distribution_list])*0.9)

  plt.sca(ax_)
  plt.xlabel(x_label_)
  plt.ylabel(y_label_)

  plt.tick_params(
      axis='x',          # changes apply to the x-axis
      which='both',      # both major and minor ticks are affected
      bottom=False,      # ticks along the bottom edge are off
      top=False,         # ticks along the top edge are off
      labelbottom=False) # labels along the bottom edge are off

  labels = []
  for label in bar2.get_yticklabels():
      text = label.get_text()
      labels.append(textwrap.fill(text, width=15,
                    break_long_words=False))
  bar2.set_yticklabels(labels, rotation=0)

  bar_labels=[]
  for cur_container_count in range(len(ax_.containers)):
    container = ax_.containers[cur_container_count]
    for rect in container:
      xloc = -2 if cur_container_count==1 else 1

      clr = '#18978F'
      align = 'right' if cur_container_count==1 else 'left'

      count_ = rect.get_width()

      label_text = '' if not count_ else round(count_) if count_ > 0 and count_ < 1000 else f'{count_/1000: .1f} k'
      size_ = 10

      yloc = rect.get_y()
      yloc += rect.get_height() * 0.3 if cur_container_count==1 else rect.get_height() * 0.7
      ax_.annotate(label_text, xy=(rect.get_width(), yloc), xytext=(xloc, 0),
                          textcoords="offset points", size=size_,
                          ha=align, va='center',
                          color=clr, clip_on=True)
  sns.despine(bottom=True)

  plt.title(title_, size=16, color='grey')
  plt.xticks(size=12)
  plt.yticks(size=12)

  ax_.spines['left'].set_color('grey')

  ax_.figure.set_size_inches(16, 0.5 * len(var_list))
  return plt


def get_bootstrap_CI_and_Plot(df_, var_, summary_var_, confidence_level=95, sample_size=None, num_repitions=None):
  [var_list, range_list, distribution_list] = get_bootstrap_CI(df_=df_, var_=var_, summary_var_=summary_var_, confidence_level=confidence_level, sample_size=sample_size, num_repitions=num_repitions)
  mosaic = '''12'''

  fig, axd = plt.subplot_mosaic(mosaic)
  ax1 = axd['1']
  ax2 = axd['2']

  plt_most10_count_plot(ax_=ax1, df_=df_, type_var_ = '', var_=var_, type_='', 
                      title_=f'{var_.replace("_", " ")} wise Most Counts', independant_label_='', dependant_label_='', 
                      unit_='', highlight_num=None, highlight_color='#18978F', 
                      orientation_='h')

  plt_most10_count_plot(ax_=ax2, df_=df_, type_var_ = '', var_=var_, type_='', 
                      title_=f'{var_.replace("_", " ")} wise Most Purchased in $', independant_label_='', dependant_label_='', 
                      unit_='', highlight_num=None, highlight_color='#18978F', 
                      orientation_='h', agg_func='sum', agg_var='Purchase')

  plt.subplots_adjust(hspace = 0.3, wspace=0.1)
  fig.set_size_inches(16, 0.5 * len(var_list))
  plt.show()

  # ------------------------------------------------------------
  fig, (ax1, ax2) = plt.subplots(2, 1, gridspec_kw={'height_ratios': [0.7, 0.3]}, sharex=True)

  plot_kde_distribution(ax_=ax1, var_list=var_list, distribution_list=distribution_list, title_=f'Sample Mean Distribution and {confidence_level}% CI', x_label_='', y_label_='')
  plot_CI(ax_=ax2, var_list=var_list, range_list=range_list, distribution_list=distribution_list, title_='')

  plt.sca(ax1)
  sns.despine(right=True, top=True, left=True, bottom=False, ax=ax1)
  ax1.spines['bottom'].set_color('grey')
  ax1.xaxis.set_tick_params(labelbottom=True)

  ax1.xaxis.set_minor_locator(AutoMinorLocator())
  ax1.tick_params(which='minor', length=2, color='grey')

  plt.subplots_adjust(hspace = (0.35-0.025*len(var_list)), wspace=0.1)
  fig.set_size_inches(16, (1.625-0.0625*len(var_list))*len(var_list))

  plt.show()

  print(f'Average Purchase range in $ with {confidence_level}% CI')
  df_cat = pd.DataFrame(data=var_list, columns=['Category'])
  df_cat['From'] = [round(i[0], 2) for i in range_list]
  df_cat['To'] = [round(i[1], 2) for i in range_list]
  return df_cat.set_index('Category')


plot_numeric_distribution(df_=df, var_='Purchase', main_title='Products Count per Age Years', xlablel_='Purchase in $', title_='Top Purchase Counts', box_major=5000, box_minor=1000)


get_bootstrap_CI_and_Plot(df_=df, var_='Gender', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=None)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='Gender', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=1000)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='Gender', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=100)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='Age', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=None)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='Age', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=1000)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='Marital_Status', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=1000)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='Occupation', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=1000)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='City_Category', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=1000)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='City_Stay', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=1000)

Average Purchase range in $ with 95% CI


get_bootstrap_CI_and_Plot(df_=df, var_='Product_Category', summary_var_='Purchase', confidence_level=95, sample_size=None, num_repitions=1000)

Average Purchase range in $ with 95% CI

	User_ID	Product_ID	Gender	Age	Occupation	City_Category	Stay_In_Current_City_Years	Product_Category	Purchase
0	1000001	P00069042	F	0-17	10.0	A	2	3.0	8370.0
1	1000001	P00248942	F	0-17	10.0	A	2	1.0	15200.0
2	1000001	P00087842	F	0-17	10.0	A	2	12.0	1422.0
3	1000001	P00085442	F	0-17	10.0	A	2	12.0	1057.0
4	1000002	P00285442	M	55+	16.0	C	4+	8.0	7969.0

	User_ID	Product_ID	Gender	Age	Occupation	City_Category	Stay_In_Current_City_Years	Marital_Status	Product_Category	Purchase
343	1000058	P00117642	M	26-35	2	B	3	0	10	23603
375	1000062	P00119342	F	36-45	3	A	1	0	10	23792
652	1000126	P00087042	M	18-25	9	B	1	0	10	23233
736	1000139	P00159542	F	26-35	20	C	2	0	10	23595
1041	1000175	P00052842	F	26-35	2	B	1	0	10	23341
...	...	...	...	...	...	...	...	...	...	...
544488	1005815	P00116142	M	26-35	20	B	1	0	10	23753
544704	1005847	P00085342	F	18-25	4	B	2	0	10	23724
544743	1005852	P00202242	F	26-35	1	A	0	1	10	23529
545663	1006002	P00116142	M	51-55	0	C	1	1	10	23663
545787	1006018	P00052842	M	36-45	1	C	3	0	10	23496

	From	To
Category
M	9422.14	9452.99
F	8719.98	8749.15

	From	To
Category
M	9421.81	9453.20
F	8719.83	8749.29

	From	To
Category
M	9422.40	9452.56
F	8721.15	8748.57

1. Defining Problem Statement and Analysing basic metrics¶

2. Missing Value & Outlier Detection¶

3. Non-Graphical Analysis: Value counts and unique attributes ¶

Download the dataset and observe a subset of the data¶

3. Business Insights based on Non-Graphical and Visual Analysis¶

4. Answering questions¶

5. Final Insights¶

6. Recommendations¶

Custom functions to plot custom charts and tables¶

Analyze the buying behavior of the customers in relation to the customer characteristics¶

Purchase¶

Gender¶

Age¶

Marital Status¶

Occupation¶

City Category¶

Stay in Current City in Years¶

Product Category¶

Business Recommendations¶

	Product_ID	Gender	Age	City_Category	Stay_In_Current_City_Years	User_ID	Occupation	Marital_Status	Product_Category	Purchase
dtype	object	object	object	object	object	int64	int64	int64	int64	int64
Missing Counts	0	0	0	0	0	0	0	0	0	0
nUniques	3631	2	7	3	5	5891	21	2	20	18105
Top 10 Unique Values	P00265242 (0%), P00025442 (0%), P00110742 (0%)...	M (75%), F (24%)	26-35 (39%), 36-45 (19%), 18-25 (18%), 46-50 (...	B (42%), C (31%), A (26%)	1 (35%), 2 (18%), 3 (17%), 4+ (15%), 0 (13%)	1001680.0 (0%), 1004277.0 (0%), 1001941.0 (0%)...	4.0 (13%), 0.0 (12%), 7.0 (10%), 1.0 (8%), 17....	0.0 (59%), 1.0 (40%)	5.0 (27%), 1.0 (25%), 8.0 (20%), 11.0 (4%), 2....	7011.0 (0%), 7193.0 (0%), 6855.0 (0%), 6891.0 ...
min	nan	nan	nan	nan	nan	1000001.0	0.0	0.0	1.0	12.0
max	nan	nan	nan	nan	nan	1006040.0	20.0	1.0	20.0	23961.0
LW (1.5)	nan	nan	nan	nan	nan	1000001.0	0.0	0.0	1.0	12.0
Q1	nan	nan	nan	nan	nan	1001516.0	2.0	0.0	1.0	5823.0
Median	nan	nan	nan	nan	nan	1003077.0	7.0	0.0	5.0	8047.0
Q3	nan	nan	nan	nan	nan	1004478.0	14.0	1.0	8.0	12054.0
UW (1.5)	nan	nan	nan	nan	nan	1006040.0	20.0	1.0	18.5	21400.5
Outlier Count (1.5*IQR)	nan	nan	nan	nan	nan	0	0	0	4153 (0.8%)	2677 (0.5%)
mean-3*std	nan	nan	nan	nan	nan	1000001.0	0.0	0.0	1.0	12.0
mean	nan	nan	nan	nan	nan	1003028.8	8.1	0.4	5.4	9264.0
std	nan	nan	nan	nan	nan	1727.6	6.5	0.5	3.9	5023.1
mean+3*std	nan	nan	nan	nan	nan	1006040.0	20.0	1.0	17.1	23961.0
Outlier Count (3*std)	nan	nan	nan	nan	nan	0	0	0	7278 (1.3%)	0

	Purchase
count	11825.000000
mean	16928.259366
std	5362.895975
min	4148.000000
25%	13051.000000
50%	17385.000000
75%	21239.000000
max	23961.000000

	From	To
Category
26-35	9231.57	9273.84
36-45	9310.11	9352.47
18-25	9148.48	9190.94
46-50	9187.66	9229.47
51-55	9513.56	9556.27
55+	9315.53	9357.05
0-17	8912.05	8954.93

	From	To
Category
4	9178.14	9251.66
0	9088.60	9159.77
7	9389.61	9462.68
1	8916.48	8987.55
17	9784.61	9858.83
12	9757.49	9834.02
20	8800.81	8871.59
14	9463.45	9538.20
16	9356.06	9430.28
2	8916.56	8986.95

	From	To
Category
1	9228.76	9272.30
2	9298.53	9342.65
3	9263.54	9309.07
4+	9252.61	9297.73
0	9158.69	9201.61

	From	To
Category
1	13583.78	13628.59
5	6229.88	6250.26
8	7488.15	7509.73
6	15817.53	15860.04
2	11233.47	11270.27
3	10082.14	10110.87
16	14742.76	14789.19
11	4676.20	4694.78
10	19652.58	19698.22
15	14753.37	14807.74

	From	To
Category
26-35	9231.57	9273.43
36-45	9310.46	9352.79
18-25	9148.97	9189.93
46-50	9187.99	9228.27
51-55	9514.37	9555.17
55+	9315.63	9357.59
0-17	8912.40	8954.96

	From	To
Category
0	9248.95	9282.93
1	9243.57	9278.07

	From	To
Category
B	9131.81	9171.31
C	9700.23	9741.00
A	8892.04	8933.17

1. Defining Problem Statement and Analysing basic metrics¶

2. Missing Value & Outlier Detection¶

3. Non-Graphical Analysis: Value counts and unique attributes ​¶

Download the dataset and observe a subset of the data¶

3. Business Insights based on Non-Graphical and Visual Analysis¶

4. Answering questions¶

5. Final Insights¶

6. Recommendations¶

Custom functions to plot custom charts and tables¶

Analyze the buying behavior of the customers in relation to the customer characteristics¶

Purchase¶

Gender¶

Age¶

Marital Status¶

Occupation¶

City Category¶

Stay in Current City in Years¶

Product Category¶

Business Recommendations¶

3. Non-Graphical Analysis: Value counts and unique attributes ¶