# !pip install matplotlib==3.5.3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math


df = pd.read_csv('/content/treadmill_data.txt')
df.head()


def get_df_summary(df_, print_summary=True, properties_as_columns=True):
  # Shape and memory usage of DataFrame
  print(f'RangeIndex: {df_.shape[0]} entries; Data columns (total {df_.shape[1]} columns)')
  memory_used = df_.memory_usage().sum()/1024
  if memory_used > 1024*1024:
    memory_used =  f'{round(memory_used/1024/1024, 1)}+ GB'
  elif memory_used > 1024:
    memory_used =  f'{round(memory_used/1024, 1)}+ MB'
  else:
    memory_used =  f'{round(memory_used, 1)}+ KB'
  print(f'memory usage: {memory_used}\n')

  # Create an empty df with column names from original df
  df2 = pd.DataFrame(columns=[None]+df_.columns.to_list())

  # Add dtype
  property_ = ['dtype']
  for clm in df_.columns:
    property_.append(df[clm].dtype)
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add Missing Values Counts
  property_ = ['Missing Counts']
  for clm in df_.columns:
    property_.append(df[clm].isna().sum())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add nUniques
  property_ = ['nUniques']
  for clm in df_.columns:
    property_.append(df[clm].nunique())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add unique values
  property_ = ['Top 10 Unique Values']
  for clm in df_.columns:
    df1 = df[clm].value_counts().reset_index()
    df1['margin'] = df1[clm]*100/ df1[clm].sum()
    property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[:10]]))
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Getting Numeric Variables Statistics
  df4 = pd.DataFrame(columns=df_.columns.to_list())
  df4 = df4.append(df_.describe()).drop('count').rename({
      '25%': 'Q1',
      '50%': 'Median',
      '75%': 'Q3'
  }).reset_index().set_index('index').round(1)
  df4 = df4.T
  df4['LW (1.5)'] = df4.apply(lambda x: max(x['min'], x['Q1'] - 1.5*(x['Q3']-x['Q1'])), axis=1)
  df4['UW (1.5)'] = df4.apply(lambda x: min(x['max'], x['Q3'] + 1.5*(x['Q3']-x['Q1'])), axis=1)

  df4['mean-3*std'] = df4.apply(lambda x: max(x['min'], x['mean'] - 3*x['std']), axis=1)
  df4['mean+3*std'] = df4.apply(lambda x: min(x['max'], x['mean'] + 3*x['std']), axis=1)

  lst_IQR_Outlier = []
  lst_std_Outlier = []
  for clm in df4.index:
    lst_IQR_Outlier.append(df_.loc[(df[clm]<df4.loc[clm,'LW (1.5)']) | (df[clm]>df4.loc[clm,'UW (1.5)'])].shape[0])
    lst_std_Outlier.append(df_.loc[(df[clm]<df4.loc[clm,'mean-3*std']) | (df[clm]>df4.loc[clm,'mean+3*std'])].shape[0])

  df4['Outlier Count (1.5*IQR)'] = lst_IQR_Outlier
  df4['Outlier Count (3*std)'] = lst_std_Outlier

  df4 = df4.round(1).T.reset_index().rename({'index': None}, axis=1)
  df2 = df2.append(df4)

  # Sort the columns acording to dtype
  df2 = df2.set_index(None).T.astype(str).sort_values('dtype', ascending=False)
  df2 = df2[['dtype', 'Missing Counts', 'nUniques', 'Top 10 Unique Values','min','max', 
  'LW (1.5)', 'Q1', 'Median', 'Q3',   'UW (1.5)', 'Outlier Count (1.5*IQR)', 
  'mean-3*std',  'mean', 'std', 'mean+3*std', 'Outlier Count (3*std)']]

  if not properties_as_columns: df2 = df2.T
  if print_summary: print(df2) 

  return df2


df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 180 entries; Data columns (total 9 columns)
memory usage: 12.8+ KB


df = df.convert_dtypes(['Product', 'Gender', 'MaritalStatus'], 'Category')


df.loc[(df['Income']<float(df_summary.loc['mean-3*std','Income'])) | (df['Income']>float(df_summary.loc['mean+3*std','Income']))]


df.loc[(df['Miles']<float(df_summary.loc['mean-3*std','Miles'])) | (df['Miles']>float(df_summary.loc['mean+3*std','Miles']))]


def get_bin_median(df_, var_, interval):
  bins = [i for i in range(df_[var_].min()//interval*interval, df_[var_].max()+interval, interval)]
  bins[0] = bins[0]-1

  for i in range(1, len(bins)):
    df_.loc[(df_[var_]>bins[i-1])&(df_[var_]<=bins[i]), f'{var_}_Median'] = df_.loc[(df_[var_]>bins[i-1])&(df_[var_]<=bins[i]), var_].median()
  
  df_[f'{var_}_Median'] = df[f'{var_}_Median'].astype('float')


get_bin_median(df, 'Income', interval=5000)
get_bin_median(df, 'Miles', interval=10)


df_v01 = df


df = df[['Product', 'Gender', 'MaritalStatus', 'Age', 'Education', 'Income_Median', 
        'Usage', 'Fitness', 'Miles_Median']]
df = df.rename({
    'Income_Median': 'Income',
    'Miles_Median': 'Miles'
}, axis=1)


df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 180 entries; Data columns (total 9 columns)
memory usage: 13.5+ KB


def plot_hist_custom(ax_, df_, var_, title_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  #
  sns.histplot(data=df_, x=var_,
               binwidth=scale_x_minor, 
               color=color_, ax=ax_, linewidth=2)
  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)

  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
  plt.xlabel('')
  plt.ylabel('Count', size=12)
  plt.yticks(size=12)
  sns.despine(bottom=True, left=False, trim=True, ax=ax_)
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plot_box_custom(ax_, df_, var_, xlabel_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
 
  #
  df_mean['y'] = [-0.3, 0.2]
  df_q1['y'] = [0.2, 0.25]
  df_q2['y'] = [0.1, 0.25]
  df_q3['y'] = [0.2, 0.25]

  sns.boxplot(data=df_, x=var_, ax=ax_,
                    color=color_, showmeans=True,
                    flierprops={"marker": "x"}, medianprops={"color": "cyan"},
                    width=0.4, fliersize=1, linewidth=2, notch=True)
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f' μ={m1:.1f}\n σ={st1:.1f}'
  ax_.annotate(text, xy=(m1, -0.3), rotation=90)

  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q1={q1:.1f} '
  ax_.annotate(text, xy=(q1-0.1, 0.25), rotation=90, va='top', ha='right')
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f'med={q2:.1f} '
  ax_.annotate(text, xy=(q2, 0.25), rotation=90, va='top', ha='center')
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q3={q3:.1f} '
  ax_.annotate(text, xy=(q3+0.1, 0.25), rotation=90, va='top', ha='left')

  plt.sca(ax_)
  plt.xlabel(xlabel_, size=12)
  plt.ylabel('')
  plt.xticks(size=12)
  sns.despine(bottom=False, left=False, ax=ax_)
  ax_.spines['bottom'].set_color('grey')
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plt_dist_plot(ax_, df_, type_var_, var_, title_, xlabel_, ylabel_):
  hue_order = df[type_var_].unique()
  sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
                   hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=['#3A5BA0', '#DC3535', '#18978F'])
  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.xlabel(xlabel_, size=12)
  plt.ylabel(ylabel_, size=12)
  plt.xticks(size=12)
  plt.yticks(size=12)
  plt.legend(labels=hue_order, loc='best', fontsize=12)

  sns.despine(right=True, top=True, ax=ax_)
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plt_most10_count_plot(ax_, df_, type_var_, var_, type_, title_, independant_label_, dependant_label_, unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='v'):
  list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
  df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]

  if not highlight_num:
    list_highlight = df_[var_].value_counts()/df_[var_].count()
    list_highlight = list_highlight[list_highlight > 0.1]
    highlight_num = len(list_highlight)

  custom_palette = [highlight_color for i in range(highlight_num)] + ['grey' for i in range(10-highlight_num)]
  if orientation_ == 'v':
    sns.countplot(data=df_, x=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    
    plt.sca(ax_)
    plt.xlabel(independant_label_)
    plt.ylabel(dependant_label_)

    plt.tick_params(
      axis='y',          # changes apply to the y-axis
      which='both',      # both major and minor ticks are affected
      left=False,      # ticks along the left edge are off
      right=False,         # ticks along the right edge are off
      labelleft=False) # labels along the left edge are off

    labels = []
    for label in ax_.get_xticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=10,
                      break_long_words=False))
    ax_.set_xticklabels(labels, rotation=60)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        yloc = 4
        # White on magenta
        clr = 'white'
        align = 'bottom'
        rotation_ = 90

        count_ = rect.get_height()
        pct_ = int(count_*100/df_[var_].count())
        pct_unit = f'({pct_}%) {unit_}'

        label_text = '' if not count_ else round(count_) if count_ > 0 and count_ < 1000 else f'{count_/1000: .1f} k'
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_height() > 0.05+0.022*len(pct_unit) else label_text if count_/container[0].get_height() > 0.05 else ''

        size_ = 12 if count_/container[0].get_height() > 0.1 else 9 if count_/container[0].get_height() > 0.06 else 8 if count_/container[0].get_height() > 0.055 else 7

        xloc = rect.get_x() + rect.get_width() / 2
        ax_.annotate(label_text, xy=(xloc, 0), xytext=(0, yloc),
                            textcoords="offset points", size=size_,
                            ha='center', va=align, rotation=rotation_,
                            color=clr, clip_on=True)

  else:
    sns.countplot(data=df_, y=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    plt.sca(ax_)
    plt.xlabel(dependant_label_)
    plt.ylabel(independant_label_)

    plt.tick_params(
      axis='x',          # changes apply to the x-axis
      which='both',      # both major and minor ticks are affected
      bottom=False,      # ticks along the bottom edge are off
      top=False,         # ticks along the top edge are off
      labelbottom=False) # labels along the bottom edge are off

    labels = []
    for label in ax_.get_yticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=15,
                      break_long_words=False))
    ax_.set_yticklabels(labels, rotation=0)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        xloc = 2
        # White on magenta
        clr = 'white'
        align = 'left'

        count_ = rect.get_width()
        pct_ = int(count_*100/df_[var_].count())
        pct_unit = f'({pct_}%) {unit_}'

        label_text = '' if not count_ else round(count_) if count_ > 0 and count_ < 1000 else f'{count_/1000: .1f} k'
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_width() > 0.05+0.022*len(pct_unit) else label_text if count_/container[0].get_width() > 0.05 else ''

        size_ = 12 if count_/container[0].get_width() > 0.1 else 9 if count_/container[0].get_width() > 0.06 else 8 if count_/container[0].get_width() > 0.055 else 7

        # Center the text vertically in the bar
        yloc = rect.get_y() + rect.get_height() / 2
        ax_.annotate(label_text, xy=(0, yloc), xytext=(xloc, 0),
                            textcoords="offset points", size=size_,
                            ha=align, va='center',
                            color=clr, clip_on=True)

  sns.despine(left=True, bottom=True, ax=ax_)

  plt.title(title_, size=16, color='grey')
  plt.xticks(size=12)
  plt.yticks(size=12)
  
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def product_wise_distribution(df_, var_, box_major=[None, None, None], box_minor=[None, None, None]):
  fig = plt.figure(figsize=(16,16))

  ax1 = plt.subplot(2, 3, 1)
  ax4 = plt.subplot(2, 3, 4, sharex=ax1)
  ax2 = plt.subplot(2, 3, 2)
  ax5 = plt.subplot(2, 3, 5, sharex=ax2)
  ax3 = plt.subplot(2, 3, 3)
  ax6 = plt.subplot(2, 3, 6, sharex=ax3)

  plot_hist_custom(ax_=ax1, df_=df.loc[df['Product']=='KP281'], var_=var_, title_=f'Distribution of {var_} for KP281', scale_x_major=box_major[0], scale_x_minor=box_minor[0], color_='#18978F')
  plot_box_custom(ax_=ax4, df_=df.loc[df['Product']=='KP281'], var_=var_, xlabel_=var_, scale_x_major=box_major[0], scale_x_minor=box_minor[0], color_='#18978F')

  plot_hist_custom(ax_=ax2, df_=df.loc[df['Product']=='KP481'], var_=var_, title_=f'Distribution of {var_} for KP481', scale_x_major=box_major[1], scale_x_minor=box_minor[1], color_='#DC3535')
  plot_box_custom(ax_=ax5, df_=df.loc[df['Product']=='KP481'], var_=var_, xlabel_=var_, scale_x_major=box_major[1], scale_x_minor=box_minor[1], color_='#DC3535')

  plot_hist_custom(ax_=ax3, df_=df.loc[df['Product']=='KP781'], var_=var_, title_=f'Distribution of {var_} for KP781', scale_x_major=box_major[2], scale_x_minor=box_minor[2], color_='#3A5BA0')
  plot_box_custom(ax_=ax6, df_=df.loc[df['Product']=='KP781'], var_=var_, xlabel_=var_, scale_x_major=box_major[2], scale_x_minor=box_minor[2], color_='#3A5BA0')

  plt.subplots_adjust(hspace = 0, wspace=0.25)

  plt.show()


def plot_heatmap(ax_, df_, var_, size=None):
  ax = sns.heatmap(pd.crosstab(index=df_[var_], columns=df_['Product'], dropna=False, normalize='index').round(2).T, ax=ax_, annot=True, cbar=False, cmap=sns.cubehelix_palette(as_cmap=True))
  plt.title('Conditional Probabilities', size=16, color='grey')
  ax_.figure.set_size_inches(size)


def product_wise_plots(df_, var_, main_title, xlablel_, title_, highligh_num = [None, None, None], box_major=[None, None, None], box_minor=[None, None, None]):
  fig, axd = plt.subplot_mosaic([['upper', 'upper', 'upper'],
                                ['lower left', 'lower middle', 'lower right']])
  ax1 = axd['upper']
  ax2 = axd['lower left']
  ax3 = axd['lower middle']
  ax4 = axd['lower right']

  plt_dist_plot(ax_=ax1, df_=df_, type_var_='Product', var_=var_, title_=main_title, xlabel_=xlablel_, ylabel_='Density')
  plt_most10_count_plot(ax_=ax2, df_=df_, type_var_ = 'Product', var_=var_, type_='KP281', title_=title_, independant_label_='', dependant_label_='', unit_='KP281s', highlight_num=highligh_num[0], highlight_color='#18978F', orientation_='h')
  plt_most10_count_plot(ax_=ax3, df_=df_, type_var_ = 'Product', var_=var_, type_='KP481', title_=title_, independant_label_='', dependant_label_='', unit_='KP481s', highlight_num=highligh_num[1], highlight_color='#DC3535', orientation_='h')
  plt_most10_count_plot(ax_=ax4, df_=df_, type_var_ = 'Product', var_=var_, type_='KP781', title_=title_, independant_label_='', dependant_label_='', unit_='KP781s', highlight_num=highligh_num[2], highlight_color='#3A5BA0', orientation_='h')

  fig.set_size_inches(16,16)
  plt.subplots_adjust(hspace = 0.25, wspace=0.25)

  plt.show()

  product_wise_distribution(df_=df_, var_=var_, box_major=box_major, box_minor=box_minor)

  plot_heatmap(ax_=plt.subplot(), df_=df, var_=var_, size=(16, 6))


mosaic = '''123
            456
            789'''
fig, axd = plt.subplot_mosaic(mosaic)
ax1 = axd['1']
ax2 = axd['2']
ax3 = axd['3']
ax4 = axd['4']
ax5 = axd['5']
ax6 = axd['6']
ax7 = axd['7']
ax8 = axd['8']
ax9 = axd['9']

plt_most10_count_plot(ax_=ax1, df_=df, type_var_ = 'Product', var_='Gender', type_='KP281', title_='Gender wise Product Counts', independant_label_='', dependant_label_='', unit_='KP281s', highlight_num=None, highlight_color='#18978F', orientation_='h')
plt_most10_count_plot(ax_=ax2, df_=df, type_var_ = 'Product', var_='Gender', type_='KP481', title_='Gender wise Product Counts', independant_label_='', dependant_label_='', unit_='KP481s', highlight_num=None, highlight_color='#DC3535', orientation_='h')
plt_most10_count_plot(ax_=ax3, df_=df, type_var_ = 'Product', var_='Gender', type_='KP781', title_='Gender wise Product Counts', independant_label_='', dependant_label_='', unit_='KP781s', highlight_num=None, highlight_color='#3A5BA0', orientation_='h')

plt_most10_count_plot(ax_=ax4, df_=df, type_var_ = 'Product', var_='MaritalStatus', type_='KP281', title_='MaritalStatus wise Product Counts', independant_label_='', dependant_label_='', unit_='KP281s', highlight_num=None, highlight_color='#18978F', orientation_='h')
plt_most10_count_plot(ax_=ax5, df_=df, type_var_ = 'Product', var_='MaritalStatus', type_='KP481', title_='MaritalStatus wise Product Counts', independant_label_='', dependant_label_='', unit_='KP481s', highlight_num=None, highlight_color='#DC3535', orientation_='h')
plt_most10_count_plot(ax_=ax6, df_=df, type_var_ = 'Product', var_='MaritalStatus', type_='KP781', title_='MaritalStatus wise Product Counts', independant_label_='', dependant_label_='', unit_='KP781s', highlight_num=None, highlight_color='#3A5BA0', orientation_='h')

df_mod = df.loc[:, ['Product']]
df_mod['Gender_MaritalStatus'] = df.apply(lambda x: f"{x['Gender']} {x['MaritalStatus']}", axis=1)
plt_most10_count_plot(ax_=ax7, df_=df_mod, type_var_ = 'Product', var_='Gender_MaritalStatus', type_='KP281', title_='Combined Counts', independant_label_='', dependant_label_='', unit_='KP281s', highlight_num=None, highlight_color='#18978F', orientation_='h')
plt_most10_count_plot(ax_=ax8, df_=df_mod, type_var_ = 'Product', var_='Gender_MaritalStatus', type_='KP481', title_='Combined Counts', independant_label_='', dependant_label_='', unit_='KP481s', highlight_num=None, highlight_color='#DC3535', orientation_='h')
plt_most10_count_plot(ax_=ax9, df_=df_mod, type_var_ = 'Product', var_='Gender_MaritalStatus', type_='KP781', title_='Combined Counts', independant_label_='', dependant_label_='', unit_='KP781s', highlight_num=None, highlight_color='#3A5BA0', orientation_='h')

plt.subplots_adjust(hspace = 0.30, wspace=0.35)
fig.set_size_inches(18,10)

plt.show()

ax1=plt.subplot(1, 3, 1)
plot_heatmap(ax_=ax1, df_=df, var_='Gender', size=(18, 3))

ax2=plt.subplot(1, 3, 2)
plot_heatmap(ax_=ax2, df_=df, var_='MaritalStatus', size=(18, 3))

ax2=plt.subplot(1, 3, 3)
plot_heatmap(ax_=ax2, df_=df_mod, var_='Gender_MaritalStatus', size=(18, 3))

plt.subplots_adjust(hspace = 0.30, wspace=0.45)
fig.set_size_inches(16,10)

plt.show()


product_wise_plots(df_=df, var_='Age', main_title='Products Count per Age Years', xlablel_='Age Years', title_='Age Years wise Purchase', box_major=[5]*3, box_minor=[1]*3)


product_wise_plots(df_=df, var_='Education', main_title='Products Count per Education Years', xlablel_='Education Years', title_='Education Years wise Purchase', box_major=[5]*3, box_minor=[1]*3)


product_wise_plots(df_=df, var_='Income', main_title='Products Count per Income in $s per Year', xlablel_='Income in $s per Year', title_='Income wise Purchase', box_major=[10000]*2+[20000], box_minor=[5000]*3)


product_wise_plots(df_=df, var_='Fitness', main_title='Products Count per Fitness', xlablel_='Fitness', title_='Fitness wise Purchase', box_major=[2]*3, box_minor=[1]*3)


product_wise_plots(df_=df, var_='Usage', main_title='Products Count per Usage Times per Week', xlablel_='Usage Times per Week', title_='Usage wise Purchase', box_major=[2]*3, box_minor=[1]*3)


product_wise_plots(df_=df, var_='Miles', main_title='Products Count per Miles in a Week', xlablel_='Miles per Week', title_='Weekly Miles wise Purchase', box_major=[20]*3, box_minor=[5]*3)

	Product	Age	Gender	Education	MaritalStatus	Usage	Fitness	Income	Miles
0	KP281	18	Male	14	Single	3	4	29562	112
1	KP281	19	Male	15	Single	2	3	31836	75
2	KP281	19	Female	14	Partnered	4	3	30699	66
3	KP281	19	Male	12	Single	3	3	32973	85
4	KP281	20	Male	13	Partnered	4	2	35247	47

	Product	Gender	MaritalStatus	Age	Education	Usage	Fitness	Income	Miles
dtype	object	object	object	int64	int64	int64	int64	int64	int64
Missing Counts	0	0	0	0	0	0	0	0	0
nUniques	3	2	2	32	8	6	5	62	37
Top 10 Unique Values	KP281 (44%), KP481 (33%), KP781 (22%)	Male (57%), Female (42%)	Partnered (59%), Single (40%)	25.0 (13%), 23.0 (10%), 24.0 (6%), 26.0 (6%), ...	16.0 (47%), 14.0 (30%), 18.0 (12%), 15.0 (2%),...	3.0 (38%), 4.0 (28%), 2.0 (18%), 5.0 (9%), 6.0...	3.0 (53%), 5.0 (17%), 2.0 (14%), 4.0 (13%), 1....	45480.0 (7%), 52302.0 (5%), 46617.0 (4%), 5457...	85.0 (15%), 95.0 (6%), 66.0 (5%), 75.0 (5%), 4...
min	nan	nan	nan	18.0	12.0	2.0	1.0	29562.0	21.0
max	nan	nan	nan	50.0	21.0	7.0	5.0	104581.0	360.0
LW (1.5)	nan	nan	nan	18.0	12.0	2.0	1.5	29562.0	21.0
Q1	nan	nan	nan	24.0	14.0	3.0	3.0	44058.8	66.0
Median	nan	nan	nan	26.0	16.0	3.0	3.0	50596.5	94.0
Q3	nan	nan	nan	33.0	16.0	4.0	4.0	58668.0	114.8
UW (1.5)	nan	nan	nan	46.5	19.0	5.5	5.0	80581.8	188.0
Outlier Count (1.5*IQR)	0	0	0	5	4	9	2	19	12
mean-3*std	nan	nan	nan	18.0	12.0	2.0	1.0	29562.0	21.0
mean	nan	nan	nan	28.8	15.6	3.5	3.3	53719.6	103.2
std	nan	nan	nan	6.9	1.6	1.1	1.0	16506.7	51.9
mean+3*std	nan	nan	nan	49.5	20.4	6.8	5.0	103239.7	258.9
Outlier Count (3*std)	0	0	0	1	3	2	0	3	4

	Product	Age	Gender	Education	MaritalStatus	Usage	Fitness	Income	Miles
168	KP781	30	Male	18	Partnered	5	4	103336	160
174	KP781	38	Male	18	Partnered	5	5	104581	150
178	KP781	47	Male	18	Partnered	4	5	104581	120

	Product	Age	Gender	Education	MaritalStatus	Usage	Fitness	Income	Miles
166	KP781	29	Male	14	Partnered	7	5	85906	300
167	KP781	30	Female	16	Partnered	6	5	90886	280
170	KP781	31	Male	16	Partnered	6	5	89641	260
173	KP781	35	Male	16	Partnered	4	5	92131	360

	Product	Gender	MaritalStatus	Income	Miles	Age	Education	Usage	Fitness
dtype	string	string	string	float64	float64	Int64	Int64	Int64	Int64
Missing Counts	0	0	0	0	0	0	0	0	0
nUniques	3	2	2	16	24	32	8	6	5
Top 10 Unique Values	KP281 (44%), KP481 (33%), KP781 (22%)	Male (57%), Female (42%)	Partnered (59%), Single (40%)	52302.0 (23%), 46617.0 (18%), 36952.5 (10%), 4...	85.0 (15%), 95.0 (15%), 66.0 (8%), 75.0 (7%), ...	25.0 (13%), 23.0 (10%), 24.0 (6%), 26.0 (6%), ...	16.0 (47%), 14.0 (30%), 18.0 (12%), 15.0 (2%),...	3.0 (38%), 4.0 (28%), 2.0 (18%), 5.0 (9%), 6.0...	3.0 (53%), 5.0 (17%), 2.0 (14%), 4.0 (13%), 1....
min	nan	nan	nan	29562.0	21.0	18.0	12.0	2.0	1.0
max	nan	nan	nan	104581.0	360.0	50.0	21.0	7.0	5.0
LW (1.5)	nan	nan	nan	29562.0	21.0	18.0	12.0	2.0	1.5
Q1	nan	nan	nan	43206.0	66.0	24.0	14.0	3.0	3.0
Median	nan	nan	nan	52302.0	95.0	26.0	16.0	3.0	3.0
Q3	nan	nan	nan	57987.0	113.0	33.0	16.0	4.0	4.0
UW (1.5)	nan	nan	nan	80158.5	183.5	46.5	19.0	5.5	5.0
Outlier Count (1.5*IQR)	0	0	0	19	13	5	4	9	2
mean-3*std	nan	nan	nan	29562.0	21.0	18.0	12.0	2.0	1.0
mean	nan	nan	nan	53592.4	103.1	28.8	15.6	3.5	3.3
std	nan	nan	nan	16402.2	51.8	6.9	1.6	1.1	1.0
mean+3*std	nan	nan	nan	102799.0	258.5	49.5	20.4	6.8	5.0
Outlier Count (3*std)	0	0	0	3	4	1	3	2	0

1. Defining Problem Statement and Analysing basic metrics¶

4. Missing Value & Outlier Detection¶

2. Non-Graphical Analysis: Value counts and unique attributes¶

Download the dataset and observe a subset of the data¶

3. Visual Analysis - Univariate & Bivariate¶

5. Business Insights based on Non-Graphical and Visual Analysis¶

6. Recommendations¶

Functions to plot custom charts and tables¶

Product wise customer segmentation for each customer characteristic¶

Gender and Marital Status¶

Age¶

Education¶

Income¶

Fitness¶

Usage¶

Miles¶

Business Recommendations¶