# !pip install matplotlib --upgrade


# !pip install matplotlib==3.5.3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math
import re
from sklearn.preprocessing import MinMaxScaler
from scipy import stats


df = pd.read_csv('/content/Courier_data.csv')
pd.set_option('display.max_columns', None)
df.head(4)


def get_df_summary(df_, print_summary=True, properties_as_columns=True):
  # Shape and memory usage of DataFrame
  print(f'RangeIndex: {df_.shape[0]} entries; Data columns (total {df_.shape[1]} columns)')
  memory_used = df_.memory_usage().sum()/1024
  if memory_used > 1024*1024:
    memory_used =  f'{round(memory_used/1024/1024, 1)}+ GB'
  elif memory_used > 1024:
    memory_used =  f'{round(memory_used/1024, 1)}+ MB'
  else:
    memory_used =  f'{round(memory_used, 1)}+ KB'
  print(f'memory usage: {memory_used}\n')

  # Create an empty df with column names from original df
  df2 = pd.DataFrame(columns=[None]+df_.columns.to_list())

  # Add dtype
  property_ = ['dtype']
  for clm in df_.columns:
    property_.append(df_[clm].dtype)
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add Missing Values Counts
  property_ = ['Missing Counts']
  for clm in df_.columns:
    property_.append(df_[clm].isna().sum())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add nUniques
  property_ = ['nUniques']
  for clm in df_.columns:
    property_.append(df_[clm].nunique())
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add unique values
  # top 10
  property_ = ['Top 10 Unique Values']
  for clm in df_.columns:
    df1 = df_[clm].value_counts().reset_index()
    df1['margin'] = df1[clm]*100/ df1[clm].sum()
    property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[:10]]))
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Add unique values
  # bottom 10
  property_ = ['Bottom 10 Unique Values']
  for clm in df_.columns:
    df1 = df_[clm].value_counts().reset_index()
    df1['margin'] = df1[clm]*100/ df1[clm].sum()
    property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[-1:-11:-1]]))
  df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))

  # Getting Numeric Variables Statistics
  df4 = pd.DataFrame(columns=df_.columns.to_list())
  df4 = df4.append(df_.describe()).drop('count').rename({
      '25%': 'Q1',
      '50%': 'Median',
      '75%': 'Q3'
  }).reset_index().set_index('index').round(1)
  df4 = df4.T
  df4['LW (1.5)'] = df4.apply(lambda x: max(x['min'], x['Q1'] - 1.5*(x['Q3']-x['Q1'])), axis=1)
  df4['UW (1.5)'] = df4.apply(lambda x: min(x['max'], x['Q3'] + 1.5*(x['Q3']-x['Q1'])), axis=1)

  df4['mean-3*std'] = df4.apply(lambda x: max(x['min'], x['mean'] - 3*x['std']), axis=1)
  df4['mean+3*std'] = df4.apply(lambda x: min(x['max'], x['mean'] + 3*x['std']), axis=1)

  lst_IQR_Outlier = []
  lst_std_Outlier = []
  for clm in df4.index:
    if clm in df_.describe().columns:
      iqr_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'LW (1.5)']) | (df_[clm]>df4.loc[clm,'UW (1.5)'])].shape[0]
      iqr_outlier_pct = f'({round(iqr_outlier_count * 100.0 / df_.__len__(), 1)}%)' if iqr_outlier_count != 0 else ''
      
      std_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'mean-3*std']) | (df_[clm]>df4.loc[clm,'mean+3*std'])].shape[0]
      std_outlier_pct = f'({round(std_outlier_count * 100.0 / df_.__len__(), 1)}%)' if std_outlier_count != 0 else ''
      
      lst_IQR_Outlier.append(f'{iqr_outlier_count} {iqr_outlier_pct}')
      lst_std_Outlier.append(f'{std_outlier_count} {std_outlier_pct}')
    else:
      lst_IQR_Outlier.append(np.nan)
      lst_std_Outlier.append(np.nan)

  df4['Outlier Count (1.5*IQR)'] = lst_IQR_Outlier
  df4['Outlier Count (3*std)'] = lst_std_Outlier

  df4 = df4.round(1).T.reset_index().rename({'index': None}, axis=1)
  df2 = df2.append(df4)

  # Sort the columns acording to dtype
  df2 = df2.set_index(None).T.astype(str).sort_values('dtype', ascending=False)
  df2 = df2[['dtype', 'Missing Counts', 'nUniques', 
             'Top 10 Unique Values', 'Bottom 10 Unique Values', 'min','max', 
  'LW (1.5)', 'Q1', 'Median', 'Q3',   'UW (1.5)', 'Outlier Count (1.5*IQR)', 
  'mean-3*std',  'mean', 'std', 'mean+3*std', 'Outlier Count (3*std)']]

  if not properties_as_columns: df2 = df2.T
  if print_summary: print(df2) 

  return df2


df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary

RangeIndex: 144867 entries; Data columns (total 24 columns)
memory usage: 25.6+ MB


df = df.drop(['is_cutoff', 'cutoff_factor', 'cutoff_timestamp', 'factor', 'segment_factor'], axis=1)


df['source_name'].fillna('Unknown', inplace=True)
df['destination_name'].fillna('Unknown', inplace=True)


df['od_start_time'] = pd.to_datetime(df['od_start_time'], errors='coerce')
df['od_end_time'] = pd.to_datetime(df['od_end_time'], errors='coerce')

df['od_time_diff_mins'] = (df['od_end_time'] - df['od_start_time']).dt.total_seconds()/60

df = df.drop(['od_start_time', 'od_end_time'], axis=1)


create_trip_dict = {
    'data' : 'first',
    'trip_creation_time' : 'first',
    'route_schedule_uuid': 'first',
    'route_type': 'first', 
    'trip_uuid': 'first', 
    
    'source_center': 'first', 
    'source_name': 'first', 
    
    'destination_center': 'last',
    'destination_name': 'last', 
    
    'od_time_diff_mins' : 'sum', 
    'start_scan_to_end_scan': 'sum', 
    
    'actual_distance_to_destination': 'sum',
    'actual_time': 'sum',
    'osrm_time': 'sum', 
    'osrm_distance': 'sum', 
    
    'segment_actual_time': 'sum',
    'segment_osrm_time': 'sum', 
    'segment_osrm_distance': 'sum'
}

df = df.groupby('trip_uuid').agg(create_trip_dict).reset_index(drop=True)


def city_place_state_separator(text):
  result = re.search('(^[A-Za-z]+)[_|\s](.+)(\(.+\))', text)

  if result:
    city = result.group(1)
    place = result.group(2).rstrip().lstrip().replace('_', ' ')
    state = result.group(3)[1:-1]

    return pd.Series([city, place, state])

  else:
    return pd.Series([text, text, text])


df[['source_city', 'source_place', 'source_state']] = df['source_name'].apply(city_place_state_separator)
df[['destination_city', 'destination_place', 'destination_state']] = df['destination_name'].apply(city_place_state_separator)
df = df.drop(['source_name', 'destination_name'], axis=1)


def timestapm_to_year_month_day(ts):
  return pd.Series([pd.to_datetime(ts).year, pd.to_datetime(ts).month, pd.to_datetime(ts).day])

df[['trip_creation_year', 'trip_creation_month', 'trip_creation_day']] = df['trip_creation_time'].apply(timestapm_to_year_month_day)
df = df.drop('trip_creation_time', axis=1)


df = df[(df['segment_osrm_distance'] > float(df_summary.loc['LW (1.5)', 'segment_osrm_distance'])) & (df['segment_osrm_distance'] < float(df_summary.loc['UW (1.5)', 'segment_osrm_distance']))]
df = df[(df['segment_osrm_time'] > float(df_summary.loc['LW (1.5)', 'segment_osrm_time'])) & (df['segment_osrm_time'] < float(df_summary.loc['UW (1.5)', 'segment_osrm_time']))]
df = df[(df['osrm_distance'] > float(df_summary.loc['LW (1.5)', 'osrm_distance'])) & (df['osrm_distance'] < float(df_summary.loc['UW (1.5)', 'osrm_distance']))]
df = df[(df['osrm_time'] > float(df_summary.loc['LW (1.5)', 'osrm_time'])) & (df['osrm_time'] < float(df_summary.loc['UW (1.5)', 'osrm_time']))]
df = df[(df['actual_time'] > float(df_summary.loc['LW (1.5)', 'actual_time'])) & (df['actual_time'] < float(df_summary.loc['UW (1.5)', 'actual_time']))]
df = df[(df['actual_distance_to_destination'] > float(df_summary.loc['LW (1.5)', 'actual_distance_to_destination'])) & (df['actual_distance_to_destination'] < float(df_summary.loc['UW (1.5)', 'actual_distance_to_destination']))]
df = df[(df['start_scan_to_end_scan'] > float(df_summary.loc['LW (1.5)', 'start_scan_to_end_scan'])) & (df['start_scan_to_end_scan'] < float(df_summary.loc['UW (1.5)', 'start_scan_to_end_scan']))]
df = df[(df['od_time_diff_mins'] > float(df_summary.loc['LW (1.5)', 'od_time_diff_mins'])) & (df['od_time_diff_mins'] < float(df_summary.loc['UW (1.5)', 'od_time_diff_mins']))]
df = df[(df['segment_actual_time'] > float(df_summary.loc['LW (1.5)', 'segment_actual_time'])) & (df['segment_actual_time'] < float(df_summary.loc['UW (1.5)', 'segment_actual_time']))]


df_num_cols = df[['segment_osrm_distance', 'segment_osrm_time', 'osrm_distance', 'osrm_time', 'actual_time', 'actual_distance_to_destination', 'start_scan_to_end_scan', 'od_time_diff_mins', 'segment_actual_time']]
df_num_cols = pd.DataFrame(MinMaxScaler().fit_transform(df_num_cols), columns = df_num_cols.columns)


df = pd.concat((df, pd.get_dummies(df['route_type'], prefix = 'route_type')), axis=1)
df = df.drop('route_type', axis = 1)


df['data'] = df['data'].astype('category')


get_df_summary(df, print_summary=False, properties_as_columns=False)

RangeIndex: 12219 entries; Data columns (total 25 columns)
memory usage: 2.2+ MB


df.head(4)


def plot_hist_custom(ax_, df_, var_, title_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  sns.histplot(data=df_, x=var_, kde=True,
               binwidth=scale_x_minor, 
               color=color_, ax=ax_, linewidth=2)
  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)

  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom=False,      # ticks along the bottom edge are off
        top=False,         # ticks along the top edge are off
        labelbottom=False) # labels along the bottom edge are off
  plt.xlabel('')
  plt.ylabel('Count', size=12)
  plt.yticks(size=12)
  sns.despine(bottom=True, left=False, trim=True, ax=ax_)
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plot_box_custom(ax_, df_, var_, xlabel_, scale_x_major, scale_x_minor, color_='#DC3535'):
  if pd.api.types.is_datetime64_any_dtype(df_[var_]):
    df_[var_] = df_[var_].dt.year

  m1 = df_[var_].mean()
  st1 = df_[var_].std()

  q1 = df_[var_].quantile(.25)
  q2 = df_[var_].median()
  q3 = df_[var_].quantile(.75)

  df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
  df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
  df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
  df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
 
  df_mean['y'] = [-0.3, 0.2]
  df_q1['y'] = [0.2, 0.25]
  df_q2['y'] = [0.1, 0.25]
  df_q3['y'] = [0.2, 0.25]

  sns.boxplot(data=df_, x=var_, ax=ax_,
                    color=color_, showmeans=True,
                    flierprops={"marker": "x"}, medianprops={"color": "cyan"},
                    width=0.4, fliersize=1, linewidth=2, notch=True)
  sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f' μ={m1:.1f}\n σ={st1:.1f}'
  ax_.annotate(text, xy=(m1, -0.3), rotation=90)

  sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q1={q1:.1f} '
  ax_.annotate(text, xy=(q1-0.1, 0.25), rotation=90, va='top', ha='right')
  sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
              estimator=None, linewidth = 2)
  text = f'med={q2:.1f} '
  ax_.annotate(text, xy=(q2, 0.25), rotation=90, va='top', ha='center')
  sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
              estimator=None, linewidth = 1)
  text = f'Q3={q3:.1f} '
  ax_.annotate(text, xy=(q3+0.1, 0.25), rotation=90, va='top', ha='left')

  plt.sca(ax_)
  plt.xlabel(xlabel_, size=12)
  plt.ylabel('')
  plt.xticks(size=12)
  sns.despine(bottom=False, left=False, ax=ax_)
  ax_.spines['bottom'].set_color('grey')
  ax_.spines['left'].set_color('grey')

  ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
  ax_.xaxis.set_major_formatter('{x:.0f}')
  ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))

  ax_.figure.set_size_inches(16,11)
  plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)

  return plt


def plt_dist_plot(ax_, df_, type_var_, var_, title_, xlabel_, ylabel_):
  hue_order = df[type_var_].unique()
  sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
                   hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=['#3A5BA0', '#DC3535', '#18978F'])
  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.xlabel(xlabel_, size=12)
  plt.ylabel(ylabel_, size=12)
  plt.xticks(size=12)
  plt.yticks(size=12)
  plt.legend(labels=hue_order, loc='best', fontsize=12)

  sns.despine(right=True, top=True, ax=ax_)
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plt_most10_count_plot(ax_, df_, type_var_, var_, type_, title_, independant_label_, dependant_label_, unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='v', agg_func='Count', agg_var=None):
  agg_func = agg_func.lower()

  if agg_func == 'count':
    total_records_count = df_[var_].count()

    if type_var_ and type_:
      list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
      df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
    else:
      list_most10 = df_[var_].value_counts().iloc[:10].index
      df_ = df_.loc[df_[var_].isin(list_most10), [var_]]

  elif agg_func == 'sum':
    total_records_count = df_[agg_var].sum()

    if type_var_ and type_:
      list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
      df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
    else:
      list_most10 = df[[var_, agg_var]].groupby(var_)[agg_var].sum().sort_values(ascending=False).iloc[:10]

  else:
    pass

  if not highlight_num:
    if agg_func == 'count':
      list_highlight = df_[var_].value_counts()/total_records_count
    elif agg_func == 'sum':
      list_highlight = list_most10.to_list()/total_records_count
    else:
      pass

    list_highlight = list_highlight[list_highlight > 0.1]
    highlight_num = len(list_highlight)

  custom_palette = [highlight_color for i in range(highlight_num)] + ['grey' for i in range(10-highlight_num)]


  if orientation_ == 'v':
    if agg_func == 'count':
      sns.countplot(data=df_, x=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    elif agg_func == 'sum':
      sns.barplot(x=[str(i) for i in list_most10.index.to_list()], y=list_most10.to_list(), ax=ax_,
                  palette = custom_palette)
    else:
      pass

    plt.sca(ax_)
    plt.xlabel(independant_label_)
    plt.ylabel(dependant_label_)

    plt.tick_params(
      axis='y',          # changes apply to the y-axis
      which='both',      # both major and minor ticks are affected
      left=False,      # ticks along the left edge are off
      right=False,         # ticks along the right edge are off
      labelleft=False) # labels along the left edge are off

    labels = []
    for label in ax_.get_xticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=10,
                      break_long_words=False))
    ax_.set_xticklabels(labels, rotation=60)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        yloc = 4
        # White on magenta
        clr = 'white'
        align = 'bottom'
        rotation_ = 90

        count_ = 0 if np.isnan(rect.get_height()) else rect.get_height()
        pct_ = int(count_*100/total_records_count)
        pct_unit = f'({pct_}%) {unit_}'

        label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_height() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_height() > 0.055 else ''

        size_ = 10 if count_/container[0].get_height() > 0.1 else 9 if count_/container[0].get_height() > 0.06 else 8 if count_/container[0].get_height() > 0.055 else 7

        xloc = rect.get_x() + rect.get_width() / 2
        ax_.annotate(label_text, xy=(xloc, 0), xytext=(0, yloc),
                            textcoords="offset points", size=size_,
                            ha='center', va=align, rotation=rotation_,
                            color=clr, clip_on=True)

  else:
    if agg_func == 'count':
      sns.countplot(data=df_, y=var_, order=list_most10, ax=ax_,
                      palette = custom_palette)
    elif agg_func == 'sum':
      sns.barplot(y=[str(i) for i in list_most10.index.to_list()], x=list_most10.to_list(), ax=ax_,
                  palette = custom_palette)
    else:
      pass
    
    plt.sca(ax_)
    plt.xlabel(dependant_label_)
    plt.ylabel(independant_label_)

    plt.tick_params(
      axis='x',          # changes apply to the x-axis
      which='both',      # both major and minor ticks are affected
      bottom=False,      # ticks along the bottom edge are off
      top=False,         # ticks along the top edge are off
      labelbottom=False) # labels along the bottom edge are off

    labels = []
    for label in ax_.get_yticklabels():
        text = label.get_text()
        labels.append(textwrap.fill(text, width=15,
                      break_long_words=False))
    ax_.set_yticklabels(labels, rotation=0)

    bar_labels=[]
    for container in ax_.containers:
      for rect in container:
        # Rectangle widths are already integer-valued but are floating
        # type, so it helps to remove the trailing decimal point and 0 by
        # converting width to int type

        # Shift the text to the left side of the right edge
        xloc = 2
        # White on magenta
        clr = 'white'
        align = 'left'

        count_ = rect.get_width()
        pct_ = int(count_*100/total_records_count)
        pct_unit = f'({pct_}%) {unit_}'

        label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
        label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_width() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_width() > 0.055 else ''

        size_ = 10 if count_/container[0].get_width() > 0.1 else 9 if count_/container[0].get_width() > 0.06 else 8 if count_/container[0].get_width() > 0.055 else 7

        # Center the text vertically in the bar
        yloc = rect.get_y() + rect.get_height() / 2
        ax_.annotate(label_text, xy=(0, yloc), xytext=(xloc, 0),
                            textcoords="offset points", size=size_,
                            ha=align, va='center',
                            color=clr, clip_on=True)

  sns.despine(left=True, bottom=True, ax=ax_)

  plt.title(title_, size=16, color='grey')
  plt.xticks(size=12)
  plt.yticks(size=12)
  
  ax_.spines['left'].set_color('grey')
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


def plot_numeric_distribution(df_, var_, main_title, xlablel_, title_, highligh_num = None, box_major=None, box_minor=None):
  fig = plt.figure()

  ax1 = plt.subplot(2, 1, 1)
  ax2 = plt.subplot(2, 1, 2, sharex=ax1)

  plot_hist_custom(ax_=ax1, df_=df, var_=var_, title_=f'Distribution of {var_}', scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')
  plot_box_custom(ax_=ax2, df_=df, var_=var_, xlabel_=var_, scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')

  fig.set_size_inches(8,10)
  plt.subplots_adjust(hspace = 0, wspace=0.25)
  plt.show()

  # fig = plt.figure()
  # ax1 = plt.subplot()
  # plt_most10_count_plot(ax_=ax1, df_=df_, type_var_ = '', var_=var_, type_='', title_=title_, independant_label_='', dependant_label_='', unit_='', highlight_num=highligh_num, highlight_color='#18978F', orientation_='h')

  # fig.set_size_inches(8,4)
  # plt.show()


def plot_kde_distribution(ax_, df_, var_, summary_var_, title_, x_label_='', y_label_=''):
  list_most10 = df_[[var_, summary_var_]].groupby(var_)[summary_var_].sum().sort_values(ascending=False).iloc[:10]
  var_list = list_most10.index.to_list()
  distribution_list = []

  for var_category in var_list:
    # df_cur = df_.loc[df[var_] == var_category, [var_, summary_var_]]
    distribution_list.append(df_.loc[df_[var_] == var_category, summary_var_])

    # --------------------
  df_dist = pd.DataFrame(var_list, columns=['Category'])
  df_dist['values'] = distribution_list
  df_dist = df_dist.explode('values').reset_index(drop=True)

  sns.kdeplot(data=df_dist, x='values', ax=ax_, fill=True, hue='Category', hue_order=var_list[::-1], lw=4, palette='deep')

  plt.sca(ax_)
  plt.title(title_, size=16, color='grey')
  plt.xlabel(x_label_, size=12)
  plt.ylabel(y_label_, size=12)
  plt.xticks(size=12)
  plt.yticks([])
  plt.legend(labels=var_list, loc='upper left', fontsize=12)

  ax_.set_xlim(xmin=np.min([np.min(i) for i in distribution_list])*0.9)

  sns.despine(right=True, top=True, left=True, bottom=False, ax=ax_)
  ax_.spines['bottom'].set_color('grey')

  ax_.figure.set_size_inches(16,8)
  ax_.figure.subplots_adjust(top=0.81,right=0.86) 

  return plt


plot_numeric_distribution(df, 'od_time_diff_mins', 'Tittle_', 'xlablel_', 'title_', box_major=1000, box_minor=250)
df['od_time_diff_mins_log'] = np.log(df['od_time_diff_mins'])
plot_numeric_distribution(df, 'od_time_diff_mins_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'start_scan_to_end_scan', 'Tittle_', 'xlablel_', 'title_', box_major=1000, box_minor=250)
df['start_scan_to_end_scan_log'] = np.log(df['start_scan_to_end_scan'])
plot_numeric_distribution(df, 'start_scan_to_end_scan_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'actual_distance_to_destination', 'Tittle_', 'xlablel_', 'title_', box_major=200, box_minor=50)
df['actual_distance_to_destination_log'] = np.log(df['actual_distance_to_destination'])
plot_numeric_distribution(df, 'actual_distance_to_destination_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'actual_time', 'Tittle_', 'xlablel_', 'title_', box_major=500, box_minor=100)
df['actual_time_log'] = np.log(df['actual_time'])
plot_numeric_distribution(df, 'actual_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'osrm_time', 'Tittle_', 'xlablel_', 'title_', box_major=200, box_minor=50)
df['osrm_time_log'] = np.log(df['osrm_time'])
plot_numeric_distribution(df, 'osrm_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'osrm_distance', 'Tittle_', 'xlablel_', 'title_', box_major=400, box_minor=100)
df['osrm_distance_log'] = np.log(df['osrm_distance'])
plot_numeric_distribution(df, 'osrm_distance_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'segment_actual_time', 'Tittle_', 'xlablel_', 'title_', box_major=200, box_minor=50)
df['segment_actual_time_log'] = np.log(df['segment_actual_time'])
plot_numeric_distribution(df, 'segment_actual_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'segment_osrm_time', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=25)
df['segment_osrm_time_log'] = np.log(df['segment_osrm_time'])
plot_numeric_distribution(df, 'segment_osrm_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plot_numeric_distribution(df, 'segment_osrm_distance', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=25)
df['segment_osrm_distance_log'] = np.log(df['segment_osrm_distance'])
plot_numeric_distribution(df, 'segment_osrm_distance_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)


plt_most10_count_plot(plt.subplot(), df, '', 'data', '', 'Train Test Data Split', 'Type', 'Count', unit_='', highlight_num=1, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.scatterplot(data = df, x = 'actual_time', y = 'actual_distance_to_destination', ax=ax[0])
sns.scatterplot(data = df, x = 'actual_time_log', y = 'actual_distance_to_destination_log', ax=ax[1])
plt.show()


fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.scatterplot(data = df, x = 'osrm_time', y = 'osrm_distance', ax=ax[0])
sns.scatterplot(data = df, x = 'osrm_time_log', y = 'osrm_distance_log', ax=ax[1])
plt.show()


fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.scatterplot(data = df, x = 'segment_osrm_time', y = 'segment_osrm_distance', ax=ax[0])
sns.scatterplot(data = df, x = 'segment_osrm_time_log', y = 'segment_osrm_distance_log', ax=ax[1])
plt.show()


df_dist = pd.DataFrame({
    'Type' : np.repeat(['od_time_diff_mins_log', 'start_scan_to_end_scan_log'], [len(df['od_time_diff_mins_log']), len(df['start_scan_to_end_scan_log'])]),
    'Values' : pd.concat((df['od_time_diff_mins_log'], df['start_scan_to_end_scan_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()


stats.levene(df['od_time_diff_mins_log'], df['start_scan_to_end_scan_log'])

LeveneResult(statistic=0.03117851384397478, pvalue=0.859844178127743)


stats.ttest_ind(df['od_time_diff_mins_log'], df['start_scan_to_end_scan_log'])

Ttest_indResult(statistic=0.282703024175061, pvalue=0.777406892326873)


df_dist = pd.DataFrame({
    'Type' : np.repeat(['actual_time_log', 'osrm_time_log'], [len(df['actual_time_log']), len(df['osrm_time_log'])]),
    'Values' : pd.concat((df['actual_time_log'], df['osrm_time_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()


stats.levene(df['actual_time_log'], df['osrm_time_log'])

LeveneResult(statistic=58.45681797951643, pvalue=2.154389317124431e-14)


stats.kstest(df['actual_time'], df['osrm_time'])

KstestResult(statistic=0.27760045830264346, pvalue=0.0)


df_dist = pd.DataFrame({
    'Type' : np.repeat(['actual_time_log', 'segment_actual_time_log'], [len(df['actual_time_log']), len(df['segment_actual_time_log'])]),
    'Values' : pd.concat((df['actual_time_log'], df['segment_actual_time_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()


stats.levene(df['actual_time_log'], df['segment_actual_time_log'])

LeveneResult(statistic=339.61604946884137, pvalue=2.507577595431446e-75)


stats.kstest(df['actual_time'], df['segment_actual_time'])

KstestResult(statistic=0.29887879531876593, pvalue=0.0)


df_dist = pd.DataFrame({
    'Type' : np.repeat(['osrm_distance_log', 'segment_osrm_distance_log'], [len(df['osrm_distance_log']), len(df['segment_osrm_distance_log'])]),
    'Values' : pd.concat((df['osrm_distance_log'], df['segment_osrm_distance_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()


stats.levene(df['osrm_distance_log'], df['segment_osrm_distance_log'])

LeveneResult(statistic=521.0798471630106, pvalue=3.852435304124549e-114)


stats.kstest(df['osrm_distance_log'], df['segment_osrm_distance_log'])

KstestResult(statistic=0.31598330468941815, pvalue=0.0)


df_dist = pd.DataFrame({
    'Type' : np.repeat(['osrm_time_log', 'segment_osrm_time_log'], [len(df['osrm_time_log']), len(df['segment_osrm_time_log'])]),
    'Values' : pd.concat((df['osrm_time_log'], df['segment_osrm_time_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()


stats.levene(df['osrm_time_log'], df['segment_osrm_time_log'])

LeveneResult(statistic=517.0434072360317, pvalue=2.790314199321843e-113)


stats.kstest(df['osrm_time_log'], df['segment_osrm_time_log'])

KstestResult(statistic=0.28954906293477367, pvalue=0.0)


plt_most10_count_plot(plt.subplot(), df, '', 'source_state', '', 'Source State wise Count', 'Source State', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


plt_most10_count_plot(plt.subplot(), df, '', 'route_type_Carting', '', 'Route Type wise Count', 'Route Type', 'Count', unit_='', highlight_num=1, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()


df['corridors'] = df[['source_city', 'destination_city']].apply(lambda x: x['source_city'] + "_" + x['destination_city'], axis=1)
plt_most10_count_plot(plt.subplot(), df, '', 'corridors', '', 'Corridors wise Order', 'Corridor', '', unit_='', highlight_num=5, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()

df['corridors'].value_counts()

Mumbai_Mumbai                 600
Bengaluru_Bengaluru           549
Bangalore_Bengaluru           455
Bhiwandi_Mumbai               437
Hyderabad_Hyderabad           371
                             ... 
Bhadohi_Gopiganj                1
Renukoot_Robertsganj            1
Madnapalle_Palamaner            1
Kundapura_Goa                   1
Hospet (Karnataka)_Bellary      1
Name: corridors, Length: 1399, dtype: int64


df1 = df.groupby('corridors').agg({'od_time_diff_mins': 'mean'})
df1['od_time_diff_mins'].sort_values(ascending=False)

corridors
Tumkur_Tumkur                        6097.128248
Balasore_Kolkata                     6013.796553
Bangana_Chandigarh                   5965.512898
Allahabad_Sidhi                      5927.420268
Sonipat_Moradabad                    5586.723676
                                        ...     
Varanasi_Varanasi (Uttar Pradesh)      52.177473
Unjha_Patan                            50.186318
Jagtial_DhrmpuriTS                     48.030439
Sankari_Erode (Tamil Nadu)             43.586897
Manthani_Ramagundam                    39.391247
Name: od_time_diff_mins, Length: 1399, dtype: float64


df1 = df.groupby('corridors').agg({'actual_distance_to_destination': 'mean'})
df1['actual_distance_to_destination'].sort_values(ascending=False)

corridors
Bangalore_MAA                         1077.395161
MAA_Bangalore                         1046.609597
Nashik_Nashik                         1037.756251
Shikohabad_Delhi                      1010.805422
Gurgaon_Jaipur                         994.115384
                                         ...     
Jabalpur (Madhya Pradesh)_Jabalpur       9.237405
Hyderabad_Hyd                            9.126716
Delhi_North                              9.045083
Salem_Salem                              9.040986
Varanasi_Varanasi (Uttar Pradesh)        9.027513
Name: actual_distance_to_destination, Length: 1399, dtype: float64


plt_most10_count_plot(plt.subplot(), df, '', 'trip_creation_month', '', 'Trip per Month', 'Month', 'Count', unit_='', highlight_num=1, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()

	data	trip_creation_time	route_schedule_uuid	route_type	trip_uuid	source_center	source_name	destination_center	destination_name	od_start_time	od_end_time	start_scan_to_end_scan	is_cutoff	cutoff_factor	cutoff_timestamp	actual_distance_to_destination	actual_time	osrm_time	osrm_distance	factor	segment_actual_time	segment_osrm_time	segment_osrm_distance	segment_factor
0	training	2018-09-20 02:35:36.476840	thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...	Carting	trip-153741093647649320	IND388121AAA	Anand_VUNagar_DC (Gujarat)	IND388620AAB	Khambhat_MotvdDPP_D (Gujarat)	2018-09-20 03:21:32.418600	2018-09-20 04:47:45.236797	86.0	True	9.0	2018-09-20 04:27:55	10.435660	14.0	11.0	11.9653	1.272727	14.0	11.0	11.9653	1.272727
1	training	2018-09-20 02:35:36.476840	thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...	Carting	trip-153741093647649320	IND388121AAA	Anand_VUNagar_DC (Gujarat)	IND388620AAB	Khambhat_MotvdDPP_D (Gujarat)	2018-09-20 03:21:32.418600	2018-09-20 04:47:45.236797	86.0	True	18.0	2018-09-20 04:17:55	18.936842	24.0	20.0	21.7243	1.200000	10.0	9.0	9.7590	1.111111
2	training	2018-09-20 02:35:36.476840	thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...	Carting	trip-153741093647649320	IND388121AAA	Anand_VUNagar_DC (Gujarat)	IND388620AAB	Khambhat_MotvdDPP_D (Gujarat)	2018-09-20 03:21:32.418600	2018-09-20 04:47:45.236797	86.0	True	27.0	2018-09-20 04:01:19.505586	27.637279	40.0	28.0	32.5395	1.428571	16.0	7.0	10.8152	2.285714
3	training	2018-09-20 02:35:36.476840	thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3...	Carting	trip-153741093647649320	IND388121AAA	Anand_VUNagar_DC (Gujarat)	IND388620AAB	Khambhat_MotvdDPP_D (Gujarat)	2018-09-20 03:21:32.418600	2018-09-20 04:47:45.236797	86.0	True	36.0	2018-09-20 03:39:57	36.118028	62.0	40.0	45.5620	1.550000	21.0	12.0	13.0224	1.750000

	data	route_schedule_uuid	trip_uuid	source_center	destination_center	od_time_diff_mins	start_scan_to_end_scan	actual_distance_to_destination	actual_time	osrm_time	osrm_distance	segment_actual_time	segment_osrm_time	segment_osrm_distance	source_city	source_place	source_state	destination_city	destination_place	destination_state	trip_creation_year	trip_creation_month	trip_creation_day	route_type_Carting	route_type_FTL
1	training	thanos::sroute:3a1b0ab2-bb0b-4c53-8c59-eb2a2c0...	trip-153671042288605164	IND572101AAA	IND562101AAA	913.174079	906.0	240.208306	399.0	210.0	269.4308	141.0	65.0	84.1894	Tumkur	Veersagr I	Karnataka	Chikblapur	ShntiSgr D	Karnataka	2018	9	12	1	0
3	training	thanos::sroute:f0176492-a679-4597-8332-bbd1c7f...	trip-153671046011330457	IND400072AAB	IND401104AAA	200.989870	200.0	28.529648	82.0	24.0	31.6475	59.0	16.0	19.8766	Mumbai	Hub	Maharashtra	Mumbai	MiraRd IP	Maharashtra	2018	9	12	1	0
4	training	thanos::sroute:d9f07b12-65e0-4f3b-bec8-df06134...	trip-153671052974046625	IND583101AAA	IND583101AAA	1588.710998	1586.0	239.007304	556.0	207.0	266.2914	340.0	115.0	146.7919	Bellary	Dc	Karnataka	Bellary	Dc	Karnataka	2018	9	12	0	1
5	training	thanos::sroute:9bf03170-d0a2-4a3f-aa4d-9aaab3d...	trip-153671055416136166	IND600116AAB	IND602105AAB	251.365798	249.0	34.407865	92.0	30.0	38.1953	60.0	23.0	28.0647	Chennai	Porur DPC	Tamil Nadu	Chennai	Sriperumbudur Dc	Tamil Nadu	2018	9	12	1	0

Define Problem Statement and perform Exploratory Data Analysis¶

Download the dataset and observe a subset of the data¶

Custom functions to plot custom charts and tables¶

Hypothesis Testing¶

Insights:¶

Recommendations:¶

	data	destination_center	cutoff_timestamp	trip_creation_time	od_end_time	destination_name	od_start_time	source_name	source_center	trip_uuid	route_type	route_schedule_uuid	cutoff_factor	osrm_distance	segment_osrm_distance	segment_osrm_time	segment_actual_time	factor	segment_factor	osrm_time	actual_time	actual_distance_to_destination	start_scan_to_end_scan	is_cutoff
dtype	object	object	object	object	object	object	object	object	object	object	object	object	int64	float64	float64	float64	float64	float64	float64	float64	float64	float64	float64	bool
Missing Counts	0	0	0	0	0	261	0	293	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
nUniques	2	1481	93180	14817	26369	1468	26369	1498	1508	14817	2	1504	501	138046	113799	214	747	45641	5675	1531	3182	144515	1915	2
Top 10 Unique Values	training (72%), test (27%)	IND000000ACB (10%), IND562132AAA (7%), IND4213...	2018-09-24 05:19:20 (0%), 2018-09-24 05:19:21 ...	2018-09-28 05:23:15.359220 (0%), 2018-10-02 06...	2018-09-24 09:59:15.691618 (0%), 2018-10-05 11...	Gurgaon_Bilaspur_HB (Haryana) (10%), Bangalore...	2018-09-21 18:37:09.322207 (0%), 2018-10-03 04...	Gurgaon_Bilaspur_HB (Haryana) (16%), Bangalore...	IND000000ACB (16%), IND562132AAA (6%), IND4213...	trip-153811219535896559 (0%), trip-15384603530...	FTL (68%), Carting (31%)	thanos::sroute:4029a8a2-6c74-4b7e-a6d8-f9e069f...	22.0 (9%), 9.0 (8%), 44.0 (5%), 18.0 (5%), 66....	48.0394 (0%), 11.23 (0%), 11.3233 (0%), 12.541...	0.0 (1%), 25.6081 (0%), 22.6267 (0%), 58.2708 ...	16.0 (7%), 17.0 (7%), 18.0 (6%), 19.0 (4%), 15...	24.0 (4%), 26.0 (3%), 30.0 (3%), 27.0 (3%), 23...	2.0 (1%), 1.5 (0%), 1.6666666666666667 (0%), 1...	2.0 (4%), 1.5 (3%), 1.0 (1%), 1.66666666666666...	21.0 (1%), 20.0 (1%), 18.0 (1%), 22.0 (1%), 17...	32.0 (0%), 36.0 (0%), 30.0 (0%), 38.0 (0%), 42...	100.28289202009655 (0%), 19.122553143991365 (0...	110.0 (0%), 72.0 (0%), 99.0 (0%), 95.0 (0%), 8...	True (81%), False (18%)
Bottom 10 Unique Values	test (27%), training (72%)	IND504215AAA (0%), IND110046AAA (0%), IND40070...	2018-09-20 16:24:28.436231 (0%), 2018-09-26 05...	2018-09-14 17:04:32.989471 (0%), 2018-09-19 04...	2018-09-27 15:47:50.386008 (0%), 2018-09-20 04...	Basta_Central_DPP_1 (Orissa) (0%), Mumbai_Sanp...	2018-09-27 02:59:59.877566 (0%), 2018-09-20 02...	Faridabad_Old (Haryana) (0%), Krishnanagar_Ana...	IND741101AAB (0%), IND222001AAA (0%), IND24200...	trip-153694467298919626 (0%), trip-15373317447...	Carting (31%), FTL (68%)	thanos::sroute:404cbabf-d2a5-4e46-bf79-8b3c518...	275.0 (0%), 412.0 (0%), 1149.0 (0%), 734.0 (0%...	88.7319 (0%), 307.6147 (0%), 28.7254 (0%), 55....	8.8088 (0%), 10.8209 (0%), 8.1647 (0%), 0.7952...	1032.0 (0%), 156.0 (0%), 204.0 (0%), 163.0 (0%...	1104.0 (0%), 1847.0 (0%), 540.0 (0%), 517.0 (0...	4.484210526315789 (0%), 1.9186746987951808 (0%...	29.77777777777778 (0%), 2.6144578313253013 (0%...	1312.0 (0%), 1250.0 (0%), 1130.0 (0%), 1614.0 ...	2980.0 (0%), 2870.0 (0%), 2910.0 (0%), 2608.0 ...	70.03901016351531 (0%), 9.162212958326531 (0%)...	2706.0 (0%), 3361.0 (0%), 905.0 (0%), 22.0 (0%...	False (18%), True (81%)
min	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	9.0	9.0	0.0	0.0	-244.0	0.1	-23.4	6.0	9.0	9.0	20.0	nan
max	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	1927.0	2326.2	2191.4	1611.0	3051.0	77.4	574.2	1686.0	4532.0	1927.4	7898.0	nan
LW (1.5)	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	9.0	9.0	0.0	0.0	-10.0	0.7	-0.1	6.0	9.0	9.0	20.0	nan
Q1	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	22.0	29.9	12.1	11.0	20.0	1.6	1.3	27.0	51.0	23.4	161.0	nan
Median	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	66.0	78.5	23.5	17.0	29.0	1.9	1.7	64.0	132.0	66.1	449.0	nan
Q3	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	286.0	343.2	27.8	22.0	40.0	2.2	2.2	257.0	513.0	286.7	1634.0	nan
UW (1.5)	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	682.0	813.2	51.4	38.5	70.0	3.1	3.6	602.0	1206.0	681.7	3843.5	nan
Outlier Count (1.5*IQR)	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	17246 (11.9%)	17811 (12.3%)	4328 (3.0%)	6378 (4.4%)	9298 (6.4%)	11769 (8.1%)	14416 (10.0%)	17603 (12.2%)	16633 (11.5%)	17992 (12.4%)	373 (0.3%)	nan
mean-3*std	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	9.0	9.0	0.0	0.0	-124.6	0.1	-12.2	6.0	9.0	9.0	20.0	nan
mean	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	232.9	284.8	22.8	18.5	36.2	2.1	2.2	213.9	416.9	234.1	961.3	nan
std	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	344.8	421.1	17.9	14.8	53.6	1.7	4.8	308.0	598.1	345.0	1037.0	nan
mean+3*std	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	1267.3	1548.1	76.5	62.9	197.0	7.2	16.6	1137.9	2211.2	1269.1	4072.3	nan
Outlier Count (3*std)	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	3428 (2.4%)	3611 (2.5%)	1504 (1.0%)	2113 (1.5%)	1201 (0.8%)	1348 (0.9%)	806 (0.6%)	3595 (2.5%)	3526 (2.4%)	3429 (2.4%)	350 (0.2%)	nan

	route_type_FTL	route_type_Carting	source_place	trip_uuid	source_center	destination_center	destination_state	destination_place	destination_city	route_schedule_uuid	source_state	source_city	trip_creation_day	trip_creation_month	trip_creation_year	segment_osrm_time	segment_osrm_distance	segment_actual_time	osrm_distance	osrm_time	actual_time	actual_distance_to_destination	start_scan_to_end_scan	od_time_diff_mins	data
dtype	uint8	uint8	object	object	object	object	object	object	object	object	object	object	int64	int64	int64	float64	float64	float64	float64	float64	float64	float64	float64	float64	category
Missing Counts	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
nUniques	2	2	684	12219	823	928	59	784	746	1316	52	640	22	2	1	373	12156	715	12193	932	1687	12208	3062	12219	2
Top 10 Unique Values	0.0 (71%), 1.0 (28%)	1.0 (71%), 0.0 (28%)	Bilaspur HB (5%), Mankoli HB (5%), Hub (4%), N...	trip-153671042288605164 (0%), trip-15379207477...	IND000000ACB (5%), IND421302AAG (5%), IND56213...	IND000000ACB (4%), IND421302AAG (3%), IND56213...	Maharashtra (18%), Karnataka (16%), Haryana (1...	Bilaspur HB (4%), Hub (3%), Mankoli HB (3%), N...	Mumbai (9%), Bengaluru (8%), Gurgaon (4%), Del...	thanos::sroute:a16bfa03-3462-4bce-9c82-5784c7d...	Maharashtra (18%), Karnataka (16%), Haryana (1...	Bengaluru (8%), Mumbai (7%), Gurgaon (5%), Bhi...	18.0 (5%), 15.0 (5%), 13.0 (5%), 12.0 (4%), 14...	9.0 (87%), 10.0 (12%)	2018.0 (100%)	17.0 (1%), 20.0 (1%), 19.0 (1%), 18.0 (1%), 22...	27.599800000000002 (0%), 54.7183 (0%), 31.6824...	47.0 (0%), 41.0 (0%), 60.0 (0%), 35.0 (0%), 55...	76.5141 (0%), 20.8553 (0%), 33.3202 (0%), 25.4...	26.0 (1%), 30.0 (1%), 24.0 (1%), 27.0 (1%), 28...	120.0 (0%), 68.0 (0%), 66.0 (0%), 78.0 (0%), 5...	52.93246696402012 (0%), 27.080239836318736 (0%...	396.0 (0%), 444.0 (0%), 300.0 (0%), 270.0 (0%)...	913.1740793 (0%), 2420.9706173833333 (0%), 252...	training (71%), test (28%)
min	0.0	0.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	1.0	9.0	2018.0	7.0	9.1	10.0	9.3	7.0	10.0	9.0	27.0	27.6	nan
max	1.0	1.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	30.0	10.0	2018.0	411.0	491.7	818.0	1390.7	1194.0	2444.0	1083.0	6436.0	6441.9	nan
LW (1.5)	0.0	0.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	1.0	9.0	2018.0	7.0	9.1	10.0	9.3	7.0	10.0	9.0	27.0	27.6	nan
Q1	0.0	0.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	14.0	9.0	2018.0	27.0	29.0	59.0	60.4	53.0	122.0	47.2	348.0	350.6	nan
Median	0.0	1.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	19.0	9.0	2018.0	51.0	51.9	107.0	119.4	113.0	251.0	90.7	720.0	724.4	nan
Q3	1.0	1.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	25.0	9.0	2018.0	113.5	130.1	228.0	315.1	282.0	574.0	254.3	1512.0	1514.4	nan
UW (1.5)	1.0	1.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	30.0	9.0	2018.0	243.2	281.8	481.5	697.2	625.5	1252.0	565.0	3258.0	3260.1	nan
Outlier Count (1.5*IQR)	0	0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	0	1507 (12.3%)	0	521 (4.3%)	519 (4.2%)	571 (4.7%)	914 (7.5%)	708 (5.8%)	739 (6.0%)	891 (7.3%)	853 (7.0%)	854 (7.0%)	nan
mean-3*std	0.0	0.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	1.0	9.0	2018.0	7.0	9.1	10.0	9.3	7.0	10.0	9.0	27.0	27.6	nan
mean	0.3	0.7	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	18.4	9.1	2018.0	79.7	89.4	163.6	234.1	202.4	416.9	185.5	1141.5	1144.2	nan
std	0.5	0.5	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	8.0	0.3	0.0	73.5	84.9	145.4	258.2	210.9	420.6	210.3	1135.8	1137.3	nan
mean+3*std	1.0	1.0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	30.0	10.0	2018.0	300.2	344.1	599.8	1008.7	835.1	1678.7	816.4	4548.9	4556.1	nan
Outlier Count (3*std)	0	0	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	0	0	0	198 (1.6%)	169 (1.4%)	224 (1.8%)	296 (2.4%)	186 (1.5%)	222 (1.8%)	281 (2.3%)	235 (1.9%)	235 (1.9%)	nan