# !pip install matplotlib --upgrade
Note:
Upgrade the matplotlib version form 3.2.2 to 3.5.3
Use command
!pip install matplotlib --upgrade
About the Business
This business case is about India's largest B2B & C2C Logistic Courier Service Provider in India. It is the largest and fastest-growing fully integrated player in India by revenue in Fiscal 2021. They aim to build the operating system for commerce, through a combination of world-class infrastructure, logistics operations of the highest quality, and cutting-edge engineering and technology capabilities.
The Data team builds intelligence and capabilities using this data that helps them to widen the gap between the quality, efficiency, and profitability of their business versus their competitors.
Concept Used
The company wants to understand and process the data coming out of data engineering pipelines:
We will carry out the following steps (Concept Used):
Importing Required Libraries
# !pip install matplotlib==3.5.3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
import seaborn as sns
import textwrap
import math
import re
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
df = pd.read_csv('/content/Courier_data.csv')
pd.set_option('display.max_columns', None)
df.head(4)
| data | trip_creation_time | route_schedule_uuid | route_type | trip_uuid | source_center | source_name | destination_center | destination_name | od_start_time | od_end_time | start_scan_to_end_scan | is_cutoff | cutoff_factor | cutoff_timestamp | actual_distance_to_destination | actual_time | osrm_time | osrm_distance | factor | segment_actual_time | segment_osrm_time | segment_osrm_distance | segment_factor | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | training | 2018-09-20 02:35:36.476840 | thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3... | Carting | trip-153741093647649320 | IND388121AAA | Anand_VUNagar_DC (Gujarat) | IND388620AAB | Khambhat_MotvdDPP_D (Gujarat) | 2018-09-20 03:21:32.418600 | 2018-09-20 04:47:45.236797 | 86.0 | True | 9.0 | 2018-09-20 04:27:55 | 10.435660 | 14.0 | 11.0 | 11.9653 | 1.272727 | 14.0 | 11.0 | 11.9653 | 1.272727 |
| 1 | training | 2018-09-20 02:35:36.476840 | thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3... | Carting | trip-153741093647649320 | IND388121AAA | Anand_VUNagar_DC (Gujarat) | IND388620AAB | Khambhat_MotvdDPP_D (Gujarat) | 2018-09-20 03:21:32.418600 | 2018-09-20 04:47:45.236797 | 86.0 | True | 18.0 | 2018-09-20 04:17:55 | 18.936842 | 24.0 | 20.0 | 21.7243 | 1.200000 | 10.0 | 9.0 | 9.7590 | 1.111111 |
| 2 | training | 2018-09-20 02:35:36.476840 | thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3... | Carting | trip-153741093647649320 | IND388121AAA | Anand_VUNagar_DC (Gujarat) | IND388620AAB | Khambhat_MotvdDPP_D (Gujarat) | 2018-09-20 03:21:32.418600 | 2018-09-20 04:47:45.236797 | 86.0 | True | 27.0 | 2018-09-20 04:01:19.505586 | 27.637279 | 40.0 | 28.0 | 32.5395 | 1.428571 | 16.0 | 7.0 | 10.8152 | 2.285714 |
| 3 | training | 2018-09-20 02:35:36.476840 | thanos::sroute:eb7bfc78-b351-4c0e-a951-fa3d5c3... | Carting | trip-153741093647649320 | IND388121AAA | Anand_VUNagar_DC (Gujarat) | IND388620AAB | Khambhat_MotvdDPP_D (Gujarat) | 2018-09-20 03:21:32.418600 | 2018-09-20 04:47:45.236797 | 86.0 | True | 36.0 | 2018-09-20 03:39:57 | 36.118028 | 62.0 | 40.0 | 45.5620 | 1.550000 | 21.0 | 12.0 | 13.0224 | 1.750000 |
Column Profiling
Custom function to get Discriptive Summary of the Dataset
def get_df_summary(df_, print_summary=True, properties_as_columns=True):
# Shape and memory usage of DataFrame
print(f'RangeIndex: {df_.shape[0]} entries; Data columns (total {df_.shape[1]} columns)')
memory_used = df_.memory_usage().sum()/1024
if memory_used > 1024*1024:
memory_used = f'{round(memory_used/1024/1024, 1)}+ GB'
elif memory_used > 1024:
memory_used = f'{round(memory_used/1024, 1)}+ MB'
else:
memory_used = f'{round(memory_used, 1)}+ KB'
print(f'memory usage: {memory_used}\n')
# Create an empty df with column names from original df
df2 = pd.DataFrame(columns=[None]+df_.columns.to_list())
# Add dtype
property_ = ['dtype']
for clm in df_.columns:
property_.append(df_[clm].dtype)
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Add Missing Values Counts
property_ = ['Missing Counts']
for clm in df_.columns:
property_.append(df_[clm].isna().sum())
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Add nUniques
property_ = ['nUniques']
for clm in df_.columns:
property_.append(df_[clm].nunique())
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Add unique values
# top 10
property_ = ['Top 10 Unique Values']
for clm in df_.columns:
df1 = df_[clm].value_counts().reset_index()
df1['margin'] = df1[clm]*100/ df1[clm].sum()
property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[:10]]))
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Add unique values
# bottom 10
property_ = ['Bottom 10 Unique Values']
for clm in df_.columns:
df1 = df_[clm].value_counts().reset_index()
df1['margin'] = df1[clm]*100/ df1[clm].sum()
property_.append(', '.join([i for i in df1.apply(lambda x:f"{x['index']} ({math.floor(x['margin'])}%)", axis=1).iloc[-1:-11:-1]]))
df2 = df2.append(pd.DataFrame([property_], columns=df2.columns))
# Getting Numeric Variables Statistics
df4 = pd.DataFrame(columns=df_.columns.to_list())
df4 = df4.append(df_.describe()).drop('count').rename({
'25%': 'Q1',
'50%': 'Median',
'75%': 'Q3'
}).reset_index().set_index('index').round(1)
df4 = df4.T
df4['LW (1.5)'] = df4.apply(lambda x: max(x['min'], x['Q1'] - 1.5*(x['Q3']-x['Q1'])), axis=1)
df4['UW (1.5)'] = df4.apply(lambda x: min(x['max'], x['Q3'] + 1.5*(x['Q3']-x['Q1'])), axis=1)
df4['mean-3*std'] = df4.apply(lambda x: max(x['min'], x['mean'] - 3*x['std']), axis=1)
df4['mean+3*std'] = df4.apply(lambda x: min(x['max'], x['mean'] + 3*x['std']), axis=1)
lst_IQR_Outlier = []
lst_std_Outlier = []
for clm in df4.index:
if clm in df_.describe().columns:
iqr_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'LW (1.5)']) | (df_[clm]>df4.loc[clm,'UW (1.5)'])].shape[0]
iqr_outlier_pct = f'({round(iqr_outlier_count * 100.0 / df_.__len__(), 1)}%)' if iqr_outlier_count != 0 else ''
std_outlier_count = df_.loc[(df_[clm]<df4.loc[clm,'mean-3*std']) | (df_[clm]>df4.loc[clm,'mean+3*std'])].shape[0]
std_outlier_pct = f'({round(std_outlier_count * 100.0 / df_.__len__(), 1)}%)' if std_outlier_count != 0 else ''
lst_IQR_Outlier.append(f'{iqr_outlier_count} {iqr_outlier_pct}')
lst_std_Outlier.append(f'{std_outlier_count} {std_outlier_pct}')
else:
lst_IQR_Outlier.append(np.nan)
lst_std_Outlier.append(np.nan)
df4['Outlier Count (1.5*IQR)'] = lst_IQR_Outlier
df4['Outlier Count (3*std)'] = lst_std_Outlier
df4 = df4.round(1).T.reset_index().rename({'index': None}, axis=1)
df2 = df2.append(df4)
# Sort the columns acording to dtype
df2 = df2.set_index(None).T.astype(str).sort_values('dtype', ascending=False)
df2 = df2[['dtype', 'Missing Counts', 'nUniques',
'Top 10 Unique Values', 'Bottom 10 Unique Values', 'min','max',
'LW (1.5)', 'Q1', 'Median', 'Q3', 'UW (1.5)', 'Outlier Count (1.5*IQR)',
'mean-3*std', 'mean', 'std', 'mean+3*std', 'Outlier Count (3*std)']]
if not properties_as_columns: df2 = df2.T
if print_summary: print(df2)
return df2
Descriptive Summary
df_summary = get_df_summary(df, print_summary=False, properties_as_columns=False)
df_summary
RangeIndex: 144867 entries; Data columns (total 24 columns) memory usage: 25.6+ MB
| data | destination_center | cutoff_timestamp | trip_creation_time | od_end_time | destination_name | od_start_time | source_name | source_center | trip_uuid | route_type | route_schedule_uuid | cutoff_factor | osrm_distance | segment_osrm_distance | segment_osrm_time | segment_actual_time | factor | segment_factor | osrm_time | actual_time | actual_distance_to_destination | start_scan_to_end_scan | is_cutoff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| dtype | object | object | object | object | object | object | object | object | object | object | object | object | int64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | bool |
| Missing Counts | 0 | 0 | 0 | 0 | 0 | 261 | 0 | 293 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| nUniques | 2 | 1481 | 93180 | 14817 | 26369 | 1468 | 26369 | 1498 | 1508 | 14817 | 2 | 1504 | 501 | 138046 | 113799 | 214 | 747 | 45641 | 5675 | 1531 | 3182 | 144515 | 1915 | 2 |
| Top 10 Unique Values | training (72%), test (27%) | IND000000ACB (10%), IND562132AAA (7%), IND4213... | 2018-09-24 05:19:20 (0%), 2018-09-24 05:19:21 ... | 2018-09-28 05:23:15.359220 (0%), 2018-10-02 06... | 2018-09-24 09:59:15.691618 (0%), 2018-10-05 11... | Gurgaon_Bilaspur_HB (Haryana) (10%), Bangalore... | 2018-09-21 18:37:09.322207 (0%), 2018-10-03 04... | Gurgaon_Bilaspur_HB (Haryana) (16%), Bangalore... | IND000000ACB (16%), IND562132AAA (6%), IND4213... | trip-153811219535896559 (0%), trip-15384603530... | FTL (68%), Carting (31%) | thanos::sroute:4029a8a2-6c74-4b7e-a6d8-f9e069f... | 22.0 (9%), 9.0 (8%), 44.0 (5%), 18.0 (5%), 66.... | 48.0394 (0%), 11.23 (0%), 11.3233 (0%), 12.541... | 0.0 (1%), 25.6081 (0%), 22.6267 (0%), 58.2708 ... | 16.0 (7%), 17.0 (7%), 18.0 (6%), 19.0 (4%), 15... | 24.0 (4%), 26.0 (3%), 30.0 (3%), 27.0 (3%), 23... | 2.0 (1%), 1.5 (0%), 1.6666666666666667 (0%), 1... | 2.0 (4%), 1.5 (3%), 1.0 (1%), 1.66666666666666... | 21.0 (1%), 20.0 (1%), 18.0 (1%), 22.0 (1%), 17... | 32.0 (0%), 36.0 (0%), 30.0 (0%), 38.0 (0%), 42... | 100.28289202009655 (0%), 19.122553143991365 (0... | 110.0 (0%), 72.0 (0%), 99.0 (0%), 95.0 (0%), 8... | True (81%), False (18%) |
| Bottom 10 Unique Values | test (27%), training (72%) | IND504215AAA (0%), IND110046AAA (0%), IND40070... | 2018-09-20 16:24:28.436231 (0%), 2018-09-26 05... | 2018-09-14 17:04:32.989471 (0%), 2018-09-19 04... | 2018-09-27 15:47:50.386008 (0%), 2018-09-20 04... | Basta_Central_DPP_1 (Orissa) (0%), Mumbai_Sanp... | 2018-09-27 02:59:59.877566 (0%), 2018-09-20 02... | Faridabad_Old (Haryana) (0%), Krishnanagar_Ana... | IND741101AAB (0%), IND222001AAA (0%), IND24200... | trip-153694467298919626 (0%), trip-15373317447... | Carting (31%), FTL (68%) | thanos::sroute:404cbabf-d2a5-4e46-bf79-8b3c518... | 275.0 (0%), 412.0 (0%), 1149.0 (0%), 734.0 (0%... | 88.7319 (0%), 307.6147 (0%), 28.7254 (0%), 55.... | 8.8088 (0%), 10.8209 (0%), 8.1647 (0%), 0.7952... | 1032.0 (0%), 156.0 (0%), 204.0 (0%), 163.0 (0%... | 1104.0 (0%), 1847.0 (0%), 540.0 (0%), 517.0 (0... | 4.484210526315789 (0%), 1.9186746987951808 (0%... | 29.77777777777778 (0%), 2.6144578313253013 (0%... | 1312.0 (0%), 1250.0 (0%), 1130.0 (0%), 1614.0 ... | 2980.0 (0%), 2870.0 (0%), 2910.0 (0%), 2608.0 ... | 70.03901016351531 (0%), 9.162212958326531 (0%)... | 2706.0 (0%), 3361.0 (0%), 905.0 (0%), 22.0 (0%... | False (18%), True (81%) |
| min | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 9.0 | 9.0 | 0.0 | 0.0 | -244.0 | 0.1 | -23.4 | 6.0 | 9.0 | 9.0 | 20.0 | nan |
| max | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 1927.0 | 2326.2 | 2191.4 | 1611.0 | 3051.0 | 77.4 | 574.2 | 1686.0 | 4532.0 | 1927.4 | 7898.0 | nan |
| LW (1.5) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 9.0 | 9.0 | 0.0 | 0.0 | -10.0 | 0.7 | -0.1 | 6.0 | 9.0 | 9.0 | 20.0 | nan |
| Q1 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 22.0 | 29.9 | 12.1 | 11.0 | 20.0 | 1.6 | 1.3 | 27.0 | 51.0 | 23.4 | 161.0 | nan |
| Median | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 66.0 | 78.5 | 23.5 | 17.0 | 29.0 | 1.9 | 1.7 | 64.0 | 132.0 | 66.1 | 449.0 | nan |
| Q3 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 286.0 | 343.2 | 27.8 | 22.0 | 40.0 | 2.2 | 2.2 | 257.0 | 513.0 | 286.7 | 1634.0 | nan |
| UW (1.5) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 682.0 | 813.2 | 51.4 | 38.5 | 70.0 | 3.1 | 3.6 | 602.0 | 1206.0 | 681.7 | 3843.5 | nan |
| Outlier Count (1.5*IQR) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 17246 (11.9%) | 17811 (12.3%) | 4328 (3.0%) | 6378 (4.4%) | 9298 (6.4%) | 11769 (8.1%) | 14416 (10.0%) | 17603 (12.2%) | 16633 (11.5%) | 17992 (12.4%) | 373 (0.3%) | nan |
| mean-3*std | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 9.0 | 9.0 | 0.0 | 0.0 | -124.6 | 0.1 | -12.2 | 6.0 | 9.0 | 9.0 | 20.0 | nan |
| mean | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 232.9 | 284.8 | 22.8 | 18.5 | 36.2 | 2.1 | 2.2 | 213.9 | 416.9 | 234.1 | 961.3 | nan |
| std | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 344.8 | 421.1 | 17.9 | 14.8 | 53.6 | 1.7 | 4.8 | 308.0 | 598.1 | 345.0 | 1037.0 | nan |
| mean+3*std | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 1267.3 | 1548.1 | 76.5 | 62.9 | 197.0 | 7.2 | 16.6 | 1137.9 | 2211.2 | 1269.1 | 4072.3 | nan |
| Outlier Count (3*std) | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 3428 (2.4%) | 3611 (2.5%) | 1504 (1.0%) | 2113 (1.5%) | 1201 (0.8%) | 1348 (0.9%) | 806 (0.6%) | 3595 (2.5%) | 3526 (2.4%) | 3429 (2.4%) | 350 (0.2%) | nan |
Drop Unknown Fields
df = df.drop(['is_cutoff', 'cutoff_factor', 'cutoff_timestamp', 'factor', 'segment_factor'], axis=1)
Fill missing values in "source_name" and "destination_name" fields
df['source_name'].fillna('Unknown', inplace=True)
df['destination_name'].fillna('Unknown', inplace=True)
Create OD Time Diff field and drop OD Start and End Time Fields
df['od_start_time'] = pd.to_datetime(df['od_start_time'], errors='coerce')
df['od_end_time'] = pd.to_datetime(df['od_end_time'], errors='coerce')
df['od_time_diff_mins'] = (df['od_end_time'] - df['od_start_time']).dt.total_seconds()/60
df = df.drop(['od_start_time', 'od_end_time'], axis=1)
Group the rows for each 'trip_uuid' using aggregation
create_trip_dict = {
'data' : 'first',
'trip_creation_time' : 'first',
'route_schedule_uuid': 'first',
'route_type': 'first',
'trip_uuid': 'first',
'source_center': 'first',
'source_name': 'first',
'destination_center': 'last',
'destination_name': 'last',
'od_time_diff_mins' : 'sum',
'start_scan_to_end_scan': 'sum',
'actual_distance_to_destination': 'sum',
'actual_time': 'sum',
'osrm_time': 'sum',
'osrm_distance': 'sum',
'segment_actual_time': 'sum',
'segment_osrm_time': 'sum',
'segment_osrm_distance': 'sum'
}
df = df.groupby('trip_uuid').agg(create_trip_dict).reset_index(drop=True)
Split the Source and Destination name to create separate features for City, Place and State
Drop Source Name and Destination Name fields
def city_place_state_separator(text):
result = re.search('(^[A-Za-z]+)[_|\s](.+)(\(.+\))', text)
if result:
city = result.group(1)
place = result.group(2).rstrip().lstrip().replace('_', ' ')
state = result.group(3)[1:-1]
return pd.Series([city, place, state])
else:
return pd.Series([text, text, text])
df[['source_city', 'source_place', 'source_state']] = df['source_name'].apply(city_place_state_separator)
df[['destination_city', 'destination_place', 'destination_state']] = df['destination_name'].apply(city_place_state_separator)
df = df.drop(['source_name', 'destination_name'], axis=1)
Extract Trip Creation month, year and day from Trip Creation Time field
Drop the Trip_creation_time feature
def timestapm_to_year_month_day(ts):
return pd.Series([pd.to_datetime(ts).year, pd.to_datetime(ts).month, pd.to_datetime(ts).day])
df[['trip_creation_year', 'trip_creation_month', 'trip_creation_day']] = df['trip_creation_time'].apply(timestapm_to_year_month_day)
df = df.drop('trip_creation_time', axis=1)
Outlier Treatment:
Since there is huge difference between median and mean; and as observed in dataframe summary, there are extreem outliers
We will remove the outliers using IQR method
df = df[(df['segment_osrm_distance'] > float(df_summary.loc['LW (1.5)', 'segment_osrm_distance'])) & (df['segment_osrm_distance'] < float(df_summary.loc['UW (1.5)', 'segment_osrm_distance']))]
df = df[(df['segment_osrm_time'] > float(df_summary.loc['LW (1.5)', 'segment_osrm_time'])) & (df['segment_osrm_time'] < float(df_summary.loc['UW (1.5)', 'segment_osrm_time']))]
df = df[(df['osrm_distance'] > float(df_summary.loc['LW (1.5)', 'osrm_distance'])) & (df['osrm_distance'] < float(df_summary.loc['UW (1.5)', 'osrm_distance']))]
df = df[(df['osrm_time'] > float(df_summary.loc['LW (1.5)', 'osrm_time'])) & (df['osrm_time'] < float(df_summary.loc['UW (1.5)', 'osrm_time']))]
df = df[(df['actual_time'] > float(df_summary.loc['LW (1.5)', 'actual_time'])) & (df['actual_time'] < float(df_summary.loc['UW (1.5)', 'actual_time']))]
df = df[(df['actual_distance_to_destination'] > float(df_summary.loc['LW (1.5)', 'actual_distance_to_destination'])) & (df['actual_distance_to_destination'] < float(df_summary.loc['UW (1.5)', 'actual_distance_to_destination']))]
df = df[(df['start_scan_to_end_scan'] > float(df_summary.loc['LW (1.5)', 'start_scan_to_end_scan'])) & (df['start_scan_to_end_scan'] < float(df_summary.loc['UW (1.5)', 'start_scan_to_end_scan']))]
df = df[(df['od_time_diff_mins'] > float(df_summary.loc['LW (1.5)', 'od_time_diff_mins'])) & (df['od_time_diff_mins'] < float(df_summary.loc['UW (1.5)', 'od_time_diff_mins']))]
df = df[(df['segment_actual_time'] > float(df_summary.loc['LW (1.5)', 'segment_actual_time'])) & (df['segment_actual_time'] < float(df_summary.loc['UW (1.5)', 'segment_actual_time']))]
Normalize the numerical featuers
df_num_cols = df[['segment_osrm_distance', 'segment_osrm_time', 'osrm_distance', 'osrm_time', 'actual_time', 'actual_distance_to_destination', 'start_scan_to_end_scan', 'od_time_diff_mins', 'segment_actual_time']]
df_num_cols = pd.DataFrame(MinMaxScaler().fit_transform(df_num_cols), columns = df_num_cols.columns)
Perform one-hot encoding of categorical variable "route_type"
df = pd.concat((df, pd.get_dummies(df['route_type'], prefix = 'route_type')), axis=1)
df = df.drop('route_type', axis = 1)
Convert the 'Data' feature to categorical data type
df['data'] = df['data'].astype('category')
Summary of dataframe after the preprocessing operations
get_df_summary(df, print_summary=False, properties_as_columns=False)
RangeIndex: 12219 entries; Data columns (total 25 columns) memory usage: 2.2+ MB
| route_type_FTL | route_type_Carting | source_place | trip_uuid | source_center | destination_center | destination_state | destination_place | destination_city | route_schedule_uuid | source_state | source_city | trip_creation_day | trip_creation_month | trip_creation_year | segment_osrm_time | segment_osrm_distance | segment_actual_time | osrm_distance | osrm_time | actual_time | actual_distance_to_destination | start_scan_to_end_scan | od_time_diff_mins | data | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| dtype | uint8 | uint8 | object | object | object | object | object | object | object | object | object | object | int64 | int64 | int64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | category |
| Missing Counts | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| nUniques | 2 | 2 | 684 | 12219 | 823 | 928 | 59 | 784 | 746 | 1316 | 52 | 640 | 22 | 2 | 1 | 373 | 12156 | 715 | 12193 | 932 | 1687 | 12208 | 3062 | 12219 | 2 |
| Top 10 Unique Values | 0.0 (71%), 1.0 (28%) | 1.0 (71%), 0.0 (28%) | Bilaspur HB (5%), Mankoli HB (5%), Hub (4%), N... | trip-153671042288605164 (0%), trip-15379207477... | IND000000ACB (5%), IND421302AAG (5%), IND56213... | IND000000ACB (4%), IND421302AAG (3%), IND56213... | Maharashtra (18%), Karnataka (16%), Haryana (1... | Bilaspur HB (4%), Hub (3%), Mankoli HB (3%), N... | Mumbai (9%), Bengaluru (8%), Gurgaon (4%), Del... | thanos::sroute:a16bfa03-3462-4bce-9c82-5784c7d... | Maharashtra (18%), Karnataka (16%), Haryana (1... | Bengaluru (8%), Mumbai (7%), Gurgaon (5%), Bhi... | 18.0 (5%), 15.0 (5%), 13.0 (5%), 12.0 (4%), 14... | 9.0 (87%), 10.0 (12%) | 2018.0 (100%) | 17.0 (1%), 20.0 (1%), 19.0 (1%), 18.0 (1%), 22... | 27.599800000000002 (0%), 54.7183 (0%), 31.6824... | 47.0 (0%), 41.0 (0%), 60.0 (0%), 35.0 (0%), 55... | 76.5141 (0%), 20.8553 (0%), 33.3202 (0%), 25.4... | 26.0 (1%), 30.0 (1%), 24.0 (1%), 27.0 (1%), 28... | 120.0 (0%), 68.0 (0%), 66.0 (0%), 78.0 (0%), 5... | 52.93246696402012 (0%), 27.080239836318736 (0%... | 396.0 (0%), 444.0 (0%), 300.0 (0%), 270.0 (0%)... | 913.1740793 (0%), 2420.9706173833333 (0%), 252... | training (71%), test (28%) |
| min | 0.0 | 0.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 1.0 | 9.0 | 2018.0 | 7.0 | 9.1 | 10.0 | 9.3 | 7.0 | 10.0 | 9.0 | 27.0 | 27.6 | nan |
| max | 1.0 | 1.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 30.0 | 10.0 | 2018.0 | 411.0 | 491.7 | 818.0 | 1390.7 | 1194.0 | 2444.0 | 1083.0 | 6436.0 | 6441.9 | nan |
| LW (1.5) | 0.0 | 0.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 1.0 | 9.0 | 2018.0 | 7.0 | 9.1 | 10.0 | 9.3 | 7.0 | 10.0 | 9.0 | 27.0 | 27.6 | nan |
| Q1 | 0.0 | 0.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 14.0 | 9.0 | 2018.0 | 27.0 | 29.0 | 59.0 | 60.4 | 53.0 | 122.0 | 47.2 | 348.0 | 350.6 | nan |
| Median | 0.0 | 1.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 19.0 | 9.0 | 2018.0 | 51.0 | 51.9 | 107.0 | 119.4 | 113.0 | 251.0 | 90.7 | 720.0 | 724.4 | nan |
| Q3 | 1.0 | 1.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 25.0 | 9.0 | 2018.0 | 113.5 | 130.1 | 228.0 | 315.1 | 282.0 | 574.0 | 254.3 | 1512.0 | 1514.4 | nan |
| UW (1.5) | 1.0 | 1.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 30.0 | 9.0 | 2018.0 | 243.2 | 281.8 | 481.5 | 697.2 | 625.5 | 1252.0 | 565.0 | 3258.0 | 3260.1 | nan |
| Outlier Count (1.5*IQR) | 0 | 0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 0 | 1507 (12.3%) | 0 | 521 (4.3%) | 519 (4.2%) | 571 (4.7%) | 914 (7.5%) | 708 (5.8%) | 739 (6.0%) | 891 (7.3%) | 853 (7.0%) | 854 (7.0%) | nan |
| mean-3*std | 0.0 | 0.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 1.0 | 9.0 | 2018.0 | 7.0 | 9.1 | 10.0 | 9.3 | 7.0 | 10.0 | 9.0 | 27.0 | 27.6 | nan |
| mean | 0.3 | 0.7 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 18.4 | 9.1 | 2018.0 | 79.7 | 89.4 | 163.6 | 234.1 | 202.4 | 416.9 | 185.5 | 1141.5 | 1144.2 | nan |
| std | 0.5 | 0.5 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 8.0 | 0.3 | 0.0 | 73.5 | 84.9 | 145.4 | 258.2 | 210.9 | 420.6 | 210.3 | 1135.8 | 1137.3 | nan |
| mean+3*std | 1.0 | 1.0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 30.0 | 10.0 | 2018.0 | 300.2 | 344.1 | 599.8 | 1008.7 | 835.1 | 1678.7 | 816.4 | 4548.9 | 4556.1 | nan |
| Outlier Count (3*std) | 0 | 0 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 0 | 0 | 0 | 198 (1.6%) | 169 (1.4%) | 224 (1.8%) | 296 (2.4%) | 186 (1.5%) | 222 (1.8%) | 281 (2.3%) | 235 (1.9%) | 235 (1.9%) | nan |
df.head(4)
| data | route_schedule_uuid | trip_uuid | source_center | destination_center | od_time_diff_mins | start_scan_to_end_scan | actual_distance_to_destination | actual_time | osrm_time | osrm_distance | segment_actual_time | segment_osrm_time | segment_osrm_distance | source_city | source_place | source_state | destination_city | destination_place | destination_state | trip_creation_year | trip_creation_month | trip_creation_day | route_type_Carting | route_type_FTL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | training | thanos::sroute:3a1b0ab2-bb0b-4c53-8c59-eb2a2c0... | trip-153671042288605164 | IND572101AAA | IND562101AAA | 913.174079 | 906.0 | 240.208306 | 399.0 | 210.0 | 269.4308 | 141.0 | 65.0 | 84.1894 | Tumkur | Veersagr I | Karnataka | Chikblapur | ShntiSgr D | Karnataka | 2018 | 9 | 12 | 1 | 0 |
| 3 | training | thanos::sroute:f0176492-a679-4597-8332-bbd1c7f... | trip-153671046011330457 | IND400072AAB | IND401104AAA | 200.989870 | 200.0 | 28.529648 | 82.0 | 24.0 | 31.6475 | 59.0 | 16.0 | 19.8766 | Mumbai | Hub | Maharashtra | Mumbai | MiraRd IP | Maharashtra | 2018 | 9 | 12 | 1 | 0 |
| 4 | training | thanos::sroute:d9f07b12-65e0-4f3b-bec8-df06134... | trip-153671052974046625 | IND583101AAA | IND583101AAA | 1588.710998 | 1586.0 | 239.007304 | 556.0 | 207.0 | 266.2914 | 340.0 | 115.0 | 146.7919 | Bellary | Dc | Karnataka | Bellary | Dc | Karnataka | 2018 | 9 | 12 | 0 | 1 |
| 5 | training | thanos::sroute:9bf03170-d0a2-4a3f-aa4d-9aaab3d... | trip-153671055416136166 | IND600116AAB | IND602105AAB | 251.365798 | 249.0 | 34.407865 | 92.0 | 30.0 | 38.1953 | 60.0 | 23.0 | 28.0647 | Chennai | Porur DPC | Tamil Nadu | Chennai | Sriperumbudur Dc | Tamil Nadu | 2018 | 9 | 12 | 1 | 0 |
def plot_hist_custom(ax_, df_, var_, title_, scale_x_major, scale_x_minor, color_='#DC3535'):
if pd.api.types.is_datetime64_any_dtype(df_[var_]):
df_[var_] = df_[var_].dt.year
m1 = df_[var_].mean()
st1 = df_[var_].std()
q1 = df_[var_].quantile(.25)
q2 = df_[var_].median()
q3 = df_[var_].quantile(.75)
sns.histplot(data=df_, x=var_, kde=True,
binwidth=scale_x_minor,
color=color_, ax=ax_, linewidth=2)
df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
plt.xlabel('')
plt.ylabel('Count', size=12)
plt.yticks(size=12)
sns.despine(bottom=True, left=False, trim=True, ax=ax_)
ax_.spines['left'].set_color('grey')
ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
ax_.xaxis.set_major_formatter('{x:.0f}')
ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))
ax_.figure.set_size_inches(16,11)
plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)
return plt
def plot_box_custom(ax_, df_, var_, xlabel_, scale_x_major, scale_x_minor, color_='#DC3535'):
if pd.api.types.is_datetime64_any_dtype(df_[var_]):
df_[var_] = df_[var_].dt.year
m1 = df_[var_].mean()
st1 = df_[var_].std()
q1 = df_[var_].quantile(.25)
q2 = df_[var_].median()
q3 = df_[var_].quantile(.75)
df_mean = pd.DataFrame({'x': [m1, m1], 'y': ax_.get_ybound()})
df_q1 = pd.DataFrame({'x': [q1, q1], 'y': ax_.get_ybound()})
df_q2 = pd.DataFrame({'x': [q2, q2], 'y': ax_.get_ybound()})
df_q3 = pd.DataFrame({'x': [q3, q3], 'y': ax_.get_ybound()})
df_mean['y'] = [-0.3, 0.2]
df_q1['y'] = [0.2, 0.25]
df_q2['y'] = [0.1, 0.25]
df_q3['y'] = [0.2, 0.25]
sns.boxplot(data=df_, x=var_, ax=ax_,
color=color_, showmeans=True,
flierprops={"marker": "x"}, medianprops={"color": "cyan"},
width=0.4, fliersize=1, linewidth=2, notch=True)
sns.lineplot(data=df_mean, x='x', y='y', color='red', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
text = f' μ={m1:.1f}\n σ={st1:.1f}'
ax_.annotate(text, xy=(m1, -0.3), rotation=90)
sns.lineplot(data=df_q1, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
text = f'Q1={q1:.1f} '
ax_.annotate(text, xy=(q1-0.1, 0.25), rotation=90, va='top', ha='right')
sns.lineplot(data=df_q2, x='x', y='y', color='cyan', ax=ax_, linestyle='--',
estimator=None, linewidth = 2)
text = f'med={q2:.1f} '
ax_.annotate(text, xy=(q2, 0.25), rotation=90, va='top', ha='center')
sns.lineplot(data=df_q3, x='x', y='y', color='black', ax=ax_, linestyle='--',
estimator=None, linewidth = 1)
text = f'Q3={q3:.1f} '
ax_.annotate(text, xy=(q3+0.1, 0.25), rotation=90, va='top', ha='left')
plt.sca(ax_)
plt.xlabel(xlabel_, size=12)
plt.ylabel('')
plt.xticks(size=12)
sns.despine(bottom=False, left=False, ax=ax_)
ax_.spines['bottom'].set_color('grey')
ax_.spines['left'].set_color('grey')
ax_.xaxis.set_major_locator(MultipleLocator(scale_x_major))
ax_.xaxis.set_major_formatter('{x:.0f}')
ax_.xaxis.set_minor_locator(MultipleLocator(scale_x_minor))
ax_.figure.set_size_inches(16,11)
plt.subplots_adjust(top=0.95, right=0.869, hspace=0, wspace=0.1)
return plt
def plt_dist_plot(ax_, df_, type_var_, var_, title_, xlabel_, ylabel_):
hue_order = df[type_var_].unique()
sns.kdeplot(data=df_, x=var_, ax=ax_, fill=True,
hue=type_var_, hue_order=hue_order[::-1], lw=4, palette=['#3A5BA0', '#DC3535', '#18978F'])
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.xlabel(xlabel_, size=12)
plt.ylabel(ylabel_, size=12)
plt.xticks(size=12)
plt.yticks(size=12)
plt.legend(labels=hue_order, loc='best', fontsize=12)
sns.despine(right=True, top=True, ax=ax_)
ax_.spines['left'].set_color('grey')
ax_.spines['bottom'].set_color('grey')
ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86)
return plt
def plt_most10_count_plot(ax_, df_, type_var_, var_, type_, title_, independant_label_, dependant_label_, unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='v', agg_func='Count', agg_var=None):
agg_func = agg_func.lower()
if agg_func == 'count':
total_records_count = df_[var_].count()
if type_var_ and type_:
list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
else:
list_most10 = df_[var_].value_counts().iloc[:10].index
df_ = df_.loc[df_[var_].isin(list_most10), [var_]]
elif agg_func == 'sum':
total_records_count = df_[agg_var].sum()
if type_var_ and type_:
list_most10 = df_.loc[df_[type_var_]==type_, var_].value_counts().iloc[:10].index
df_ = df_.loc[(df_[type_var_]==type_) & df_[var_].isin(list_most10), [var_]]
else:
list_most10 = df[[var_, agg_var]].groupby(var_)[agg_var].sum().sort_values(ascending=False).iloc[:10]
else:
pass
if not highlight_num:
if agg_func == 'count':
list_highlight = df_[var_].value_counts()/total_records_count
elif agg_func == 'sum':
list_highlight = list_most10.to_list()/total_records_count
else:
pass
list_highlight = list_highlight[list_highlight > 0.1]
highlight_num = len(list_highlight)
custom_palette = [highlight_color for i in range(highlight_num)] + ['grey' for i in range(10-highlight_num)]
if orientation_ == 'v':
if agg_func == 'count':
sns.countplot(data=df_, x=var_, order=list_most10, ax=ax_,
palette = custom_palette)
elif agg_func == 'sum':
sns.barplot(x=[str(i) for i in list_most10.index.to_list()], y=list_most10.to_list(), ax=ax_,
palette = custom_palette)
else:
pass
plt.sca(ax_)
plt.xlabel(independant_label_)
plt.ylabel(dependant_label_)
plt.tick_params(
axis='y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left=False, # ticks along the left edge are off
right=False, # ticks along the right edge are off
labelleft=False) # labels along the left edge are off
labels = []
for label in ax_.get_xticklabels():
text = label.get_text()
labels.append(textwrap.fill(text, width=10,
break_long_words=False))
ax_.set_xticklabels(labels, rotation=60)
bar_labels=[]
for container in ax_.containers:
for rect in container:
# Rectangle widths are already integer-valued but are floating
# type, so it helps to remove the trailing decimal point and 0 by
# converting width to int type
# Shift the text to the left side of the right edge
yloc = 4
# White on magenta
clr = 'white'
align = 'bottom'
rotation_ = 90
count_ = 0 if np.isnan(rect.get_height()) else rect.get_height()
pct_ = int(count_*100/total_records_count)
pct_unit = f'({pct_}%) {unit_}'
label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_height() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_height() > 0.055 else ''
size_ = 10 if count_/container[0].get_height() > 0.1 else 9 if count_/container[0].get_height() > 0.06 else 8 if count_/container[0].get_height() > 0.055 else 7
xloc = rect.get_x() + rect.get_width() / 2
ax_.annotate(label_text, xy=(xloc, 0), xytext=(0, yloc),
textcoords="offset points", size=size_,
ha='center', va=align, rotation=rotation_,
color=clr, clip_on=True)
else:
if agg_func == 'count':
sns.countplot(data=df_, y=var_, order=list_most10, ax=ax_,
palette = custom_palette)
elif agg_func == 'sum':
sns.barplot(y=[str(i) for i in list_most10.index.to_list()], x=list_most10.to_list(), ax=ax_,
palette = custom_palette)
else:
pass
plt.sca(ax_)
plt.xlabel(dependant_label_)
plt.ylabel(independant_label_)
plt.tick_params(
axis='x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom=False, # ticks along the bottom edge are off
top=False, # ticks along the top edge are off
labelbottom=False) # labels along the bottom edge are off
labels = []
for label in ax_.get_yticklabels():
text = label.get_text()
labels.append(textwrap.fill(text, width=15,
break_long_words=False))
ax_.set_yticklabels(labels, rotation=0)
bar_labels=[]
for container in ax_.containers:
for rect in container:
# Rectangle widths are already integer-valued but are floating
# type, so it helps to remove the trailing decimal point and 0 by
# converting width to int type
# Shift the text to the left side of the right edge
xloc = 2
# White on magenta
clr = 'white'
align = 'left'
count_ = rect.get_width()
pct_ = int(count_*100/total_records_count)
pct_unit = f'({pct_}%) {unit_}'
label_text = f'{count_/1000000000: .1f} b' if count_ > 1000000000 else f'{count_/1000000: .1f} m' if count_ > 1000000 else f'{count_/1000: .1f} k' if count_ > 1000 else ''
label_text = f'{label_text} {pct_unit}' if label_text and count_/container[0].get_width() > 0.055+0.023*len(pct_unit) else label_text if count_/container[0].get_width() > 0.055 else ''
size_ = 10 if count_/container[0].get_width() > 0.1 else 9 if count_/container[0].get_width() > 0.06 else 8 if count_/container[0].get_width() > 0.055 else 7
# Center the text vertically in the bar
yloc = rect.get_y() + rect.get_height() / 2
ax_.annotate(label_text, xy=(0, yloc), xytext=(xloc, 0),
textcoords="offset points", size=size_,
ha=align, va='center',
color=clr, clip_on=True)
sns.despine(left=True, bottom=True, ax=ax_)
plt.title(title_, size=16, color='grey')
plt.xticks(size=12)
plt.yticks(size=12)
ax_.spines['left'].set_color('grey')
ax_.spines['bottom'].set_color('grey')
ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86)
return plt
def plot_numeric_distribution(df_, var_, main_title, xlablel_, title_, highligh_num = None, box_major=None, box_minor=None):
fig = plt.figure()
ax1 = plt.subplot(2, 1, 1)
ax2 = plt.subplot(2, 1, 2, sharex=ax1)
plot_hist_custom(ax_=ax1, df_=df, var_=var_, title_=f'Distribution of {var_}', scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')
plot_box_custom(ax_=ax2, df_=df, var_=var_, xlabel_=var_, scale_x_major=box_major, scale_x_minor=box_minor, color_='#18978F')
fig.set_size_inches(8,10)
plt.subplots_adjust(hspace = 0, wspace=0.25)
plt.show()
# fig = plt.figure()
# ax1 = plt.subplot()
# plt_most10_count_plot(ax_=ax1, df_=df_, type_var_ = '', var_=var_, type_='', title_=title_, independant_label_='', dependant_label_='', unit_='', highlight_num=highligh_num, highlight_color='#18978F', orientation_='h')
# fig.set_size_inches(8,4)
# plt.show()
def plot_kde_distribution(ax_, df_, var_, summary_var_, title_, x_label_='', y_label_=''):
list_most10 = df_[[var_, summary_var_]].groupby(var_)[summary_var_].sum().sort_values(ascending=False).iloc[:10]
var_list = list_most10.index.to_list()
distribution_list = []
for var_category in var_list:
# df_cur = df_.loc[df[var_] == var_category, [var_, summary_var_]]
distribution_list.append(df_.loc[df_[var_] == var_category, summary_var_])
# --------------------
df_dist = pd.DataFrame(var_list, columns=['Category'])
df_dist['values'] = distribution_list
df_dist = df_dist.explode('values').reset_index(drop=True)
sns.kdeplot(data=df_dist, x='values', ax=ax_, fill=True, hue='Category', hue_order=var_list[::-1], lw=4, palette='deep')
plt.sca(ax_)
plt.title(title_, size=16, color='grey')
plt.xlabel(x_label_, size=12)
plt.ylabel(y_label_, size=12)
plt.xticks(size=12)
plt.yticks([])
plt.legend(labels=var_list, loc='upper left', fontsize=12)
ax_.set_xlim(xmin=np.min([np.min(i) for i in distribution_list])*0.9)
sns.despine(right=True, top=True, left=True, bottom=False, ax=ax_)
ax_.spines['bottom'].set_color('grey')
ax_.figure.set_size_inches(16,8)
ax_.figure.subplots_adjust(top=0.81,right=0.86)
return plt
plot_numeric_distribution(df, 'od_time_diff_mins', 'Tittle_', 'xlablel_', 'title_', box_major=1000, box_minor=250)
df['od_time_diff_mins_log'] = np.log(df['od_time_diff_mins'])
plot_numeric_distribution(df, 'od_time_diff_mins_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'start_scan_to_end_scan', 'Tittle_', 'xlablel_', 'title_', box_major=1000, box_minor=250)
df['start_scan_to_end_scan_log'] = np.log(df['start_scan_to_end_scan'])
plot_numeric_distribution(df, 'start_scan_to_end_scan_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'actual_distance_to_destination', 'Tittle_', 'xlablel_', 'title_', box_major=200, box_minor=50)
df['actual_distance_to_destination_log'] = np.log(df['actual_distance_to_destination'])
plot_numeric_distribution(df, 'actual_distance_to_destination_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'actual_time', 'Tittle_', 'xlablel_', 'title_', box_major=500, box_minor=100)
df['actual_time_log'] = np.log(df['actual_time'])
plot_numeric_distribution(df, 'actual_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'osrm_time', 'Tittle_', 'xlablel_', 'title_', box_major=200, box_minor=50)
df['osrm_time_log'] = np.log(df['osrm_time'])
plot_numeric_distribution(df, 'osrm_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'osrm_distance', 'Tittle_', 'xlablel_', 'title_', box_major=400, box_minor=100)
df['osrm_distance_log'] = np.log(df['osrm_distance'])
plot_numeric_distribution(df, 'osrm_distance_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'segment_actual_time', 'Tittle_', 'xlablel_', 'title_', box_major=200, box_minor=50)
df['segment_actual_time_log'] = np.log(df['segment_actual_time'])
plot_numeric_distribution(df, 'segment_actual_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'segment_osrm_time', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=25)
df['segment_osrm_time_log'] = np.log(df['segment_osrm_time'])
plot_numeric_distribution(df, 'segment_osrm_time_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plot_numeric_distribution(df, 'segment_osrm_distance', 'Tittle_', 'xlablel_', 'title_', box_major=100, box_minor=25)
df['segment_osrm_distance_log'] = np.log(df['segment_osrm_distance'])
plot_numeric_distribution(df, 'segment_osrm_distance_log', 'Tittle_', 'xlablel_', 'title_', box_major=2, box_minor=0.5)
plt_most10_count_plot(plt.subplot(), df, '', 'data', '', 'Train Test Data Split', 'Type', 'Count', unit_='', highlight_num=1, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
Comparison & Visualization of time and distance fields
fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.scatterplot(data = df, x = 'actual_time', y = 'actual_distance_to_destination', ax=ax[0])
sns.scatterplot(data = df, x = 'actual_time_log', y = 'actual_distance_to_destination_log', ax=ax[1])
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.scatterplot(data = df, x = 'osrm_time', y = 'osrm_distance', ax=ax[0])
sns.scatterplot(data = df, x = 'osrm_time_log', y = 'osrm_distance_log', ax=ax[1])
plt.show()
fig, ax = plt.subplots(1, 2, figsize=(10, 7))
sns.scatterplot(data = df, x = 'segment_osrm_time', y = 'segment_osrm_distance', ax=ax[0])
sns.scatterplot(data = df, x = 'segment_osrm_time_log', y = 'segment_osrm_distance_log', ax=ax[1])
plt.show()
Assume Significance Level ($\alpha$) as 0.05.
All the numeric features visually appear to be log normally distributed. We can assume, above features are log normally distributed.
Log normal scatter plots look to have equal variances.
Test 1
H0 = Mean of od_time_diff_mins and start_scan_to_end_scan are not significantly different
Ha = Mean of od_time_diff_mins and start_scan_to_end_scan are significantly different
df_dist = pd.DataFrame({
'Type' : np.repeat(['od_time_diff_mins_log', 'start_scan_to_end_scan_log'], [len(df['od_time_diff_mins_log']), len(df['start_scan_to_end_scan_log'])]),
'Values' : pd.concat((df['od_time_diff_mins_log'], df['start_scan_to_end_scan_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()
Visullay both the distribution look identical.
stats.levene(df['od_time_diff_mins_log'], df['start_scan_to_end_scan_log'])
LeveneResult(statistic=0.03117851384397478, pvalue=0.859844178127743)
Since p_value > significance level; we can assume equal variances
stats.ttest_ind(df['od_time_diff_mins_log'], df['start_scan_to_end_scan_log'])
Ttest_indResult(statistic=0.282703024175061, pvalue=0.777406892326873)
Since p_value < significance level; mean of od_time_diff_mins and start_scan_to_end_scan are not significantly different.
Test 2
H0 = Mean of actual_time and osrm_time are not significantly different
Ha = Mean of actual_time and osrm_time are significantly different
df_dist = pd.DataFrame({
'Type' : np.repeat(['actual_time_log', 'osrm_time_log'], [len(df['actual_time_log']), len(df['osrm_time_log'])]),
'Values' : pd.concat((df['actual_time_log'], df['osrm_time_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()
Visullay both the distribution look different.
stats.levene(df['actual_time_log'], df['osrm_time_log'])
LeveneResult(statistic=58.45681797951643, pvalue=2.154389317124431e-14)
Since p_value < significance level; we can assume variances are significantly different
We chose to perform non-parametric test
stats.kstest(df['actual_time'], df['osrm_time'])
KstestResult(statistic=0.27760045830264346, pvalue=0.0)
Since p_value < significance level; mean of actual_time and osrm_time are significantly different.
Test 3
H0 = Mean of actual_time and segment_actual_time are not significantly different
Ha = Mean of actual_time and segment_actual_time are significantly different
df_dist = pd.DataFrame({
'Type' : np.repeat(['actual_time_log', 'segment_actual_time_log'], [len(df['actual_time_log']), len(df['segment_actual_time_log'])]),
'Values' : pd.concat((df['actual_time_log'], df['segment_actual_time_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()
Visullay both the distribution look different.
stats.levene(df['actual_time_log'], df['segment_actual_time_log'])
LeveneResult(statistic=339.61604946884137, pvalue=2.507577595431446e-75)
Since p_value < significance level; we can assume variances are significantly different
We chose to perform non-parametric test
stats.kstest(df['actual_time'], df['segment_actual_time'])
KstestResult(statistic=0.29887879531876593, pvalue=0.0)
Since p_value < significance level; mean of actual_time and segment_actual_time are significantly different
Test 4
H0 = Mean of osrm_distance and segment_osrm_distance are not significantly different
Ha = Mean of osrm_distance and segment_osrm_distance are significantly different
df_dist = pd.DataFrame({
'Type' : np.repeat(['osrm_distance_log', 'segment_osrm_distance_log'], [len(df['osrm_distance_log']), len(df['segment_osrm_distance_log'])]),
'Values' : pd.concat((df['osrm_distance_log'], df['segment_osrm_distance_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()
Visullay both the distribution look different.
stats.levene(df['osrm_distance_log'], df['segment_osrm_distance_log'])
LeveneResult(statistic=521.0798471630106, pvalue=3.852435304124549e-114)
Since p_value < significance level; we can assume variances are significantly different
We chose to perform non-parametric test
stats.kstest(df['osrm_distance_log'], df['segment_osrm_distance_log'])
KstestResult(statistic=0.31598330468941815, pvalue=0.0)
Since p_value < significance level; mean of osrm_distance and segment_osrm_distance are significantly different
Test 5
H0 = Mean of actual_time and osrm_time are not significantly different
Ha = Mean of actual_time and osrm_time are significantly different
df_dist = pd.DataFrame({
'Type' : np.repeat(['osrm_time_log', 'segment_osrm_time_log'], [len(df['osrm_time_log']), len(df['segment_osrm_time_log'])]),
'Values' : pd.concat((df['osrm_time_log'], df['segment_osrm_time_log']))
})
plot_kde_distribution(plt.subplot(), df_dist, 'Type', 'Values', 'Season wise Count', 'count', 'Density').show()
Visullay both the distribution look different.
stats.levene(df['osrm_time_log'], df['segment_osrm_time_log'])
LeveneResult(statistic=517.0434072360317, pvalue=2.790314199321843e-113)
Since p_value < significance level; we can assume variances are significantly different
We chose to perform non-parametric test
stats.kstest(df['osrm_time_log'], df['segment_osrm_time_log'])
KstestResult(statistic=0.28954906293477367, pvalue=0.0)
Since p_value < significance level; mean of actual_time and osrm_time are significantly different
Business Insights
plt_most10_count_plot(plt.subplot(), df, '', 'source_state', '', 'Source State wise Count', 'Source State', 'Count', unit_='', highlight_num=None, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
plt_most10_count_plot(plt.subplot(), df, '', 'route_type_Carting', '', 'Route Type wise Count', 'Route Type', 'Count', unit_='', highlight_num=1, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
df['corridors'] = df[['source_city', 'destination_city']].apply(lambda x: x['source_city'] + "_" + x['destination_city'], axis=1)
plt_most10_count_plot(plt.subplot(), df, '', 'corridors', '', 'Corridors wise Order', 'Corridor', '', unit_='', highlight_num=5, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()
df['corridors'].value_counts()
Mumbai_Mumbai 600
Bengaluru_Bengaluru 549
Bangalore_Bengaluru 455
Bhiwandi_Mumbai 437
Hyderabad_Hyderabad 371
...
Bhadohi_Gopiganj 1
Renukoot_Robertsganj 1
Madnapalle_Palamaner 1
Kundapura_Goa 1
Hospet (Karnataka)_Bellary 1
Name: corridors, Length: 1399, dtype: int64
df1 = df.groupby('corridors').agg({'od_time_diff_mins': 'mean'})
df1['od_time_diff_mins'].sort_values(ascending=False)
corridors
Tumkur_Tumkur 6097.128248
Balasore_Kolkata 6013.796553
Bangana_Chandigarh 5965.512898
Allahabad_Sidhi 5927.420268
Sonipat_Moradabad 5586.723676
...
Varanasi_Varanasi (Uttar Pradesh) 52.177473
Unjha_Patan 50.186318
Jagtial_DhrmpuriTS 48.030439
Sankari_Erode (Tamil Nadu) 43.586897
Manthani_Ramagundam 39.391247
Name: od_time_diff_mins, Length: 1399, dtype: float64
df1 = df.groupby('corridors').agg({'actual_distance_to_destination': 'mean'})
df1['actual_distance_to_destination'].sort_values(ascending=False)
corridors
Bangalore_MAA 1077.395161
MAA_Bangalore 1046.609597
Nashik_Nashik 1037.756251
Shikohabad_Delhi 1010.805422
Gurgaon_Jaipur 994.115384
...
Jabalpur (Madhya Pradesh)_Jabalpur 9.237405
Hyderabad_Hyd 9.126716
Delhi_North 9.045083
Salem_Salem 9.040986
Varanasi_Varanasi (Uttar Pradesh) 9.027513
Name: actual_distance_to_destination, Length: 1399, dtype: float64
plt_most10_count_plot(plt.subplot(), df, '', 'trip_creation_month', '', 'Trip per Month', 'Month', 'Count', unit_='', highlight_num=1, highlight_color='#DC3535', orientation_='h', agg_func='Count', agg_var=None).show()