import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use('ggplot')
df = pd.read_csv("D:\\DataSets\\uber\\Uber_Request_Data.csv")
df.head()
def fix_columns(df):
"""
returns column names in standard format
"""
old_col = df.columns
new_col = old_col.str.replace(' ', '_').str.lower()
for old, new in zip(list(old_col), list(new_col)):
df = df.rename(columns={old: new})
return df
# clean and display column names
df = fix_columns(df)
df.columns
df.info()
# Finding number of NA values in each column
print("Number of NA values in each column.")
df.isna().sum()
# Dropping redundant columns
df.drop(['request_id', 'driver_id'], axis=1, inplace=True)
# Converting date-strings to timestamp
df['request_timestamp'] = pd.to_datetime(df['request_timestamp'], dayfirst=True)
df['drop_timestamp'] = pd.to_datetime(df['drop_timestamp'], dayfirst=True)
# Derived metrics
df['request_day'] = df['request_timestamp'].dt.day
df['request_month'] = df['request_timestamp'].dt.month
df['request_hour'] = df['request_timestamp'].dt.hour
df['is_success'] = 0 # Column Indicating whether trip was successfully booked and completed
df.loc[df.status == 'Trip Completed', 'is_success'] = 1
df['time_taken'] = (df['drop_timestamp'] - df['request_timestamp']).astype('timedelta64[m]')
df.head()
print(f"Data is available for:-\nmonth: {df['request_month'].unique()}\ndays: {df['request_day'].unique()}\nfor the year of {df['request_timestamp'].dt.year.unique()[0]}")
print(f"Drop timestamp is not available only when status is: {df[df['drop_timestamp'].isna()]['status'].unique()}")
df['status'].unique()
plt.figure(figsize=(8, 6))
x = ['Trip Completed', 'No Cars Available', 'Cancelled']
y = list(df["status"].value_counts())
plt.bar(x, y, width=0.5, color='k')
plt.title("Count of Trip Statuses", fontsize=14)
plt.xlabel("\nTrip Status", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.show()
def demand_supply_comp(pickup_point):
pickup_df = df[df["pickup_point"] == pickup_point]
plt.figure(figsize=(12, 6))
x = ['11th', '12th', '13th', '14th', '15th']
y = list(pickup_df['request_day'].value_counts())
y2 = list(pickup_df[pickup_df["is_success"] == 1]['request_day'].value_counts())
plt.bar(x, y, width=0.5, alpha=0.6)
plt.bar(x, y2, width=0.2, alpha=0.6)
plt.legend(['demand', 'supply'])
plt.title(f"Demand-Supply Seggregated by day at {pickup_point}", fontsize=14)
plt.ylabel('Requests', fontsize=12)
plt.show()
plt.figure(figsize=(16, 8))
pickup_df['request_hour'].hist(bins=24, alpha=0.6)
pickup_df[(pickup_df['status'] == 'Trip Completed')]['request_hour'].hist(bins=24, alpha=0.6)
plt.xticks(np.arange(0, 24, 1.0))
plt.title(f"\n\nDemand-Supply Seggregated by Hour at {pickup_point}", fontsize=14)
plt.ylabel('Requests', fontsize=12)
plt.legend(['demand', 'supply'])
plt.show()
demand_supply_comp("Airport")
Here we can see that the demand for cabs at the airport surges around 4:30pm and this surge lasts till 11:00pm
demand_supply_comp("City")
For city-to-Airport travel, demand rises around 4:00am, and the subsides by 11:00am
def status_comp(df=df[df["is_success"] == 0]):
plt.figure(figsize=(12, 6))
x = ['11th', '12th', '13th', '14th', '15th']
y = list(df['request_day'].value_counts())
y2 = list(df[df["status"]=='No Cars Available']['request_day'].value_counts())
plt.bar(x, y, width=0.3, color='k', alpha=0.8)
plt.bar(x, y2, width=0.3, color='g', alpha=0.5)
plt.ylabel("Requests", fontsize=12)
plt.title("Seggregated by Day", fontsize=14)
plt.legend(["Cancelled", "No Cars Available"])
for a,b,c in zip(x, y, y2):
plt.text(a, c - 200, str(b), horizontalalignment='center', fontsize=15, color='w')
plt.text(a, b - 150, str(b-c), horizontalalignment='center', fontsize=15, color='w')
plt.tight_layout()
plt.show()
status_comp()
plt.figure(figsize=(16, 6))
df[df['status'] == 'Cancelled']['request_hour'].hist(bins=24, alpha=0.8, color='k')
df[df['status'] == 'No Cars Available']['request_hour'].hist(bins=24, alpha=0.5, color='g')
plt.xticks(np.arange(0, 24, 1.0))
plt.title("Seggregated by Day", fontsize=14)
plt.ylabel("Count")
plt.legend(["Cancelled", "No Cars Available"])
plt.savefig("plot.jpg", orientation="Landscape")
plt.show()
Typically, frequency of cancellations are high between 4:00am and 10:00am, while frequency of a lack of cabs is high between 4:30pm and 11:00pm
Highest demand for travel from City to Airport is between 4:00AM to 11:00AM (Early-Morning to Late-Morning). During this period we see the most number of unfulfilled requests due to cab cancellation by the drivers. This is because although there are a lot of available cabs, no cab driver wants to travel to the airport. This is very likely due to the fact that the driver does not wish to return from airport to city without a customer.
On the other hand, highest demand for travel from Airport to City is between 4:30AM to 11:00PM (Evening to Late-Night). During this period we see most number of unfulfilled requests due to no cabs being available.
# Exporting data to use in Tableau
df.to_csv("D:\\DataSets\\uber\\Uber_Request_Data2.csv")