In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
In [2]:
import pytz
import datetime

timestamp = datetime.datetime.now(pytz.timezone('Europe/Helsinki'))
print("Notebook exported at", timestamp.now(), "(Europe/Helsinki)")
Notebook exported at 2026-04-18 23:02:04.967623 (Europe/Helsinki)

Product Analytics T5-2¶

Preparations¶

In [3]:
def is_statistically_significant(values_1, values_2):
    t_stat, p_value = stats.ttest_ind(values_1, values_2, alternative="two-sided", equal_var=False)
    result = p_value <= 0.05
    print(f"Is the difference statistically significant at the 5% condidence level? {"yes" if result else "no"}")
In [4]:
pool_average_price = 10
express_average_price = 12.5
In [5]:
switchbacks = pd.read_excel("input/uber.xlsx", sheet_name="Switchbacks")
switchbacks.info()
switchbacks.head()
<class 'pandas.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   city_id               126 non-null    str           
 1   period_start          126 non-null    datetime64[us]
 2   wait_time             126 non-null    str           
 3   treat                 126 non-null    bool          
 4   commute               126 non-null    bool          
 5   trips_pool            126 non-null    int64         
 6   trips_express         126 non-null    int64         
 7   rider_cancellations   126 non-null    int64         
 8   total_driver_payout   126 non-null    float64       
 9   total_matches         126 non-null    int64         
 10  total_double_matches  126 non-null    int64         
dtypes: bool(2), datetime64[us](1), float64(1), int64(5), str(2)
memory usage: 9.2 KB
Out[5]:
city_id period_start wait_time treat commute trips_pool trips_express rider_cancellations total_driver_payout total_matches total_double_matches
0 Boston 2018-02-19 07:00:00 2 mins False True 1415 3245 256 34458.411634 3372 1476
1 Boston 2018-02-19 09:40:00 5 mins True False 1461 2363 203 29764.349821 2288 1275
2 Boston 2018-02-19 12:20:00 2 mins False False 1362 2184 118 27437.367363 2283 962
3 Boston 2018-02-19 15:00:00 5 mins True True 1984 3584 355 44995.452993 4035 2021
4 Boston 2018-02-19 17:40:00 2 mins False False 1371 2580 181 27583.955295 2200 979
In [6]:
data = switchbacks[switchbacks.columns]

# Group information
data.loc[data["treat"] == True, 'group'] = "treatment"
data.loc[data["treat"] == False, 'group'] = "control"
data.loc[data["commute"] == True, 'hours'] = "commute"
data.loc[data["commute"] == False, 'hours'] = "non-commute"

# Calculations
data["trips_all"] = data["trips_pool"] + data["trips_express"]
data["driver_payout_per_trip"] = data["total_driver_payout"] / data["trips_all"]

data["pools_express_difference"] = data["trips_pool"] - data["trips_express"]
data["share_of_pool_trips"] = data["trips_pool"] / data["trips_all"]
data["share_of_express_trips"] = data["trips_express"] / data["trips_all"]

data["pools_revenue"] = data["trips_pool"] * pool_average_price
data["express_revenue"] = data["trips_express"] * express_average_price
data["all_revenue"] = data["pools_revenue"] + data["express_revenue"]

data["pools_revenue_per_trip"] = data["pools_revenue"] / data["trips_pool"]
data["express_revenue_per_trip"] = data["express_revenue"] / data["trips_express"]
data["all_revenue_per_trip"] = data["all_revenue"] / data["trips_all"]

data["profit"] = data["all_revenue"] - data["total_driver_payout"]
data["profit_per_trip"] = data["profit"] / data["trips_all"]

data = data.drop(columns=["treat", "commute", "wait_time", "city_id", "period_start"])

data.info()
data.head()
<class 'pandas.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   trips_pool                126 non-null    int64  
 1   trips_express             126 non-null    int64  
 2   rider_cancellations       126 non-null    int64  
 3   total_driver_payout       126 non-null    float64
 4   total_matches             126 non-null    int64  
 5   total_double_matches      126 non-null    int64  
 6   group                     126 non-null    str    
 7   hours                     126 non-null    str    
 8   trips_all                 126 non-null    int64  
 9   driver_payout_per_trip    126 non-null    float64
 10  pools_express_difference  126 non-null    int64  
 11  share_of_pool_trips       126 non-null    float64
 12  share_of_express_trips    126 non-null    float64
 13  pools_revenue             126 non-null    int64  
 14  express_revenue           126 non-null    float64
 15  all_revenue               126 non-null    float64
 16  pools_revenue_per_trip    126 non-null    float64
 17  express_revenue_per_trip  126 non-null    float64
 18  all_revenue_per_trip      126 non-null    float64
 19  profit                    126 non-null    float64
 20  profit_per_trip           126 non-null    float64
dtypes: float64(11), int64(8), str(2)
memory usage: 20.8 KB
Out[6]:
trips_pool trips_express rider_cancellations total_driver_payout total_matches total_double_matches group hours trips_all driver_payout_per_trip ... share_of_pool_trips share_of_express_trips pools_revenue express_revenue all_revenue pools_revenue_per_trip express_revenue_per_trip all_revenue_per_trip profit profit_per_trip
0 1415 3245 256 34458.411634 3372 1476 control commute 4660 7.394509 ... 0.303648 0.696352 14150 40562.5 54712.5 10.0 12.5 11.740880 20254.088366 4.346371
1 1461 2363 203 29764.349821 2288 1275 treatment non-commute 3824 7.783564 ... 0.382061 0.617939 14610 29537.5 44147.5 10.0 12.5 11.544848 14383.150179 3.761284
2 1362 2184 118 27437.367363 2283 962 control non-commute 3546 7.737554 ... 0.384095 0.615905 13620 27300.0 40920.0 10.0 12.5 11.539763 13482.632637 3.802209
3 1984 3584 355 44995.452993 4035 2021 treatment commute 5568 8.081080 ... 0.356322 0.643678 19840 44800.0 64640.0 10.0 12.5 11.609195 19644.547007 3.528115
4 1371 2580 181 27583.955295 2200 979 control non-commute 3951 6.981512 ... 0.347001 0.652999 13710 32250.0 45960.0 10.0 12.5 11.632498 18376.044705 4.650986

5 rows × 21 columns

In [7]:
aggregated_data = data[data.columns].sort_values(by=["group", "hours"]).groupby(["group", "hours"], as_index=False).agg(
    trips_pool=('trips_pool', 'sum'),
    trips_express=('trips_express', 'sum'),
    trips_all=('trips_all', 'sum'),
    rider_cancellations=('rider_cancellations', 'sum'),
    total_driver_payout=('total_driver_payout', 'sum'),
    driver_payout_per_trip=('driver_payout_per_trip', 'mean'),
    pools_express_difference=('pools_express_difference', 'sum'),
    mean_pools_express_difference=('pools_express_difference', 'mean'),
    pools_revenue=('pools_revenue', 'sum'),
    pools_revenue_per_trip=('pools_revenue_per_trip', 'mean'),
    express_revenue=('express_revenue', 'sum'),
    express_revenue_per_trip=('express_revenue_per_trip', 'mean'),
    all_revenue=('all_revenue', 'sum'),
    all_revenue_per_trip=('all_revenue_per_trip', 'mean'),
    profit=('profit', 'sum'),
    profit_per_trip=('profit_per_trip', 'mean'),
    share_of_pool_trips=('share_of_pool_trips', 'mean'),
    share_of_express_trips=('share_of_express_trips', 'mean'),
    total_matches=('total_matches', 'sum'),
    total_double_matches=('total_double_matches', 'sum'),
)
aggregated_data.info()
aggregated_data.head()
<class 'pandas.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   group                          4 non-null      str    
 1   hours                          4 non-null      str    
 2   trips_pool                     4 non-null      int64  
 3   trips_express                  4 non-null      int64  
 4   trips_all                      4 non-null      int64  
 5   rider_cancellations            4 non-null      int64  
 6   total_driver_payout            4 non-null      float64
 7   driver_payout_per_trip         4 non-null      float64
 8   pools_express_difference       4 non-null      int64  
 9   mean_pools_express_difference  4 non-null      float64
 10  pools_revenue                  4 non-null      int64  
 11  pools_revenue_per_trip         4 non-null      float64
 12  express_revenue                4 non-null      float64
 13  express_revenue_per_trip       4 non-null      float64
 14  all_revenue                    4 non-null      float64
 15  all_revenue_per_trip           4 non-null      float64
 16  profit                         4 non-null      float64
 17  profit_per_trip                4 non-null      float64
 18  share_of_pool_trips            4 non-null      float64
 19  share_of_express_trips         4 non-null      float64
 20  total_matches                  4 non-null      int64  
 21  total_double_matches           4 non-null      int64  
dtypes: float64(12), int64(8), str(2)
memory usage: 836.0 bytes
Out[7]:
group hours trips_pool trips_express trips_all rider_cancellations total_driver_payout driver_payout_per_trip pools_express_difference mean_pools_express_difference ... express_revenue express_revenue_per_trip all_revenue all_revenue_per_trip profit profit_per_trip share_of_pool_trips share_of_express_trips total_matches total_double_matches
0 control commute 15185 35275 50460 2469 3.952442e+05 7.812577 -20090 -2009.000000 ... 440937.5 12.5 592787.5 11.745324 197543.273885 3.932747 0.301870 0.698130 37893 17943
1 control non-commute 70200 129260 199460 7948 1.450104e+06 7.280393 -59060 -1114.339623 ... 1615750.0 12.5 2317750.0 11.620021 867646.174041 4.339629 0.351992 0.648008 127999 63148
2 treatment commute 15399 31842 47241 3032 3.574423e+05 7.569325 -16443 -1644.300000 ... 398025.0 12.5 552015.0 11.681179 194572.716895 4.111854 0.327528 0.672472 34744 18078
3 treatment non-commute 76625 120579 197204 8946 1.355099e+06 6.877672 -43954 -829.320755 ... 1507237.5 12.5 2273487.5 11.530828 918388.032552 4.653157 0.387669 0.612331 118869 67459

4 rows × 22 columns

In [8]:
control_commute = data.loc[(data['group'] == 'control') & (data['hours'] == 'commute')]
control_non_commute = data.loc[(data['group'] == 'control') & (data['hours'] == 'non-commute')]

treatment_commute = data.loc[(data['group'] == 'treatment') & (data['hours'] == 'commute')]
treatment_non_commute = data.loc[(data['group'] == 'treatment') & (data['hours'] == 'non-commute')]
In [9]:
control_commute_aggregated = aggregated_data.loc[(aggregated_data['group'] == 'control') & (aggregated_data['hours'] == 'commute')].iloc[0]
control_non_commute_aggregated = aggregated_data.loc[(aggregated_data['group'] == 'control') & (aggregated_data['hours'] == 'non-commute')].iloc[0]

treatment_commute_aggregated = aggregated_data.loc[(aggregated_data['group'] == 'treatment') & (aggregated_data['hours'] == 'commute')].iloc[0]
treatment_non_commute_aggregated = aggregated_data.loc[(aggregated_data['group'] == 'treatment') & (aggregated_data['hours'] == 'non-commute')].iloc[0]

Problem 1¶

Compare commuting hours versus non-commuting hours in the control group (i.e., with 2-minute wait times)

Hint: For this exercise you need to select a subsample of the dataset: the observations in the control group, i.e., the observations where the variable “treat” = FALSE. Then, within the “treat” = FALSE subsample, we can run t-tests between the observations with the variable “commute” = TRUE and observations with the variable “commute” = FALSE.

Question 1¶

In [10]:
print(f"Commuting hours experiencing more trips in total than non-commuting hours: {"yes" if control_commute_aggregated["trips_all"] > control_non_commute_aggregated["trips_all"] else "no"}")
Commuting hours experiencing more trips in total than non-commuting hours: no

Question 2¶

In [11]:
print(f"Total trips in commuting hours is {control_commute_aggregated["trips_all"]} and in non-commuting hours is {control_non_commute_aggregated["trips_all"]}")
print(f"Difference in trips total between commuting and non-commuting hours is {abs(control_commute_aggregated["trips_all"] - control_non_commute_aggregated["trips_all"])}")
Total trips in commuting hours is 50460 and in non-commuting hours is 199460
Difference in trips total between commuting and non-commuting hours is 149000

Question 3¶

In [12]:
is_statistically_significant(control_commute["trips_all"], control_non_commute["trips_all"])
Is the difference statistically significant at the 5% condidence level? yes

Question 4¶

In [13]:
print(f"Commuting hours experiencing more Express trips than non-commuting hours: {"yes" if control_commute_aggregated["share_of_express_trips"] > control_non_commute_aggregated["share_of_express_trips"] else "no"}")
Commuting hours experiencing more Express trips than non-commuting hours: yes

Question 5¶

In [14]:
print(f"Share of Express trips in commuting hours is {control_commute_aggregated["share_of_express_trips"]} and in non-commuting hours is {control_non_commute_aggregated["share_of_express_trips"]}")
print(f"Difference in share of Express trips between commuting and non-commuting hours is {abs(control_commute_aggregated["share_of_express_trips"] - control_non_commute_aggregated["share_of_express_trips"])}")
Share of Express trips in commuting hours is 0.6981296132632429 and in non-commuting hours is 0.6480084408938269
Difference in share of Express trips between commuting and non-commuting hours is 0.050121172369415934

Question 6¶

In [15]:
is_statistically_significant(control_commute["share_of_express_trips"], control_non_commute["share_of_express_trips"])
Is the difference statistically significant at the 5% condidence level? yes

Question 7¶

In [16]:
print(f"Total revenue in commuting hours is {control_commute_aggregated["all_revenue"]} and in non-commuting hours is {control_non_commute_aggregated["all_revenue"]}")
print(f"Difference in total revenue between commuting and non-commuting hours is {abs(control_commute_aggregated["all_revenue"] - control_non_commute_aggregated["all_revenue"])}")
Total revenue in commuting hours is 592787.5 and in non-commuting hours is 2317750.0
Difference in total revenue between commuting and non-commuting hours is 1724962.5

Question 8¶

In [17]:
is_statistically_significant(control_commute["all_revenue"], control_non_commute["all_revenue"])
Is the difference statistically significant at the 5% condidence level? yes

Question 9¶

In [18]:
print(f"Profit per trip in commuting hours is {control_commute_aggregated["profit_per_trip"]} and in non-commuting hours is {control_non_commute_aggregated["profit_per_trip"]}")
print(f"Difference in profit per trip between commuting and non-commuting hours is {abs(control_commute_aggregated["profit_per_trip"] - control_non_commute_aggregated["profit_per_trip"])}")
Profit per trip in commuting hours is 3.932747396337362 and in non-commuting hours is 4.3396285473614356
Difference in profit per trip between commuting and non-commuting hours is 0.4068811510240735

Question 10¶

In [19]:
is_statistically_significant(control_commute["profit_per_trip"], control_non_commute["profit_per_trip"])
Is the difference statistically significant at the 5% condidence level? yes

Problem 2¶

Estimate the effect of extending waiting times from 2 minutes (control group) to 5 minutes (treatment group) separately for commuting and non-commuting hours.

Hint:  For parts 1 to 11 you need to compare treatment and control subsamples only during commuting hours so we limit the observations to “commute” = TRUE. We run t-tests between two subsamples of the observations with “commute” = TRUE: “treat” = TRUE – 5 minute waiting – and “treat” = FALSE – 2 minute waiting.

Question 1¶

In [20]:
print(f"Total trips in commuting hours for control group is {control_commute_aggregated["trips_all"]} and for treatment group is {treatment_commute_aggregated["trips_all"]}")
print(f"Difference in total trips in commuting hours between control and treatment groups is {abs(control_commute_aggregated["trips_all"] - treatment_commute_aggregated["trips_all"])}")
Total trips in commuting hours for control group is 50460 and for treatment group is 47241
Difference in total trips in commuting hours between control and treatment groups is 3219

Question 2¶

In [21]:
is_statistically_significant(control_commute["trips_all"], treatment_commute["trips_all"])
Is the difference statistically significant at the 5% condidence level? no

Question 3¶

In [22]:
print(f"Total rider cancellations in commuting hours for control group is {control_commute_aggregated["rider_cancellations"]} and for treatment group is {treatment_commute_aggregated["rider_cancellations"]}")
print(f"Difference in rider cancellations in commuting hours between control and treatment groups is {abs(control_commute_aggregated["rider_cancellations"] - treatment_commute_aggregated["rider_cancellations"])}")
Total rider cancellations in commuting hours for control group is 2469 and for treatment group is 3032
Difference in rider cancellations in commuting hours between control and treatment groups is 563

Question 4¶

In [23]:
is_statistically_significant(control_commute["rider_cancellations"], treatment_commute["rider_cancellations"])
Is the difference statistically significant at the 5% condidence level? yes

Question 5¶

In [24]:
print(f"Total driver payout per trip in commuting hours for control group is {control_commute_aggregated["driver_payout_per_trip"]} and for treatment group is {treatment_commute_aggregated["driver_payout_per_trip"]}")
print(f"Difference in driver payout per trip in commuting hours between control and treatment groups is {abs(control_commute_aggregated["driver_payout_per_trip"] - treatment_commute_aggregated["driver_payout_per_trip"])}")
Total driver payout per trip in commuting hours for control group is 7.812576636820745 and for treatment group is 7.569324569233842
Difference in driver payout per trip in commuting hours between control and treatment groups is 0.2432520675869032

Question 6¶

In [25]:
is_statistically_significant(control_commute["driver_payout_per_trip"], treatment_commute["driver_payout_per_trip"])
Is the difference statistically significant at the 5% condidence level? no

Question 7¶

In [26]:
print(f"Total matches in commuting hours for control group is {control_commute_aggregated["total_matches"]} and for treatment group is {treatment_commute_aggregated["total_matches"]}")
print(f"Difference in matches in commuting hours between control and treatment groups is {abs(control_commute_aggregated["total_matches"] - treatment_commute_aggregated["total_matches"])}")
Total matches in commuting hours for control group is 37893 and for treatment group is 34744
Difference in matches in commuting hours between control and treatment groups is 3149

Question 8¶

In [27]:
is_statistically_significant(control_commute["total_matches"], treatment_commute["total_matches"])
Is the difference statistically significant at the 5% condidence level? no

Question 9¶

In [28]:
print(f"Total double matches in commuting hours for control group is {control_commute_aggregated["total_double_matches"]} and for treatment group is {treatment_commute_aggregated["total_double_matches"]}")
print(f"Difference in double matches in commuting hours between control and treatment groups is {abs(control_commute_aggregated["total_double_matches"] - treatment_commute_aggregated["total_double_matches"])}")
Total double matches in commuting hours for control group is 17943 and for treatment group is 18078
Difference in double matches in commuting hours between control and treatment groups is 135

Question 10¶

In [29]:
is_statistically_significant(control_commute["total_double_matches"], treatment_commute["total_double_matches"])
Is the difference statistically significant at the 5% condidence level? no

Question 11¶

No, the data provides clear evidence against extending waiting times.

Question 12¶

In [30]:
print(f"Total trips in non-commuting hours for control group is {control_non_commute_aggregated["trips_all"]} and for treatment group is {treatment_non_commute_aggregated["trips_all"]}")
print(f"Difference in total trips in non-commuting hours between control and treatment groups is {abs(control_non_commute_aggregated["trips_all"] - treatment_non_commute_aggregated["trips_all"])}")
Total trips in non-commuting hours for control group is 199460 and for treatment group is 197204
Difference in total trips in non-commuting hours between control and treatment groups is 2256

Question 13¶

In [31]:
is_statistically_significant(control_non_commute["trips_all"], treatment_non_commute["trips_all"])
Is the difference statistically significant at the 5% condidence level? no

Question 14¶

In [32]:
print(f"Total rider cancellations in non-commuting hours for control group is {control_non_commute_aggregated["rider_cancellations"]} and for treatment group is {treatment_non_commute_aggregated["rider_cancellations"]}")
print(f"Difference in rider cancellations in non-commuting hours between control and treatment groups is {abs(control_non_commute_aggregated["rider_cancellations"] - treatment_non_commute_aggregated["rider_cancellations"])}")
Total rider cancellations in non-commuting hours for control group is 7948 and for treatment group is 8946
Difference in rider cancellations in non-commuting hours between control and treatment groups is 998

Question 15¶

In [33]:
is_statistically_significant(control_non_commute["rider_cancellations"], treatment_non_commute["rider_cancellations"])
Is the difference statistically significant at the 5% condidence level? yes

Question 16¶

In [34]:
print(f"Total driver payout per trip in non-commuting hours for control group is {control_non_commute_aggregated["driver_payout_per_trip"]} and for treatment group is {treatment_non_commute_aggregated["driver_payout_per_trip"]}")
print(f"Difference in driver payout per trip in non-commuting hours between control and treatment groups is {abs(control_non_commute_aggregated["driver_payout_per_trip"] - treatment_non_commute_aggregated["driver_payout_per_trip"])}")
Total driver payout per trip in non-commuting hours for control group is 7.280392554873131 and for treatment group is 6.877671503647454
Difference in driver payout per trip in non-commuting hours between control and treatment groups is 0.4027210512256776

Question 17¶

In [35]:
is_statistically_significant(control_non_commute["driver_payout_per_trip"], treatment_non_commute["driver_payout_per_trip"])
Is the difference statistically significant at the 5% condidence level? yes

Question 18¶

In [36]:
print(f"Total matches in non-commuting hours for control group is {control_non_commute_aggregated["total_matches"]} and for treatment group is {treatment_non_commute_aggregated["total_matches"]}")
print(f"Difference in matches in non-commuting hours between control and treatment groups is {abs(control_non_commute_aggregated["total_matches"] - treatment_non_commute_aggregated["total_matches"])}")
Total matches in non-commuting hours for control group is 127999 and for treatment group is 118869
Difference in matches in non-commuting hours between control and treatment groups is 9130

Question 19¶

In [37]:
is_statistically_significant(control_non_commute["total_matches"], treatment_non_commute["total_matches"])
Is the difference statistically significant at the 5% condidence level? yes

Question 20¶

In [38]:
print(f"Total double matches in non-commuting hours for control group is {control_non_commute_aggregated["total_double_matches"]} and for treatment group is {treatment_non_commute_aggregated["total_double_matches"]}")
print(f"Difference in double matches in non-commuting hours between control and treatment groups is {abs(control_non_commute_aggregated["total_double_matches"] - treatment_non_commute_aggregated["total_double_matches"])}")
Total double matches in non-commuting hours for control group is 63148 and for treatment group is 67459
Difference in double matches in non-commuting hours between control and treatment groups is 4311

Question 21¶

In [39]:
is_statistically_significant(control_non_commute["total_double_matches"], treatment_non_commute["total_double_matches"])
Is the difference statistically significant at the 5% condidence level? no

Question 22¶

No, the data provides clear evidence against extending waiting times.

Analyzing the results¶

Reflect on Uber’s approach to managing innovation through the Express POOL development project. Identify specific strategies or practices that contributed significantly to the project’s outcomes, explaining why these choices were effective. Additionally, critically analyze areas where Uber’s innovation strategy fell short or faced significant trade-offs. Propose specific, alternative approaches Uber might have taken to address these shortcomings, considering organizational, operational, and strategic implications.

Uber's approach to managing innovation is based on validation of the data. For example they run different kind of experiments like user-level A/B testing, switchbacks and synthetic control experiments. In Express POOL they did use the switchbacks as their primary experimentation method. Before the implementation, they did run simulations against existing data and run simulation that how the Express scenarios would work. They did use this as a base for Express' matching system, for example to generate needed parameters. After simulation they did run pilots in test markets, in this case markets was Boston and San Francisco. They did it in two steps as enabling it first in partion of both cities and after promising results in limited areas, experiment was expanded to whole city. After realising that for example weather did have affect, I'm still questioning why the weather was not included in the dataset as it does offer interesting factor telling insights from the data.

I understand that they wanted to try switchback in Boston as previous results there was not so promising compared to San Francisco. The dataset shows that shared riding was more popular in non-commute hours and there is clearly increase of more full capacity trips. I see the point why they wanted to expand experimentation quite shortly after first tests as often it is money talking. I think that it is quite a risk to overrule the 5-week freeze they did put on earlier. It does make the efforts towards data worthless. As the company is not facing bankrupcy, I would have chosen to wait. Because the increase of the wait time is so big change and not having any comparable market insights from time before change, it do not tell anything about if the increase of the wait time is actually a good move. There was also mention about that savings was not calculated accurately, but just on back-of-the-envelope state.

Considering the supplementary dataset, what is the effect of extending match wait times from two to five minutes on the total number of shared rides completed (that is, rides taken via both the existing shared rides product — UberPOOL — and the new shared rides product, Express), the proportion of shared rides that were matched, and driver payout per trip?

In [40]:
print(f"Difference in total pool trips between control and treatment groups in commute hours: {control_commute_aggregated["trips_pool"] - treatment_commute_aggregated["trips_pool"]}")
print(f"Difference in total pool trips between control and treatment groups in non-commute hours: {control_non_commute_aggregated["trips_pool"] - treatment_non_commute_aggregated["trips_pool"]}")
print(f"{"-"*25}")
print(f"Difference in total Express trips between control and treatment groups in commute hours: {control_commute_aggregated["trips_express"] - treatment_commute_aggregated["trips_express"]}")
print(f"Difference in total Express trips between control and treatment groups in non-commute hours: {control_non_commute_aggregated["trips_express"] - treatment_non_commute_aggregated["trips_express"]}")
print(f"{"-"*25}")
print(f"Difference in total trips between control and treatment groups in commute hours: {control_commute_aggregated["trips_all"] - treatment_commute_aggregated["trips_all"]}")
print(f"Difference in total trips between control and treatment groups in non-commute hours: {control_non_commute_aggregated["trips_all"] - treatment_non_commute_aggregated["trips_all"]}")
print(f"{"-"*25}")
print(f"Difference in total matches between control and treatment groups in commute hours: {control_commute_aggregated["total_matches"] - treatment_commute_aggregated["total_matches"]}")
print(f"Difference in total matches between control and treatment groups in non-commute hours: {control_non_commute_aggregated["total_matches"] - treatment_non_commute_aggregated["total_matches"]}")
print(f"{"-"*25}")
print(f"Difference in total double matches between control and treatment groups in commute hours: {control_commute_aggregated["total_double_matches"] - treatment_commute_aggregated["total_double_matches"]}")
print(f"Difference in total matches between control and treatment groups in non-commute hours: {control_non_commute_aggregated["total_double_matches"] - treatment_non_commute_aggregated["total_double_matches"]}")
print(f"{"-"*25}")
print(f"Difference in driver payout per trip between control and treatment groups in commute hours: {control_commute_aggregated["driver_payout_per_trip"] - treatment_commute_aggregated["driver_payout_per_trip"]}")
print(f"Difference in driver payout per trip between control and treatment groups in non-commute hours: {control_non_commute_aggregated["driver_payout_per_trip"] - treatment_non_commute_aggregated["driver_payout_per_trip"]}")
Difference in total pool trips between control and treatment groups in commute hours: -214
Difference in total pool trips between control and treatment groups in non-commute hours: -6425
-------------------------
Difference in total Express trips between control and treatment groups in commute hours: 3433
Difference in total Express trips between control and treatment groups in non-commute hours: 8681
-------------------------
Difference in total trips between control and treatment groups in commute hours: 3219
Difference in total trips between control and treatment groups in non-commute hours: 2256
-------------------------
Difference in total matches between control and treatment groups in commute hours: 3149
Difference in total matches between control and treatment groups in non-commute hours: 9130
-------------------------
Difference in total double matches between control and treatment groups in commute hours: -135
Difference in total matches between control and treatment groups in non-commute hours: -4311
-------------------------
Difference in driver payout per trip between control and treatment groups in commute hours: 0.2432520675869032
Difference in driver payout per trip between control and treatment groups in non-commute hours: 0.4027210512256776

Quick look into the data shows that in treatment groups users preferred the new Express product when longer waiting time was active. But still it shows that in total trips, shorter wait time equals more trips and more users. On the side of driver payout, the old pool trips are better from driver perspective as them will payout more money. At the same time it will increase the cost of service for Uber. Longer wait time will increase matches when there are at least 3 riders in the same trip, but decreases quite well in trips with at least 2 riders. However, the thing that dataset does not show is related metrics that affects, like the weather for example. In Boston at time of the experiment was winter and that did affect the results.

Based on the data available to you, other insights from the dataset, and additional qualitative considerations, what would you recommend that Stock do? Should he increase match wait times from two to five minutes in the six treatment cities of the launch experiment? If so, when should he do so?

I would advice against for increasing the wait time. Given dataset clearly shows that almost every valuation point shows that positive effects are not raising and negative effects however are raising. Valuation points in problem 1 tells the story why this experiment was conducted. There is a lot room for improvment compared between non-commute and commute hours. But this is clearly not the way to go. I see the cost saving measurements here, but other metrics are decresing that is not so good thing.

Sources¶

Source codes of the analytics are available at https://github.com/FinThunderstorm/product-analytics-t5-2 and built notebook at https://product-analytics-t5-2.alanen.dev.