Description

This script uses inferential statistics to understand UFC fight odds data.

Libraries

library(tidyverse)
library(psych)
library(corrplot)
library(knitr)
library(Amelia)
library(rstanarm)

Examine Data

Load data.

load("./Datasets/df_master.RData")

Set the minimum number of fights required for a fighter to be included in the analysis.

fight_min = 1

Summarize data.

summary(df_master)

##      NAME               Date              Event               City          
##  Length:6088        Length:6088        Length:6088        Length:6088       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     State             Country          FightWeightClass       Round      
##  Length:6088        Length:6088        Length:6088        Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :3.000  
##                                                           Mean   :2.431  
##                                                           3rd Qu.:3.000  
##                                                           Max.   :5.000  
##                                                                          
##     Method          Winner_Odds         Loser_Odds            Sex           
##  Length:6088        Length:6088        Length:6088        Length:6088       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     fight_id         Result          FighterWeight   FighterWeightClass
##  Min.   :   1.0   Length:6088        Min.   :115.0   Length:6088       
##  1st Qu.: 761.8   Class :character   1st Qu.:135.0   Class :character  
##  Median :1522.5   Mode  :character   Median :155.0   Mode  :character  
##  Mean   :1522.5                      Mean   :163.9                     
##  3rd Qu.:2283.2                      3rd Qu.:185.0                     
##  Max.   :3044.0                      Max.   :265.0                     
##                                                                        
##      REACH            SLPM             SAPM             STRA       
##  Min.   :58.00   Min.   : 0.000   Min.   : 0.100   Min.   :0.0000  
##  1st Qu.:69.00   1st Qu.: 2.680   1st Qu.: 2.630   1st Qu.:0.3900  
##  Median :72.00   Median : 3.440   Median : 3.240   Median :0.4400  
##  Mean   :71.77   Mean   : 3.529   Mean   : 3.434   Mean   :0.4422  
##  3rd Qu.:75.00   3rd Qu.: 4.220   3rd Qu.: 4.020   3rd Qu.:0.4900  
##  Max.   :84.00   Max.   :19.910   Max.   :21.180   Max.   :0.8800  
##  NA's   :215                                                       
##       STRD              TD             TDA              TDD        
##  Min.   :0.0900   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5100   1st Qu.:0.560   1st Qu.:0.2600   1st Qu.:0.5100  
##  Median :0.5600   Median :1.210   Median :0.3700   Median :0.6400  
##  Mean   :0.5516   Mean   :1.520   Mean   :0.3752   Mean   :0.6158  
##  3rd Qu.:0.6000   3rd Qu.:2.183   3rd Qu.:0.5000   3rd Qu.:0.7600  
##  Max.   :0.9200   Max.   :8.930   Max.   :1.0000   Max.   :1.0000  
##                                                                    
##       SUBA        
##  Min.   : 0.0000  
##  1st Qu.: 0.1000  
##  Median : 0.4000  
##  Mean   : 0.5506  
##  3rd Qu.: 0.8000  
##  Max.   :12.1000  
## 

Redefine variables.

df_master$NAME = as.factor(df_master$NAME)
df_master$Date = as.Date(df_master$Date)
df_master$Event = as.factor(df_master$Event)
df_master$City= as.factor(df_master$City)
df_master$State = as.factor(df_master$State)
df_master$Country = as.factor(df_master$Country)
df_master$FightWeightClass = as.factor(df_master$FightWeightClass)
# will keep round as integer since represents time in match...
# df_master$Round = as.factor(df_master$Round)  
df_master$Method = as.factor(df_master$Method)
df_master$Winner_Odds = as.numeric(df_master$Winner_Odds)
df_master$Loser_Odds = as.numeric(df_master$Loser_Odds)
df_master$fight_id = as.factor(df_master$fight_id)
df_master$Sex = as.factor(df_master$Sex)
df_master$Result = as.factor(df_master$Result)
df_master$FighterWeightClass = as.factor(df_master$FighterWeightClass)

Summarize again. There are infinite odds and overturned / DQ fight outcomes. These will have to be removed.

summary(df_master)

##                  NAME           Date           
##  Donald Cerrone    :  24   Min.   :2013-04-27  
##  Ovince Saint Preux:  21   1st Qu.:2015-09-05  
##  Jim Miller        :  20   Median :2017-06-17  
##  Derrick Lewis     :  19   Mean   :2017-07-11  
##  Neil Magny        :  19   3rd Qu.:2019-05-18  
##  Andrei Arlovski   :  18   Max.   :2021-04-17  
##  (Other)           :5967                       
##                                   Event                  City     
##  UFC 259: Blachowicz vs. Adesanya    :  30   Las Vegas     :1346  
##  UFC Fight Night: Chiesa vs. Magny   :  28   Abu Dhabi     : 258  
##  UFC Fight Night: Poirier vs. Gaethje:  28   Boston        : 124  
##  UFC Fight Night: Whittaker vs. Till :  28   Rio de Janeiro: 124  
##  UFC 190: Rousey vs Correia          :  26   Chicago       : 118  
##  UFC 193: Rousey vs Holm             :  26   Newark        : 114  
##  (Other)                             :5922   (Other)       :4004  
##         State                      Country          FightWeightClass
##  Nevada    :1346   USA                 :3564   Lightweight  : 998   
##  Abu Dhabi : 258   Brazil              : 534   Welterweight : 992   
##  Texas     : 256   Canada              : 378   Bantamweight : 872   
##  New York  : 252   United Arab Emirates: 258   Featherweight: 736   
##  California: 250   Australia           : 236   Middleweight : 666   
##  Florida   : 176   United Kingdom      : 184   Flyweight    : 508   
##  (Other)   :3550   (Other)             : 934   (Other)      :1316   
##      Round              Method      Winner_Odds     Loser_Odds       Sex      
##  Min.   :1.000   DQ        :  16   Min.   :1.06   Min.   :1.07   Female: 780  
##  1st Qu.:1.000   KO/TKO    :1940   1st Qu.:1.42   1st Qu.:1.77   Male  :5308  
##  Median :3.000   M-DEC     :  36   Median :1.71   Median :2.38                
##  Mean   :2.431   Overturned:  20   Mean   : Inf   Mean   : Inf                
##  3rd Qu.:3.000   S-DEC     : 642   3rd Qu.:2.33   3rd Qu.:3.35                
##  Max.   :5.000   SUB       :1078   Max.   : Inf   Max.   : Inf                
##                  U-DEC     :2356                                              
##     fight_id       Result     FighterWeight       FighterWeightClass
##  1      :   2   Loser :3044   Min.   :115.0   Welterweight :1018    
##  2      :   2   Winner:3044   1st Qu.:135.0   Lightweight  : 992    
##  3      :   2                 Median :155.0   Bantamweight : 821    
##  4      :   2                 Mean   :163.9   Featherweight: 739    
##  5      :   2                 3rd Qu.:185.0   Middleweight : 667    
##  6      :   2                 Max.   :265.0   Flyweight    : 570    
##  (Other):6076                                 (Other)      :1281    
##      REACH            SLPM             SAPM             STRA       
##  Min.   :58.00   Min.   : 0.000   Min.   : 0.100   Min.   :0.0000  
##  1st Qu.:69.00   1st Qu.: 2.680   1st Qu.: 2.630   1st Qu.:0.3900  
##  Median :72.00   Median : 3.440   Median : 3.240   Median :0.4400  
##  Mean   :71.77   Mean   : 3.529   Mean   : 3.434   Mean   :0.4422  
##  3rd Qu.:75.00   3rd Qu.: 4.220   3rd Qu.: 4.020   3rd Qu.:0.4900  
##  Max.   :84.00   Max.   :19.910   Max.   :21.180   Max.   :0.8800  
##  NA's   :215                                                       
##       STRD              TD             TDA              TDD        
##  Min.   :0.0900   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5100   1st Qu.:0.560   1st Qu.:0.2600   1st Qu.:0.5100  
##  Median :0.5600   Median :1.210   Median :0.3700   Median :0.6400  
##  Mean   :0.5516   Mean   :1.520   Mean   :0.3752   Mean   :0.6158  
##  3rd Qu.:0.6000   3rd Qu.:2.183   3rd Qu.:0.5000   3rd Qu.:0.7600  
##  Max.   :0.9200   Max.   :8.930   Max.   :1.0000   Max.   :1.0000  
##                                                                    
##       SUBA        
##  Min.   : 0.0000  
##  1st Qu.: 0.1000  
##  Median : 0.4000  
##  Mean   : 0.5506  
##  3rd Qu.: 0.8000  
##  Max.   :12.1000  
## 

How many events are there in the dataset?

length(unique(df_master$Event))

## [1] 265

How many fights?

length(unique(df_master$fight_id))

## [1] 3044

Over what time frame did these occur?

range(sort(unique(df_master$Date)))

## [1] "2013-04-27" "2021-04-17"

Preprocess Data

Make copy of data frame.

df_stats = df_master

Calculate number of fights in dataset for each fighter in dataset.

df_stats %>%
  dplyr::group_by(NAME) %>%
  dplyr::summarise(
    Num_Fights = length(Round)
  ) -> df_fight_count

Append fight count to dataframe.

df_stats = merge(df_stats, df_fight_count)

Which fights will we lose due to equal odds?

df_stats %>%
  dplyr::filter(Winner_Odds == Loser_Odds) -> df_equal_odds

kable(df_equal_odds)

NAME	Date	Event	City	State	Country	FightWeightClass	Round	Method	Winner_Odds	Loser_Odds	Sex	fight_id	Result	FighterWeight	FighterWeightClass	REACH	SLPM	SAPM	STRA	STRD	TD	TDA	TDD	SUBA	Num_Fights
Abdul Razak Alhassan	2021-04-17	UFC Fight Night: Whittaker vs. Gastelum	Las Vegas	Nevada	USA	Middleweight	3	U-DEC	Inf	Inf	Male	1220	Loser	170	Welterweight	73	3.71	4.24	0.47	0.53	0.53	0.28	0.55	0.0	7
Aisling Daly	2015-10-24	UFC Fight Night: Holohan vs Smolka	Dublin	Leinster	Ireland	Strawweight	3	U-DEC	2.00	2.00	Female	18	Winner	115	Strawweight	64	2.92	1.40	0.52	0.55	2.15	0.41	0.33	0.9	3
Aleksei Oleinik	2017-11-04	UFC 217: Bisping vs. St-Pierre	New York City	New York	USA	Heavyweight	2	KO/TKO	Inf	Inf	Male	617	Loser	240	Heavyweight	80	3.47	3.85	0.50	0.44	2.38	0.46	0.33	2.4	14
Aleksei Oleinik	2017-07-08	UFC 213: Romero vs. Whittaker	Las Vegas	Nevada	USA	Heavyweight	2	SUB	Inf	Inf	Male	62	Winner	240	Heavyweight	80	3.47	3.85	0.50	0.44	2.38	0.46	0.33	2.4	14
Ali AlQaisi	2020-10-10	UFC Fight Night: Moraes vs. Sandhagen	Abu Dhabi	Abu Dhabi	United Arab Emirates	Bantamweight	3	U-DEC	Inf	Inf	Male	2694	Loser	135	Bantamweight	68	2.43	1.97	0.42	0.56	3.50	0.29	0.60	1.0	2
Alvaro Herrera Mendoza	2018-07-28	UFC Fight Night: Alvarez vs. Poirier 2	Calgary	Alberta	Canada	Lightweight	1	KO/TKO	2.00	2.00	Male	801	Loser	155	Lightweight	74	1.89	3.40	0.38	0.55	0.00	0.00	0.33	1.1	2
Anthony Johnson	2015-05-23	UFC 187: Johnson vs Cormier	Las Vegas	Nevada	USA	Light Heavyweight	3	SUB	1.96	1.96	Male	655	Loser	205	Light Heavyweight	78	3.25	1.83	0.47	0.60	2.43	0.57	0.77	0.6	8
Antonio Carlos Junior	2017-10-28	UFC Fight Night: Brunson vs. Machida	Sao Paulo	Sao Paulo	Brazil	Middleweight	1	SUB	Inf	Inf	Male	251	Winner	185	Middleweight	79	1.95	2.14	0.42	0.52	3.42	0.39	0.53	0.8	9
Ariane Lipski	2019-11-16	UFC Fight Night: Blachowicz vs. Jacare	Sao Paulo	Sao Paulo	Brazil	Flyweight	3	U-DEC	Inf	Inf	Female	254	Winner	125	Flyweight	67	3.03	4.45	0.32	0.48	0.27	0.25	0.45	0.5	4
Bevon Lewis	2018-12-29	UFC 232: Jones vs. Gustafsson 2	Los Angeles	California	USA	Middleweight	3	KO/TKO	Inf	Inf	Male	2881	Loser	185	Middleweight	79	3.73	2.67	0.43	0.54	0.00	0.00	0.66	0.0	3
Brian Camozzi	2018-02-18	UFC Fight Night: Cerrone vs. Medeiros	Austin	Texas	USA	Welterweight	1	SUB	Inf	Inf	Male	1041	Loser	170	Welterweight	78	3.15	6.39	0.26	0.54	0.00	0.00	0.00	0.7	3
Carlos Condit	2016-01-02	UFC 195: Lawler vs Condit	Las Vegas	Nevada	USA	Welterweight	5	S-DEC	1.95	1.95	Male	2336	Loser	170	Welterweight	75	3.63	2.49	0.39	0.56	0.62	0.54	0.39	1.0	8
Carlton Minus	2020-08-22	UFC Fight Night: Munhoz vs. Edgar	Las Vegas	Nevada	USA	Welterweight	3	U-DEC	Inf	Inf	Male	2887	Loser	170	Welterweight	75	3.50	4.97	0.40	0.50	0.00	0.00	0.58	0.0	2
Cezar Ferreira	2018-05-12	UFC 224: Nunes vs. Pennington	Rio de Janeiro	Rio de Janeiro	Brazil	Middleweight	1	SUB	1.95	1.95	Male	453	Winner	185	Middleweight	78	1.90	2.44	0.42	0.53	2.69	0.53	0.84	0.5	13
Curtis Blaydes	2017-11-04	UFC 217: Bisping vs. St-Pierre	New York City	New York	USA	Heavyweight	2	KO/TKO	Inf	Inf	Male	617	Winner	265	Heavyweight	80	3.59	1.70	0.53	0.57	6.64	0.54	0.33	0.0	12
Da-Un Jung	2021-04-10	UFC Fight Night: Vettori vs. Holland	Las Vegas	Nevada	USA	Light Heavyweight	3	U-DEC	Inf	Inf	Male	3039	Winner	205	Light Heavyweight	78	3.95	3.90	0.45	0.54	2.79	0.61	0.88	0.3	3
Daniel Cormier	2015-05-23	UFC 187: Johnson vs Cormier	Las Vegas	Nevada	USA	Light Heavyweight	3	SUB	1.96	1.96	Male	655	Winner	235	Heavyweight	72	4.25	2.92	0.52	0.54	1.83	0.44	0.80	0.4	10
Darrick Minner	2020-09-19	UFC Fight Night: Covington vs. Woodley	Las Vegas	Nevada	USA	Featherweight	1	SUB	Inf	Inf	Male	715	Winner	145	Featherweight	69	3.24	1.40	0.70	0.43	3.00	0.62	0.60	3.6	3
Deiveson Figueiredo	2018-02-03	UFC Fight Night: Machida vs. Anders	Belem	Para	Brazil	Flyweight	2	KO/TKO	2.20	2.20	Male	734	Winner	125	Flyweight	68	3.38	3.35	0.56	0.50	1.57	0.50	0.61	2.4	8
Devin Powell	2018-07-28	UFC Fight Night: Alvarez vs. Poirier 2	Calgary	Alberta	Canada	Lightweight	1	KO/TKO	2.00	2.00	Male	801	Winner	155	Lightweight	73	2.88	3.67	0.41	0.51	0.00	0.00	0.00	0.6	4
Devonte Smith	2019-02-09	UFC 234: Adesanya vs. Silva	Melbourne	Victoria	Australia	Lightweight	1	KO/TKO	Inf	Inf	Male	803	Winner	155	Lightweight	76	5.64	2.65	0.55	0.59	0.74	1.00	1.00	0.7	4
Dong Hyun Ma	2019-02-09	UFC 234: Adesanya vs. Silva	Melbourne	Victoria	Australia	Lightweight	1	KO/TKO	Inf	Inf	Male	803	Loser	155	Lightweight	70	2.84	4.10	0.41	0.54	1.27	0.53	0.33	0.0	8
Dong Hyun Ma	2017-09-22	UFC Fight Night: Saint Preux vs. Okami	Saitama	Saitama	Japan	Lightweight	1	KO/TKO	Inf	Inf	Male	849	Winner	155	Lightweight	70	2.84	4.10	0.41	0.54	1.27	0.53	0.33	0.0	8
Ericka Almeida	2015-10-24	UFC Fight Night: Holohan vs Smolka	Dublin	Leinster	Ireland	Strawweight	3	U-DEC	2.00	2.00	Female	18	Loser	115	Strawweight	NA	1.07	3.33	0.39	0.43	0.50	0.33	0.37	0.0	2
Felicia Spencer	2020-02-29	UFC Fight Night: Benavidez vs. Figueiredo	Norfolk	Virginia	USA	Featherweight	1	KO/TKO	Inf	Inf	Female	974	Winner	145	Featherweight	68	3.02	5.57	0.45	0.43	0.64	0.10	0.14	0.3	4
Felipe Colares	2019-02-02	UFC Fight Night: Assuncao vs. Moraes 2	Fortaleza	Ceara	Brazil	Featherweight	3	U-DEC	Inf	Inf	Male	2915	Loser	135	Bantamweight	69	1.13	3.00	0.42	0.32	2.00	0.25	0.34	1.0	3
Frankie Perez	2015-01-18	UFC Fight Night: McGregor vs Siver	Boston	Massachusetts	USA	Lightweight	3	KO/TKO	Inf	Inf	Male	1432	Loser	155	Lightweight	73	1.64	2.17	0.41	0.54	1.75	0.41	0.50	0.3	4
Frankie Saenz	2018-05-19	UFC Fight Night: Maia vs. Usman	Santiago	Chile	Chile	Bantamweight	3	U-DEC	Inf	Inf	Male	1021	Winner	135	Bantamweight	66	3.94	3.50	0.47	0.52	1.74	0.31	0.61	0.1	9
Geoff Neal	2018-02-18	UFC Fight Night: Cerrone vs. Medeiros	Austin	Texas	USA	Welterweight	1	SUB	Inf	Inf	Male	1041	Winner	170	Welterweight	75	4.94	4.93	0.49	0.61	0.50	0.50	0.92	0.2	5
Geraldo de Freitas	2019-02-02	UFC Fight Night: Assuncao vs. Moraes 2	Fortaleza	Ceara	Brazil	Featherweight	3	U-DEC	Inf	Inf	Male	2915	Winner	135	Bantamweight	72	3.67	2.62	0.52	0.50	3.00	0.45	0.59	0.0	3
Gina Mazany	2017-11-25	UFC Fight Night: Bisping vs. Gastelum	Shanghai	Hebei	China	Bantamweight	3	U-DEC	2.25	2.25	Female	1087	Winner	125	Flyweight	68	3.45	3.07	0.47	0.51	4.41	0.63	0.33	0.3	6
Glover Teixeira	2015-08-08	UFC Fight Night: Teixeira vs Saint Preux	Nashville	Tennessee	USA	Light Heavyweight	3	SUB	1.95	1.95	Male	1093	Winner	205	Light Heavyweight	76	3.75	3.84	0.47	0.54	2.04	0.40	0.60	1.0	12
Henry Briones	2018-05-19	UFC Fight Night: Maia vs. Usman	Santiago	Chile	Chile	Bantamweight	3	U-DEC	Inf	Inf	Male	1021	Loser	135	Bantamweight	69	3.47	4.68	0.42	0.53	0.00	0.00	0.52	0.6	4
Ildemar Alcantara	2015-07-15	UFC Fight Night: Mir vs Duffee	San Diego	California	USA	Middleweight	3	U-DEC	1.95	1.95	Male	1624	Loser	185	Middleweight	78	1.93	2.63	0.38	0.50	2.00	0.68	0.81	0.9	3
Isabela de Padua	2019-11-16	UFC Fight Night: Blachowicz vs. Jacare	Sao Paulo	Sao Paulo	Brazil	Flyweight	3	U-DEC	Inf	Inf	Female	254	Loser	125	Flyweight	64	1.00	2.07	0.35	0.50	2.00	0.66	0.00	1.0	1
Iuri Alcantara	2018-02-03	UFC Fight Night: Machida vs. Anders	Belem	Para	Brazil	Bantamweight	1	KO/TKO	2.00	2.00	Male	2949	Winner	135	Bantamweight	71	2.72	2.79	0.45	0.49	1.44	0.62	0.60	0.8	12
Jack Marshman	2017-10-28	UFC Fight Night: Brunson vs. Machida	Sao Paulo	Sao Paulo	Brazil	Middleweight	1	SUB	Inf	Inf	Male	251	Loser	185	Middleweight	73	2.74	4.19	0.25	0.56	0.00	0.00	0.20	0.0	6
Jacob Malkoun	2021-04-17	UFC Fight Night: Whittaker vs. Gastelum	Las Vegas	Nevada	USA	Middleweight	3	U-DEC	Inf	Inf	Male	1220	Winner	185	Middleweight	73	1.76	1.83	0.47	0.51	7.84	0.33	0.00	2.0	2
Jarjis Danho	2021-04-10	UFC Fight Night: Vettori vs. Holland	Las Vegas	Nevada	USA	Heavyweight	1	KO/TKO	Inf	Inf	Male	1288	Winner	265	Heavyweight	74	3.38	5.25	0.50	0.49	0.51	0.16	1.00	0.0	2
Joanne Calderwood	2019-06-08	UFC 238: Cejudo vs. Moraes	Chicago	Illinois	USA	Flyweight	3	U-DEC	1.98	1.98	Female	1596	Loser	125	Flyweight	65	6.59	4.40	0.49	0.53	1.80	0.55	0.58	0.5	9
Joe Soto	2018-02-03	UFC Fight Night: Machida vs. Anders	Belem	Para	Brazil	Bantamweight	1	KO/TKO	2.00	2.00	Male	2949	Loser	135	Bantamweight	65	3.36	5.37	0.41	0.67	0.85	0.21	0.70	1.9	7
Johnny Case	2015-01-18	UFC Fight Night: McGregor vs Siver	Boston	Massachusetts	USA	Lightweight	3	KO/TKO	Inf	Inf	Male	1432	Winner	155	Lightweight	72	3.95	2.66	0.38	0.61	1.70	0.45	0.72	0.2	6
Johny Hendricks	2017-11-04	UFC 217: Bisping vs. St-Pierre	New York City	New York	USA	Middleweight	2	KO/TKO	Inf	Inf	Male	2198	Loser	185	Middleweight	69	3.49	3.99	0.45	0.53	3.83	0.46	0.63	0.3	9
Jorge Masvidal	2016-05-29	UFC Fight Night: Almeida vs Garbrandt	Las Vegas	Nevada	USA	Welterweight	3	S-DEC	2.00	2.00	Male	1750	Loser	170	Welterweight	74	4.20	3.00	0.47	0.65	1.57	0.59	0.77	0.4	15
Joseph Morales	2018-02-03	UFC Fight Night: Machida vs. Anders	Belem	Para	Brazil	Flyweight	2	KO/TKO	2.20	2.20	Male	734	Loser	125	Flyweight	69	1.58	1.72	0.37	0.60	0.53	0.50	0.23	2.6	2
Karl Roberson	2018-05-12	UFC 224: Nunes vs. Pennington	Rio de Janeiro	Rio de Janeiro	Brazil	Middleweight	1	SUB	1.95	1.95	Male	453	Loser	185	Middleweight	74	3.00	2.42	0.51	0.57	0.99	0.57	0.50	0.8	6
Katlyn Chookagian	2019-06-08	UFC 238: Cejudo vs. Moraes	Chicago	Illinois	USA	Flyweight	3	U-DEC	1.98	1.98	Female	1596	Winner	125	Flyweight	68	4.22	4.23	0.34	0.63	0.27	0.15	0.51	0.5	11
KB Bhullar	2020-10-10	UFC Fight Night: Moraes vs. Sandhagen	Abu Dhabi	Abu Dhabi	United Arab Emirates	Middleweight	1	KO/TKO	Inf	Inf	Male	2678	Loser	185	Middleweight	78	1.18	10.00	0.13	0.37	0.00	0.00	0.00	0.0	1
Kevin Casey	2015-07-15	UFC Fight Night: Mir vs Duffee	San Diego	California	USA	Middleweight	3	U-DEC	1.95	1.95	Male	1624	Winner	185	Middleweight	77	2.27	3.76	0.53	0.46	0.79	0.22	0.27	0.4	3
Kyle Bochniak	2019-10-18	UFC Fight Night: Reyes vs. Weidman	Boston	Massachusetts	USA	Featherweight	3	U-DEC	2.00	2.00	Male	2486	Loser	145	Featherweight	70	2.63	5.11	0.31	0.58	1.14	0.15	0.62	0.0	7
Lorenz Larkin	2016-05-29	UFC Fight Night: Almeida vs Garbrandt	Las Vegas	Nevada	USA	Welterweight	3	S-DEC	2.00	2.00	Male	1750	Winner	170	Welterweight	72	3.53	2.74	0.46	0.63	0.27	0.42	0.79	0.1	7
Luis Pena	2020-02-29	UFC Fight Night: Benavidez vs. Figueiredo	Norfolk	Virginia	USA	Lightweight	3	U-DEC	Inf	Inf	Male	1770	Winner	155	Lightweight	75	3.66	3.12	0.46	0.51	1.17	0.33	0.48	1.2	6
Luke Sanders	2018-08-25	UFC Fight Night: Gaethje vs. Vick	Lincoln	Nebraska	USA	Bantamweight	1	SUB	2.00	2.00	Male	2261	Loser	135	Bantamweight	67	6.21	4.11	0.51	0.53	0.31	0.20	0.66	0.3	6
Magomed Mustafaev	2019-04-20	UFC Fight Night: Overeem vs. Oleinik	Saint Petersburg	Saint Petersburg	Russia	Lightweight	1	KO/TKO	2.25	2.25	Male	1806	Winner	155	Lightweight	71	2.59	2.68	0.58	0.41	3.31	0.50	0.23	0.4	5
Matthew Semelsberger	2020-08-22	UFC Fight Night: Munhoz vs. Edgar	Las Vegas	Nevada	USA	Welterweight	3	U-DEC	Inf	Inf	Male	2887	Winner	170	Welterweight	75	7.93	5.31	0.50	0.56	1.97	1.00	1.00	1.0	1
Misha Cirkunov	2016-12-10	UFC 206: Holloway vs. Pettis	Toronto	Ontario	Canada	Light Heavyweight	1	SUB	1.88	1.88	Male	2047	Winner	205	Light Heavyweight	77	4.18	3.22	0.51	0.60	4.28	0.57	0.71	2.3	9
Nikita Krylov	2016-12-10	UFC 206: Holloway vs. Pettis	Toronto	Ontario	Canada	Light Heavyweight	1	SUB	1.88	1.88	Male	2047	Loser	205	Light Heavyweight	77	4.33	2.52	0.56	0.43	1.34	0.33	0.55	1.3	11
Ovince Saint Preux	2015-08-08	UFC Fight Night: Teixeira vs Saint Preux	Nashville	Tennessee	USA	Light Heavyweight	3	SUB	1.95	1.95	Male	1093	Loser	205	Light Heavyweight	80	2.68	3.03	0.46	0.45	1.19	0.40	0.66	0.6	21
Paulo Costa	2018-07-07	UFC 226: Miocic vs. Cormier	Las Vegas	Nevada	USA	Middleweight	2	KO/TKO	Inf	Inf	Male	2199	Winner	185	Middleweight	72	7.03	6.70	0.57	0.50	0.00	0.00	0.80	0.0	5
Paulo Costa	2017-11-04	UFC 217: Bisping vs. St-Pierre	New York City	New York	USA	Middleweight	2	KO/TKO	Inf	Inf	Male	2198	Winner	185	Middleweight	72	7.03	6.70	0.57	0.50	0.00	0.00	0.80	0.0	5
Rafael Fiziev	2019-04-20	UFC Fight Night: Overeem vs. Oleinik	Saint Petersburg	Saint Petersburg	Russia	Lightweight	1	KO/TKO	2.25	2.25	Male	1806	Loser	155	Lightweight	71	4.67	4.17	0.57	0.55	0.84	0.50	1.00	0.0	2
Rani Yahya	2018-08-25	UFC Fight Night: Gaethje vs. Vick	Lincoln	Nebraska	USA	Bantamweight	1	SUB	2.00	2.00	Male	2261	Winner	135	Bantamweight	67	1.57	1.70	0.37	0.50	2.89	0.33	0.24	2.0	10
Robbie Lawler	2016-01-02	UFC 195: Lawler vs Condit	Las Vegas	Nevada	USA	Welterweight	5	S-DEC	1.95	1.95	Male	2336	Winner	170	Welterweight	74	3.50	4.16	0.45	0.60	0.68	0.64	0.64	0.0	12
Sean Woodson	2019-10-18	UFC Fight Night: Reyes vs. Weidman	Boston	Massachusetts	USA	Featherweight	3	U-DEC	2.00	2.00	Male	2486	Winner	145	Featherweight	78	6.40	4.43	0.45	0.58	0.00	0.00	0.77	0.0	2
Steve Garcia	2020-02-29	UFC Fight Night: Benavidez vs. Figueiredo	Norfolk	Virginia	USA	Lightweight	3	U-DEC	Inf	Inf	Male	1770	Loser	155	Lightweight	75	4.31	2.52	0.55	0.36	0.61	0.25	1.00	0.6	1
Takanori Gomi	2017-09-22	UFC Fight Night: Saint Preux vs. Okami	Saitama	Saitama	Japan	Lightweight	1	KO/TKO	Inf	Inf	Male	849	Loser	155	Lightweight	70	3.81	3.52	0.41	0.60	1.23	0.65	0.63	0.8	6
TJ Laramie	2020-09-19	UFC Fight Night: Covington vs. Woodley	Las Vegas	Nevada	USA	Featherweight	1	SUB	Inf	Inf	Male	715	Loser	145	Featherweight	66	2.73	3.24	0.51	0.26	2.56	0.50	0.00	0.0	1
Tom Breese	2020-10-10	UFC Fight Night: Moraes vs. Sandhagen	Abu Dhabi	Abu Dhabi	United Arab Emirates	Middleweight	1	KO/TKO	Inf	Inf	Male	2678	Winner	185	Middleweight	73	3.34	2.81	0.50	0.60	0.00	0.00	0.70	1.1	8
Tony Kelley	2020-10-10	UFC Fight Night: Moraes vs. Sandhagen	Abu Dhabi	Abu Dhabi	United Arab Emirates	Bantamweight	3	U-DEC	Inf	Inf	Male	2694	Winner	135	Bantamweight	70	4.57	4.77	0.47	0.43	0.00	0.00	0.50	2.0	2
Travis Browne	2017-07-08	UFC 213: Romero vs. Whittaker	Las Vegas	Nevada	USA	Heavyweight	2	SUB	Inf	Inf	Male	62	Loser	255	Heavyweight	79	2.93	4.31	0.41	0.42	1.21	0.68	0.75	0.2	10
Uriah Hall	2018-07-07	UFC 226: Miocic vs. Cormier	Las Vegas	Nevada	USA	Middleweight	2	KO/TKO	Inf	Inf	Male	2199	Loser	185	Middleweight	79	3.34	3.54	0.51	0.53	0.67	0.38	0.69	0.2	14
Uriah Hall	2018-12-29	UFC 232: Jones vs. Gustafsson 2	Los Angeles	California	USA	Middleweight	3	KO/TKO	Inf	Inf	Male	2881	Winner	185	Middleweight	79	3.34	3.54	0.51	0.53	0.67	0.38	0.69	0.2	14
William Knight	2021-04-10	UFC Fight Night: Vettori vs. Holland	Las Vegas	Nevada	USA	Light Heavyweight	3	U-DEC	Inf	Inf	Male	3039	Loser	205	Light Heavyweight	73	3.56	2.56	0.74	0.34	2.56	0.53	0.40	0.3	2
Wu Yanan	2017-11-25	UFC Fight Night: Bisping vs. Gastelum	Shanghai	Hebei	China	Bantamweight	3	U-DEC	2.25	2.25	Female	1087	Loser	125	Flyweight	66	4.51	4.82	0.45	0.51	0.61	0.22	0.66	0.3	3
Yorgan De Castro	2021-04-10	UFC Fight Night: Vettori vs. Holland	Las Vegas	Nevada	USA	Heavyweight	1	KO/TKO	Inf	Inf	Male	1288	Loser	250	Heavyweight	74	2.46	3.85	0.43	0.53	0.00	0.00	0.77	0.0	3
Zarah Fairn	2020-02-29	UFC Fight Night: Benavidez vs. Figueiredo	Norfolk	Virginia	USA	Featherweight	1	KO/TKO	Inf	Inf	Female	974	Loser	135	Bantamweight	72	1.98	6.61	0.45	0.39	0.00	0.00	0.50	0.0	1

Filter out controversial results (DQ and Overturned) and equal odds.

df_stats %>%
  dplyr::filter(
    (Method != "DQ") & (Method != "Overturned")
    , Winner_Odds != Loser_Odds
    , Num_Fights >= fight_min
  ) -> df_stats

How many rows do we lose?

nrow(df_master) - nrow(df_stats)

## [1] 112

Also convert infinite odds to NAs.

df_stats %>%
  dplyr::mutate(
    Winner_Odds = ifelse(is.infinite(Winner_Odds), NA, Winner_Odds)
    , Loser_Odds = ifelse(is.infinite(Loser_Odds), NA, Loser_Odds)
  ) -> df_stats

Get rid of lonely fight ids (i.e. instances where one of the two competitiors was removed from the dataset).

df_stats %>%
  dplyr::group_by(fight_id) %>%
  dplyr::summarise(Count = length(NAME)) %>%
  dplyr::filter(Count != 2) -> lonely_ids

idx_lonely_ids = lonely_ids$fight_id

df_stats = df_stats[!(df_stats$fight_id %in% idx_lonely_ids), ]

How many additional rows do we lose?

length(idx_lonely_ids)

## [1] 0

What percentage of fights, from those that were succesfully scrapped, do we keep?

length(unique(df_stats$fight_id)) / length(unique(df_master$fight_id)) 

## [1] 0.9816032

How many fights are we left with?

length(unique(df_stats$fight_id))

## [1] 2988

Over what period of time?

range(df_stats$Date)

## [1] "2013-04-27" "2021-04-17"

How many fights per Year?

df_stats %>%
  dplyr::mutate(
    Year = as.numeric(format(Date,"%Y"), ordered = T)
  ) %>%
  dplyr::group_by(Year) %>%
  dplyr::summarise(
    count = length(unique(fight_id))
  ) -> df_year_count

kable(df_year_count)

Year	count
2013	113
2014	305
2015	444
2016	479
2017	367
2018	396
2019	399
2020	403
2021	82

Create additional columns to frame everything vis-a-vis favorite.

df_stats %>%
  dplyr::mutate(
    Favorite_Won = ifelse(Winner_Odds < Loser_Odds, T, F)
    , Was_Favorite = ifelse(
      (Result == "Winner" & Favorite_Won)|(Result == "Loser" & !Favorite_Won)
      , T
      , F
      )
  ) -> df_stats

Get odds and transform probability to logit space.

df_stats %>%
  dplyr::mutate(
    fighter_odds = ifelse(Result == "Winner", Winner_Odds, Loser_Odds)
    , implied_prob = 1/fighter_odds
    , logit_prob = qlogis(1/fighter_odds)
    , Year = as.numeric(format(Date,"%Y"))
  ) -> df_stats

Visualize Data

Many of the stats are repeated several times due to fighters having several fights. Ideally we would account for this but for the purpose of visualizing the data to get an idea of what we are dealing with, we won’t bother.

Later on, the predictors will end up being the difference in stats between fighters. Since most fights are not re-matches, there will not be a major concern.

df_stats %>%
  dplyr::select(
    fight_id
    , Sex
    , Favorite_Won
    , Was_Favorite
    , Result
    , Year
    , REACH
    , implied_prob
    , logit_prob
  ) -> df_for_graph

Examine correlations among potential predictors. There do not appear to be any notable correlations.

df_cor = cor(df_for_graph[6:9], method = c("spearman"), use = "na.or.complete")
corrplot(df_cor)

df_cor

##                    Year       REACH implied_prob  logit_prob
## Year          1.0000000 -0.01396640  -0.02148670 -0.02148670
## REACH        -0.0139664  1.00000000   0.03274692  0.03274692
## implied_prob -0.0214867  0.03274692   1.00000000  1.00000000
## logit_prob   -0.0214867  0.03274692   1.00000000  1.00000000

Visualize the relationship between the Implied Probabilities of the odds in original and transformed space. The graph also gives a sense of the distribution of the Implied Probabilities and the number of them which start racing towards to edges of transformed space (i.e. distortions close to the limits of the original scale).

df_for_graph %>%
  ggplot(aes(x=implied_prob, y=logit_prob))+
  geom_point()+
  geom_smooth(se = F, method = "lm")+
  ylab("Logit Implied Probability")+
  xlab("Implied Probability")

Create function to visualize data as boxplots.

boxplot_df_for_graph = function(df = df_for_graph, grouping = "Was_Favorite") {
  df$Grouping = df[,which(colnames(df) == grouping)]
  
  df %>%
    gather(key = "Metric", value = "Value", REACH:logit_prob) %>%
    ggplot(aes(x=Grouping, y=Value, group = Grouping, color = Grouping))+
    geom_boxplot()+
    labs(color = grouping) +
    xlab(grouping)+
    facet_wrap(.~Metric, scales = "free", nrow = 2) -> gg
  print(gg)
  
}

Examine distribution of predictors as a function of which fighter was the favorite.

boxplot_df_for_graph()

Examine distribution of predictors as a function of who won.

boxplot_df_for_graph(grouping = "Result")

Examine distribution of predictors as a function of Sex.

boxplot_df_for_graph(grouping = "Sex")

Examine distribution of predictors as a function of Year.

boxplot_df_for_graph(grouping = "Year")

Examine distribution of potential predictors. Of course, the distributions of Implied Probabilities are missing their peaks due to the removal of fights with equal odds.

Otherwise, Reach is somewhat normally distributed.

df_for_graph %>%
  gather(key = "Metric", value = "Value", Year:logit_prob) %>%
  ggplot(aes(x=Value))+
  geom_histogram()+
  facet_wrap(.~Metric, scales = "free", nrow = 3)

Compute Differences between Fighters

Create separate data frames for favorites and underdogs, then merge them.

# Favorites
df_stats %>%
  dplyr::filter(Was_Favorite) %>%
  dplyr::select(
    fight_id
    , Sex
    , Favorite_Won
    , Year
    , REACH
    , implied_prob 
    , logit_prob
    ) -> df_favs

# Underdogs
df_stats %>%
  dplyr::filter(!Was_Favorite) %>%
  dplyr::select(
    fight_id
    , Sex
    , Favorite_Won
    , Year
    , REACH
    , implied_prob
    , logit_prob
  ) -> df_under
# rename
df_under %>%
  rename(
    U_REACH = REACH
    , U_implied_prob = implied_prob
    , U_logit_prob = logit_prob
  ) -> df_under

# Merge
df_both = merge(df_under, df_favs)

Examine new dataframe.

summary(df_both)

##     fight_id        Sex       Favorite_Won         Year         U_REACH     
##  1      :   1   Female: 385   Mode :logical   Min.   :2013   Min.   :58.00  
##  2      :   1   Male  :2603   FALSE:1057      1st Qu.:2015   1st Qu.:69.00  
##  3      :   1                 TRUE :1931      Median :2017   Median :72.00  
##  4      :   1                                 Mean   :2017   Mean   :71.62  
##  5      :   1                                 3rd Qu.:2019   3rd Qu.:75.00  
##  6      :   1                                 Max.   :2021   Max.   :84.00  
##  (Other):2982                                                NA's   :159    
##  U_implied_prob     U_logit_prob          REACH        implied_prob   
##  Min.   :0.07117   Min.   :-2.56879   Min.   :60.00   Min.   :0.4000  
##  1st Qu.:0.27548   1st Qu.:-0.96698   1st Qu.:69.00   1st Qu.:0.5780  
##  Median :0.36101   Median :-0.57098   Median :72.00   Median :0.6410  
##  Mean   :0.34701   Mean   :-0.67036   Mean   :71.87   Mean   :0.6576  
##  3rd Qu.:0.42553   3rd Qu.:-0.30010   3rd Qu.:75.00   3rd Qu.:0.7299  
##  Max.   :0.52356   Max.   : 0.09431   Max.   :84.00   Max.   :0.9434  
##                                       NA's   :54                      
##    logit_prob     
##  Min.   :-0.4055  
##  1st Qu.: 0.3147  
##  Median : 0.5798  
##  Mean   : 0.6948  
##  3rd Qu.: 0.9943  
##  Max.   : 2.8134  
## 

How many fights do we have?

nrow(df_both)

## [1] 2988

How often does the favorite win?

mean(df_both$Favorite_Won)

## [1] 0.6462517

Compute differences in Reach. Also, adjust for the overround.

df_both %>%
  dplyr::group_by(fight_id) %>%
  dplyr::summarise(
    Favorite_Won=Favorite_Won
    , Sex=Sex
    , Year=Year
    , Delta_REACH = REACH - U_REACH
    , Log_Odds = logit_prob
    , Implied_Prob = implied_prob
    , Adjust_Implied_Prob = implied_prob - (implied_prob + U_implied_prob - 1)/2
  ) -> df_og_diff

Get Adjusted Log Odds.

df_og_diff %>% 
  mutate(Adjust_Log_Odds = qlogis(Adjust_Implied_Prob)) -> df_og_diff

Examine new dataframe. Notice that Adjusted Implied Probabilities never go under 0.5. Similarly, Adjusted Log Odds never go below 0.

summary(df_og_diff)

##     fight_id    Favorite_Won        Sex            Year       Delta_REACH      
##  1      :   1   Mode :logical   Female: 385   Min.   :2013   Min.   :-10.0000  
##  2      :   1   FALSE:1057      Male  :2603   1st Qu.:2015   1st Qu.: -2.0000  
##  3      :   1   TRUE :1931                    Median :2017   Median :  0.0000  
##  4      :   1                                 Mean   :2017   Mean   :  0.2748  
##  5      :   1                                 3rd Qu.:2019   3rd Qu.:  2.0000  
##  6      :   1                                 Max.   :2021   Max.   : 12.0000  
##  (Other):2982                                                NA's   :211       
##     Log_Odds        Implied_Prob    Adjust_Implied_Prob Adjust_Log_Odds   
##  Min.   :-0.4055   Min.   :0.4000   Min.   :0.5012      Min.   :0.004782  
##  1st Qu.: 0.3147   1st Qu.:0.5780   1st Qu.:0.5778      1st Qu.:0.313771  
##  Median : 0.5798   Median :0.6410   Median :0.6406      Median :0.577952  
##  Mean   : 0.6948   Mean   :0.6576   Mean   :0.6553      Mean   :0.682173  
##  3rd Qu.: 0.9943   3rd Qu.:0.7299   3rd Qu.:0.7261      3rd Qu.:0.974797  
##  Max.   : 2.8134   Max.   :0.9434   Max.   :0.9327      Max.   :2.628868  
## 

Examine correlations among potential predictors.

df_cor_diff = cor(df_og_diff[5:9], method = c("spearman"), use = "na.or.complete")
corrplot(df_cor_diff)

df_cor_diff

##                     Delta_REACH   Log_Odds Implied_Prob Adjust_Implied_Prob
## Delta_REACH          1.00000000 0.05662437   0.05662437          0.05569811
## Log_Odds             0.05662437 1.00000000   1.00000000          0.99449602
## Implied_Prob         0.05662437 1.00000000   1.00000000          0.99449602
## Adjust_Implied_Prob  0.05569811 0.99449602   0.99449602          1.00000000
## Adjust_Log_Odds      0.05569811 0.99449602   0.99449602          1.00000000
##                     Adjust_Log_Odds
## Delta_REACH              0.05569811
## Log_Odds                 0.99449602
## Implied_Prob             0.99449602
## Adjust_Implied_Prob      1.00000000
## Adjust_Log_Odds          1.00000000

Visualize the relationship between Log Odds and Adjusted Log Odds. The red line is y=x whereas the blue line is the line of best fit.

The adjustments do not appear to vary with respect to Log Odds (i.e. closer contests did not tend to have more overround than more disperate ones, etc.).

df_og_diff %>%
  ggplot(aes(x=Log_Odds, y=Adjust_Log_Odds))+
  geom_point()+
  geom_smooth(se = F, method = "lm")+
  geom_abline(slope=1, color = "red")+
  ylab("Adjusted Log Odds")+
  xlab("Log_Odds")

Plot similar graph for Implied Probabilities.

df_og_diff %>%
  ggplot(aes(x=Implied_Prob, y=Adjust_Implied_Prob))+
  geom_point()+
  geom_smooth(se = F, method = "lm")+
  geom_abline(slope=1, color = "red")+
  ylab("Adjusted Implied Probability")+
  xlab("Implied Probability")

Deal with Missing Values

missing_reach <- round(mean(is.na(df_og_diff$Delta_REACH))*100)

How many Reach entries are missing? Around 7%.

df_og_diff %>%
  select("Delta_REACH", "fight_id") -> df_for_amelia

missmap(df_for_amelia)

Is there a relationship between missingness of Reach and actual values of Reach? Based on the random sample below, fighters with no recorded Reach appear to not be popular overall. Therefore, they are likely NOT strong competitors on average. This will be something to keep in mind as it could introduce some kind of bias into the dataset / results.

df_master %>%
  dplyr::filter(is.na(REACH)) %>%
  dplyr::select(
  "Date"
  , "NAME"
  , "FightWeightClass"
  ) -> df_reach_nas 

kable(df_reach_nas[sample(1:nrow(df_reach_nas), nrow(df_reach_nas)/10),])

	Date	NAME	FightWeightClass
192	2014-01-04	Shunichi Shimizu	Bantamweight
159	2014-10-04	Niklas Backstrom	Featherweight
85	2017-01-28	JC Cottrell	Lightweight
43	2017-10-28	Christian Colombo	Heavyweight
4	2016-05-29	Alberto Uda	Middleweight
158	2015-06-20	Niklas Backstrom	Featherweight
166	2013-11-30	Peggy Morgan	Bantamweight
27	2015-11-07	Bruno Rodrigues	Bantamweight
70	2016-04-10	Filip Pejic	Bantamweight
123	2016-02-21	Kelly Faszholz	Bantamweight
106	2016-08-06	Joseph Gigliotti	Middleweight
69	2016-07-08	Fernando Bruno	Featherweight
65	2013-12-28	Estevan Payan	Featherweight
148	2016-07-07	Mehdi Baghdad	Lightweight
119	2017-10-07	Kalindra Faria	Flyweight
214	2015-09-26	Yusuke Kasuya	Lightweight
39	2014-11-15	Chris Heatherly	Welterweight
154	2016-11-19	Milana Dudieva	Bantamweight
194	2015-08-08	Sirwan Kakai	Bantamweight
167	2014-01-04	Quinn Mulhern	Lightweight
5	2017-02-19	Alex Ricci	Lightweight

Compare the summaries of the subset of data without NAs to the one with NAs to identify notable differences.

df_og_diff %>%
  dplyr::filter(is.na(Delta_REACH)) -> df_nas

df_og_diff %>%
  dplyr::filter(!is.na(Delta_REACH)) -> df_nonas

summary(df_nas)

##     fight_id   Favorite_Won        Sex           Year       Delta_REACH 
##  1      :  1   Mode :logical   Female: 38   Min.   :2013   Min.   : NA  
##  4      :  1   FALSE:52        Male  :173   1st Qu.:2014   1st Qu.: NA  
##  39     :  1   TRUE :159                    Median :2015   Median : NA  
##  48     :  1                                Mean   :2015   Mean   :NaN  
##  78     :  1                                3rd Qu.:2016   3rd Qu.: NA  
##  90     :  1                                Max.   :2021   Max.   : NA  
##  (Other):205                                               NA's   :211  
##     Log_Odds         Implied_Prob    Adjust_Implied_Prob Adjust_Log_Odds  
##  Min.   :-0.07696   Min.   :0.4808   Min.   :0.5064      Min.   :0.02564  
##  1st Qu.: 0.37106   1st Qu.:0.5917   1st Qu.:0.5893      1st Qu.:0.36108  
##  Median : 0.61619   Median :0.6494   Median :0.6504      Median :0.62073  
##  Mean   : 0.73200   Mean   :0.6660   Mean   :0.6621      Mean   :0.71134  
##  3rd Qu.: 1.07881   3rd Qu.:0.7463   3rd Qu.:0.7389      3rd Qu.:1.04018  
##  Max.   : 2.40795   Max.   :0.9174   Max.   :0.9088      Max.   :2.29865  
## 

summary(df_nonas)

##     fight_id    Favorite_Won        Sex            Year       Delta_REACH      
##  2      :   1   Mode :logical   Female: 347   Min.   :2013   Min.   :-10.0000  
##  3      :   1   FALSE:1005      Male  :2430   1st Qu.:2015   1st Qu.: -2.0000  
##  5      :   1   TRUE :1772                    Median :2017   Median :  0.0000  
##  6      :   1                                 Mean   :2017   Mean   :  0.2748  
##  7      :   1                                 3rd Qu.:2019   3rd Qu.:  2.0000  
##  8      :   1                                 Max.   :2021   Max.   : 12.0000  
##  (Other):2771                                                                  
##     Log_Odds        Implied_Prob    Adjust_Implied_Prob Adjust_Log_Odds   
##  Min.   :-0.4055   Min.   :0.4000   Min.   :0.5012      Min.   :0.004782  
##  1st Qu.: 0.3147   1st Qu.:0.5780   1st Qu.:0.5773      1st Qu.:0.311586  
##  Median : 0.5798   Median :0.6410   Median :0.6399      Median :0.574930  
##  Mean   : 0.6919   Mean   :0.6570   Mean   :0.6548      Mean   :0.679957  
##  3rd Qu.: 0.9676   3rd Qu.:0.7246   3rd Qu.:0.7248      3rd Qu.:0.968347  
##  Max.   : 2.8134   Max.   :0.9434   Max.   :0.9327      Max.   :2.628868  
## 

Favorites appear to be more likely to win when Reach is NA despite not being substantially more favored (see above).

mean(df_nas$Favorite_Won) - mean(df_nonas$Favorite_Won)

## [1] 0.1154558

Using original dataset, look at if those without Reach entry tend to be favored or not. Indeed they tended to be the underdogs. Therefore, it seems like those with Reach NAs tend to underperform relative to their odds, when considering the above. As such, removing entries with missing Reach data in a future analysis could affect the results.

df_stats %>%
  dplyr::filter(is.na(REACH)) -> df_nas_odds 

mean(df_nas_odds$implied_prob)

## [1] 0.4031639

mean(df_nas_odds$Was_Favorite)

## [1] 0.2535211

Also, the fights with Reach NAs are a couple years older on average. This may be due to improved stats collection and tracking through the years.

mean(df_nas$Year) - mean(df_nonas$Year)

## [1] -2.064192

I may consider a simple random imputation for the model with both Reach and Log Odds as predictors, to avoid losing cases with higher rates of underperformance.

Visualize Difference Data

Create function to generate boxplots of difference data.

# function for box plot
boxplot_df_og_diff = function(df = df_og_diff, grouping = NULL, do_result = F) {
  
  if (is.null(grouping)) {
    
    if (do_result) {
      df %>%
        gather(key = "Metric", value = "Value", Delta_REACH:Adjust_Log_Odds) %>%
        dplyr::mutate(Value = ifelse(Favorite_Won, Value, -Value)) %>%  # THIS FLIPS SIGN 
        ggplot(aes(x=Metric, y=Value))+
        geom_boxplot()+
        facet_wrap(.~Metric, scales = "free", nrow = 2)+
        ggtitle("Winner - Loser") -> gg
    } else {
      df %>%
        gather(key = "Metric", value = "Value", Delta_REACH:Adjust_Log_Odds) %>%
        ggplot(aes(x=Metric, y=Value))+
        geom_boxplot()+
        facet_wrap(.~Metric, scales = "free", nrow = 2)+
        ggtitle("Favorite - Underdog") -> gg
    }
    

    
  } else {
    
    df$Grouping = df[,which(colnames(df) == grouping)][[1]]
    
    df %>%
      gather(key = "Metric", value = "Value", Delta_REACH:Adjust_Log_Odds) %>%
      ggplot(aes(x=Grouping, y=Value, group = Grouping, color = Grouping))+
      geom_boxplot()+
      labs(color = grouping) +
      xlab(grouping)+
      ggtitle("Favorite - Underdog")+
      facet_wrap(.~Metric, scales = "free", nrow = 2) -> gg
  }
  
  print(gg)
  
}

Generate boxplots for potential predictors. I am including all versions of Implied Probability/Log Odds to compare them. Of course, I will only include one of these as a predictor in the model.

boxplot_df_og_diff()

Compare predictors when the Favorite wins and loses.

boxplot_df_og_diff(grouping = "Favorite_Won")

Compare predictors as a function of Sex.

boxplot_df_og_diff(grouping = "Sex")

Compare predictors as a function of Year.

boxplot_df_og_diff(grouping = "Year")

Modify predictors to look at difference in stats between Winner and Loser (instead of Favorite and Underdog).

boxplot_df_og_diff(do_result = T)

Relationship between Predictors and Outcome

Create function to plot Outcome as a function of Predictors.

# function
plot_against_log_odds = function(df = df_og_diff, variable = "Log_Odds", pred_log_odds = F, num_bin = 20, min_bin_size = 30) {
  
  # create dummy variable for function
  df$Dummy = df[
    ,which(colnames(df) == sprintf("%s", variable))
  ][[1]]
  
  # as numeric
  df$Dummy = as.numeric(df$Dummy)
  
  # get bins
  df$Dummy_Bin = cut(df$Dummy, num_bin)
  
  
  # get log odds of Favorite victory by bin
  df %>%
    dplyr::group_by(Dummy_Bin) %>%
    dplyr::summarise(
      Prop_of_Victory = mean(Favorite_Won)
      , Log_Odds_Victory = logit(Prop_of_Victory)
      , Size_of_Bin = length(Favorite_Won)
      , Dummy = mean(Dummy)
    ) -> fav_perf
  
  # extract bins
  fav_labs <- as.character(fav_perf$Dummy_Bin) 
  fav_bins = as.data.frame(
    cbind(
      lower = as.numeric( sub("\\((.+),.*", "\\1", fav_labs) )
      , upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", fav_labs) )
    )
  )
  # get value in middle of bin
  fav_bins %>% dplyr::mutate(mid_bin = (lower + upper)/2 ) -> fav_bins
  # add mid bin column
  fav_perf$Mid_Bin = fav_bins$mid_bin
  
  if (pred_log_odds) {
    fav_perf %>%
    dplyr::filter(Size_of_Bin >= min_bin_size) %>%
    ggplot(aes(x=Dummy, y=Log_Odds_Victory))+
    geom_point()+
    geom_smooth(se=F , method="lm")+
    geom_abline(slope = 1, color = "red")+
    # geom_smooth()+
    ylab("Log Odds that Favorite Won")+
    xlab(sprintf("Mean %s", variable))->gg
  print(gg)
  } else {
      fav_perf %>%
      dplyr::filter(Size_of_Bin >= min_bin_size) %>%
      ggplot(aes(x=Dummy, y=Log_Odds_Victory))+
      geom_point()+
      geom_smooth(se=F , method="lm")+
      # geom_smooth()+
      ylab("Log Odds that Favorite Won")+
      xlab(sprintf("Mean %s", variable))->gg
    print(gg)
  }

  
}

Unsurprisingly, the log odds of the implied probabilities are good linear predictors of the actual log odds of victory. Importantly, the Adjusted Log Odds appear to outperform the non-adjusted Log Odds.

plot_against_log_odds(pred_log_odds = T)

plot_against_log_odds(variable = "Adjust_Log_Odds", pred_log_odds = T)

The linear fit is not quite as nice using the Implied Probability metrics. With that said, Implied Probability could likely still be used very succesfully as a predictor - the fit is still solid. Theoretically, we would have expected issues at the limits of Implied Probability (90%+). However, in practice, Implied Probabilities seldom reach those limits.

plot_against_log_odds(variable = "Implied_Prob")

plot_against_log_odds(variable = "Adjust_Implied_Prob")

Look at y axis scale. There is virtually no difference in Log Odds of victory between sexes.

plot_against_log_odds(variable = "Sex")

Favorites may be less likely to win in recent years. However, this may simply be an artifact of an unstable estimate for year 2013. (That point appears to have a lot of leverage.)

plot_against_log_odds(variable = "Year")

There may be a positive effect of Reach on Log Odds of victory.

plot_against_log_odds(variable = "Delta_REACH")

Assumptions

Here are the assumptions of logistic regression:

First, binary logistic regression requires the dependent variable to be binary and ordinal logistic regression requires the dependent variable to be ordinal.
Second, logistic regression requires the observations to be independent of each other. In other words, the observations should not come from repeated measurements or matched data.
Third, logistic regression requires there to be little or no multicollinearity among the independent variables. This means that the independent variables should not be too highly correlated with each other.
Fourth, logistic regression assumes linearity of independent variables and log odds. although this analysis does not require the dependent and independent variables to be related linearly, it requires that the independent variables are linearly related to the log odds.

In our case, (1) is met; (2) should be met, however, there could be random effects of Fighter or Event, etc. which likely would not impact prediction substantially but could influence explanatory analysis; (3) seems satisfactorily met (there are no major correlations); and (4) appears to be met (regardless of whether or not we use Log Odds or Implied Probability).

Adjusted Log Odds of Implied Probability as Single Predictor

fit_1 <- stan_glm(Favorite_Won ~ Adjust_Log_Odds, family=binomial(link="logit"), data = df_og_diff)

## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000151 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.51 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.607679 seconds (Warm-up)
## Chain 1:                0.709751 seconds (Sampling)
## Chain 1:                1.31743 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 0.000102 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 1.02 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.580237 seconds (Warm-up)
## Chain 2:                0.631632 seconds (Sampling)
## Chain 2:                1.21187 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 9.8e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.98 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.582118 seconds (Warm-up)
## Chain 3:                0.678056 seconds (Sampling)
## Chain 3:                1.26017 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9.6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.534413 seconds (Warm-up)
## Chain 4:                0.592038 seconds (Sampling)
## Chain 4:                1.12645 seconds (Total)
## Chain 4:

print(fit_1)

## stan_glm
##  family:       binomial [logit]
##  formula:      Favorite_Won ~ Adjust_Log_Odds
##  observations: 2988
##  predictors:   2
## ------
##                 Median MAD_SD
## (Intercept)     -0.2    0.1  
## Adjust_Log_Odds  1.3    0.1  
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Display uncertainty in the parameters.

sims_1 <- as.matrix(fit_1)
n_sims <- nrow(sims_1)

draws_1 <- sample(n_sims, 20)

curve(invlogit(sims_1[draws_1[1],1] + sims_1[draws_1[1],2]*x)
      , from = 0
      , to = 3
      , col = "gray"
      , lwd=0.5
      , xlab="Adjusted Log Odds of Favorite"
      , ylab = "Probability of Favorite Winning"
      , main = "Random Draws from Parameter Simulations"
      )

for (j in draws_1[2:20]) {
  curve(invlogit(sims_1[j,1] + sims_1[j,2]*x)
        , col = "gray"
        , lwd=0.5
        , add=TRUE
        )
}

Now do the same with the backtransformed predictors (i.e. Adjusted Log Odds to Implied Probabilities). Here we can see that the negative intercept allows the model to capture the overperformance of large favorites and the underperformance of mild ones.

curve(invlogit(sims_1[draws_1[1],1] + sims_1[draws_1[1],2]*logit(x))
      , from = 0.5
      , to = 1
      , col = "gray"
      , lwd=0.5
      , xlab="Adjusted Implied Probability of Favorite"
      , ylab = "Probability of Favorite Winning"
      , main = "Random Draws from Parameter Simulations"
      )

for (j in draws_1[2:20]) {
  curve(invlogit(sims_1[j,1] + sims_1[j,2]*logit(x))
        , col = "gray"
        , lwd=0.5
        , add=TRUE
        )
}
abline(a=0, b=1, col = "red")

Evaluate intercept (i.e. with Adjusted Log Odds equal to zero). Note: Implied Probability of 50% is technically right outside the range of the data.

b1 <- fit_1$coefficients[[1]]
b2 <- fit_1$coefficients[[2]]
invlogit(0)

## [1] 0.5

invlogit(b1 + b2 * 0)

## [1] 0.4525427

Now, with Adjusted Log Odds of 1/2 which is equivalent to about 62% Adjusted Implied Probability.

invlogit(0.5)

## [1] 0.6224593

invlogit(b1 + b2 * 0.5)

## [1] 0.607421

Now, with Adjusted Log Odds of 1 which is equivalent to about 73% Adjusted Implied Probability.

invlogit(1)

## [1] 0.7310586

invlogit(b1 + b2 * 1)

## [1] 0.7433343

Now, with Adjusted Log Odds of 3/2 which is equivalent to about 82% Adjusted Implied Probability.

invlogit(3/2)

## [1] 0.8175745

invlogit(b1 + b2 * 3/2)

## [1] 0.8442581

With the intercept, the model captures the fact that mild favorites underperform whereas large favorites overperform.

Now, we’ll try the divide-by-4-rule. Indeed, the actual change in the Probability of the Favorite Winning is just under the coefficient divided by 4.

b2/4

## [1] 0.3134434

invlogit(b1 + b2 * 1) - invlogit(b1 + b2 * 0)

## [1] 0.2907916

Get point predictions using predict() and compare to Adjusted Log Odds to again get a sense of how model manages to account for underperformances and overperformances.

newx = seq(0, 3, 0.5)
new <- data.frame(Adjust_Log_Odds=newx)
pred <- predict(fit_1, type = "response", newdata = new)

new$BackTrans_Implied_Prob <- invlogit(newx)
new$Point_Pred <- pred

Get expected outcome with uncertainty.

epred <- posterior_epred(fit_1, newdata=new)

new$Means <- apply(epred, 2, mean)
new$SDs <- apply(epred, 2, sd)

Predictive distribution for new observation. Taking the mean of the predictions gives us similar results as above.

postpred <- posterior_predict(fit_1, newdata=new)

new$Mean_Pred <- apply(postpred, 2, mean)
kable(new)

Adjust_Log_Odds	BackTrans_Implied_Prob	Point_Pred	Means	SDs	Mean_Pred
0.0	0.5000000	0.4523726	0.4523726	0.0168735	0.45050
0.5	0.6224593	0.6070073	0.6070073	0.0094670	0.61050
1.0	0.7310586	0.7427698	0.7427698	0.0104895	0.73500
1.5	0.8175745	0.8434727	0.8434727	0.0125003	0.84300
2.0	0.8807971	0.9093447	0.9093447	0.0115072	0.90775
2.5	0.9241418	0.9490398	0.9490398	0.0090143	0.95425
3.0	0.9525741	0.9718231	0.9718231	0.0064209	0.97575

Calculate log score for pure chance.

logscore_chance <- log(0.5) * length(fit_1$fitted.values)
logscore_chance

## [1] -2071.124

Calculate log score for simply following adjusted best odds.

y <- fit_1$data$Favorite_Won
x <- fit_1$data$Adjust_Implied_Prob
logscore_bestodds <-  sum(y * log(x) + (1-y)*log(1-x))
logscore_bestodds

## [1] -1844.482

Calculate log score for model.

predp_1 <- predict(fit_1, type = "response")
logscore_1 <- sum(y * log(predp_1) + (1-y)*log(1-predp_1))
logscore_1

## [1] -1840.294

Run leave one out cross validation. There is about a two point difference between the elpd_loo estimate and the within-sample log score which makes sense since the fitted model has two parameters.

loo_1 <- loo(fit_1)
print(loo_1)

## 
## Computed from 4000 by 2988 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1842.3 19.8
## p_loo         2.0  0.1
## looic      3684.5 39.6
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

Adjusted Implied Probability as Single Predictor

fit_2 <- stan_glm(Favorite_Won ~ Adjust_Implied_Prob, family=binomial(link="logit"), data = df_og_diff)

## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 9.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.537421 seconds (Warm-up)
## Chain 1:                0.608507 seconds (Sampling)
## Chain 1:                1.14593 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 8.8e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.88 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.492744 seconds (Warm-up)
## Chain 2:                0.591973 seconds (Sampling)
## Chain 2:                1.08472 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 9.2e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.92 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.526684 seconds (Warm-up)
## Chain 3:                0.577734 seconds (Sampling)
## Chain 3:                1.10442 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9.1e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.91 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.51957 seconds (Warm-up)
## Chain 4:                0.610287 seconds (Sampling)
## Chain 4:                1.12986 seconds (Total)
## Chain 4:

print(fit_2)

## stan_glm
##  family:       binomial [logit]
##  formula:      Favorite_Won ~ Adjust_Implied_Prob
##  observations: 2988
##  predictors:   2
## ------
##                     Median MAD_SD
## (Intercept)         -3.2    0.3  
## Adjust_Implied_Prob  5.9    0.4  
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Display uncertainty in the parameters.

sims_2 <- as.matrix(fit_2)
n_sims_2 <- nrow(sims_2)

draws_2 <- sample(n_sims_2, 20)

curve(invlogit(sims_2[draws_2[1],1] + sims_2[draws_2[1],2]*x)
      , from = 0.5
      , to = 1
      , col = "gray"
      , lwd=0.5
      , xlab="Adjusted Implied Probability of Favorite"
      , ylab = "Probability of Favorite Winning"
      , main = "Random Draws from Parameter Simulations"
      )

for (j in draws_2[2:20]) {
  curve(invlogit(sims_2[j,1] + sims_2[j,2]*x)
        , col = "gray"
        , lwd=0.5
        , add=TRUE
        )
}
abline(a=0, b=1, col = "red")

Get point predictions using predict() and compare to Adjusted Implied Probabilities to get a sense of how model manages to account for underperformances and overperformances.

newx_2 = seq(0.5, 1, 0.1)
new_2 <- data.frame(Adjust_Implied_Prob=newx_2)
new_2$Point_Pred <- predict(fit_2, type = "response", newdata = new_2)

Get expected outcome with uncertainty.

epred_2 <- posterior_epred(fit_2, newdata=new_2)

new_2$Means <- apply(epred_2, 2, mean)
new_2$SDs <- apply(epred_2, 2, sd)

Predictive distribution for new observation. Taking the mean of the predictions gives us similar results as above.

postpred_2 <- posterior_predict(fit_2, newdata=new_2)

new_2$Mean_Pred <- apply(postpred_2, 2, mean)
kable(new_2)

Adjust_Implied_Prob	Point_Pred	Means	SDs	Mean_Pred
0.5	0.4346352	0.4346352	0.0178472	0.43400
0.6	0.5812958	0.5812958	0.0106102	0.57450
0.7	0.7148974	0.7148974	0.0100209	0.72175
0.8	0.8189411	0.8189411	0.0122480	0.80925
0.9	0.8906131	0.8906131	0.0119773	0.88875
1.0	0.9359914	0.9359914	0.0099081	0.93300

Calculate log score for second model (i.e. with Implied Probabilities). The log score is marginally worst than the one using the Adjusted Log Odds (-1840.29).

predp_2 <- predict(fit_2, type = "response")
logscore_2 <- sum(y * log(predp_2) + (1-y)*log(1-predp_2))
logscore_2

## [1] -1841.472

Run leave one out cross validation. Similarly, the elpd_loo is marginally worst than the one using the Adjusted Adjusted Log Odds (-1842.27).

loo_2 <- loo(fit_2)
print(loo_2)

## 
## Computed from 4000 by 2988 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1843.5 19.7
## p_loo         2.0  0.1
## looic      3687.0 39.5
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

The difference between using the Adjusted Log Odds or Implied Probabilities as a predictor is fairly small. However, using the Adjsuted Log Odds makes more sense in principle and leads to a marginally better performance. Also, it is trivially easy to back-transform the predictors for interpretability. Therefore, we will use Adjusted Log Odds.

Adjusted Log Odds with Square Term.

df_og_diff$Adjust_Log_Odds_sq = (df_og_diff$Adjust_Log_Odds)^2
fit_3 <- stan_glm(Favorite_Won ~ Adjust_Log_Odds + Adjust_Log_Odds_sq, family=binomial(link="logit"), data = df_og_diff)

## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 9.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 1.46779 seconds (Warm-up)
## Chain 1:                1.4207 seconds (Sampling)
## Chain 1:                2.88849 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 9.6e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 1.3878 seconds (Warm-up)
## Chain 2:                1.53574 seconds (Sampling)
## Chain 2:                2.92354 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 9.4e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.94 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 1.48475 seconds (Warm-up)
## Chain 3:                1.52879 seconds (Sampling)
## Chain 3:                3.01354 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9.9e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.99 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 1.42941 seconds (Warm-up)
## Chain 4:                1.57456 seconds (Sampling)
## Chain 4:                3.00396 seconds (Total)
## Chain 4:

print(fit_3)

## stan_glm
##  family:       binomial [logit]
##  formula:      Favorite_Won ~ Adjust_Log_Odds + Adjust_Log_Odds_sq
##  observations: 2988
##  predictors:   3
## ------
##                    Median MAD_SD
## (Intercept)        -0.2    0.1  
## Adjust_Log_Odds     1.1    0.3  
## Adjust_Log_Odds_sq  0.1    0.2  
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Display uncertainty in the parameters. Take 40 draws to capture strange behavior of some of the draws on implied probability scale.

sims_3 <- as.matrix(fit_3)
n_sims_3 <- nrow(sims_3)

draws_3 <- sample(n_sims_3, 40)

curve(invlogit(sims_3[draws_3[1],1] + sims_3[draws_3[1],2]*x + sims_3[draws_3[1],3]*(x^2))
      , from = 0
      , to = 3
      , col = "gray"
      , lwd=0.5
      , xlab="Adjusted Log Odds of Favorite"
      , ylab = "Probability of Favorite Winning"
      , main = "Random Draws from Parameter Simulations"
      )

for (j in draws_3[2:40]) {
  curve(invlogit(sims_3[j,1] + sims_3[j,2]*x + sims_3[j,3]*(x^2))
        , col = "gray"
        , lwd=0.5
        , add=TRUE
        )
}

Now do the same with the backtransformed predictors (i.e. Adjusted Log Odds to Implied Probabilities). Some of the draws exhibit strange behavior toward the limit of the implied probabilities (i.e. close to 1) whereby they dip dramatically towards lower outcome values.

curve(invlogit(sims_3[draws_3[1],1] + sims_3[draws_3[1],2]*logit(x) + sims_3[draws_3[1],3]*(logit(x)^2))
      , from = 0.5
      , to = 1
      , col = "gray"
      , lwd=0.5
      , xlab="Adjusted Implied Probability of Favorite"
      , ylab = "Probability of Favorite Winning"
      , main = "Random Draws from Parameter Simulations"
      )

for (j in draws_3[2:40]) {
  curve(invlogit(sims_3[j,1] + sims_3[j,2]*logit(x) + sims_3[j,3]*(logit(x)^2))
        , col = "gray"
        , lwd=0.5
        , add=TRUE
        )
}

abline(a=0, b=1, col = "red")

Run leave one out cross validation. The model with the square terms performs about as well as the model with the Adjusted Implied Probabilities. Therefore, we will stick with the basic single predictor model using Adjusted Log Odds as a predictor.

loo_3 <- loo(fit_3)
print(loo_3)

## 
## Computed from 4000 by 2988 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1843.6 19.9
## p_loo         3.5  0.4
## looic      3687.1 39.9
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

Reach as Lone Predictor

We run the model with just Reach as a predictor. There are missing Reach values that we have not replaced. Indeed, the number of observations is reduced compared to the models above.

fit_4 <- stan_glm(Favorite_Won ~ Delta_REACH, family=binomial(link="logit"), data = df_og_diff)

## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 8.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.86 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.470981 seconds (Warm-up)
## Chain 1:                0.51835 seconds (Sampling)
## Chain 1:                0.989331 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 8.9e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.89 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.459119 seconds (Warm-up)
## Chain 2:                0.506495 seconds (Sampling)
## Chain 2:                0.965614 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 8.4e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.84 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.458516 seconds (Warm-up)
## Chain 3:                0.478289 seconds (Sampling)
## Chain 3:                0.936805 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9.1e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.91 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.4651 seconds (Warm-up)
## Chain 4:                0.484291 seconds (Sampling)
## Chain 4:                0.949391 seconds (Total)
## Chain 4:

print(fit_4)

## stan_glm
##  family:       binomial [logit]
##  formula:      Favorite_Won ~ Delta_REACH
##  observations: 2777
##  predictors:   2
## ------
##             Median MAD_SD
## (Intercept) 0.6    0.0   
## Delta_REACH 0.0    0.0   
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Need to see additional decimal places for coefficients and SEs. It looks like there is a positive effect of Reach such that fighters with longer reaches than their opponents have a greater probability of victory than those with shorter reaches than their opponents.

fit_4$coefficients

## (Intercept) Delta_REACH 
##  0.56153475  0.03062914

fit_4$ses

## (Intercept) Delta_REACH 
##  0.03856440  0.01229028

Display uncertainty in the parameters. The below graphs gives a sense of the uncertainty as well as the magnitude of the effect. The effect of Reach absent any other information does appear to be positive.

sims_4 <- as.matrix(fit_4)
n_sims_4 <- nrow(sims_4)



draws_4 <- sample(n_sims_4, 20)

curve(invlogit(sims_4[draws_4[1],1] + sims_4[draws_4[1],2]*x)
      , from = -15
      , to = 15
      , col = "gray"
      , lwd=0.5
      , xlab="Difference in Reach (Favorite - Underdog; inches)"
      , ylab = "Probability of Favorite Winning"
      , main = "Random Draws from Parameter Simulations"
      )

for (j in draws_4[2:20]) {
  curve(invlogit(sims_4[j,1] + sims_4[j,2]*x)
        , col = "gray"
        , lwd=0.5
        , add=TRUE
        )
}

Evaluate intercept (i.e. with Difference in Reach equal to zero). This should approximately represent the average Probability of Victory of the favorite, although not exactly since Favorites have slightly greater reaches than their opponents on average (a little over a quarter inch).

b1_4 <- fit_4$coefficients[[1]]
b2_4 <- fit_4$coefficients[[2]]
invlogit(b1_4 + b2_4 * 0)

## [1] 0.6368076

mean(fit_4$data$Delta_REACH, na.rm = T)

## [1] 0.2747569

Now, we’ll try the divide by 4 rule for reach differences ranging from 1 to 5 inches.

b2_4/4 * c(1:5)

## [1] 0.007657285 0.015314570 0.022971854 0.030629139 0.038286424

b2_4_5 <- round(b2_4/4 * 5 * 100)
b2_4_1 <- round(b2_4/4 * 1 * 100, 1)

It looks like those with a 5 inch reach advantage may have about a 4% greater probability of victory than those with no reach advantage. Indeed, each one inch reach advantage corresponds to approximately a 0.8% increase in probability of victory.

Get point predictions using predict(). As with the divide-by-4-rule, a 5 inch Reach advantage is associated with almost a 4% increase with the Probability of the Favorite Winning.

newx_4 = seq(-10, 10, 1)
new_4 <- data.frame(Delta_REACH=newx_4)
new_4$Point_Pred <- predict(fit_4, type = "response", newdata = new_4)

min_point_pred <- round(min(new_4$Point_Pred)*100)
max_point_pred <- round(max(new_4$Point_Pred)*100)
diff_point_pred <- max_point_pred - min_point_pred 

With the most extreme comparison, from -10 Reach to +10 Difference in Reach, the Probability of Victory jumps from around 56% to 70% - that’s a 14% difference. However, as we could see from the parameter draws, there is likely a lot of uncertainty about those more extreme estimates.

Get expected outcome with uncertainty. The uncertainty in the estimate increases with larger Reach Difference values, which was expected. However, for Reach Difference within 5 inches, the SDs for Probability of Victory are below 2%.

epred_4 <- posterior_epred(fit_4, newdata=new_4)
new_4$Means <- apply(epred_4, 2, mean)
new_4$SDs <- apply(epred_4, 2, sd)

Predictive distribution for new observation. Taking the mean of the predictions gives us similar results as above.

postpred_4 <- posterior_predict(fit_4, newdata=new_4)
new_4$Mean_Pred <- apply(postpred_4, 2, mean)
kable(new_4)

Delta_REACH	Point_Pred	Means	SDs	Mean_Pred
-10	0.5625262	0.5625262	0.0321655	0.56400
-9	0.5701204	0.5701204	0.0292140	0.57350
-8	0.5776871	0.5776871	0.0263011	0.58275
-7	0.5852210	0.5852210	0.0234433	0.57700
-6	0.5927171	0.5927171	0.0206623	0.59700
-5	0.6001704	0.6001704	0.0179895	0.59175
-4	0.6075762	0.6075762	0.0154719	0.60250
-3	0.6149296	0.6149296	0.0131836	0.61700
-2	0.6222262	0.6222262	0.0112430	0.62250
-1	0.6294616	0.6294616	0.0098272	0.62050
0	0.6366315	0.6366315	0.0091447	0.62875
1	0.6437321	0.6437321	0.0093161	0.63800
2	0.6507594	0.6507594	0.0102582	0.64225
3	0.6577100	0.6577100	0.0117494	0.65525
4	0.6645804	0.6645804	0.0135759	0.66275
5	0.6713676	0.6713676	0.0155891	0.66475
6	0.6780687	0.6780687	0.0176971	0.68675
7	0.6846809	0.6846809	0.0198441	0.67525
8	0.6912019	0.6912019	0.0219955	0.69250
9	0.6976294	0.6976294	0.0241289	0.69825
10	0.7039615	0.7039615	0.0262296	0.70975

Calculate log score for pure chance.

logscore_chance_4 <- log(0.5) * length(fit_4$fitted.values)
logscore_chance_4  

## [1] -1924.87

Calculate log score for model. The simple model with Reach clearly outperforms chance.

y_4 <- fit_4$model$Favorite_Won
predp_4 <- predict(fit_4, type = "response")
logscore_4 <- sum(y_4 * log(predp_4) + (1-y_4)*log(1-predp_4))
logscore_4

## [1] -1814.43

However, does the model outperform picking the favorite at the base rate of victory (i.e. intercept only)?

base_rate_4 <- mean(fit_4$model$Favorite_Won)
logscore_inter_4 <- sum(y_4 * log(base_rate_4) + (1-y_4)*log(1-base_rate_4))
logscore_inter_4

## [1] -1817.558

score_diff_inter_4 <- round(logscore_4 - logscore_inter_4,2)

It does, but not by much - only by a logscore of 3.13, which means adding Reach as a factor probably minimally improves the predictions.

Run leave one out cross validation. We don’t have another model to compare it to but we can see that elpd_loo is is about two points more negative than the logscore calculation which is not surprising given that the model has two parameters.

loo_4 <- loo(fit_4)
print(loo_4)

## 
## Computed from 4000 by 2777 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1816.4 14.6
## p_loo         2.0  0.0
## looic      3632.8 29.2
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

Impute Missing Reach Values for Two Predictor Model

Do simple random imputation.

We want to avoid removing cases with missing Reach values since, as we saw earlier, those cases overrepresent underperformances and underdogs. Therefore, if we were to remove them, we may bias the results and affect the relationship between Adjusted Fight Odds and the outcome variable (Probability of Favorite Winning).

NOTE: With a more complex model, with multiple predictors and missing values across them, we may consider multiple imputation.

Based on the summary below, we see that the imputed values of Difference in Reach are virtually the same as the original ones (i.e. with NAs excluded). This is as expected, especially since we only replaced approximately 7% of Reach entries (NAs).

random_imp <- function(a) {
  missing <- is.na(a)
  n_missing <- sum(missing)
  a_obs <- a[!missing]
  imputed <- a
  imputed[missing] <- sample(a_obs, n_missing)
  
  return(imputed)
}

df_og_diff$Delta_REACH_imp <- random_imp(df_og_diff$Delta_REACH)

summary(df_og_diff)

##     fight_id    Favorite_Won        Sex            Year       Delta_REACH      
##  1      :   1   Mode :logical   Female: 385   Min.   :2013   Min.   :-10.0000  
##  2      :   1   FALSE:1057      Male  :2603   1st Qu.:2015   1st Qu.: -2.0000  
##  3      :   1   TRUE :1931                    Median :2017   Median :  0.0000  
##  4      :   1                                 Mean   :2017   Mean   :  0.2748  
##  5      :   1                                 3rd Qu.:2019   3rd Qu.:  2.0000  
##  6      :   1                                 Max.   :2021   Max.   : 12.0000  
##  (Other):2982                                                NA's   :211       
##     Log_Odds        Implied_Prob    Adjust_Implied_Prob Adjust_Log_Odds   
##  Min.   :-0.4055   Min.   :0.4000   Min.   :0.5012      Min.   :0.004782  
##  1st Qu.: 0.3147   1st Qu.:0.5780   1st Qu.:0.5778      1st Qu.:0.313771  
##  Median : 0.5798   Median :0.6410   Median :0.6406      Median :0.577952  
##  Mean   : 0.6948   Mean   :0.6576   Mean   :0.6553      Mean   :0.682173  
##  3rd Qu.: 0.9943   3rd Qu.:0.7299   3rd Qu.:0.7261      3rd Qu.:0.974797  
##  Max.   : 2.8134   Max.   :0.9434   Max.   :0.9327      Max.   :2.628868  
##                                                                           
##  Adjust_Log_Odds_sq Delta_REACH_imp   
##  Min.   :0.000023   Min.   :-10.0000  
##  1st Qu.:0.098452   1st Qu.: -2.0000  
##  Median :0.334029   Median :  0.0000  
##  Mean   :0.699743   Mean   :  0.2734  
##  3rd Qu.:0.950229   3rd Qu.:  2.0000  
##  Max.   :6.910949   Max.   : 12.0000  
## 

Two Predictor Model: Imputed Reach and Adjusted Log Odds

fit_5 <- stan_glm(Favorite_Won ~ Adjust_Log_Odds + Delta_REACH_imp
                  , family=binomial(link="logit")
                  , data = df_og_diff
                  )

## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000108 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.08 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.618832 seconds (Warm-up)
## Chain 1:                0.706744 seconds (Sampling)
## Chain 1:                1.32558 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 0.000102 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 1.02 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.662914 seconds (Warm-up)
## Chain 2:                0.699454 seconds (Sampling)
## Chain 2:                1.36237 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 0.000103 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 1.03 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.61107 seconds (Warm-up)
## Chain 3:                0.70918 seconds (Sampling)
## Chain 3:                1.32025 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 9.4e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.94 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.615998 seconds (Warm-up)
## Chain 4:                0.691082 seconds (Sampling)
## Chain 4:                1.30708 seconds (Total)
## Chain 4:

print(fit_5)

## stan_glm
##  family:       binomial [logit]
##  formula:      Favorite_Won ~ Adjust_Log_Odds + Delta_REACH_imp
##  observations: 2988
##  predictors:   3
## ------
##                 Median MAD_SD
## (Intercept)     -0.2    0.1  
## Adjust_Log_Odds  1.2    0.1  
## Delta_REACH_imp  0.0    0.0  
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Get better sense of coefficients. Unsurprisingly, Adjusted Log Odds still has a large effect. The (imputed) Difference in Reach still seems to have an effect. The magnitude of the effect decreased a bit but the standard error stayed about the same (compared to the model with just Reach).

fit_5$coefficients

##     (Intercept) Adjust_Log_Odds Delta_REACH_imp 
##     -0.19006516      1.24198698      0.02357245

fit_5$ses

##     (Intercept) Adjust_Log_Odds Delta_REACH_imp 
##      0.06827189      0.09365881      0.01213586

Display uncertainty in the parameters. The graph below plots random draws from the parameter simulations for 5 values of Difference in Reach: -10 (green), -5 inches (red), 0 inches (grey), +5 inches (blue), and +10 (yellow). We see the Difference in Reach sort of just shifts the curves (and therefore the Probability of the Favorite Winning) upwards.

sims_5 <- as.matrix(fit_5)
n_sims_5 <- nrow(sims_5)

draws_5 <- sample(n_sims_5, 20)

curve(invlogit(sims_5[draws_5[1],1] + sims_5[draws_5[1],2]*x + sims_5[draws_5[1],3]*(0))
    , from = 0
    , to = 3
    , col = "black"
    , lwd=0.0       # make invisible, just used to set up plot. 
    , xlab="Adjusted Log Odds of Favorite"
    , ylab = "Probability of Favorite Winning"
    , main = "Random Draws from Parameter Simulations"
    )

for (j in draws_5) {
  
   curve(invlogit(sims_5[j,1] + sims_5[j,2]*x + sims_5[j,3]*(-10))
        , col = "green"
        , lwd=0.3
        , add=TRUE
        )
  
    curve(invlogit(sims_5[j,1] + sims_5[j,2]*x + sims_5[j,3]*(-5))
        , col = "red"
        , lwd=0.3
        , add=TRUE
        )
  
    curve(invlogit(sims_5[j,1] + sims_5[j,2]*x + sims_5[j,3]*(0))
        , col = "black"
        , lwd=0.3
        , add=TRUE
        )
    
      curve(invlogit(sims_5[j,1] + sims_5[j,2]*x + sims_5[j,3]*(+5))
        , col = "blue"
        , lwd=0.3
        , add=TRUE
        )
      
      curve(invlogit(sims_5[j,1] + sims_5[j,2]*x + sims_5[j,3]*(+10))
        , col = "yellow"
        , lwd=0.3
        , add=TRUE
        )
}

Now do the same with the backtransformed predictors (i.e. Adjusted Log Odds to Implied Probabilities). It appears that when there is no Reach Difference (i.e. black lines), the model account for some of the underperformance of mild favorites / overperformance of large favorites. Otherwise, Reach Difference simply results in a shift in the curve.

curve(invlogit(sims_5[draws_5[1],1] + sims_5[draws_5[1],2]*logit(x) + sims_5[draws_5[1],3]*(0))
    , from = 0.5
    , to = 1
    , col = "black"
    , lwd=0.0       # make invisible, just used to set up plot. 
    , xlab="Adjusted Implied Probability of Favorite"
    , ylab = "Probability of Favorite Winning"
    , main = "Random Draws from Parameter Simulations"
    )

for (j in draws_5) {
  
   curve(invlogit(sims_5[j,1] + sims_5[j,2]*logit(x) + sims_5[j,3]*(-10))
        , col = "green"
        , lwd=0.3
        , add=TRUE
        )
  
    curve(invlogit(sims_5[j,1] + sims_5[j,2]*logit(x) + sims_5[j,3]*(-5))
        , col = "red"
        , lwd=0.3
        , add=TRUE
        )
  
    curve(invlogit(sims_5[j,1] + sims_5[j,2]*logit(x) + sims_5[j,3]*(0))
        , col = "black"
        , lwd=0.3
        , add=TRUE
        )
    
      curve(invlogit(sims_5[j,1] + sims_5[j,2]*logit(x) + sims_5[j,3]*(+5))
        , col = "blue"
        , lwd=0.3
        , add=TRUE
        )
      
      curve(invlogit(sims_5[j,1] + sims_5[j,2]*logit(x) + sims_5[j,3]*(+10))
        , col = "yellow"
        , lwd=0.3
        , add=TRUE
        )
}


abline(a=0, b=1, col = "black", lwd = 2, lty = 3)

Get point predictions and compare them to Adjusted Log Odds backtransformed to Implied Probabilities.

new_5 <- data.frame(Adjust_Log_Odds=rep(newx, each = 21), Delta_REACH_imp=newx_4)
new_5$Point_Pred <- predict(fit_5, type = "response", newdata = new_5)

new_5$BackTrans_Implied_Prob <- invlogit(new_5$Adjust_Log_Odds) 

Get expected outcome with uncertainty.

epred_5 <- posterior_epred(fit_5, newdata=new_5)

new_5$Means <- apply(epred_5, 2, mean)
new_5$SDs <- apply(epred_5, 2, sd)

Predictive distribution for new observation.

postpred_5 <- posterior_predict(fit_5, newdata=new_5)

new_5$Pred_Ms <- apply(postpred_5, 2, mean)
kable(new_5)

Adjust_Log_Odds	Delta_REACH_imp	Point_Pred	BackTrans_Implied_Prob	Means	SDs	Pred_Ms
0.0	-10	0.3955847	0.5000000	0.3955847	0.0331463	0.38500
0.0	-9	0.4011308	0.5000000	0.4011308	0.0308774	0.38350
0.0	-8	0.4067158	0.5000000	0.4067158	0.0286508	0.42050
0.0	-7	0.4123376	0.5000000	0.4123376	0.0264902	0.40600
0.0	-6	0.4179938	0.5000000	0.4179938	0.0244261	0.42275
0.0	-5	0.4236821	0.5000000	0.4236821	0.0224986	0.43050
0.0	-4	0.4294000	0.5000000	0.4294000	0.0207595	0.42225
0.0	-3	0.4351451	0.5000000	0.4351451	0.0192731	0.43850
0.0	-2	0.4409146	0.5000000	0.4409146	0.0181143	0.43250
0.0	-1	0.4467061	0.5000000	0.4467061	0.0173604	0.45100
0.0	0	0.4525167	0.5000000	0.4525167	0.0170750	0.45050
0.0	1	0.4583438	0.5000000	0.4583438	0.0172891	0.44925
0.0	2	0.4641845	0.5000000	0.4641845	0.0179906	0.46575
0.0	3	0.4700360	0.5000000	0.4700360	0.0191293	0.47025
0.0	4	0.4758956	0.5000000	0.4758956	0.0206345	0.47350
0.0	5	0.4817602	0.5000000	0.4817602	0.0224321	0.47825
0.0	6	0.4876271	0.5000000	0.4876271	0.0244561	0.48450
0.0	7	0.4934935	0.5000000	0.4934935	0.0266519	0.48300
0.0	8	0.4993563	0.5000000	0.4993563	0.0289766	0.51300
0.0	9	0.5052129	0.5000000	0.5052129	0.0313968	0.51175
0.0	10	0.5110603	0.5000000	0.5110603	0.0338866	0.49875
0.5	-10	0.5486025	0.6224593	0.5486025	0.0317580	0.54125
0.5	-9	0.5544326	0.6224593	0.5544326	0.0289347	0.54700
0.5	-8	0.5602515	0.6224593	0.5602515	0.0261492	0.56875
0.5	-7	0.5660565	0.6224593	0.5660565	0.0234171	0.56700
0.5	-6	0.5718448	0.6224593	0.5718448	0.0207599	0.56750
0.5	-5	0.5776135	0.6224593	0.5776135	0.0182088	0.59150
0.5	-4	0.5833601	0.6224593	0.5833601	0.0158114	0.58625
0.5	-3	0.5890818	0.6224593	0.5890818	0.0136417	0.59100
0.5	-2	0.5947759	0.6224593	0.5947759	0.0118143	0.60325
0.5	-1	0.6004400	0.6224593	0.6004400	0.0104937	0.60325
0.5	0	0.6060714	0.6224593	0.6060714	0.0098651	0.60500
0.5	1	0.6116679	0.6224593	0.6116679	0.0100368	0.61775
0.5	2	0.6172270	0.6224593	0.6172270	0.0109481	0.63375
0.5	3	0.6227465	0.6224593	0.6227465	0.0124154	0.61325
0.5	4	0.6282242	0.6224593	0.6282242	0.0142472	0.64125
0.5	5	0.6336580	0.6224593	0.6336580	0.0163018	0.63700
0.5	6	0.6390460	0.6224593	0.6390460	0.0184870	0.63600
0.5	7	0.6443862	0.6224593	0.6443862	0.0207450	0.65350
0.5	8	0.6496768	0.6224593	0.6496768	0.0230388	0.64625
0.5	9	0.6549163	0.6224593	0.6549163	0.0253441	0.64875
0.5	10	0.6601029	0.6224593	0.6601029	0.0276447	0.67225
1.0	-10	0.6929719	0.7310586	0.6929719	0.0286974	0.70400
1.0	-9	0.6980489	0.7310586	0.6980489	0.0261868	0.69100
1.0	-8	0.7030720	0.7310586	0.7030720	0.0237609	0.70900
1.0	-7	0.7080398	0.7310586	0.7080398	0.0214350	0.70825
1.0	-6	0.7129508	0.7310586	0.7129508	0.0192294	0.71500
1.0	-5	0.7178035	0.7310586	0.7178035	0.0171717	0.71375
1.0	-4	0.7225969	0.7310586	0.7225969	0.0153000	0.71550
1.0	-3	0.7273296	0.7310586	0.7273296	0.0136655	0.72325
1.0	-2	0.7320006	0.7310586	0.7320006	0.0123344	0.72950
1.0	-1	0.7366091	0.7310586	0.7366091	0.0113822	0.74575
1.0	0	0.7411541	0.7310586	0.7411541	0.0108759	0.73850
1.0	1	0.7456350	0.7310586	0.7456350	0.0108450	0.74275
1.0	2	0.7500511	0.7310586	0.7500511	0.0112620	0.75225
1.0	3	0.7544019	0.7310586	0.7544019	0.0120515	0.75175
1.0	4	0.7586870	0.7310586	0.7586870	0.0131203	0.76750
1.0	5	0.7629059	0.7310586	0.7629059	0.0143828	0.76125
1.0	6	0.7670586	0.7310586	0.7670586	0.0157720	0.76100
1.0	7	0.7711447	0.7310586	0.7711447	0.0172385	0.77025
1.0	8	0.7751642	0.7310586	0.7751642	0.0187480	0.78125
1.0	9	0.7791172	0.7310586	0.7791172	0.0202760	0.77825
1.0	10	0.7830037	0.7310586	0.7830037	0.0218057	0.79525
1.5	-10	0.8071793	0.8175745	0.8071793	0.0243405	0.81125
1.5	-9	0.8109322	0.8175745	0.8109322	0.0225683	0.79875
1.5	-8	0.8146189	0.8175745	0.8146189	0.0209012	0.82650
1.5	-7	0.8182394	0.8175745	0.8182394	0.0193490	0.81250
1.5	-6	0.8217940	0.8175745	0.8217940	0.0179231	0.82250
1.5	-5	0.8252827	0.8175745	0.8252827	0.0166367	0.82250
1.5	-4	0.8287060	0.8175745	0.8287060	0.0155044	0.82800
1.5	-3	0.8320641	0.8175745	0.8320641	0.0145417	0.83475
1.5	-2	0.8353575	0.8175745	0.8353575	0.0137631	0.84275
1.5	-1	0.8385867	0.8175745	0.8385867	0.0131801	0.83550
1.5	0	0.8417522	0.8175745	0.8417522	0.0127987	0.83950
1.5	1	0.8448545	0.8175745	0.8448545	0.0126172	0.84275
1.5	2	0.8478944	0.8175745	0.8478944	0.0126252	0.85175
1.5	3	0.8508724	0.8175745	0.8508724	0.0128045	0.85075
1.5	4	0.8537893	0.8175745	0.8537893	0.0131319	0.84950
1.5	5	0.8566458	0.8175745	0.8566458	0.0135817	0.85500
1.5	6	0.8594427	0.8175745	0.8594427	0.0141289	0.86475
1.5	7	0.8621808	0.8175745	0.8621808	0.0147506	0.85900
1.5	8	0.8648609	0.8175745	0.8648609	0.0154270	0.87425
1.5	9	0.8674839	0.8175745	0.8674839	0.0161418	0.86575
1.5	10	0.8700507	0.8175745	0.8700507	0.0168815	0.87025
2.0	-10	0.8856899	0.8807971	0.8856899	0.0190148	0.88825
2.0	-9	0.8881492	0.8807971	0.8881492	0.0178911	0.88950
2.0	-8	0.8905530	0.8807971	0.8905530	0.0168491	0.88725
2.0	-7	0.8929021	0.8807971	0.8929021	0.0158915	0.89925
2.0	-6	0.8951974	0.8807971	0.8951974	0.0150211	0.89600
2.0	-5	0.8974398	0.8807971	0.8974398	0.0142407	0.89150
2.0	-4	0.8996301	0.8807971	0.8996301	0.0135527	0.89625
2.0	-3	0.9017694	0.8807971	0.9017694	0.0129592	0.89500
2.0	-2	0.9038586	0.8807971	0.9038586	0.0124610	0.91075
2.0	-1	0.9058985	0.8807971	0.9058985	0.0120582	0.90650
2.0	0	0.9078902	0.8807971	0.9078902	0.0117490	0.91000
2.0	1	0.9098346	0.8807971	0.9098346	0.0115300	0.90800
2.0	2	0.9117326	0.8807971	0.9117326	0.0113964	0.91300
2.0	3	0.9135852	0.8807971	0.9135852	0.0113417	0.91300
2.0	4	0.9153933	0.8807971	0.9153933	0.0113582	0.91175
2.0	5	0.9171579	0.8807971	0.9171579	0.0114377	0.91550
2.0	6	0.9188800	0.8807971	0.9188800	0.0115716	0.92350
2.0	7	0.9205603	0.8807971	0.9205603	0.0117515	0.92375
2.0	8	0.9222000	0.8807971	0.9222000	0.0119694	0.92825
2.0	9	0.9237998	0.8807971	0.9237998	0.0122181	0.92650
2.0	10	0.9253607	0.8807971	0.9253607	0.0124909	0.92050
2.5	-10	0.9346760	0.9241418	0.9346760	0.0137536	0.93425
2.5	-9	0.9361667	0.9241418	0.9361667	0.0130704	0.94075
2.5	-8	0.9376190	0.9241418	0.9376190	0.0124389	0.94225
2.5	-7	0.9390339	0.9241418	0.9390339	0.0118589	0.93850
2.5	-6	0.9404122	0.9241418	0.9404122	0.0113303	0.93375
2.5	-5	0.9417548	0.9241418	0.9417548	0.0108527	0.94900
2.5	-4	0.9430626	0.9241418	0.9430626	0.0104257	0.93700
2.5	-3	0.9443364	0.9241418	0.9443364	0.0100485	0.93825
2.5	-2	0.9455771	0.9241418	0.9455771	0.0097201	0.94475
2.5	-1	0.9467855	0.9241418	0.9467855	0.0094391	0.94225
2.5	0	0.9479624	0.9241418	0.9479624	0.0092038	0.94800
2.5	1	0.9491087	0.9241418	0.9491087	0.0090120	0.95275
2.5	2	0.9502250	0.9241418	0.9502250	0.0088611	0.95325
2.5	3	0.9513122	0.9241418	0.9513122	0.0087485	0.94825
2.5	4	0.9523711	0.9241418	0.9523711	0.0086711	0.94850
2.5	5	0.9534024	0.9241418	0.9534024	0.0086256	0.95575
2.5	6	0.9544067	0.9241418	0.9544067	0.0086088	0.95100
2.5	7	0.9553849	0.9241418	0.9553849	0.0086173	0.95750
2.5	8	0.9563376	0.9241418	0.9563376	0.0086480	0.95400
2.5	9	0.9572654	0.9241418	0.9572654	0.0086978	0.95825
2.5	10	0.9581692	0.9241418	0.9581692	0.0087639	0.96100
3.0	-10	0.9634555	0.9525741	0.9634555	0.0093830	0.96250
3.0	-9	0.9643181	0.9525741	0.9643181	0.0089749	0.96650
3.0	-8	0.9651569	0.9525741	0.9651569	0.0085967	0.97000
3.0	-7	0.9659725	0.9525741	0.9659725	0.0082478	0.97250
3.0	-6	0.9667656	0.9525741	0.9667656	0.0079276	0.96500
3.0	-5	0.9675369	0.9525741	0.9675369	0.0076352	0.96475
3.0	-4	0.9682869	0.9525741	0.9682869	0.0073699	0.96750
3.0	-3	0.9690162	0.9525741	0.9690162	0.0071308	0.96850
3.0	-2	0.9697255	0.9525741	0.9697255	0.0069170	0.96900
3.0	-1	0.9704153	0.9525741	0.9704153	0.0067275	0.96800
3.0	0	0.9710861	0.9525741	0.9710861	0.0065611	0.97175
3.0	1	0.9717386	0.9525741	0.9717386	0.0064167	0.96825
3.0	2	0.9723732	0.9525741	0.9723732	0.0062929	0.97700
3.0	3	0.9729904	0.9525741	0.9729904	0.0061885	0.97750
3.0	4	0.9735908	0.9525741	0.9735908	0.0061021	0.97400
3.0	5	0.9741749	0.9525741	0.9741749	0.0060322	0.97375
3.0	6	0.9747430	0.9525741	0.9747430	0.0059775	0.97275
3.0	7	0.9752957	0.9525741	0.9752957	0.0059365	0.97225
3.0	8	0.9758335	0.9525741	0.9758335	0.0059079	0.97400
3.0	9	0.9763567	0.9525741	0.9763567	0.0058904	0.97375
3.0	10	0.9768658	0.9525741	0.9768658	0.0058827	0.97500

Calculate log score for model.

predp_5 <- predict(fit_5, type = "response")
logscore_5 <- sum(y * log(predp_5) + (1-y)*log(1-predp_5))
logscore_5

## [1] -1838.467

score_diff_5_m_1 <- round(logscore_5 - logscore_1,2)

The model’s log score is only 1.83 points better than the model with just Adjusted Log Odds as a predictor. This advantage may go away with the LOO estimate.

Run leave one out cross validation.

loo_5 <- loo(fit_5)
print(loo_5)

## 
## Computed from 4000 by 2988 log-likelihood matrix
## 
##          Estimate   SE
## elpd_loo  -1841.5 19.9
## p_loo         3.0  0.1
## looic      3682.9 39.8
## ------
## Monte Carlo SE of elpd_loo is 0.0.
## 
## All Pareto k estimates are good (k < 0.5).
## See help('pareto-k-diagnostic') for details.

The model with the Reach term (-1841.46) is basically just as good as the model with just the Adjusted Log Odds (-1842.27). Nonetheless, I will keep the Reach term since it makes sense a priori to have it in there. (Reach is typically considered an advantage in UFC fights by commentators etc.)

Plot Coefficients for Best Model

The sign of the intercept is almost certainly negative.

plot(fit_5, plotfun = "areas", prob = 0.95,
     pars = "(Intercept)")

The Adjusted Log Odds coefficient is clearly positive. It is also almost certainly greater than 1, which means the slope is steeper that what you would expect if the Fight Odds perfectly tracked the Probability of the Favorite Winning. Indeed, this steep slope along with the negative intercept manifests in the clear trend whereby the Adjusted Odds overestimate mild Favorites but underestimate large ones.

plot(fit_5, plotfun = "areas", prob = 0.95,
     pars = "Adjust_Log_Odds")

prob_reach_neg = round(mean(sims_5[, 3] < 0) * 100)

By analyzing the coefficient simulations, we can see that there is approximately a 2% probability that the effect of Reach is negative.

plot(fit_5, plotfun = "areas", prob = 0.95,
     pars = "Delta_REACH_imp")

Save Best Model

Save best model - the one with two predictors: Adjusted Log Odds and (imputed) Difference in Fighter Reach.

save(fit_5, file = "./Models/bayesian_logistic_regression_two_predictor_model.RData")

Examine Predictions

Extract data frame from model object.

df <- fit_5$data

Extract fitted values.

df$predicted_prob <- fit_5$fitted.values

Predicted Outcomes

Compute predicted outcomes (i.e. if predicted probability of victory is greater than 50%).

df$predicted_fav_win <- ifelse(df$predicted_prob > 0.5, T, F)

Examine results table.

pred_table <- table(df$predicted_fav_win, df$Favorite_Won)
pred_table

##        
##         FALSE TRUE
##   FALSE   182  182
##   TRUE    875 1749

Compute accuracy of the model.

right = (pred_table[1,1] + pred_table[2,2])  # underdog won and you predicted it plus the favorite won and you predicted it
wrong = (pred_table[2,1] + pred_table[1,2]) # underdog won and you did not predict it plus favorite won and you did not predict it

accuracy = right/(right + wrong)
accuracy

## [1] 0.6462517

Compute accuracy if you always pick favorite.

mean(df$Favorite_Won)

## [1] 0.6462517

Compute accuracy boost afforded by the model.

accuracy - mean(df$Favorite_Won)

## [1] 0

Predicted Bets

Compute predicted bets (note: this is different than predicted outcomes

with predicted bets we see if the predicted probability of victory exceeds the adjusted implied probability of victory of the odds).

df$predicted_bet_fav <- ifelse(df$predicted_prob > df$Adjust_Implied_Prob, T, F)

Get results table for predicted bets.

pred_table_bet <- table(df$predicted_bet_fav, df$Favorite_Won)
pred_table_bet

##        
##         FALSE TRUE
##   FALSE   763 1018
##   TRUE    294  913

Get accuracy for predicted bets. Note: it is greater than chance (50%).

right_bet = (pred_table_bet[1,1] + pred_table_bet[2,2])  # underdog won and you bet on it plus the favorite won and you bet on it
wrong_bet = (pred_table_bet[2,1] + pred_table_bet[1,2]) # underdog won and you did not bet on it plus favorite won and you did not  bet on it

accuracy_bet = right_bet/(right_bet + wrong_bet)
accuracy_bet

## [1] 0.5609103

In-sample returns on predicted bets

Compute Potential Profit by converting Adjusted Implied Probabilities into decimal odds, with a standard 3% overround penalty (as an estimate since the overall overround is usually around 5% total). Also, compute potential profit without penalty.

df$Potential_Profit <- ifelse(df$Favorite_Won, 1/(df$Adjust_Implied_Prob + 0.03) - 1,  1/(1-df$Adjust_Implied_Prob + 0.03) - 1)

Compute actual profit.

df$Profit <- ifelse(
  (df$predicted_bet_fav & df$Favorite_Won) | (!(df$predicted_bet_fav) & !(df$Favorite_Won))
  , df$Potential_Profit 
  , -1
  )

Compute ROI (%) based on in-sample predicted bets.

mean(df$Profit)

## [1] -0.01736408

Compute actual profit for just picking Favorites, and for just picking Underdogs.

df$Profit_Fav <- ifelse(
  df$Favorite_Won
  , df$Potential_Profit
  , -1
)

df$Profit_Dog <- ifelse(
  !(df$Favorite_Won)
  , df$Potential_Profit
  , -1
)

Compute ROI (%) for Favorite-only betting strategy.

mean(df$Profit_Fav)

## [1] -0.06242702

Compute ROI (%) for Underdog-only betting strategy.

mean(df$Profit_Dog)

## [1] -0.08220306

Conclusion

From a betting standpoint, the model cannot even overcome a modest average overround. Moreover, that is just for in-sample estimates - performance would be even worst with cross-validation estimates. On the other hand, the model does appear to substantially outperform simple / blind algorithms (such as just picking Favorites or just picking Underdogs). Practically, overcoming most of the overround (assuming 6% total and assuming the out-sample benefits are similar) may still confer some benefits as part of a larger betting strategy.