Description

This script scrapes fight odds information from the www.betmma.tips website.

The script is adapted from:

https://github.com/jasonchanhku/UFC-MMA-Predictor/blob/master/UFC%20MMA%20Predictor%20Workflow.ipynb


Libraries

library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
library(dtplyr)
library(ggplot2)
library(lubridate)


Web Scrape Odds Data


MMA Handicappers Page

Read html.

url <- read_html("http://www.betmma.tips/past_mma_handicapper_performance_all.php")

List addresses for each event listed on the site page.

links <- url %>% html_nodes("td td td a") %>% html_attr("href")

Append the addresses to the website base.

links <- paste0("http://www.betmma.tips/", links) 

Read titles to identify and select exclusively UFC events.

titles <- url %>% html_nodes("td td td a") %>% html_text()
UFC_indices = grepl("UFC", trimws(titles))
links <- links[UFC_indices]


Underdogs vs. Favorites Page

Do the same thing for another page with some additional fight information.

url2 <- read_html("http://www.betmma.tips/mma_betting_favorites_vs_underdogs.php?Org=1")
links2 <- url2 %>% html_nodes("td td td td a") %>% html_attr("href")
links2 <- paste0("http://www.betmma.tips/", links2) 
titles2 <- url2 %>% html_nodes("td td td td a")  %>% html_text()
links2 <- links2[!(trimws(titles2) %in% trimws(titles))]


Initialization

Combine links then create empty data structures to store odds data and web scraping errors.

links <- c(links, links2)

event <- c()
datez <- c()
fighter1 <- c()
fighter2 <- c()
fighter1_odds <- c()
fighter2_odds <- c()
win <- c()

wins_mismatch = 0
fighters_mismatch = 0
fighter1_mismatch = 0
fighter2_mismatch = 0
win_included = 0

wins_mismatch_event <- c()
fighters_mismatch_event <- c()
fighter1_mismatch_event <- c()
fighter2_mismatch_event <- c()
win_included_event <- c()


Loop

# start the clock
ptm <- proc.time()

for (i in 1:length(links)){
  
  # # print iteration
  # print(sprintf("%d of %d", i, length(links)))
  
  # # start the clock
  # ptm <- proc.time()
  
  # fighter info
  sub_link <- read_html(links[i])
  fighters_t <- sub_link %>% html_nodes("td td td a+ a") %>% html_text()  
  fighters_t <- fighters_t[fighters_t != " vs "]
  fighters_t <- fighters_t[fighters_t != ""]
  
  fighter1_t <- fighters_t[c(T, F)]
  fighter2_t <- fighters_t[c(F, T)]
  
  # event info
  event_t <- sub_link %>% html_nodes("td h1") %>% html_text()
  date_t <- sub_link %>% html_nodes("h2") %>% html_text()
  date_t <- strsplit(date_t, ";")[[1]][2]
  date_t <- as.character(parse_date_time(date_t, orders = "dmy"))
  # print(date_t)
  
  
  # odds info
  label <- c()
  label_t <- sub_link %>% html_nodes("td td td td tr~ tr+ tr td") %>% html_text()
  label_cleansed <- gsub("@", "",trimws(label_t))
  
  # fight outcome info
  win_t <- sub_link %>% html_nodes("td td td td br+ a") %>% html_text()
  
  # get every second odd (i.e. those corresponding to fighter 1)
  fighter1_odds_t <- label_cleansed[c(TRUE, FALSE)]
  
  # get every second odd (i.e. those corresponding to fighter 2)
  fighter2_odds_t <- label_cleansed[c(FALSE, TRUE)]
  
  
  # QUALITY CONTROL:
  # how many wins to we have?
  lwins = length(win_t)
  # how many odds do we have for f1?
  l1odds = length(fighter1_odds_t)
  # how many odds do we have for f2?
  l2odds = length(fighter2_odds_t)
  # how long is the list of fighter 1s?
  l1 = length(fighter1_t)
  # how long is the list of fighter 2s?
  l2 = length(fighter2_t)
  
  skip_this = F
  
  if (!(l1odds == l1)) {
    fighter1_mismatch = fighter1_mismatch + 1
    fighter1_mismatch_event = c(fighter1_mismatch_event, event_t)
    # print("fighter 1 mismatch")
    skip_this = T
  }
  if (!(l2odds == l2)) {
    fighter2_mismatch = fighter2_mismatch + 1
    fighter2_mismatch_event = c(fighter2_mismatch_event, event_t)
    # print("fighter 2 mismatch")
    skip_this = T
  }
  if (!(l1 == l2)) {
    fighters_mismatch = fighters_mismatch + 1
    fighters_mismatch_event = c(fighters_mismatch_event, event_t)
    # print("mismatch between fighters 1 and 2")
    skip_this = T
  }
  if (!(l1 == lwins)) {
    wins_mismatch = wins_mismatch + 1
    wins_mismatch_event = c(wins_mismatch_event, event_t)
    # print("wins mismatch with fighter 1")
    
    # if there is a mismatch between fighter1 and number of wins 
    # , and if fighter1 and fighter 2 lists are the same length
    # , then we can take out the fight for which there was no win
    if (l1 == l2 | l2odds == l2 | l1odds == l1) {
      keep_indices1_t = which(fighter1_t %in% win_t)
      keep_indices2_t = which(fighter2_t %in% win_t)
      
      keep_indices_t = sort(c(keep_indices1_t, keep_indices2_t))
      
      fighter1_t = fighter1_t[keep_indices_t]
      fighter2_t = fighter2_t[keep_indices_t]
      
      fighter1_odds_t = fighter1_odds_t[keep_indices_t]
      fighter2_odds_t = fighter2_odds_t[keep_indices_t]
      
      
      win_included = win_included + 1
      win_included_event = c(win_included_event, event_t)
    }
    
  }
  
  if (!(skip_this)) {
    # APPEND LONG LISTS
    fighter1 = c(fighter1, fighter1_t)
    fighter2 = c(fighter2, fighter2_t)
    fighter1_odds <- c(fighter1_odds, fighter1_odds_t)
    fighter2_odds <- c(fighter2_odds, fighter2_odds_t)
    win <- c(win, win_t)
    # replicate event name 
    event <- c(event, replicate(length(fighter1_t),event_t))
    # replicate event date
    datez <- c(datez, replicate(length(fighter1_t), date_t))
  }
  
  # # stop the clock
  # print(proc.time() - ptm)
}  

# stop the clock
print(proc.time() - ptm)

##     user   system  elapsed 
##  177.329    4.770 1085.158


Quality Control

Identify which events were excluded from the analysis.

What proportion of the candidate events were excluded?

NOTE: some events may not be listed on MMAtips (e.g. UFC Vegas 9 is not listed as of 2020 09 13).

excluded_events = unique(c(fighter1_mismatch_event, fighter2_mismatch_event, fighters_mismatch_event))

The following events were excluded from the analysis.

excluded_events

##  [1] "UFC FN"                               
##  [2] "UFC 256"                              
##  [3] "UFC Vegas 13"                         
##  [4] "UFC 251"                              
##  [5] "UFC Fight Night 162"                  
##  [6] "UFC on ESPN+ 19"                      
##  [7] "UFC 243"                              
##  [8] "UFC 239"                              
##  [9] "UFC on ESPN 3"                        
## [10] "UFC Fight Night 145"                  
## [11] "UFC on ESPN 1"                        
## [12] "UFC on ESPN+ 1"                       
## [13] "UFC Fight Night Sao Paulo"            
## [14] "UFC 227"                              
## [15] "UFC on Fox 27"                        
## [16] "UFC Fight Night 114 Pettis vs. Moreno"
## [17] "UFC Fight Night 113"                  
## [18] "UFC 212 Aldo vs Holloway"

Proportion of events excluded from analysis:

length(excluded_events) / length(links)

## [1] 0.0647482


Save Data

fight_odds <- data.frame(
  Date = datez
  , Events = event
  , Fighter1 = fighter1
  , Fighter1_Decimal_Odds = fighter1_odds
  , Fighter2 = fighter2
  , Fighter2_Decimal_Odds = fighter2_odds
  , Winner = win
)

# save file for master script
save(fight_odds, file = "./Datasets/fight_odds.RData")