Get Current Fighter Stats
Description
This script scrapes up-to-date UFC fighter information from the http://www.fightmetric.com website and appends it to a fighter stats database which stores fighter stats as they were at the time of the actual UFC events.
The script is adapted from:
https://github.com/jasonchanhku/UFC-MMA-Predictor/blob/master/UFC%20MMA%20Predictor%20Workflow.ipynb
Libraries
library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
Web Scrape Fighter Stats
Identify fighters on upcoming card
read_html("http://ufcstats.com/statistics/events/completed") %>%
html_nodes(".b-link") %>%
html_attr("href") -> url_event_list
url_upcoming = url_event_list[1]
read_html(url_upcoming) %>%
html_nodes(".l-page_align_left .b-link_style_black") %>%
html_attr("href") -> url_list
read_html(url_upcoming) %>%
html_nodes(".b-list__box-list-item:nth-child(1)") %>%
html_text() -> rough_date
clean_date2 = trimws(strsplit(trimws(rough_date), ":")[[1]][2])
clean_date <- as.Date(clean_date2, format = "%B %d, %Y")
Load fighter stats database
load("./Datasets/current_fighter_stats.RData")
master_temp = current_fighter_stats
rm(current_fighter_stats)
most_recent_date = max(unique(master_temp$Event_Date))
Initialization
Create data structures to store the following fighter stats:
LEGEND:
SLpM - Significant Strikes Landed per Minute
Str. Acc. - Significant Striking Accuracy
SApM - Significant Strikes Absorbed per Minute
Str. Def. - Significant Strike Defence (the % of opponents strikes that
did not land) TD Avg. - Average Takedowns Landed per 15 minutes
TD Acc. - Takedown Accuracy
TD Def. - Takedown Defense (the % of opponents TD attempts that did not
land)
Sub. Avg. - Average Submissions Attempted per 15 minutes
name <- character(length(url_list))
weightclass <- numeric(length(url_list))
reach <- numeric(length(url_list))
slpm <- numeric(length(url_list))
td <- numeric(length(url_list))
tda <- numeric(length(url_list))
tdd <- numeric(length(url_list))
stra <- numeric(length(url_list))
strd <- numeric(length(url_list))
suba <- numeric(length(url_list))
sapm <- numeric(length(url_list))
num_fights <- numeric(length(url_list))
Loop
# start the clock
ptm <- proc.time()
for(i in 1:length(url_list)) {
# # print iteration
# print(i)
# # start the clock
# ptm <- proc.time()
# fighter url
fighter_url <- read_html(url_list[i])
# fighter name
name_t <- fighter_url %>% html_nodes(".b-content__title-highlight") %>% html_text()
name_t <- gsub("\n", "", name_t)
name[i] <- as.character(trimws(name_t))
#weightclass
weightclass_t <- fighter_url %>% html_nodes(".b-list__info-box_style_small-width .b-list__box-list-item_type_block:nth-child(2)") %>% html_text()
weightclass_t <- gsub(" ", "", weightclass_t)
weightclass_t <- gsub("\n", "", weightclass_t)
weightclass_t <- strsplit(weightclass_t, ":")
weightclass_t <- weightclass_t[[1]][2]
weightclass_t <- strsplit(weightclass_t, "l")
weightclass[i] <- as.numeric(as.character(weightclass_t[[1]][1]))
#reach
reach_t <- fighter_url %>% html_nodes(".b-list__info-box_style_small-width .b-list__box-list-item_type_block:nth-child(3)") %>% html_text()
reach_t <- gsub(" ", "", reach_t)
reach_t <- gsub("\n", "", reach_t)
reach_t <- strsplit(reach_t, ":")
reach_t <- reach_t[[1]][2]
reach_t <- strsplit(reach_t, "\"")
reach[i] <- as.numeric(as.character(reach_t[[1]][1]))
#feature 1: slpm
slpm_t <- fighter_url %>% html_nodes(".b-list__info-box-left .b-list__info-box-left .b-list__box-list-item_type_block:nth-child(1)") %>% html_text()
slpm_t <- gsub(" ", "", slpm_t)
slpm_t <- gsub("\n", "", slpm_t)
slpm_t <- strsplit(slpm_t, ":")
slpm_t <- slpm_t[[1]][2]
slpm[i] <- as.numeric(slpm_t)
#feature 2: takedown avg
td_t <- fighter_url %>% html_nodes(".b-list__info-box_style-margin-right .b-list__box-list-item_type_block:nth-child(2)") %>% html_text()
td_t<- gsub(" ", "", td_t)
td_t <- gsub("\n", "", td_t)
td_t <- strsplit(td_t, ":")
td_t <- td_t[[1]][2]
td[i] <- as.numeric(td_t)
#feature 3: significant striking accuracy
stra_t <- fighter_url %>% html_nodes(".b-list__info-box-left .b-list__info-box-left .b-list__box-list-item_type_block:nth-child(2)") %>% html_text()
stra_t <- gsub(" ", "", stra_t)
stra_t <- gsub("\n", "", stra_t)
stra_t <- strsplit(stra_t, ":")
stra_t <- stra_t[[1]][2]
stra_t <- strsplit(stra_t, "%")
stra[i] <- as.numeric(stra_t) / 100
#feature 4: takedown accuracy
tda_t <- fighter_url %>% html_nodes(".b-list__info-box_style-margin-right .b-list__box-list-item_type_block:nth-child(3)") %>% html_text()
tda_t <- gsub(" ", "", tda_t)
tda_t <- gsub("\n", "", tda_t)
tda_t <- strsplit(tda_t, ":")
tda_t <- tda_t[[1]][2]
tda_t <- strsplit(tda_t, "%")
tda[i] <- as.numeric(tda_t) / 100
#feature 5: significant absorbed stras per minute
sapm_t <- fighter_url %>% html_nodes(".b-list__info-box-left .b-list__info-box-left .b-list__box-list-item_type_block:nth-child(3)") %>% html_text()
sapm_t <- gsub(" ", "", sapm_t)
sapm_t <- gsub("\n", "", sapm_t)
sapm_t <- strsplit(sapm_t, ":")
sapm_t <- sapm_t[[1]][2]
sapm[i] <- as.numeric(sapm_t)
#feature 6: takedown def
tdd_t <- fighter_url %>% html_nodes(".b-list__info-box_style-margin-right .b-list__box-list-item_type_block:nth-child(4)") %>% html_text()
tdd_t <- gsub(" ", "", tdd_t)
tdd_t <- gsub("\n", "", tdd_t)
tdd_t <- strsplit(tdd_t, ":")
tdd_t <- tdd_t[[1]][2]
tdd_t <- strsplit(tdd_t, "%")
tdd[i] <- as.numeric(tdd_t) / 100
#feature 7: striking def
strd_t <- fighter_url %>% html_nodes(".b-list__info-box-left .b-list__info-box-left .b-list__box-list-item_type_block:nth-child(4)") %>% html_text()
strd_t <- gsub(" ", "", strd_t)
strd_t <- gsub("\n", "", strd_t)
strd_t <- strsplit(strd_t, ":")
strd_t <- strd_t[[1]][2]
strd_t <- strsplit(strd_t, "%")
strd[i] <- as.numeric(strd_t) / 100
#feature 8: submission average
suba_t <- fighter_url %>% html_nodes(".b-list__box-list_margin-top .b-list__box-list-item_type_block:nth-child(5)") %>% html_text()
suba_t <- gsub(" ", "", suba_t)
suba_t <- gsub("\n", "", suba_t)
suba_t <- strsplit(suba_t, ":")
suba_t <- suba_t[[1]][2]
suba[i] <- as.numeric(suba_t)
# number of fights on record
num_fights_t <- fighter_url %>% html_nodes(".b-flag__text") %>% html_text()
num_fights[i] <- length(num_fights_t)
# # stop the clock
# print(proc.time() - ptm)
}
# stop the clock
print(proc.time() - ptm)
## user system elapsed
## 3.188 0.060 9.976
Manipulate Data Frame
Create data frame and add columns describing weightclass and listing date data was collected.
data_date = Sys.Date()
database <- data.frame(
NAME = name
, Weight = weightclass
, REACH = reach
, SLPM = slpm
, SAPM = sapm
, STRA = stra
, STRD = strd
, TD = td
, TDA = tda
, TDD = tdd
, SUBA = suba
, Num_Fights = num_fights
, Event_Date = clean_date
, Data_Date = data_date
)
data_add_wc <- mutate(
database
, WeightClass = ifelse(Weight == 115, "strawweight"
, ifelse(Weight == 125, "flyweight"
, ifelse(Weight == 135, "bantamweight"
, ifelse(Weight == 145, "featherweight"
, ifelse(Weight == 155, "lightweight"
, ifelse(Weight == 170, "welterweight"
, ifelse(Weight == 185, "middleweight"
, ifelse(Weight == 205, "lightheavyweight"
, ifelse(Weight > 205, "heavyweight"
, "catchweight"))))))))))
The following fighters did not have their weights listed.
data_add_wc[is.na(data_add_wc$Weight), ]
## [1] NAME Weight REACH SLPM SAPM STRA
## [7] STRD TD TDA TDD SUBA Num_Fights
## [13] Event_Date Data_Date WeightClass
## <0 rows> (or 0-length row.names)
Get rid of fighters with no listed weight.
data_clean <- data_add_wc[!is.na(data_add_wc$Weight), ]
The following fighters are listed at catchweight.
data_clean %>% filter(WeightClass == "catchweight")
## [1] NAME Weight REACH SLPM SAPM STRA
## [7] STRD TD TDA TDD SUBA Num_Fights
## [13] Event_Date Data_Date WeightClass
## <0 rows> (or 0-length row.names)
Examine data frame.
summary(data_clean)
## NAME Weight REACH SLPM
## Length:28 Min. :125.0 Min. :64.00 Min. :0.000
## Class :character 1st Qu.:125.0 1st Qu.:69.00 1st Qu.:2.000
## Mode :character Median :140.0 Median :72.00 Median :3.140
## Mean :151.8 Mean :71.57 Mean :2.902
## 3rd Qu.:170.0 3rd Qu.:73.00 3rd Qu.:3.833
## Max. :225.0 Max. :80.00 Max. :6.530
## NA's :5
## SAPM STRA STRD TD
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:1.877 1st Qu.:0.3175 1st Qu.:0.3450 1st Qu.:0.000
## Median :2.690 Median :0.4000 Median :0.5200 Median :0.715
## Mean :3.087 Mean :0.3500 Mean :0.4143 Mean :1.395
## 3rd Qu.:4.440 3rd Qu.:0.5025 3rd Qu.:0.5725 3rd Qu.:2.572
## Max. :9.000 Max. :0.6000 Max. :0.6500 Max. :6.610
##
## TDA TDD SUBA Num_Fights
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 2.000
## Median :0.2950 Median :0.5600 Median :0.0000 Median : 4.000
## Mean :0.2961 Mean :0.4329 Mean :0.3714 Mean : 5.821
## 3rd Qu.:0.5000 3rd Qu.:0.7275 3rd Qu.:0.4750 3rd Qu.: 8.000
## Max. :1.0000 Max. :1.0000 Max. :1.7000 Max. :24.000
##
## Event_Date Data_Date WeightClass
## Min. :2021-01-20 Min. :2021-01-19 Length:28
## 1st Qu.:2021-01-20 1st Qu.:2021-01-19 Class :character
## Median :2021-01-20 Median :2021-01-19 Mode :character
## Mean :2021-01-20 Mean :2021-01-19
## 3rd Qu.:2021-01-20 3rd Qu.:2021-01-19
## Max. :2021-01-20 Max. :2021-01-19
##
Append Fighter Stats Database
if (most_recent_date < clean_date) {
print(sprintf("We are updating the database to include event on %s", clean_date))
} else if(most_recent_date == clean_date) {
# most recent addition
most_recent_add = max(unique(master_temp$Data_Date))
print(sprintf("Info from %s will be replaced by that from %s", most_recent_add, data_date))
# get rid of past addition for more up-to-date addition
master_temp %>%
dplyr::filter(Event_Date != clean_date) -> master_temp
} else {
print("ERROR: The next event date may be greater than what is recorded in this data set...")
}
## [1] "Info from 2021-01-19 will be replaced by that from 2021-01-19"
# combine data
current_fighter_stats = rbind(master_temp, data_clean)
Save Data
save(current_fighter_stats, file = "./Datasets/current_fighter_stats.RData")