Get Event Info
Description
This script scrapes UFC event information from the http://ufcstats.com website.
The script is adapted from:
https://github.com/jasonchanhku/UFC-MMA-Predictor/blob/master/UFC%20MMA%20Predictor%20Workflow.ipynb
Libraries
library(rvest)
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
library(dtplyr)
Web Scrape Event Info
Get urls for all desired events.
webpage <- read_html("http://ufcstats.com/statistics/events/completed?page=all")
urls <- webpage %>% html_nodes(".b-link_style_black") %>% html_attr("href")
Initialization
Create data structures to store event info.
date <- c()
event <- c()
city <- c()
state <- c()
country <- c()
winner <- c()
loser <- c()
wc <- c()
method <- c()
round <- c()
Loop
# start the clock
ptm <- proc.time()
for(i in 1:length(urls)){
# # print iteration
# print(sprintf("%d of %d", i, length(urls)))
# # start the clock
# ptm <- proc.time()
url_link <- read_html(urls[i])
# winner
winner_t <- url_link %>% html_nodes(".b-fight-details__table-text:nth-child(1) .b-link_style_black") %>% html_text()
winner <- c(winner, trimws(winner_t))
# loser
loser_t <- url_link %>% html_nodes(".b-fight-details__table-text+ .b-fight-details__table-text .b-link_style_black") %>% html_text()
loser <- c(loser, trimws(loser_t))
# weightclass
wc_t <- url_link %>% html_nodes(".l-page_align_left:nth-child(7) .b-fight-details__table-text") %>% html_text()
wc <- c(wc, trimws(wc_t))
# method
method_t <- url_link %>% html_nodes(".l-page_align_left+ .l-page_align_left .b-fight-details__table-text:nth-child(1)") %>% html_text()
method <- c(method, trimws(method_t))
# round
round_t <- url_link %>% html_nodes(".b-fight-details__table-col:nth-child(9) .b-fight-details__table-text") %>% html_text()
round <- c(round, as.integer(trimws(round_t)))
# date
date_t <- url_link %>% html_nodes(".b-list__box-list-item:nth-child(1)") %>% html_text()
date_t <- trimws(strsplit(trimws(date_t), ":")[[1]][2])
date_t <- as.character(as.Date(date_t, format = "%B %d, %Y"))
date_t <- replicate(length(winner_t), date_t)
date <- c(date, date_t)
# event
event_t <- url_link %>% html_nodes(".b-content__title-highlight") %>% html_text()
event_t <- trimws(event_t)
event_t <- replicate(length(winner_t), event_t)
event <- c(event, event_t)
# location
location_t <- url_link %>% html_nodes(".b-list__box-list-item:nth-child(2)") %>% html_text()
location_t <- trimws(strsplit(trimws(location_t), ":")[[1]][2])
location_t <- strsplit(location_t, ",")
city_t <- trimws(location_t[[1]][1])
city_t <- replicate(length(winner_t), city_t)
city <- c(city, city_t)
state_t <- trimws(location_t[[1]][2])
state_t <- replicate(length(winner_t), state_t)
state <- c(state, state_t)
country_t <- trimws(location_t[[1]][3])
country_t <- replicate(length(winner_t), ifelse(is.na(country_t), replicate(length(winner_t), state_t),replicate(length(winner_t), country_t)))
country <- c(country, country_t)
# # stop the clock
# print(proc.time() - ptm)
}
# stop the clock
print(proc.time() - ptm)
## user system elapsed
## 45.345 0.878 305.929
Create and Save Data Frame
Create data frame.
event_info <- data.frame(Date = date, Event = event, City = city, State = state, Country = country, Winner = winner, Loser = loser, WeightClass = wc, Round = round, Method = method)
Examine event data.
summary(event_info)
## Date Event City State
## Length:5927 Length:5927 Length:5927 Length:5927
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Country Winner Loser WeightClass
## Length:5927 Length:5927 Length:5927 Length:5927
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Round Method
## Min. :1.000 Length:5927
## 1st Qu.:1.000 Class :character
## Median :3.000 Mode :character
## Mean :2.309
## 3rd Qu.:3.000
## Max. :5.000
Save data frame.
save(event_info, file = "./Datasets/event_info.RData")