Combine Scraped Data
Description
This script combines fight data that has been scraped from various sites.
Libraries
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
Combine Databases Events & Odds
Pre-process
Load data.
load("./Datasets/fight_odds.RData")
load("./Datasets/event_info.RData")
Ensure that dates are characters.
event_info$Date = as.character(event_info$Date)
Add loser column.
fight_odds %>%
dplyr::mutate(Loser = ifelse(Winner == Fighter1, Fighter2, Fighter1)) -> fight_odds
Identify odds by Winner/Loser label.
fight_odds %>%
dplyr::mutate(
Winner_Odds = ifelse(Winner == Fighter1, Fighter1_Decimal_Odds, Fighter2_Decimal_Odds)
, Loser_Odds = ifelse(Loser == Fighter1, Fighter1_Decimal_Odds, Fighter2_Decimal_Odds)
) -> fight_odds
Only keep columns of interest.
fight_odds %>% dplyr::select(
-c(
"Events"
, "Fighter1"
, "Fighter1_Decimal_Odds"
, "Fighter2"
, "Fighter2_Decimal_Odds"
)
) -> fight_odds_clean
Identify date discrepancies as odds website appears to have some of the dates wrong if compared to UFC website.
setdiff(fight_odds_clean$Date, event_info$Date)
## [1] "2020-12-06" "2020-11-29" "2020-10-04" "2020-05-14" "2020-05-10"
## [6] "2020-03-08" "2020-02-09" "2019-12-15" "2018-02-04" "2017-12-31"
## [11] "2016-12-31" "2013-12-07"
Correct manually for these discrepancies
fight_odds_clean$Date = dplyr::recode(
fight_odds_clean$Date
, "2020-12-06" = "2020-12-05"
, "2020-11-29" = "2020-11-28"
, "2020-10-04" = "2020-10-03"
, "2020-05-14" = "2020-05-13"
, "2020-05-10" = "2020-05-09"
, "2020-03-08" = "2020-03-07"
, "2020-02-09" = "2020-02-08"
, "2019-12-15" = "2019-12-14"
, "2018-02-04" = "2018-02-03"
, "2017-12-31" = "2017-12-30"
, "2017-12-02" = "2017-12-01"
, "2016-12-31" = "2016-12-30"
, "2013-12-07" = "2013-12-06"
)
Get rid of either the Loser or Winner column to merge below.
fight_odds_clean %>% dplyr::select(
-c("Loser")
) -> fight_odds_clean_win
fight_odds_clean %>% dplyr::select(
-c("Winner")
) -> fight_odds_clean_lose
Merge by (either Winner or Loser) and Date
df_fight_win = merge(event_info, fight_odds_clean_win, by = c("Winner", "Date"))
df_fight_lose = merge(event_info, fight_odds_clean_lose, by = c("Loser", "Date"))
Add these databases together.
df_fight_bind = rbind(df_fight_win, df_fight_lose)
Remove duplicates.
df_fight_bind %>% distinct() -> event_and_odds
Quality Control
How much data (# rows) do we lose if compared to fight_odds_clean data frame?
nrow(fight_odds_clean) - nrow(event_and_odds )
## [1] 52
Which fight dates, if any, did we lose entirely?
lost_fight_dates = unique(fight_odds_clean$Date)[!(unique(fight_odds_clean$Date) %in% unique(event_and_odds$Date))]
Which events do these dates correspond to?
event_info %>%
dplyr::filter(Date %in% lost_fight_dates) %>%
group_by(Date) %>%
dplyr::summarise(Events = unique(Event))
## # A tibble: 1 x 2
## Date Events
## <chr> <chr>
## 1 2017-12-01 The Ultimate Fighter: A New World Champion Finale
Which Winners and Losers were lost during merging?
lost_winners = unique(fight_odds_clean$Winner)[!(unique(fight_odds_clean$Winner) %in% unique(event_and_odds$Winner))]
lost_losers = unique(fight_odds_clean$Loser)[!(unique(fight_odds_clean$Loser) %in% unique(event_and_odds$Loser))]
What other data did we lose (based on Odds and Date)?
The below code identifies the discrepancies between the fight_oods_clean and event_and_odds dataframes.
fight_odds_clean %>%
dplyr::select(
Date
, Winner_Odds
, Loser_Odds
) -> fight_odds_clean_odds
event_and_odds %>%
dplyr::select(
Date
, Winner_Odds
, Loser_Odds
) -> event_and_odds_odds
fight_odds_clean_odds$Coder = "b"
event_and_odds_odds$ Coder = "d"
df_compare = rbind(event_and_odds_odds, fight_odds_clean_odds)
dupsBetweenGroups <- function (df, idcol) {
# df: the data frame
# idcol: the column which identifies the group each row belongs to
# Get the data columns to use for finding matches
datacols <- setdiff(names(df), idcol)
# Sort by idcol, then datacols. Save order so we can undo the sorting later.
sortorder <- do.call(order, df)
df <- df[sortorder,]
# Find duplicates within each id group (first copy not marked)
dupWithin <- duplicated(df)
# With duplicates within each group filtered out, find duplicates between groups.
# Need to scan up and down with duplicated() because first copy is not marked.
dupBetween = rep(NA, nrow(df))
dupBetween[!dupWithin] <- duplicated(df[!dupWithin,datacols])
dupBetween[!dupWithin] <- duplicated(df[!dupWithin,datacols], fromLast=TRUE) | dupBetween[!dupWithin]
# ============= Replace NA's with previous non-NA value ==============
# This is why we sorted earlier - it was necessary to do this part efficiently
# Get indexes of non-NA's
goodIdx <- !is.na(dupBetween)
# These are the non-NA values from x only
# Add a leading NA for later use when we index into this vector
goodVals <- c(NA, dupBetween[goodIdx])
# Fill the indices of the output vector with the indices pulled from
# these offsets of goodVals. Add 1 to avoid indexing to zero.
fillIdx <- cumsum(goodIdx)+1
# The original vector, now with gaps filled
dupBetween <- goodVals[fillIdx]
# Undo the original sort
dupBetween[sortorder] <- dupBetween
# Return the vector of which entries are duplicated across groups
return(dupBetween)
}
dupRows <- dupsBetweenGroups(df_compare, "Coder")
df_dups = cbind(df_compare, dupRows)
lost_fight_odds = df_dups[df_dups$dupRows == F,]
lost_fights = merge(lost_fight_odds, fight_odds_clean)
The following fight data were lost.
Unsure of exact reason for these lost fights but finding out could be tedious. They could be cases where both fight names were mismatches.
lost_fights[,c(1,6,7)]
## Date Winner Loser
## 1 2013-04-27 Pat Healy Jim Miller
## 2 2014-07-26 Brian Ortega Mike de la Torre
## 3 2014-08-23 Ning Guangyou Yang Jianping
## 4 2015-05-16 Jon delos Reyes Roldan Sangcha-an
## 5 2015-11-07 Matheus Nicolau Bruno Korea
## 6 2015-11-07 Gleison Tibau Abel Trujillo
## 7 2015-11-21 Alvaro Herrera Vernon Ramos
## 8 2015-11-28 Freddy Serrano Zhuikui Yao
## 9 2015-12-12 Tatsuya Kawajiri Jason Knight
## 10 2015-12-12 Ryan LaFlare Mike Pierce
## 11 2015-12-12 Evan Dunham Joe Lauzon
## 12 2015-12-12 Tony Ferguson Edson Barboza
## 13 2015-12-12 Chris Gruetzemacher Abner Lloveras
## 14 2015-12-12 Frankie Edgar Chad Mendes
## 15 2015-12-12 Geane Herrera Joby Sanchez
## 16 2015-12-12 Julian Erosa Marcin Wrzosek
## 17 2015-12-12 Gabriel Gonzaga Konstantin Erokhin
## 18 2015-12-12 Ryan Hall Artem Lobov
## 19 2016-11-19 Darren Stewart Francimar Barroso
## 20 2017-02-04 Curtis Blaydes Adam Milstead
## 21 2017-02-04 Niko Price Alex Morono
## 22 2017-03-11 Kelvin Gastelum Vitor Belfort
## 23 2017-07-29 Jon Jones Daniel Cormier
## 24 2017-11-25 Yadong Song Bharat Khandare
## 25 2017-12-01 Amanda Cooper Angela Magana
## 26 2017-12-01 Dominick Reyes Jeremy Kimball
## 27 2017-12-01 Henry Cejudo Sergio Pettis
## 28 2017-12-01 Abdul Razak Alhassan Sabah Homasi
## 29 2017-12-01 Max Holloway Jose Aldo
## 30 2017-12-01 Justin Willis Allen Crowder
## 31 2017-12-01 Francis Ngannou Alistair Overeem
## 32 2017-12-01 Tecia Torres Michelle Waterson
## 33 2017-12-01 David Teymur Drakkar Klose
## 34 2017-12-01 Felice Herrig Cortney Casey
## 35 2017-12-01 Paul Felder Charles Oliveira
## 36 2017-12-01 Eddie Alvarez Justin Gaethje
## 37 2017-12-01 Yancy Medeiros Alex Oliveira
## 38 2017-12-30 Michal Oleksiejczuk Khalil Rountree
## 39 2018-06-09 Mike Jackson Phil Brooks
## 40 2018-11-10 Bobby Moffett Chas Skelly
## 41 2018-11-24 Yadong Song Vincent Morales
## 42 2018-12-29 Walt Harris Andrei Arlovski
## 43 2019-03-16 Saparbek Safarov Nick Negumereanu
## 44 2019-07-13 John Allan Mike Rodriguez
## 45 2019-08-10 Alex da Silva Rodrigo Vargas
## 46 2019-08-31 Mizuki Inoue Yanan Wu
## 47 2020-02-08 Journey Newson Domingo Pilarte
## 48 2020-05-16 Rodrigo Nascimento Ferreira Don'tale Mayes
## 49 2020-07-25 Jesse Ronson Nicolas Dalby
## 50 2020-08-22 Trevin Jones Timur Valiev
## 51 2020-09-12 Kevin Croom Roosevelt Roberts
Save Data
save(event_and_odds, file = "./Datasets/event_and_odds.RData")
Add Fighter Stats Database
NOTE: the following fighter stats are dated to when the data were scrapped from the UFC website. Therefore, once merged with the events and odds database, the stats will not necessarily be representative of what they were the night a given fighter was fighting.
Pre-process
Load data:
load("./Datasets/fighter_stats.RData")
Redefine weight classes.
fighter_stats$WeightClass = dplyr::recode(
fighter_stats$WeightClass
, "lightweight" = "Lightweight"
, "heavyweight" = "Heavyweight"
, "featherweight" = "Featherweight"
, "welterweight" = "Welterweight"
, "middleweight" = "Middleweight"
, "lightheavyweight" = "Light Heavyweight"
, "catchweight" = "Catch Weight"
, "flyweight" = "Flyweight"
, "strawweight" = "Strawweight"
, "bantamweight" = "Bantamweight"
)
Rename columns.
fighter_stats %>%
dplyr::rename(
"FighterWeightClass" = "WeightClass"
, "FighterWeight" = "Weight"
) -> fighter_stats
event_and_odds %>%
dplyr::rename(
"FightWeightClass" = "WeightClass"
) -> event_and_odds
Add Sex category.
event_and_odds %>%
dplyr::mutate(
Sex = ifelse(grepl("Women's ", FightWeightClass), "Female", "Male")
) -> event_and_odds
Remove mention of Sex from Weight Class category.
event_and_odds$FightWeightClass = gsub("Women's ", "", event_and_odds$FightWeightClass)
Check that Weight Class labels make sense.
unique(event_and_odds$FightWeightClass)
## [1] "Welterweight" "Light Heavyweight" "Lightweight"
## [4] "Middleweight" "Heavyweight" "Bantamweight"
## [7] "Strawweight" "Featherweight" "Flyweight"
## [10] "Catch Weight"
Create unique fight id for each fight (i.e. each row).
event_and_odds$fight_id = 1:nrow(event_and_odds)
Gather data frame by fighter (i.e. go from short to long format).
event_and_odds %>%
gather(Result, NAME, c("Winner", "Loser")) -> event_and_odds_long
Merge databases.
df_master = merge(event_and_odds_long, fighter_stats, by = c("NAME"))
Quality Control
Examine difference in data frame length between df_master and event_and_odds long.
nrow(df_master) - nrow(event_and_odds_long)
## [1] 7
Are there any duplicate rows in df_master?
df_master %>% distinct() -> df_master
Where are the additional rows coming from?
sum(duplicated(fighter_stats$NAME))
## [1] 4
Who are these duplicate fighters?
fighter_stats[duplicated(fighter_stats$NAME),]
## NAME FighterWeight FighterWeightClass REACH SLPM SAPM STRA
## 1132 Joey Gomez 155 Lightweight 71 3.73 3.33 0.49
## 1528 Tony Johnson 265 Heavyweight NA 2.00 4.73 0.53
## 2029 Michael McDonald 135 Bantamweight 70 2.69 2.76 0.42
## 2990 Bruno Silva 185 Middleweight NA 0.00 0.00 0.00
## STRD TD TDA TDD SUBA
## 1132 0.50 2.00 0.28 0.00 0.0
## 1528 0.31 2.00 0.22 0.00 0.0
## 2029 0.57 1.09 0.66 0.52 1.4
## 2990 0.00 0.00 0.00 0.00 0.0
What are the duplicate entries?
fighter_stats[fighter_stats$NAME %in% fighter_stats$NAME[duplicated(fighter_stats$NAME)], ]
## NAME FighterWeight FighterWeightClass REACH SLPM SAPM STRA
## 1130 Joey Gomez 135 Bantamweight 73 2.44 4.46 0.28
## 1132 Joey Gomez 155 Lightweight 71 3.73 3.33 0.49
## 1520 Tony Johnson 205 Light Heavyweight 76 4.00 3.67 0.92
## 1528 Tony Johnson 265 Heavyweight NA 2.00 4.73 0.53
## 2027 Michael McDonald 205 Light Heavyweight NA 0.00 0.40 0.00
## 2029 Michael McDonald 135 Bantamweight 70 2.69 2.76 0.42
## 2989 Bruno Silva 125 Flyweight 65 2.60 3.51 0.42
## 2990 Bruno Silva 185 Middleweight NA 0.00 0.00 0.00
## STRD TD TDA TDD SUBA
## 1130 0.55 0.62 1.00 0.50 0.0
## 1132 0.50 2.00 0.28 0.00 0.0
## 1520 0.22 0.00 0.00 0.90 0.0
## 1528 0.31 2.00 0.22 0.00 0.0
## 2027 0.50 0.00 0.00 0.00 0.0
## 2029 0.57 1.09 0.66 0.52 1.4
## 2989 0.55 3.14 0.30 0.61 0.0
## 2990 0.00 0.00 0.00 0.00 0.0
Remove these entries if they have Weight Class mismatches.
dup_names = fighter_stats$NAME[duplicated(fighter_stats$NAME)]
df_master %>%
dplyr::filter(
!((NAME %in% dup_names) & (FighterWeightClass != FightWeightClass))
) -> df_master
Remove fight IDs without pair.
df_master %>%
group_by(fight_id) %>%
summarize(count = length(Sex)) %>%
dplyr::filter(count != 2) -> to_remove_from_master
df_master[df_master$fight_id %in% to_remove_from_master$fight_id,]
## [1] NAME Date Event City
## [5] State Country FightWeightClass Round
## [9] Method Winner_Odds Loser_Odds Sex
## [13] fight_id Result FighterWeight FighterWeightClass
## [17] REACH SLPM SAPM STRA
## [21] STRD TD TDA TDD
## [25] SUBA
## <0 rows> (or 0-length row.names)
df_master = df_master[!(df_master$fight_id %in% to_remove_from_master$fight_id),]
Save File
save(df_master, file = "./Datasets/df_master.RData")
Instead, Add CURRENT Fighter Stats Database
NOTE: unlike the previously fighter stats database that was added, these fighter stats are as they were at the time of the actual UFC events (hence “current”). This current fighter stats database will be added to the events and odds database in the same manner as the previous fighter stats database.
Pre-process
Load data.
load("./Datasets/current_fighter_stats.RData")
Redefine weight classes.
current_fighter_stats$WeightClass = dplyr::recode(
current_fighter_stats$WeightClass
, "lightweight" = "Lightweight"
, "heavyweight" = "Heavyweight"
, "featherweight" = "Featherweight"
, "welterweight" = "Welterweight"
, "middleweight" = "Middleweight"
, "lightheavyweight" = "Light Heavyweight"
, "catchweight" = "Catch Weight"
, "flyweight" = "Flyweight"
, "strawweight" = "Strawweight"
, "bantamweight" = "Bantamweight"
)
Rename columns.
current_fighter_stats %>%
dplyr::rename(
"FighterWeightClass" = "WeightClass"
, "FighterWeight" = "Weight"
) -> current_fighter_stats
current_fighter_stats %>%
dplyr::rename(
"Date" = "Event_Date"
) -> current_fighter_stats
Convert Date to character type.
current_fighter_stats$Date = as.character(current_fighter_stats$Date)
Merge current fighter database with events and odds database.
df_current_master = merge(event_and_odds_long, current_fighter_stats, by = c("NAME", "Date"))
Get distinct entries.
df_current_master %>% distinct() -> df_current_master
Get rid of fight IDs without pair.
df_current_master %>%
group_by(fight_id) %>%
summarize(count = length(Sex)) %>%
dplyr::filter(count != 2) -> to_remove_from_master
df_current_master[df_current_master$fight_id %in% to_remove_from_master$fight_id,]
## NAME Date Event City
## 9 Alex Caceres 2020-08-29 UFC Fight Night: Smith vs. Rakic Las Vegas
## 50 Brian Kelleher 2020-09-05 UFC Fight Night: Overeem vs. Sakai Las Vegas
## 233 Mirsad Bektic 2020-09-19 UFC Fight Night: Covington vs. Woodley Las Vegas
## State Country FightWeightClass Round Method Winner_Odds Loser_Odds Sex
## 9 Nevada USA Featherweight 1 SUB 1.54 2.80 Male
## 50 Nevada USA Featherweight 1 SUB 1.28 4.25 Male
## 233 Nevada USA Featherweight 3 SUB 4.05 1.30 Male
## fight_id Result FighterWeight REACH SLPM SAPM STRA STRD TD TDA TDD SUBA
## 9 73 Winner 145 73 4.14 2.90 0.49 0.65 0.55 0.76 0.59 0.7
## 50 372 Winner 145 66 4.48 6.00 0.40 0.53 1.21 0.25 0.76 0.6
## 233 623 Loser 145 70 2.59 1.89 0.41 0.58 3.32 0.47 0.92 0.5
## Num_Fights Data_Date FighterWeightClass
## 9 22 2020-08-24 Featherweight
## 50 9 2020-08-29 Featherweight
## 233 10 2020-09-18 Featherweight
df_current_master = df_current_master[!(df_current_master$fight_id %in% to_remove_from_master$fight_id),]
Save File
save(df_current_master, file = "./Datasets/df_current_master.RData")