{xgboost}

With {xgboost}

Author

Jessica Helmer

Published

March 27, 2026

Code
library(tidyverse)
library(tidymodels)
library(finetune)
library(vip)
Code
v5_dat <- readRDS(here::here("Data", "v5_dat.rds"))

startup_costs <- readRDS(here::here("Data", "startup_costs.rds"))
Code
source(here::here("Scripts", "get_startup.R"))
Code
v5_dat_wide <- v5_dat |>
  # condensing denominator neglect scores down to one score per item pair
  mutate(.by = c(subject_id, item),
         score = ifelse(task == "dn.c" | task == "dn.s",
                        mean(score),
                        score)) |>
  unique() |>
  select(-c(time, task)) |>
  pivot_wider(names_from = "item", values_from = "score",
              id_cols = c(subject_id, sscore)) |>
  # alert alert !!
  drop_na()

Trying {xgboost}

Splitting data into train and test.

Code
v5_dat_split <- v5_dat_wide |>
  select(-subject_id) |>
  initial_split()

v5_dat_train <- training(v5_dat_split)
v5_dat_test <- testing(v5_dat_split)

v5_dat_folds <- vfold_cv(v5_dat_train, v = 5)

Specifying model (S-score predicted by everything else) and normalizing all predictors.

Code
v5_dat_recipe <- recipe(sscore ~ ., data = v5_dat_train) |>
  step_normalize(all_numeric(), -all_outcomes())

Specifying hyperparameters and which are going to be tuned.

Code
xgb_spec <- boost_tree(
    trees = 1000,
    tree_depth = tune(),
    min_n = tune(),
    mtry = tune(),
    sample_size = tune(),
    learn_rate = tune()
  ) %>%
  set_engine("xgboost") |>
  set_mode("regression")

xgb_wf <- workflow(v5_dat_recipe, xgb_spec)

Hyperparameter Tuning

Defining grid of candidate hyperparameter combinations.

Code
xgb_grid <- grid_space_filling(
    tree_depth(c(5L, 10L)),
    min_n(c(1L, 40L)),
    mtry(c(1L, 90L)),
    sample_prop(c(0.5, 1.0)),
    learn_rate(c(-3, -1)),
    size = 100)

xgb_grid
# A tibble: 100 × 5
   tree_depth min_n  mtry sample_size learn_rate
        <int> <int> <int>       <dbl>      <dbl>
 1          5    12    43       0.808    0.00266
 2          5    32    34       0.833    0.0236 
 3          5    17    29       0.576    0.0248 
 4          5    27     9       0.672    0.00443
 5          5     9    21       0.899    0.0260 
 6          5     3    46       0.697    0.0129 
 7          5    16    76       0.646    0.0343 
 8          5    21    82       0.859    0.0102 
 9          5    37    61       0.687    0.00811
10          5    10    68       0.884    0.0413 
# ℹ 90 more rows

Five-fold cross-validation for tuning.

Code
doParallel::registerDoParallel()

set.seed(234)
xgb_rs <- tune_race_anova(
    xgb_wf,
    v5_dat_folds,
    grid = xgb_grid,
    control = control_race(verbose_elim = TRUE))

if (!file.exists(here::here("Data", "xgboost"))) dir.create(here::here("Data", "xgboost"))
saveRDS(xgb_rs, here::here("Data", "xgboost", "xgb_rs.rds"))
Code
xgb_rs <- readRDS(here::here("Data", "xgboost", "xgb_rs.rds"))

plot_race(xgb_rs) +
  theme_classic(base_size = 20)

Highest-performing hyperparameter combinations

Code
show_best(xgb_rs, metric = "rmse")
# A tibble: 5 × 11
   mtry min_n tree_depth learn_rate sample_size .metric .estimator  mean     n
  <int> <int>      <int>      <dbl>       <dbl> <chr>   <chr>      <dbl> <int>
1     1    20          8    0.0123        0.924 rmse    standard   0.568     5
2     2    10          6    0.00774       0.667 rmse    standard   0.571     5
3     7    20          7    0.00850       0.510 rmse    standard   0.572     5
4    46     3          5    0.0129        0.697 rmse    standard   0.573     5
5     5    18          5    0.00559       0.909 rmse    standard   0.574     5
# ℹ 2 more variables: std_err <dbl>, .config <chr>

Fitting Final Model

Code
xgb_last <-
  xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, metric = "rmse")) %>%
  last_fit(v5_dat_split)

Importance of items

Code
extract_workflow(xgb_last) %>%
  extract_fit_parsnip() |>
  vi() |>
  saveRDS(here::here("Data", "xgboost", "xgb_item-importance.rds"))


extract_workflow(xgb_last) %>%
  extract_fit_parsnip() %>%
  vip(geom = "point", num_features = 100) +
  theme_classic(base_size = 16) +
  theme(aspect.ratio = 4,
        panel.grid.major.x = element_line(linewidth = 0.2))