Support Vector Machines for Classification

Nonlinear Regression Models

Background

This case study examines a large number of models that predict a property of concrete mixtures, as outlined in Chapter 10 of APM. The analysis here will be similar to the APM approach but will illustrate two techniques: using workflow sets to launch models and efficient triage of models using race methods, following TMWR Chapter 15. There are two versions of the data. Like APM, we’ll use the version where the concrete ingredients are represented as proportions in a mixture. There are some replicated mixtures so we create a distinct set of mixtures and average the outcome data across replicates.

data(concrete, package = "modeldata")

We split our data and set up a 10 fold CV set with 5 repeats.

set.seed(1001)
concrete_split <- initial_split(concrete, strata = compressive_strength)
concrete_train <- training(concrete_split)
concrete_test  <- testing(concrete_split)
set.seed(1002)
concrete_folds <- vfold_cv(concrete_train, strata = compressive_strength, 
                           repeats = 5)

Preprocessing and Model Specification

SVMs only require normalized parameters, so we won’t do much else here.

normalized_rec <- 
  recipe(compressive_strength ~ ., data = concrete_train) %>% 
  step_normalize(all_numeric_predictors()) 

Here, we specify our SVM models, plus a KNN for comparison.

library(kernlab)
## Warning: package 'kernlab' was built under R version 4.0.5
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
## The following object is masked from 'package:scales':
## 
##     alpha
svm_r_spec <- 
  svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")
svm_p_spec <- 
  svm_poly(cost = tune(), degree = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")

library(kknn)
knn_spec <- 
  nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune()) %>% 
  set_engine("kknn") %>% 
  set_mode("regression")
library("earth")
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## 
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
## 
##     rescale
## Loading required package: TeachingDemos
mars_spec <- 
  mars(prod_degree = tune()) %>%  #<- use GCV to choose terms
  set_engine("earth") %>% 
  set_mode("regression")
library(ranger)
rf_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
xgb_spec <- 
  boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(), 
             min_n = tune(), sample_size = tune(), trees = tune()) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")
no_pre_proc <- 
  workflow_set(
    preproc = list(simple = compressive_strength ~ .), 
    models = list(MARS = mars_spec, RF = rf_spec, 
                  boosting = xgb_spec)
  )

normalized <- 
  workflow_set(
    preproc = list(normalized = normalized_rec), 
    models = list(svm_radial = svm_r_spec, svm_poly = svm_p_spec, 
                  KNN = knn_spec)
  )

all_workflows <- 
  bind_rows(no_pre_proc, normalized)

When using the workflow_map() function, the same function is applied to the workflows of interest.

The seed argument controls the random number stream so that each model with is processed with this random number seed.

library(doParallel)
## Warning: package 'doParallel' was built under R version 4.0.5
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.0.5
## Loading required package: parallel
library(finetune)
cores <- parallel::detectCores()
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
  
workflow_fits <- 
  all_workflows %>%
  workflow_map(fn = "tune_grid", seed = 1003, verbose = TRUE,
               resamples = concrete_folds, grid = 15,
               control = control_grid(parallel_over = "everything")
               )
## i 1 of 6 tuning:     simple_MARS
## ✔ 1 of 6 tuning:     simple_MARS (14.2s)
## i 2 of 6 tuning:     simple_RF
## i Creating pre-processing data to finalize unknown parameter: mtry
## ✔ 2 of 6 tuning:     simple_RF (2m 15.7s)
## i 3 of 6 tuning:     simple_boosting
## ✔ 3 of 6 tuning:     simple_boosting (3m 42s)
## i 4 of 6 tuning:     normalized_svm_radial
## ✔ 4 of 6 tuning:     normalized_svm_radial (1m 43.4s)
## i 5 of 6 tuning:     normalized_svm_poly
## ✔ 5 of 6 tuning:     normalized_svm_poly (11m 59.8s)
## i 6 of 6 tuning:     normalized_KNN
## ✔ 6 of 6 tuning:     normalized_KNN (1m 43.3s)
stopCluster(cl)

Alternatively, the finetune::tune_race_anova() could be used. Rather than performing the entire CV process for each parameter combination, this tuning function eliminates some tuning parameter combinations whenever it becomes clear that they are unlikely to produce the best results. See TMWR Ch15.4. To get a quick assessment of the results, the autoplot() function can be used:

autoplot(workflow_fits, rank_metric = "rmse")

autoplot(workflow_fits, rank_metric = "rmse", select_best=TRUE)

Here it looks like the boosted tree model is performing the best, and it’s not particularly close.

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
workflow_fits %>% collect_metrics(select_best=TRUE) %>%
  group_by(wflow_id) %>%
  filter(.metric=="rmse") %>%
  slice_min(order_by = mean) %>%
  ungroup() %>%
  arrange(mean) %>%
  select(model, mean, n, std_err) %>%
  kable()
model mean n std_err
boost_tree 4.133672 50 0.0978921
rand_forest 5.062955 50 0.0932092
mars 6.227203 50 0.1115937
svm_rbf 6.338072 50 0.0878039
svm_poly 7.025577 50 0.2169813
nearest_neighbor 7.995286 50 0.1099770
boosting_res <- workflow_fits %>%
  extract_workflow_set_result("simple_boosting")
boosting_res %>% 
  show_best(metric="rmse") %>%
  select(-.config) %>%
  kable() # for nice output, e.g., for a project
trees min_n tree_depth learn_rate loss_reduction sample_size .metric .estimator mean n std_err
860 5 5 0.0169087 0.0002203 0.7646908 rmse standard 4.133672 50 0.0978921
1924 22 12 0.1823157 0.0331872 0.9843885 rmse standard 4.138362 50 0.0896586
1745 27 5 0.2490691 0.0015410 0.3977150 rmse standard 4.184375 50 0.0902776
623 24 9 0.0256900 0.0000005 0.7522490 rmse standard 4.312496 50 0.0929550
339 10 10 0.0476567 0.0167030 0.2984359 rmse standard 4.347723 50 0.0958753
best_boost <- boosting_res %>% select_best(metric="rmse")
final_fit <- workflow_fits %>%
  extract_workflow("simple_boosting") %>%
  finalize_workflow(
    parameters = best_boost
  ) %>%
  fit(data=concrete_train)

final_test_res <- final_fit %>%
  augment(new_data=concrete_test)

rmse(final_test_res, truth=compressive_strength, estimate=.pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.49