Nonlinear Regression Models

Background

This case study examines a large number of models that predict a property of concrete mixtures, as outlined in Chapter 10 of APM. The analysis here will be similar to the APM approach but will illustrate two techniques: using workflow sets to launch models and efficient triage of models using race methods, following TMWR Chapter 15. There are two versions of the data. Like APM, we’ll use the version where the concrete ingredients are represented as proportions in a mixture. There are some replicated mixtures so we create a distinct set of mixtures and average the outcome data across replicates.

data(concrete, package = "modeldata")

We split our data and set up a 10 fold CV set with 5 repeats.

set.seed(1001)
concrete_split <- initial_split(concrete, strata = compressive_strength)
concrete_train <- training(concrete_split)
concrete_test  <- testing(concrete_split)
set.seed(1002)
concrete_folds <- vfold_cv(concrete_train, strata = compressive_strength, 
                           repeats = 5)

Preprocessing and Model Specification

SVMs only require normalized parameters, so we won’t do much else here.

normalized_rec <- 
  recipe(compressive_strength ~ ., data = concrete_train) %>% 
  step_normalize(all_numeric_predictors())

Here, we specify our SVM models, plus a KNN for comparison.

library(kernlab)

## Warning: package 'kernlab' was built under R version 4.0.5

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:purrr':
## 
##     cross

## The following object is masked from 'package:ggplot2':
## 
##     alpha

## The following object is masked from 'package:scales':
## 
##     alpha

svm_r_spec <- 
  svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")
svm_p_spec <- 
  svm_poly(cost = tune(), degree = tune()) %>% 
  set_engine("kernlab") %>% 
  set_mode("regression")

library(kknn)
knn_spec <- 
  nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune()) %>% 
  set_engine("kknn") %>% 
  set_mode("regression")
library("earth")

## Loading required package: Formula

## Loading required package: plotmo

## Loading required package: plotrix

## 
## Attaching package: 'plotrix'

## The following object is masked from 'package:scales':
## 
##     rescale

## Loading required package: TeachingDemos

mars_spec <- 
  mars(prod_degree = tune()) %>%  #<- use GCV to choose terms
  set_engine("earth") %>% 
  set_mode("regression")
library(ranger)
rf_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")
library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

xgb_spec <- 
  boost_tree(tree_depth = tune(), learn_rate = tune(), loss_reduction = tune(), 
             min_n = tune(), sample_size = tune(), trees = tune()) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

no_pre_proc <- 
  workflow_set(
    preproc = list(simple = compressive_strength ~ .), 
    models = list(MARS = mars_spec, RF = rf_spec, 
                  boosting = xgb_spec)
  )

normalized <- 
  workflow_set(
    preproc = list(normalized = normalized_rec), 
    models = list(svm_radial = svm_r_spec, svm_poly = svm_p_spec, 
                  KNN = knn_spec)
  )

all_workflows <- 
  bind_rows(no_pre_proc, normalized)

When using the workflow_map() function, the same function is applied to the workflows of interest.

The seed argument controls the random number stream so that each model with is processed with this random number seed.

library(doParallel)

## Warning: package 'doParallel' was built under R version 4.0.5

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.0.5

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

## Warning: package 'iterators' was built under R version 4.0.5

## Loading required package: parallel

library(finetune)
cores <- parallel::detectCores()
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)
  
workflow_fits <- 
  all_workflows %>%
  workflow_map(fn = "tune_grid", seed = 1003, verbose = TRUE,
               resamples = concrete_folds, grid = 15,
               control = control_grid(parallel_over = "everything")
               )

## i 1 of 6 tuning:     simple_MARS

## ✔ 1 of 6 tuning:     simple_MARS (14.2s)

## i 2 of 6 tuning:     simple_RF

## i Creating pre-processing data to finalize unknown parameter: mtry

## ✔ 2 of 6 tuning:     simple_RF (2m 15.7s)

## i 3 of 6 tuning:     simple_boosting

## ✔ 3 of 6 tuning:     simple_boosting (3m 42s)

## i 4 of 6 tuning:     normalized_svm_radial

## ✔ 4 of 6 tuning:     normalized_svm_radial (1m 43.4s)

## i 5 of 6 tuning:     normalized_svm_poly

## ✔ 5 of 6 tuning:     normalized_svm_poly (11m 59.8s)

## i 6 of 6 tuning:     normalized_KNN

## ✔ 6 of 6 tuning:     normalized_KNN (1m 43.3s)

stopCluster(cl)

Alternatively, the finetune::tune_race_anova() could be used. Rather than performing the entire CV process for each parameter combination, this tuning function eliminates some tuning parameter combinations whenever it becomes clear that they are unlikely to produce the best results. See TMWR Ch15.4. To get a quick assessment of the results, the autoplot() function can be used:

autoplot(workflow_fits, rank_metric = "rmse")

autoplot(workflow_fits, rank_metric = "rmse", select_best=TRUE)

Here it looks like the boosted tree model is performing the best, and it’s not particularly close.

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

workflow_fits %>% collect_metrics(select_best=TRUE) %>%
  group_by(wflow_id) %>%
  filter(.metric=="rmse") %>%
  slice_min(order_by = mean) %>%
  ungroup() %>%
  arrange(mean) %>%
  select(model, mean, n, std_err) %>%
  kable()

model	mean	n	std_err
boost_tree	4.133672	50	0.0978921
rand_forest	5.062955	50	0.0932092
mars	6.227203	50	0.1115937
svm_rbf	6.338072	50	0.0878039
svm_poly	7.025577	50	0.2169813
nearest_neighbor	7.995286	50	0.1099770

boosting_res <- workflow_fits %>%
  extract_workflow_set_result("simple_boosting")
boosting_res %>% 
  show_best(metric="rmse") %>%
  select(-.config) %>%
  kable() # for nice output, e.g., for a project

trees	min_n	tree_depth	learn_rate	loss_reduction	sample_size	.metric	.estimator	mean	n	std_err
860	5	5	0.0169087	0.0002203	0.7646908	rmse	standard	4.133672	50	0.0978921
1924	22	12	0.1823157	0.0331872	0.9843885	rmse	standard	4.138362	50	0.0896586
1745	27	5	0.2490691	0.0015410	0.3977150	rmse	standard	4.184375	50	0.0902776
623	24	9	0.0256900	0.0000005	0.7522490	rmse	standard	4.312496	50	0.0929550
339	10	10	0.0476567	0.0167030	0.2984359	rmse	standard	4.347723	50	0.0958753

best_boost <- boosting_res %>% select_best(metric="rmse")
final_fit <- workflow_fits %>%
  extract_workflow("simple_boosting") %>%
  finalize_workflow(
    parameters = best_boost
  ) %>%
  fit(data=concrete_train)

final_test_res <- final_fit %>%
  augment(new_data=concrete_test)

rmse(final_test_res, truth=compressive_strength, estimate=.pred)

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        4.49

Support Vector Machines for Classification

Nonlinear Regression Models

Background

Preprocessing and Model Specification