::opts_chunk$set(echo = TRUE)
knitrlibrary(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.2 ✔ rsample 1.1.1
## ✔ dials 1.1.0 ✔ tune 1.0.1
## ✔ infer 1.0.4 ✔ workflows 1.1.2
## ✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.3 ✔ yardstick 1.1.0
## ✔ recipes 1.0.4
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
The Data
We’ll work again with the patients.csv
data that
contains information about different tumors, where we are trying to
classify them as benign (0
) or cancerous
(1
).
<- read.csv("breast-cancer.csv") %>%
patients clean_names() %>%
mutate(class = as.factor(class))
To get a sense for how logistic regression works, consider the following plot:
ggplot(patients, aes(x=bland_chromatin, y=class)) +
geom_point(position = position_jitter(w = 0.25, h = 0), alpha=.2)
<- initial_split(patients, prop=.8)
patients_split
<- training(patients_split)
patients_train <- testing(patients_split)
patients_test
<- vfold_cv(patients_train, v=10) patients_folds
Logistic Regression
Let’s create a logistic regression model specification.
<- logistic_reg() %>%
logistic_model set_engine("glm") %>% # the default, stands for "generalized linear models"
set_mode("classification") # also the default
What kinds of preprocessing should we use? Check out the appendix in TMWR.
<- recipe(class ~ ., data=patients_train) %>%
logistic_recipe #update_role(id, new_role="id") %>%
step_normalize(all_numeric_predictors())
<- workflow() %>%
logistic_wf add_model(logistic_model) %>%
add_recipe(logistic_recipe)
Cross-validate the model to estimate performance:
%>%
logistic_wf fit_resamples(resamples=patients_folds) %>%
collect_metrics()
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.974 10 0.00780 Preprocessor1_Model1
## 2 roc_auc binary 0.996 10 0.00156 Preprocessor1_Model1
<- logistic_wf %>%
logistic_results fit(data=patients_train) %>%
augment(new_data=patients_test)
<- metric_set(accuracy, sens, spec)
my_metrics accuracy(logistic_results, truth=class, estimate=.pred_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.956
# logistic_results %>%
# mutate(class = as.numeric(class)-1) %>%
# ggplot(aes(x=bland_chromatin)) +
# geom_point(aes(y=as.numeric(class)), alpha=.5) +
# geom_point(aes(y=.pred_1), color="blue")
Your turn
We will continue to work on the titanic activity (KNN). Once you finish the specified steps, complete the following:
Build, cross-validate, and train a logistic regression model. Is there any difference in performance between the two methods?
Use the
tune()
andtune_grid()
functions to choose the value of \(k\) that maximizes the accuracy of your KNN model.
Penalized Regression
We can implement a penalized regression model (linear or logistic)
using the glmnet
package. Make sure you install this
package before you run!
<- logistic_reg(penalty=tune()) %>%
penalized_model set_engine("glmnet") %>% # the default, stands for "generalized linear models"
set_mode("classification") # also the default
<- workflow() %>%
penalized_wf add_model(penalized_model) %>%
add_recipe(logistic_recipe)
Cross-validate the model to tune parameters:
%>%
penalized_wf tune_grid(
resamples=patients_folds,
grid=10,
metrics = metric_set(accuracy)
%>%
) show_best(n=5)
## # A tibble: 5 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 3.41e-10 accuracy binary 0.974 10 0.00780 Preprocessor1_Model01
## 2 1.34e- 9 accuracy binary 0.974 10 0.00780 Preprocessor1_Model02
## 3 1.43e- 8 accuracy binary 0.974 10 0.00780 Preprocessor1_Model03
## 4 6.96e- 7 accuracy binary 0.974 10 0.00780 Preprocessor1_Model04
## 5 4.61e- 6 accuracy binary 0.974 10 0.00780 Preprocessor1_Model05
Smaller values of the penalty mean the penalty has less influence. I would interpret the results above to mean that penalization is actually hurting performance! This is likely because I gave you a good set of predictors. In this case, I would probably stick to a parameter of 0.
<- penalized_wf %>%
penalized_results finalize_workflow(
parameters = tibble(penalty=0)
%>%
) fit(data=patients_train) %>%
augment(new_data=patients_test)
my_metrics(penalized_results, truth=class, estimate=.pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.956
## 2 sens binary 0.958
## 3 spec binary 0.951