::opts_chunk$set(echo = TRUE, eval = FALSE)
knitrlibrary(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.2 ✔ recipes 1.0.4
## ✔ dials 1.1.0 ✔ rsample 1.1.1
## ✔ dplyr 1.0.10 ✔ tibble 3.1.8
## ✔ ggplot2 3.4.0 ✔ tidyr 1.2.1
## ✔ infer 1.0.4 ✔ tune 1.0.1
## ✔ modeldata 1.0.1 ✔ workflows 1.1.2
## ✔ parsnip 1.0.3 ✔ workflowsets 1.0.0
## ✔ purrr 1.0.1 ✔ yardstick 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
library(ggridges)
library(skimr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
General Process
Train/Test Split
We are going to consider using a linear model for predicting the wait time of old faithful eruptions using the duration of the eruption. Since we want to know if our model will work well on data we haven’t yet seen, a good way to understand it’s performance is to randomly split the data up into two parts, a training and a test set.
In tidymodels
, the split object tells stores how the
split is performs.
library(tidymodels)
<- read_csv("faithful.csv") %>%
faithful mutate(long_wait = if_else(duration > 3, TRUE, FALSE))
# Use a seed for reproducibility
# Otherwise your code will run differently than mine (which is ok!)
set.seed(2023) # comment this out if you want the code to generate a true random split
<- initial_split(faithful, prop = 0.80)
faithful_split
faithful_split
<- training(faithful_split)
faithful_train <- testing(faithful_split) faithful_test
The output tells us how many observations are in each set.
Sepcifying the model
# Our first model
<- linear_reg() %>%
faithful_tidymodel1 set_engine("lm")
Notice that the model isn’t yet “trained”, we’ve only specified the method and which algorithm we want to use for the training.
Fitting the model
#fit the model using training data
<- fit(
faithful_fit1
faithful_tidymodel1,~ duration,
wait data=faithful_train
)
Making predictions
Now that we have the model fit to the training data, we see what the model would predict for data in the test set:
# Use the model to make predictions on the test set
<- predict(
faithful_results1
faithful_fit1, new_data = faithful_test %>% select(-wait)
%>%
) bind_cols(faithful_test %>% select(wait))
faithful_results1
Model Evaluation
Let’s see how the model performed:
# make a set of metrics you want to use
# For linear regression, we're interested in R^2 and
# Root mean square error
<- metric_set(rmse, rsq)
metrics # apply those metrics to the results
metrics(faithful_results1, truth = wait, estimate = .pred)
Comparing with a second model: overfitting
Repeat these steps with our full model using the
long_wait
variable and compare the metrics.
# Notice that here we specified the model
# and fit the model in one step!
<- linear_reg() %>%
faithful_tidymodel2 set_engine("lm") %>%
fit(wait ~ duration*long_wait, data=faithful_train)
# Use the model to make predictions on the test set
<- predict(
faithful_results2
faithful_tidymodel2, new_data = faithful_test %>% select(-wait)
%>%
) bind_cols(faithful_test %>% select(wait))
glance(faithful_tidymodel2)
metrics(faithful_results2, truth = wait, estimate = .pred)
What does overfitting look like? Go back and fit the model using the
formula wait ~ poly(duration, 15)
. Compare the fit (R^2)
and accuracy (RMSE) on the training data to the test data—what do you
notice?
ggplot(faithful_train, aes(duration, wait)) +
geom_point() +
geom_smooth(method="lm", se=FALSE, formula="y~poly(x,15)")
Tidymodels with Recipes and Workflows
Here we will focus on predicting fuel efficiency (mpg) from a US Department of Energy data set for real cars from 2018. Many methods will require categorical variable to be designated as factors (as they should be) and you want to make sure that happens before any data splitting.
<- read_csv("cars2018.csv") %>%
cars2018 mutate_if(is.character, as.factor)
glimpse(cars2018)
EDA
# Plot a histogram
ggplot(cars2018, aes(x = mpg, fill = drive)) +
geom_histogram(bins = 25) +
labs(x = "Fuel efficiency (mpg)",
y = "Number of cars")
ggplot(cars2018, aes(x = mpg, y = drive)) +
geom_density_ridges(aes(fill = drive), alpha = .3) +
labs(x = "Fuel efficiency (mpg)",
y = "Number of cars")
# Engine displacement is the measure of the cylinder volume swept by all of the pistons of a piston engine, excluding the combustion chambers. It is commonly used as an expression of an engine's size, and by extension as a loose indicator of the power an engine might be capable of producing and the amount of fuel it should be expected to consume.
# Scatterplot
ggplot(cars2018, aes(x = displacement, y = mpg, color = drive)) +
geom_point() +
labs(y = "Fuel efficiency (mpg)",
x = "Displacement",
color = "Drive")
# Scatterplot with transformation
ggplot(cars2018, aes(x = displacement, y = log(mpg), color = drive)) +
geom_point() +
geom_smooth() +
labs(y = "Log Fuel efficiency (mpg)",
x = "Displacement",
color = "Drive")
<- cars2018 %>% mutate(lnmpg = log(mpg))
cars2018
ggplot(cars2018, aes(x = lnmpg, y = drive)) +
geom_density_ridges(aes(fill = drive), alpha = .3) +
labs(x = "Log Fuel efficiency (mpg)",
y = "Number of cars")
# Deselect model and model-index columns to create cars_vars
# These columns tell us the individual identifiers for each car and it would not make sense to include them in modeling
# All other variables are potential predictors.
set.seed(2023) # this is a random split, so setting the seed lets us reproduce results
<- cars2018 %>%
car_vars select(-model, -model_index, -mpg) # we are using lnmpg as our outcome now
skim(car_vars)
# ggpairs(car_vars)
Initial Train/Test Split
An initial split indicates that it is without cross-validation (we’ll talk about CV soon! But we keep it simple here.)
set.seed(2023) # this is a random split, so setting the seed lets us reproduce results
?initial_split<- car_vars %>%
car_split initial_split(prop = 0.8,
strata = transmission)
car_split
<- training(car_split)
car_train <- testing(car_split) car_test
Model Fitting
Let’s specify the model. We only know one kind so far!
# a linear regression model specification
# there is no data referenced here!
<- linear_reg() %>% # this is the type of model
lm_mod set_engine("lm")
Below I am fitting the model to the data. Since I am using the
standard lm
package, the fitting is done via least
squares.
# an example with 2 predicts
# note that the first argument of the "fit" function is the model specification
<- lm_mod %>%
lm_fit fit(lnmpg ~ transmission + cylinders, # there are lots of ways to designate "formulas" in R
data = car_train)
$fit
lm_fit
# all the predictors
<- lm_mod %>%
lm_fit fit(lnmpg ~ .,
data = car_train)
$fit # note the parameter values (the betas) are different
lm_fit
# Print the summary of the model fit
# You can save these as their own tibbles
tidy(lm_fit)
glance(lm_fit)
Predicting
The row order of the predictions are always the same as the original data.
predict(lm_fit,new_data = car_test)
# Notice that this is slightly different to what we tried above
# The augment function will do prediction and add the columns so you don't need to have a separate call
<- augment(lm_fit, new_data = car_test)
car_test_results metrics(car_test_results, truth=lnmpg, estimate=.pred)
ggplot(car_test_results, aes(x = lnmpg, y = .pred)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "green")
# plot(lm_fit$fit) # the 'lm' function has a plotting method just for its own results -- it's not the best, but you can see what it does!
ggplot(car_test_results, aes(x = .pred, y = .resid)) +
geom_point()
ggplot(car_test_results, aes(x = .resid)) +
geom_histogram()
Feature Engineering
Let’s take a step back: what have we done at this point?
- Imported data from csv
- mutate variables to factors (preprocessing)
- EDA
- Take the log of
mpg
. (preprocessing)
Let’s repeat some of these steps using recipes
.
# Re-importing the data to remove the preprocessing step and splitting the data
<- read_csv("cars2018.csv") %>% mutate(lnmpg = log(mpg))
cars2018 set.seed(2023)
<- cars2018 %>%
car_split initial_split(prop = 0.8,
strata = transmission)
<- training(car_split)
car_train <- testing(car_split) car_test
Why did we already transform the outcome variable here? Why not in the recipe (the next step)? We will get an error when we try to predict new data with an unknown mpg. So transform outcome variables outside of recipes.
(There are decisions to make about which steps to do at the beginning and forever, and which to put in a recipe.)
# the first step here uses the formula to identify roles
<- recipe(lnmpg ~ ., data = car_train) %>%
lm_recipe # now we will convert strings to factors
step_string2factor(where(is_character)) %>%
# now we remove some variables
step_rm(model, model_index, mpg)
Ok, now we have a recipe. Did anything actually happen? What can we do with a recipe?
One the recipe is specified, we prep
it, then
bake
the data.
tidy(lm_recipe)
prep(lm_recipe)
<- bake(prep(lm_recipe), new_data = NULL)
baked_train baked_train
We can also add a new step to the recipe and recompute.
<- lm_recipe %>%
lm_recipe step_dummy(all_nominal_predictors()) # add a new step
<- bake(prep(lm_recipe), new_data = NULL) baked_train
<- linear_reg() %>%
lm_fit set_engine("lm") %>%
fit(lnmpg ~., data = baked_train)
<- bake(prep(lm_recipe), new_data=car_test)
baked_test
<- predict(lm_fit, new_data = baked_test) %>%
cars_results bind_cols(baked_test %>% select(lnmpg))
# we specified these metrics earlier so this is not strictly necessary
<- metric_set(rmse, rsq)
metrics metrics(cars_results, truth=lnmpg, estimate=.pred)
Workflows
Now we can do this with workflows, which will take care of all the baking and prepping!
# first, create a model
# we actually already did this, but we'll do it again just for clarity
<- linear_reg() %>% # this is the type of model
lm_mod set_engine("lm")
# second, add it to a workflow ALONG WITH the recipe for preprocessing
<- workflow() %>%
lm_wkflow add_recipe(lm_recipe) %>%
add_model(lm_mod)
lm_wkflow
Model Fitting Round 2
<- fit(lm_wkflow, data = car_train)
lm_fit tidy(lm_fit)
glance(lm_fit)
Let’s take our model and predict the test data as before. This should look exactly the same as the first time we ran it.
# the workflow will do all the preprocessing steps for the test data
predict(lm_fit,new_data = car_test)
# the augment function is an easy way to add the predictions to the data set
<- augment(lm_fit,new_data = car_test)
car_test_results
ggplot(car_test_results, aes(x = lnmpg, y = .pred)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "green")
Model Evaluation
<- metric_set(rmse, rsq)
metrics metrics(car_test_results, truth = lnmpg, estimate = .pred)