Lecture: K Nearest Neighbors

2023-01-10

KNN Demonstration

patients <- read.csv("breast-cancer.csv") %>% clean_names() %>% mutate(class = factor(class))
glimpse(patients)
## Rows: 683
## Columns: 11
## $ id                          <int> 1000025, 1002945, 1015425, 1016277, 101702…
## $ clump_thickness             <int> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, …
## $ uniformity_of_cell_size     <int> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1,…
## $ uniformity_of_cell_shape    <int> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1,…
## $ marginal_adhesion           <int> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, …
## $ single_epithelial_cell_size <int> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, …
## $ bare_nuclei                 <int> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, …
## $ bland_chromatin             <int> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, …
## $ normal_nucleoli             <int> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, …
## $ mitoses                     <int> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, …
## $ class                       <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
patients %>% group_by(class) %>% summarise(n=n())
## # A tibble: 2 × 2
##   class     n
##   <fct> <int>
## 1 0       444
## 2 1       239
skim(patients)
Data summary
Name patients
Number of rows 683
Number of columns 11
_______________________
Column type frequency:
factor 1
numeric 10
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
class 0 1 FALSE 2 0: 444, 1: 239

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 1076720.23 620644.05 63375 877617 1171795 1238705 13454352 ▇▁▁▁▁
clump_thickness 0 1 4.44 2.82 1 2 4 6 10 ▇▇▇▃▃
uniformity_of_cell_size 0 1 3.15 3.07 1 1 1 5 10 ▇▂▁▁▂
uniformity_of_cell_shape 0 1 3.22 2.99 1 1 1 5 10 ▇▂▁▁▁
marginal_adhesion 0 1 2.83 2.86 1 1 1 4 10 ▇▂▁▁▁
single_epithelial_cell_size 0 1 3.23 2.22 1 2 2 4 10 ▇▂▂▁▁
bare_nuclei 0 1 3.54 3.64 1 1 1 6 10 ▇▁▁▁▂
bland_chromatin 0 1 3.45 2.45 1 2 3 5 10 ▇▅▁▂▁
normal_nucleoli 0 1 2.87 3.05 1 1 1 4 10 ▇▁▁▁▁
mitoses 0 1 1.60 1.73 1 1 1 1 10 ▇▁▁▁▁
ggplot(data=patients, aes(x=bland_chromatin, y=single_epithelial_cell_size, color=class)) +
  geom_point(position="jitter")

These two quantities look nicely separated, and could be useful for prediction! Let’s use them to create a k Nearest Neighbors Model.

What does KNN do?

Suppose we are diagnosing a new patient, and we get readings on bland_chromatin and single_epithelial_cell_size, say 3 and 5, respectively.

patients_split <- initial_split(patients, prop = 0.80, strata = class)
patients_train <- training(patients_split)
patients_test <- testing(patients_split)
patients_train %>% 
  mutate(
    dist = sqrt((bland_chromatin-3)^2+(single_epithelial_cell_size-5)^2)
    ) %>%
  slice_min(dist, n=5, with_ties=FALSE) # There are a bunch of ties!
##        id clump_thickness uniformity_of_cell_size uniformity_of_cell_shape
## 1  242970               5                       7                        7
## 2  718641               1                       1                        1
## 3 1116132               6                       3                        4
## 4 1171845               8                       6                        4
## 5  832226               3                       4                        4
##   marginal_adhesion single_epithelial_cell_size bare_nuclei bland_chromatin
## 1                 1                           5           8               3
## 2                 1                           5           1               3
## 3                 1                           5           2               3
## 4                 3                           5           9               3
## 5                10                           5           1               3
##   normal_nucleoli mitoses class dist
## 1               4       1     0    0
## 2               1       1     0    0
## 3               9       1     1    0
## 4               1       1     1    0
## 5               3       1     1    0

What class should we predict here? What if we change the number of neighbors? What if we change the point?

Building a KNN Model

Specify the model:

knn_model <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) %>% 
  set_engine("kknn") %>% 
  set_mode("classification")

Fit the model to the training data:

knn_fit <- knn_model %>% 
  fit(class ~ bland_chromatin + single_epithelial_cell_size, data = patients_train)

Evaluate the model on the test set (recall that we may also use augment here)

patients_pred <- patients_test %>% 
  bind_cols(predict(knn_fit, new_data=patients_test))
# Some performance metrics for classification
conf_mat(patients_pred, truth = class, estimate = .pred_class)
##           Truth
## Prediction  0  1
##          0 83  6
##          1  6 42
my_metrics <- metric_set(sens, spec, accuracy)
my_metrics(patients_pred, truth = class, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.933
## 2 spec     binary         0.875
## 3 accuracy binary         0.912
patients_pred <- patients_pred %>% mutate(misclassified = if_else(class != .pred_class, TRUE, FALSE))
ggplot(data=patients_pred, aes(x=bland_chromatin, y=single_epithelial_cell_size)) + 
  geom_point(
    data=patients_train, 
    aes(x=bland_chromatin, y=single_epithelial_cell_size, shape=class), 
    position="jitter")+
  geom_point(
    aes(color=misclassified, shape=class), 
    position="jitter") 

patients_pred %>% filter(class != .pred_class) %>% head()
##        id clump_thickness uniformity_of_cell_size uniformity_of_cell_shape
## 1 1113906               9                       5                        5
## 2 1126417              10                       6                        4
## 3 1177399               8                       3                        5
## 4 1177512               1                       1                        1
## 5  145447               8                       4                        4
## 6   76389              10                       4                        7
##   marginal_adhesion single_epithelial_cell_size bare_nuclei bland_chromatin
## 1                 2                           2           2               5
## 2                 1                           3           4               3
## 3                 4                           5          10               1
## 4                 1                          10           1               1
## 5                 1                           2           9               3
## 6                 2                           2           8               6
##   normal_nucleoli mitoses class .pred_class misclassified
## 1               1       1     1           0          TRUE
## 2               2       3     1           0          TRUE
## 3               6       2     1           0          TRUE
## 4               1       1     0           1          TRUE
## 5               3       1     1           0          TRUE
## 6               1       1     1           0          TRUE

How many neighbors should we use?

knn_fit <- knn_model %>%
  # change this to whatever you want and compare with your neighbors
  set_args(neighbors=5) %>%
  # Can we try fitting on all the predictors?
  fit(class ~ . -id, data = patients_train)

patients_pred <- augment(knn_fit, new_data=patients_test)

#conf_mat(patients_pred, truth = class, estimate = .pred_class)
my_metrics(patients_pred, truth = class, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.978
## 2 spec     binary         0.958
## 3 accuracy binary         0.971

Weighted KNN

Should all the nearest neighbors get an equal vote? Shouldn’t the closest neighbors have a bigger say? This information can be incorporated using weight functions, which give more weight to closer observations and less to ones further away. See this paper for more details.

knn_recipe <- recipe(class ~ ., data=patients_train) %>%
  update_role(id, new_role="id") %>%
  step_normalize(all_numeric_predictors())

knn_model <- nearest_neighbor(neighbors=5, weight_func="gaussian") %>% # What are the default arguments?
  set_engine("kknn") %>% 
  set_mode("classification") 

knn_wf <- workflow() %>% 
  add_recipe(knn_recipe) %>%
  add_model(knn_model)

knn_fit <- knn_wf %>% fit(data = patients_train)

patients_pred <- augment(knn_fit, new_data=patients_test)

conf_mat(patients_pred, truth = class, estimate = .pred_class)
##           Truth
## Prediction  0  1
##          0 87  2
##          1  2 46
my_metrics(patients_pred, truth = class, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.978
## 2 spec     binary         0.958
## 3 accuracy binary         0.971
patients_pred <- patients_pred %>% mutate(misclassified = if_else(class != .pred_class, TRUE, FALSE))
ggplot(data=patients_pred, aes(x=bland_chromatin, y=single_epithelial_cell_size))+ 
  geom_point(
    data=patients_train, 
    aes(x=bland_chromatin, y=single_epithelial_cell_size, shape=class), 
    position="jitter") +
  geom_point(
    aes(color=misclassified, shape=class), 
    position="jitter") 

Resampling

V-fold Cross-validation

The function vfold_cv is used to create the cross-validation folds. What are the default arguments?

patients_folds <- vfold_cv(patients_train, v = 10)
# to actually return a particular fold
#patients_folds$splits[[4]] %>% analysis()
#patients_folds$splits[[4]] %>% assessment()

Repeated V-fold Cross-validation

We can also set up the folds to repeat many times. How many estimates do we have now?

patients_folds <- vfold_cv(patients_train, v = 10, repeats = 5)
# to actually return a particular fold:
# patients_folds$splits[[1]] %>% analysis()
# patients_folds$splits[[1]] %>% assessment()

Fitting the resamples

Now our workflow object can fit the model to each analysis set and compute metrics on each assessment set. You can also pass fit_resamples a metric set, otherwise it will automatically choose.

# This might take a while!
diab_res <- knn_wf %>% fit_resamples(resamples = patients_folds)
diab_res
## # Resampling results
## # 10-fold cross-validation repeated 5 times 
## # A tibble: 50 × 5
##    splits           id      id2    .metrics         .notes          
##    <list>           <chr>   <chr>  <list>           <list>          
##  1 <split [491/55]> Repeat1 Fold01 <tibble [2 × 4]> <tibble [0 × 3]>
##  2 <split [491/55]> Repeat1 Fold02 <tibble [2 × 4]> <tibble [0 × 3]>
##  3 <split [491/55]> Repeat1 Fold03 <tibble [2 × 4]> <tibble [0 × 3]>
##  4 <split [491/55]> Repeat1 Fold04 <tibble [2 × 4]> <tibble [0 × 3]>
##  5 <split [491/55]> Repeat1 Fold05 <tibble [2 × 4]> <tibble [0 × 3]>
##  6 <split [491/55]> Repeat1 Fold06 <tibble [2 × 4]> <tibble [0 × 3]>
##  7 <split [492/54]> Repeat1 Fold07 <tibble [2 × 4]> <tibble [0 × 3]>
##  8 <split [492/54]> Repeat1 Fold08 <tibble [2 × 4]> <tibble [0 × 3]>
##  9 <split [492/54]> Repeat1 Fold09 <tibble [2 × 4]> <tibble [0 × 3]>
## 10 <split [492/54]> Repeat1 Fold10 <tibble [2 × 4]> <tibble [0 × 3]>
## # … with 40 more rows
collect_metrics(diab_res)
## # A tibble: 2 × 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.970    50 0.00318 Preprocessor1_Model1
## 2 roc_auc  binary     0.985    50 0.00247 Preprocessor1_Model1

Tuning hyperparameters

Parameters that we want to mark for tuning can take the tune() tag as an argument.

(Do you remember what dist_power is?)

knn_model <- knn_model %>%
  set_args(neighbors=tune(), dist_power=tune())

knn_wf <- workflow() %>%
  add_recipe(knn_recipe) %>%
  add_model(knn_model)

We can take a look at the default parameter sets below:

knn_param <- knn_wf %>% extract_parameter_set_dials()
knn_param %>% extract_parameter_dials("neighbors")
## # Nearest Neighbors (quantitative)
## Range: [1, 15]
knn_param %>% extract_parameter_dials("dist_power")
## Minkowski Distance Order (quantitative)
## Range: [0.1, 2]

In our example, we said we were interested in dist powers between 1 and 2, and in 3, 5, 7, or 9 nearest neighbors. Here are two ways to create a regular grid like this:

# Way 1: Using the crossing function
# Creates a lot of combinations
my_grid <- crossing(
  neighbors = c(3, 5, 7, 9),
  dist_power = c(1, 1.25, 1.5, 1.75, 2)
  )

ggplot(my_grid, aes(neighbors, dist_power)) + geom_point()

# Way 2: Using the grid_regular function
knn_param <- knn_param %>% update(
  neighbors = neighbors(c(3,9)),
  dist_power = dist_power(c(1,2))
)

my_grid <- grid_regular(
  # the parameter set:
  knn_param, 
  # how many divisions to make at each level. Could also be an integer to give the same number of levels for each parameter
  levels = c(neighbors=4, dist_power=5) 
  )

We use the tune_grid function in a similar way to how we used fit_resamples.

# Will take a while!
knn_tune <- knn_wf %>%
  tune_grid(
    patients_folds, # the CV set
    grid=my_grid, # the number of levels of each parameter
    metrics=metric_set(accuracy) # the metrics you'd like to compute
    )

The book lists a bunch of ways to take a look at the different parameter combinations using plots! Here we’ll just use the show_best() function to get a look at the best performing combinations. “Best” is quantified according to the specified metric.

knn_tune %>% show_best()
## # A tibble: 5 × 8
##   neighbors dist_power .metric  .estimator  mean     n std_err .config          
##       <int>      <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>            
## 1         5       1.25 accuracy binary     0.971    50 0.00332 Preprocessor1_Mo…
## 2         5       2    accuracy binary     0.970    50 0.00318 Preprocessor1_Mo…
## 3         5       1    accuracy binary     0.970    50 0.00326 Preprocessor1_Mo…
## 4         9       1.5  accuracy binary     0.970    50 0.00295 Preprocessor1_Mo…
## 5         5       1.5  accuracy binary     0.969    50 0.00333 Preprocessor1_Mo…

(This probably indicates that we should perform a second round of CV using neighbors values close to 6)

Now that we have a best model, we can now update our workflow and fit it to the training set. We can use select_* to get the best performing parameter set:

select_best(knn_tune, metric="accuracy")
## # A tibble: 1 × 3
##   neighbors dist_power .config              
##       <int>      <dbl> <chr>                
## 1         5       1.25 Preprocessor1_Model06
#select_by_one_std_err(knn_tune, metric="accuracy", desc(neighbors))
knn_wf <- knn_wf %>%
  finalize_workflow(
    parameters = select_best(knn_tune, metric="accuracy")
    # could also set the parameters "by hand" here, e.g.
    # parameters = c(neighbors = 6, dist_power=2)
  )

final_results <- knn_wf %>%
  fit(patients_train) %>%
  augment(new_data=patients_test)

my_metrics(final_results, truth=class, estimate=.pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.989
## 2 spec     binary         0.938
## 3 accuracy binary         0.971

Should we always choose the best performing model?

Without specifying a grid:

knn_tune <- knn_wf %>%
  tune_grid(
    patients_folds, # the CV set
    grid=4, # the number of levels of each parameter
    metrics=metric_set(accuracy) # the metrics you'd like to compute
    )
## Warning: No tuning parameters have been detected, performance will be evaluated
## using the resamples with no tuning. Did you want to [tune()] parameters?

Specifying other types of grids

Here are some other types of grids. Maximum entropy is the default for tune_grid I believe.

my_grid <- grid_latin_hypercube(
  # the parameter set:
  knn_param, 
  # how many points in the grid
  size = 13
  )
ggplot(my_grid, aes(neighbors, dist_power)) + geom_point()

my_grid <- grid_max_entropy(
  # the parameter set:
  knn_param, 
  # how many points in the grid
  size = 13
  )
ggplot(my_grid, aes(neighbors, dist_power)) + geom_point()