Machine Learning

Load Packages

library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.2      ✔ recipes      1.0.4 
✔ dials        1.1.0      ✔ rsample      1.1.1 
✔ dplyr        1.0.10     ✔ tibble       3.1.8 
✔ ggplot2      3.4.0      ✔ tidyr        1.2.1 
✔ infer        1.0.4      ✔ tune         1.0.1 
✔ modeldata    1.0.1      ✔ workflows    1.1.2 
✔ parsnip      1.0.3      ✔ workflowsets 1.0.0 
✔ purrr        1.0.1      ✔ yardstick    1.1.0 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ purrr::discard() masks scales::discard()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
✖ recipes::step()  masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ readr   2.1.3     ✔ forcats 0.5.2
✔ stringr 1.5.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ readr::col_factor() masks scales::col_factor()
✖ purrr::discard()    masks scales::discard()
✖ dplyr::filter()     masks stats::filter()
✖ stringr::fixed()    masks recipes::fixed()
✖ dplyr::lag()        masks stats::lag()
✖ readr::spec()       masks yardstick::spec()
library(rpart)
Warning: package 'rpart' was built under R version 4.2.3

Attaching package: 'rpart'

The following object is masked from 'package:dials':

    prune
library(ranger)
Warning: package 'ranger' was built under R version 4.2.3
library(glmnet)
Warning: package 'glmnet' was built under R version 4.2.3
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loaded glmnet 4.1-7
library(rpart.plot)
Warning: package 'rpart.plot' was built under R version 4.2.3
library(vip)
Warning: package 'vip' was built under R version 4.2.3

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi

Read in Data

#load in data set
flu_ml<-readRDS("../data/flu_ml.rds")

Setup

#setting seed for reproducibility
set.seed(123)
#Split data with 70% in the training set stratifying on BodyTemp
flu_split <- initial_split(data=flu_ml, strata = BodyTemp, prop=7/10)

#Create data frames for the two sets:
train_data <- training(flu_split)
test_data  <- testing(flu_split)
#Cross-validation
resample_object<-vfold_cv(data=train_data, v=5, repeats=5, strata=BodyTemp)
#Recipe
flu_ml_rec <- 
  recipe(BodyTemp ~ ., data = train_data) %>% step_dummy(all_nominal(), -all_outcomes())

Null Model

#Null model recipe
null_recipe <- recipe(BodyTemp ~1, train_data) %>% step_dummy(all_nominal(), -all_outcomes())
#regression model
ln_mod <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")
#Workflow
null_flow <- workflow() %>% add_model(ln_mod) %>% add_recipe(null_recipe)
#Evaluate
null_fit <- null_flow %>% fit(data=train_data) %>% fit_resamples(resamples=resample_object)
! Fold1, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
null_metrics<- collect_metrics(null_fit)
null_metrics
# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 rmse    standard     1.21    25  0.0177 Preprocessor1_Model1
2 rsq     standard   NaN        0 NA      Preprocessor1_Model1
#RMSE=1.21

Fitting a Tree

#Model Specification
tune_spec <- 
  decision_tree(
    cost_complexity = tune(),
    tree_depth = tune()
  ) %>% 
  set_engine("rpart") %>% 
  set_mode("regression")
#Grid specification
tree_grid <- grid_regular(cost_complexity(),
                          tree_depth(),
                          levels = 5)
#create workflow
tree_wf <- workflow() %>%
  add_model(tune_spec) %>%
  add_recipe(flu_ml_rec)
#Tuning grid cross validation
tree_res <- 
  tree_wf %>% 
  tune_grid(
    resamples = resample_object,
    grid = tree_grid
    )
! Fold1, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
#get metrics
tree_res %>% 
  collect_metrics()
# A tibble: 50 × 8
   cost_complexity tree_depth .metric .estimator     mean     n  std_err .config
             <dbl>      <int> <chr>   <chr>         <dbl> <int>    <dbl> <chr>  
 1    0.0000000001          1 rmse    standard     1.19      25  0.0181  Prepro…
 2    0.0000000001          1 rsq     standard     0.0361    25  0.00422 Prepro…
 3    0.0000000178          1 rmse    standard     1.19      25  0.0181  Prepro…
 4    0.0000000178          1 rsq     standard     0.0361    25  0.00422 Prepro…
 5    0.00000316            1 rmse    standard     1.19      25  0.0181  Prepro…
 6    0.00000316            1 rsq     standard     0.0361    25  0.00422 Prepro…
 7    0.000562              1 rmse    standard     1.19      25  0.0181  Prepro…
 8    0.000562              1 rsq     standard     0.0361    25  0.00422 Prepro…
 9    0.1                   1 rmse    standard     1.21      25  0.0177  Prepro…
10    0.1                   1 rsq     standard   NaN          0 NA       Prepro…
# … with 40 more rows
#Plotting
tree_res %>% autoplot()

#Find and show best
tree_res %>%
  show_best(n=1)
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 1 × 8
  cost_complexity tree_depth .metric .estimator  mean     n std_err .config     
            <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>       
1    0.0000000001          1 rmse    standard    1.19    25  0.0181 Preprocesso…
best_tree <- tree_res %>%
  select_best(n=1)
Warning: No value of `metric` was given; metric 'rmse' will be used.
#RMSE = 1.19
#Finalize workflow
final_wf <- 
  tree_wf %>% 
  finalize_workflow(best_tree)
#Final fit
final_fit <- 
  final_wf %>%
  last_fit(flu_split) 
#Final fit metrics
final_fit %>%
  collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard    1.19     Preprocessor1_Model1
2 rsq     standard    0.000889 Preprocessor1_Model1
#Plot of final fit
rpart.plot(extract_fit_parsnip(final_fit)$fit)
Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.

Fitting a LASSO

#Build the model
lasso_mod <- 
  linear_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")
#Pull earlier recipe
flu_ml_rec
Recipe

Inputs:

      role #variables
   outcome          1
 predictor         25

Operations:

Dummy variables from all_nominal(), -all_outcomes()
#Create workflow
lasso_workflow <- 
  workflow() %>% 
  add_model(lasso_mod) %>% 
  add_recipe(flu_ml_rec)
#Create tune grid
lasso_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
#Find lowest penalty values
lasso_grid %>% top_n(-5)
Selecting by penalty
# A tibble: 5 × 1
   penalty
     <dbl>
1 0.0001  
2 0.000127
3 0.000161
4 0.000204
5 0.000259
#Highest penalty values
lasso_grid %>% top_n(5)
Selecting by penalty
# A tibble: 5 × 1
  penalty
    <dbl>
1  0.0386
2  0.0489
3  0.0621
4  0.0788
5  0.1   
#Train and tune model
lasso_res <- 
  lasso_workflow %>% 
  tune_grid(resample_object,
            grid = lasso_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))
lasso_res %>% collect_metrics()
# A tibble: 30 × 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 0.0001   rmse    standard    1.18    25  0.0167 Preprocessor1_Model01
 2 0.000127 rmse    standard    1.18    25  0.0167 Preprocessor1_Model02
 3 0.000161 rmse    standard    1.18    25  0.0167 Preprocessor1_Model03
 4 0.000204 rmse    standard    1.18    25  0.0167 Preprocessor1_Model04
 5 0.000259 rmse    standard    1.18    25  0.0167 Preprocessor1_Model05
 6 0.000329 rmse    standard    1.18    25  0.0167 Preprocessor1_Model06
 7 0.000418 rmse    standard    1.18    25  0.0167 Preprocessor1_Model07
 8 0.000530 rmse    standard    1.18    25  0.0167 Preprocessor1_Model08
 9 0.000672 rmse    standard    1.18    25  0.0167 Preprocessor1_Model09
10 0.000853 rmse    standard    1.18    25  0.0167 Preprocessor1_Model10
# … with 20 more rows
#Plot
lasso_res %>% autoplot()

#Select best performing model
lasso_res %>% show_best()
# A tibble: 5 × 7
  penalty .metric .estimator  mean     n std_err .config              
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1  0.0489 rmse    standard    1.15    25  0.0170 Preprocessor1_Model27
2  0.0621 rmse    standard    1.15    25  0.0170 Preprocessor1_Model28
3  0.0386 rmse    standard    1.16    25  0.0170 Preprocessor1_Model26
4  0.0304 rmse    standard    1.16    25  0.0169 Preprocessor1_Model25
5  0.0788 rmse    standard    1.16    25  0.0172 Preprocessor1_Model29
best_lasso<-lasso_res %>% select_best(method="rmse")
best_lasso
# A tibble: 1 × 2
  penalty .config              
    <dbl> <chr>                
1  0.0489 Preprocessor1_Model27
#Final workflow
lasso_final<-lasso_workflow %>% finalize_workflow(best_lasso)
#Final fit
lasso_final_fit<-lasso_final %>% fit(train_data)
#Plot
x <- extract_fit_engine(lasso_final_fit)
plot(x, "lambda")

Fitting a Random Forest

#Build model
cores <- parallel::detectCores()
cores
[1] 4
rf_mod <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger", num.threads = cores, importance="impurity") %>% 
  set_mode("regression")
#Create workflow
rf_workflow <- 
  workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(flu_ml_rec)
#Parameters for tuning
extract_parameter_set_dials(rf_mod)
Collection of 2 parameters for tuning

 identifier  type    object
       mtry  mtry nparam[?]
      min_n min_n nparam[+]

Model parameters needing finalization:
   # Randomly Selected Predictors ('mtry')

See `?dials::finalize` or `?dials::update.parameters` for more information.
#Tune grid
rf_res <- 
  rf_workflow %>% 
  tune_grid(resample_object,
            grid = 25,
            control = control_grid(save_pred = TRUE),
            metrics = NULL)
i Creating pre-processing data to finalize unknown parameter: mtry
#Find and show best
rf_res %>% show_best()
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     5    27 rmse    standard    1.16    25  0.0168 Preprocessor1_Model18
2     3    19 rmse    standard    1.16    25  0.0167 Preprocessor1_Model09
3     8    24 rmse    standard    1.17    25  0.0167 Preprocessor1_Model04
4    11    36 rmse    standard    1.17    25  0.0168 Preprocessor1_Model13
5     6    15 rmse    standard    1.17    25  0.0167 Preprocessor1_Model22
rf_best <- rf_res %>% select_best()
Warning: No value of `metric` was given; metric 'rmse' will be used.
#Final workflow
rf_final_wf<- rf_workflow%>% finalize_workflow(rf_best)
#Final fit
rf_final <- rf_final_wf %>% fit(train_data)
rf_final %>% extract_fit_parsnip() %>% vip(num_features=28)

fx <- extract_fit_engine(rf_final)
vip(fx)

Fitting Test Data

#I'm deciding to use the lasso fit as the best model because it has the lowest rmse
final_test <- lasso_final %>% last_fit(flu_split)
final_test %>% collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      1.16   Preprocessor1_Model1
2 rsq     standard      0.0299 Preprocessor1_Model1