4 min read

Standard errors for cross-validated precision macro

Simulation function – response is a quadratic function of continuous predictor

sim_quadratic_logistic_data = function(sample_size = 25) {
  x = rnorm(n = sample_size)
  eta = -1.5 + 0.5 * x + x ^ 2 + rnorm(sample_size, sd = 2) # add some noise
  p = 1 / (1 + exp(-eta))
  y = rbinom(n = sample_size, size = 1, prob = p)
  data.frame(y.factor = factor(y, levels = 1:0, labels = c("COPD","control")), x = x, y = y)
}

Fit once to see what the resulting predicted curve looks like

set.seed(42)
example_data <- sim_quadratic_logistic_data(sample_size = 500)
table(example_data$y.factor)
## 
##    COPD control 
##     209     291
fit_glm <- glm(y ~ x + I(x^2), data = example_data, family = binomial)
summary(fit_glm)
## 
## Call:
## glm(formula = y ~ x + I(x^2), family = binomial, data = example_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0555  -0.8862  -0.8084   1.2208   1.6041  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.9058     0.1255  -7.217 5.30e-13 ***
## x             0.3904     0.1118   3.491 0.000482 ***
## I(x^2)        0.6619     0.1009   6.558 5.45e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 679.64  on 499  degrees of freedom
## Residual deviance: 609.91  on 497  degrees of freedom
## AIC: 615.91
## 
## Number of Fisher Scoring iterations: 4
plot(y ~ x, data = example_data,
     pch = 20, ylab = "Estimated Probability",
     main = "Logistic Regression, Quadratic Relationship")
grid()
curve(predict(fit_glm, data.frame(x), type = "response"),
      add = TRUE, col = "dodgerblue", lty = 2)
curve(boot::inv.logit(-1.5 + 0.5 * x + x ^ 2),
      add = TRUE, col = "darkorange", lty = 1)
legend("bottomleft", c("True Probability", "Estimated Probability", "Data"), lty = c(1, 2, 0),
       pch = c(NA, NA, 20), lwd = 2, col = c("darkorange", "dodgerblue", "black"))

Run cross-validation experiment once

if(!requireNamespace("pacman")) install.packages("pacman")
## Loading required namespace: pacman
pacman::p_load(caret)
pacman::p_load(MLmetrics)
pacman::p_load(dplyr)
pacman::p_load(tidyr)

# function that returns the metrics of interest
MySummary <- function (data, lev = NULL, model = NULL) {
  
  PPV = caret::posPredValue(data = data$pred,
                            reference = data$obs)
  NPV = caret::negPredValue(data = data$pred,
                            reference = data$obs)

  tt <- table(data$pred, data$obs)
  n_call_COPD <- sum(tt[1,]) # number of 'positives' called by the model
  n_call_control <- sum(tt[2,]) # number of 'negatives' called by the model

  c(PPV = PPV,
    NPV = NPV,
    Macro = (PPV + NPV) / 2, # JH calls this ave.suuccess12
    SE2_Macro =  (1/2^2) * (PPV * (1-PPV) / n_call_COPD + NPV * (1-NPV) / n_call_control) # JH calls this var.ave.success
  )
}
set.seed(42)

# sample size
N <- 500

# folds 
K <- 5

# simulate data
df <- sim_quadratic_logistic_data(sample_size = N)

# fit the model with cross-validated metrics
fit <- caret::train(y.factor ~ x + I(x^2),
                    data = df,
                    method = "glm",
                    family = "binomial",
                    trControl = trainControl(method = "cv",
                                             number = K,
                                             savePredictions = TRUE,
                                             summaryFunction = MySummary,
                                             classProbs = TRUE))

fit$resample
##         PPV       NPV     Macro   SE2_Macro Resample
## 1 0.7307692 0.6891892 0.7099792 0.002615458    Fold1
## 2 0.5000000 0.6000000 0.5500000 0.003875000    Fold2
## 3 0.5652174 0.6282051 0.5967113 0.003419760    Fold3
## 4 0.8400000 0.7297297 0.7848649 0.002010298    Fold4
## 5 0.5909091 0.6282051 0.6095571 0.003495596    Fold5

Run CV experiment many times

set.seed(42)
n.samples <- 400 # number of replications

# sample size
N <- 500

# folds 
K <- 5

# repeat experiment several times
res.list <- replicate(n.samples, expr = {

  # simulate data with samp size
  df <- sim_quadratic_logistic_data(sample_size = N)
  
  # fit the model with cross-validated metrics
  fit <- caret::train(y.factor ~ x + I(x^2),
                      data = df,
                      method = "glm",
                      family = "binomial",
                      trControl = trainControl(method = "cv",
                                               number = K,
                                               savePredictions = TRUE,
                                               summaryFunction = MySummary,
                                               classProbs = TRUE))
  data.frame(
    SE_pbar = sqrt((1/K)*mean(fit$resample$SE2_Macro)), # model based (what the reviewer suggested)
    SEM = sd(fit$resample$Macro) / sqrt(K), # what we did aka the Standard error of the mean
    corr_ppv_npv = cor(fit$resample$PPV, fit$resample$NPV)
  )

}, simplify = FALSE)


res <- do.call(rbind, res.list) %>% 
  mutate(replicate = 1:n())

hist(res$corr_ppv_npv, 
     col = "lightblue", 
     xlab = "corr(PPV, NPV)",
     main = sprintf("Average correlation(PPV, NPV) over %i samples = %0.2f",n.samples, mean(res$corr_ppv_npv)))

res %>% 
  dplyr::select(replicate, SE_pbar, SEM) %>% 
  tidyr::pivot_longer(cols = -1, names_to = "SE_type") %>% 
  ggplot(aes(x = SE_type, y = value)) + geom_boxplot() + 
  labs(title = "Distribution of standard errors for SEM vs. Average of \nK fold model based variances over many replications")

boxplot(res$SEM / res$SE_pbar, main = "Ratio of SEM to SE_pbar")