Paper Example 2: MCAR missingness and custom criterion callbacks

This vignette reproduces the MCAR missingness component of Schroeders and Gnambs (2025) Example 2 as far as irtsim’s current API permits. The paper’s full Example 2 uses a bivariate latent trait (theta + external criterion) with a TAM latent regression — features irtsim does not yet support. What we can reproduce is the impact of MCAR missingness on item parameter recovery, which is one of the paper’s key design variables. For the full gap analysis, see vignette("paper-reproduction-gaps").

We also demonstrate irtsim’s custom criterion callback (criterion_fn), which lets users compute arbitrary item-level metrics on top of the built-in bias/MSE/RMSE/coverage.

Background

A researcher is developing a 30-item personality scale administered online. Due to respondent burden, some participants see only a random subset of items (MCAR at 30%). We compare complete data (0% missing) to 30% MCAR, asking: how much does MCAR missingness degrade item parameter recovery, and how large a sample is needed to compensate?

Study Design

library(irtsim)

set.seed(2024)
n_items <- 30

params <- irt_params_2pl(
  n_items = n_items,
  a_dist  = "lnorm",
  a_mean  = 0.2,
  a_sd    = 0.3,
  b_mean  = 0,
  b_sd    = 1,
  seed    = 2024
)

design <- irt_design(
  model       = "2PL",
  n_items     = n_items,
  item_params = params,
  theta_dist  = "normal"
)

sample_sizes <- seq(200, 1000, by = 200)

study_complete <- irt_study(design, sample_sizes = sample_sizes)

study_mcar30 <- irt_study(
  design,
  sample_sizes = sample_sizes,
  missing      = "mcar",
  missing_rate = 0.30
)

res_complete <- irt_simulate(study_complete, iterations = 500,
                             seed = 2024, parallel = TRUE)
res_mcar30   <- irt_simulate(study_mcar30,   iterations = 500,
                             seed = 2024, parallel = TRUE)

Note on reproducibility. Precomputed with parallel = TRUE. See ?irt_simulate for the serial/parallel reproducibility contract.

Proportion of Items Meeting RMSE Threshold

At each sample size, what fraction of items achieve acceptable RMSE? This captures the full story: both items that converge quickly and items that never converge, which a single max(recommended_n) would hide.

#' Compute proportion of items meeting a criterion threshold at each N
prop_meeting <- function(res, criterion, threshold, param = NULL) {
  s <- summary(res, criterion = criterion, param = param)
  df <- s$item_summary
  cfg <- irtsim:::get_criterion_config(criterion)
  vals <- df[[criterion]]
  if (cfg$use_abs) vals <- abs(vals)
  if (cfg$direction == "higher_is_better") {
    df$meets <- !is.na(vals) & vals >= threshold
  } else {
    df$meets <- !is.na(vals) & vals <= threshold
  }
  agg <- aggregate(meets ~ sample_size, data = df,
                   FUN = function(x) mean(x))
  names(agg)[2] <- "prop_meeting"
  agg
}

Discrimination (a)

prop_a_complete <- prop_meeting(res_complete, "rmse", 0.20, param = "a")
prop_a_complete$condition <- "Complete"

prop_a_mcar <- prop_meeting(res_mcar30, "rmse", 0.20, param = "a")
prop_a_mcar$condition <- "30% MCAR"

prop_a <- rbind(prop_a_complete, prop_a_mcar)

ggplot(prop_a, aes(x = sample_size, y = prop_meeting, colour = condition)) +
  geom_line(linewidth = 0.9) +
  geom_point(size = 2.5) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1)) +
  labs(
    title = "Proportion of Items with RMSE(a) \u2264 0.20",
    x = "Sample size (N)", y = "Proportion meeting threshold",
    colour = NULL
  ) +
  theme_minimal(base_size = 12)

Difficulty (b)

prop_b_complete <- prop_meeting(res_complete, "rmse", 0.20, param = "b")
prop_b_complete$condition <- "Complete"

prop_b_mcar <- prop_meeting(res_mcar30, "rmse", 0.20, param = "b")
prop_b_mcar$condition <- "30% MCAR"

prop_b <- rbind(prop_b_complete, prop_b_mcar)

ggplot(prop_b, aes(x = sample_size, y = prop_meeting, colour = condition)) +
  geom_line(linewidth = 0.9) +
  geom_point(size = 2.5) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1)) +
  labs(
    title = "Proportion of Items with RMSE(b) \u2264 0.20",
    x = "Sample size (N)", y = "Proportion meeting threshold",
    colour = NULL
  ) +
  theme_minimal(base_size = 12)

Aggregate RMSE Trajectories

The ribbon shows the range (min to max) across items; the line is the mean.

make_agg <- function(res, param, label) {
  s <- summary(res, criterion = "rmse", param = param)
  agg <- aggregate(
    rmse ~ sample_size,
    data = s$item_summary,
    FUN = function(x) c(mean = mean(x), min = min(x), max = max(x))
  )
  agg <- do.call(data.frame, agg)
  names(agg) <- c("sample_size", "mean_rmse", "min_rmse", "max_rmse")
  agg$condition <- label
  agg
}

agg_all <- rbind(
  make_agg(res_complete, "a", "Complete"),
  make_agg(res_mcar30,   "a", "30% MCAR")
)

ggplot(agg_all, aes(x = sample_size, colour = condition, fill = condition)) +
  geom_ribbon(aes(ymin = min_rmse, ymax = max_rmse), alpha = 0.15, colour = NA) +
  geom_line(aes(y = mean_rmse), linewidth = 0.9) +
  geom_point(aes(y = mean_rmse), size = 2) +
  geom_hline(yintercept = 0.20, linetype = "dashed", colour = "grey40") +
  labs(
    title    = "RMSE(a) \u2014 Complete vs. 30% MCAR",
    subtitle = "Line = mean across items; ribbon = min\u2013max range",
    x = "Sample size (N)", y = "RMSE(a)", colour = NULL, fill = NULL
  ) +
  theme_minimal(base_size = 12)

Custom Criterion: Standard Error of Measurement

irtsim’s criterion_fn argument to summary() lets users define arbitrary item-level criteria. Here we compute the average standard error across replications, derived from the CI width.

sem_at_theta <- function(estimates, true_value, ci_lower, ci_upper, converged, ...) {
  valid <- !is.na(ci_lower) & !is.na(ci_upper)
  if (!any(valid)) return(c(sem = NA))
  se_vec <- (ci_upper[valid] - ci_lower[valid]) / 3.92
  c(sem = mean(se_vec, na.rm = TRUE))
}

sum_complete_custom <- summary(res_complete, criterion_fn = sem_at_theta)
sum_mcar30_custom   <- summary(res_mcar30,   criterion_fn = sem_at_theta)

Any function with the signature f(estimates, true_value, ci_lower, ci_upper, converged, ...) -> named numeric can be passed to summary(). This lets users compute domain-specific metrics without modifying irtsim internals.

Theta Recovery

sum_complete <- summary(res_complete)
sum_mcar30   <- summary(res_mcar30)

cat("=== Theta Recovery ===\n\n")
#> === Theta Recovery ===
cat("Complete data:\n")
#> Complete data:
print(sum_complete$theta_summary[, c("sample_size", "mean_cor", "mean_rmse")])
#>   sample_size  mean_cor mean_rmse
#> 1         200 0.9349194 0.3604117
#> 2         400 0.9364936 0.3549195
#> 3         600 0.9366280 0.3535303
#> 4         800 0.9366737 0.3521547
#> 5        1000 0.9369620 0.3511869
cat("\n30% MCAR:\n")
#> 
#> 30% MCAR:
print(sum_mcar30$theta_summary[, c("sample_size", "mean_cor", "mean_rmse")])
#>   sample_size  mean_cor mean_rmse
#> 1         200 0.9382227 0.3522203
#> 2         400 0.9387196 0.3481327
#> 3         600 0.9390215 0.3463959
#> 4         800 0.9388607 0.3465829
#> 5        1000 0.9392179 0.3448338

Summary

rec_a_complete <- recommended_n(sum_complete, criterion = "rmse",
                                threshold = 0.20, param = "a")
rec_a_mcar     <- recommended_n(sum_mcar30,   criterion = "rmse",
                                threshold = 0.20, param = "a")

n_items <- nrow(rec_a_complete)
n_na_complete <- sum(is.na(rec_a_complete$recommended_n))
n_na_mcar     <- sum(is.na(rec_a_mcar$recommended_n))

cat("Items tested:", n_items, "\n")
#> Items tested: 30
cat("Items reaching RMSE(a) <= 0.20:\n")
#> Items reaching RMSE(a) <= 0.20:
cat("  Complete:", n_items - n_na_complete, "of", n_items, "\n")
#>   Complete: 30 of 30
cat("  30% MCAR:", n_items - n_na_mcar, "of", n_items, "\n")
#>   30% MCAR: 1 of 30

MCAR missingness reduces the effective information per item per person, which increases RMSE and inflates sample-size requirements. The proportion-meeting-threshold plots show this at each N: fewer items achieve acceptable precision under MCAR than under complete data. The magnitude of the penalty depends on the missingness rate, the number of items, and the discrimination structure.

The criterion_fn callback demonstrated here allows users to define domain-specific metrics without modifying irtsim internals.

For the paper’s full Example 2 — which uses the SE of a theta-criterion correlation from a TAM latent regression — see vignette("paper-reproduction-gaps") and Obj 30 in the project backlog.