## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4 ) ## ----simulate----------------------------------------------------------------- library(mixedsubjectsirt) library(ggplot2) set.seed(2026) n_human <- 400 n_generated <- 1200 n_items <- 8 n_good <- 4 # items where LLM predicts well true_pars <- data.frame( item = paste0("Item", seq_len(n_items)), a = seq(0.8, 1.6, length.out = n_items), d = seq(-1.1, 1.1, length.out = n_items) ) true_pars$b <- -true_pars$d / true_pars$a theta_human <- rnorm(n_human) observed <- simulate_2pl(theta_human, true_pars) # LLM: good for items 1–4 (same DGP), poor for 5–8 (random noise) llm_pars_good <- true_pars llm_pars_poor <- true_pars llm_pars_poor$a <- pmax(0.05, rnorm(n_items, 0, 0.1)) # near-random llm_pars_poor$d <- rnorm(n_items, 0, 2) llm_pars_poor$b <- -llm_pars_poor$d / llm_pars_poor$a # Build predicted (same subjects as human) predicted <- observed # F = Y for first 4 items predicted[, 5:8] <- simulate_2pl(theta_human, llm_pars_poor)[, 5:8] # Build generated generated_good <- simulate_2pl(rnorm(n_generated), true_pars) generated_poor <- simulate_2pl(rnorm(n_generated), llm_pars_poor) generated <- cbind(generated_good[, 1:4], generated_poor[, 5:8]) colnames(generated) <- true_pars$item ## ----baseline----------------------------------------------------------------- human_pars <- fit_2pl(observed, technical = list(NCYCLES = 500))$pars global_tuned <- tune_lambda_ability_risk( lambda_grid = seq(0, 1, by = 0.1), observed = observed, predicted = predicted, generated = generated, initial_pars = human_pars, fit_fn = fit_mixed_subjects_mml, n_quad = 11, control = list(maxit = 200) ) cat("Global scalar best lambda:", global_tuned$best_lambda, "\n") ## ----ppi-item----------------------------------------------------------------- ppi_item <- tune_lambda_ppi_score_item( observed = observed, predicted = predicted, item_pars = human_pars, n_generated = n_generated, n_quad = 11 ) cat("Per-item PPI++ lambda:\n") print(data.frame(item = ppi_item$item, lambda = round(ppi_item$lambda, 3))) cat("N/(n+N) upper bound:", round(n_generated / (n_human + n_generated), 3), "\n") ## ----item-risk, cache = FALSE------------------------------------------------- item_tuned <- tune_lambda_ability_risk_item( lambda_grid = seq(0, 1, by = 0.25), observed = observed, predicted = predicted, generated = generated, initial_pars = human_pars, init_lambda = global_tuned$best_lambda, # start from global best n_quad = 11, n_pass = 1, control = list(maxit = 200) ) cat("Per-item ability-risk lambda:\n") print(data.frame(item = item_tuned$item, lambda = round(item_tuned$lambda, 3))) ## ----compare------------------------------------------------------------------ fit_scalar <- global_tuned$best_fit fit_per_item <- item_tuned$final_fit rmse <- function(x, y) sqrt(mean((x - y)^2)) comparison <- data.frame( item = true_pars$item, true_a = round(true_pars$a, 3), human_a = round(human_pars$a, 3), scalar_a = round(fit_scalar$item_pars$a, 3), item_a = if (is.null(fit_per_item)) NA_real_ else round(fit_per_item$item_pars$a, 3) ) knitr::kable(comparison, row.names = FALSE, caption = "Discrimination recovery: scalar lambda vs. per-item lambda") cat("RMSE(a) human-only: ", round(rmse(human_pars$a, true_pars$a), 4), "\n") cat("RMSE(a) scalar MML: ", round(rmse(fit_scalar$item_pars$a, true_pars$a), 4), "\n") if (!is.null(fit_per_item)) { cat("RMSE(a) per-item MML: ", round(rmse(fit_per_item$item_pars$a, true_pars$a), 4), "\n") }