## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE, fig.width = 7, fig.height = 4.4, dpi = 150, fig.align = "center" ) set.seed(2026) ## ----libs--------------------------------------------------------------------- library(mixqr) library(ggplot2) library(quantreg) ## ----theme, include = FALSE--------------------------------------------------- # A clean, consistent look used throughout. theme_mixqr <- function() { theme_minimal(base_size = 12) + theme(panel.grid.minor = element_blank(), plot.title = element_text(face = "bold"), plot.subtitle = element_text(colour = "grey35"), legend.position = "top") } pal_seq <- c("#1b6ca8", "#e07b39", "#5aae61", "#9b59b6") ## ----simulate----------------------------------------------------------------- simulate_schools <- function(n = 600, pi_A = 0.6) { ses <- rnorm(n) # standardized SES regime <- ifelse(runif(n) < pi_A, "A", "B") err <- exp(rnorm(n, 0, 0.45)) - 1 # right-skewed, median 0 spread <- ifelse(regime == "A", 6, 9) * exp(0.30 * ses) # grows with SES score <- ifelse(regime == "A", 56 + 8.0 * ses, # supportive schools 46 + 3.0 * ses) + spread * err data.frame(score = score, ses = ses, regime = factor(regime)) } schools <- simulate_schools() head(schools) ## ----raw, fig.height = 4.2, fig.alt = "Scatter of score against SES with a single pooled median line."---- single <- rq(score ~ ses, tau = 0.5, data = schools) ggplot(schools, aes(ses, score)) + geom_point(alpha = 0.35, colour = "grey30") + geom_abline(intercept = coef(single)[1], slope = coef(single)[2], linewidth = 1.1, linetype = "dashed") + labs(x = "Family SES (standardized)", y = "Math score", title = "One median line hides the subgroups", subtitle = "A single quantile regression averages two regimes together") + theme_mixqr() ## ----fit---------------------------------------------------------------------- fit <- mixqr(score ~ ses, data = schools, tau = 0.5, m = 2, nstart = 25, variance = "stochEM", vcontrol = mixqr_vcontrol(B = 200, seed = 1)) fit ## ----summary------------------------------------------------------------------ summary(fit) ## ----est-plot, fig.alt = "Score against SES, points coloured by recovered regime, with the two component median lines."---- # Components are ordered by SES slope, so component 1 is the shallower-gradient # (under-resourced) regime and component 2 the steeper (supportive) one. comp_label <- c("Under-resourced schools", "Supportive schools") pal2 <- c("Under-resourced schools" = "#e07b39", "Supportive schools" = "#1b6ca8") schools$regime_hat <- factor(comp_label[predict(fit, type = "class")], levels = comp_label) xseq <- seq(min(schools$ses), max(schools$ses), length.out = 100) lines_df <- do.call(rbind, lapply(1:2, function(j) { data.frame(ses = xseq, score = as.numeric(cbind(1, xseq) %*% fit$beta[, j]), regime = factor(comp_label[j], levels = comp_label)) })) ggplot(schools, aes(ses, score)) + geom_point(aes(colour = regime_hat), alpha = 0.45, size = 1.8) + geom_line(data = lines_df, aes(colour = regime), linewidth = 1.3) + scale_colour_manual(values = pal2, name = "Recovered regime") + labs(x = "Family SES (standardized)", y = "Math score", title = "Two SES–achievement gradients, recovered jointly", subtitle = "Component median regressions from mixqr") + theme_mixqr() ## ----coef-plot, fig.height = 3.6, fig.alt = "Coefficient estimates with 95% confidence intervals for each regime, by term."---- ci <- confint(fit) # Wald intervals from stochastic-EM SEs keep <- grep("^comp", rownames(ci)) coef_df <- data.frame( label = rownames(ci)[keep], est = c(as.numeric(fit$beta)), lo = ci[keep, 1], hi = ci[keep, 2] ) comp_idx <- as.integer(sub("comp(\\d+):.*", "\\1", coef_df$label)) coef_df$regime <- factor(comp_label[comp_idx], levels = comp_label) coef_df$term <- sub("comp\\d+:", "", coef_df$label) coef_df$term <- factor(coef_df$term, labels = c("Intercept", "SES slope")) ggplot(coef_df, aes(est, regime, colour = regime)) + geom_pointrange(aes(xmin = lo, xmax = hi), linewidth = 0.9, size = 0.7) + facet_wrap(~ term, scales = "free_x") + scale_colour_manual(values = pal2, guide = "none") + labs(x = "Estimate (95% CI)", y = NULL, title = "Regime-specific effects with uncertainty", subtitle = "The SES gradient is far steeper in the supportive regime") + theme_mixqr() ## ----posterior, fig.height = 3.6, fig.alt = "Histogram of the probability of belonging to regime A."---- post <- predict(fit, type = "posterior") schools$p_supportive <- post[, 2] # component 2 = supportive (steeper) regime ggplot(schools, aes(p_supportive)) + geom_histogram(bins = 30, fill = "#1b6ca8", colour = "white") + labs(x = "Posterior probability of the supportive regime", y = "Students", title = "Most students are classified with confidence", subtitle = "Mass near 0 and 1 means confident assignments") + theme_mixqr() ## ----overlap------------------------------------------------------------------ fit$diagnostics$overlap ## ----densities, fig.height = 3.8, fig.alt = "Estimated component error densities with the constraint marker at zero."---- fit_kd <- mixqr(score ~ ses, data = schools, tau = 0.5, m = 2, engine = "kdEM", init = "ald", nstart = 15) rng <- range(vapply(fit_kd$density, function(d) range(d$grid), numeric(2))) tt <- seq(rng[1], rng[2], length.out = 400) dens_df <- do.call(rbind, lapply(1:2, function(j) { data.frame(resid = tt, density = fit_kd$density[[j]]$eval(tt), regime = factor(comp_label[j], levels = comp_label)) })) ggplot(dens_df, aes(resid, density, colour = regime)) + geom_line(linewidth = 1.1) + geom_vline(xintercept = 0, linetype = 3, colour = "grey40") + scale_colour_manual(values = pal2, name = "Regime") + labs(x = "Residual", y = "Density", title = "Each regime has its own (skewed) error distribution", subtitle = "Estimated by the Wu & Yao kernel-density EM; tau-quantile fixed at 0") + theme_mixqr() ## ----engines------------------------------------------------------------------ rbind(ALD = as.numeric(fit$beta), kdEM = as.numeric(fit_kd$beta)) ## ----confint------------------------------------------------------------------ confint(fit) ## ----multitau, fig.height = 4.6, fig.alt = "Conditional quantile lines at tau = 0.1, 0.5, 0.9 for each regime."---- taus <- c(0.1, 0.5, 0.9) multi <- do.call(rbind, lapply(taus, function(tt) { f <- mixqr(score ~ ses, data = schools, tau = tt, m = 2, nstart = 15) do.call(rbind, lapply(1:2, function(j) { data.frame(ses = xseq, score = as.numeric(cbind(1, xseq) %*% f$beta[, j]), regime = factor(comp_label[j], levels = comp_label), tau = tt) })) })) ggplot(schools, aes(ses, score)) + geom_point(alpha = 0.18, colour = "grey45", size = 1.2) + geom_line(data = multi, aes(colour = regime, group = interaction(regime, tau), linewidth = factor(tau))) + scale_colour_manual(values = pal2, name = "Regime") + scale_linewidth_manual(values = c("0.1" = 0.5, "0.5" = 1.2, "0.9" = 0.5), name = "Quantile") + labs(x = "Family SES (standardized)", y = "Math score", title = "Conditional quantile bands by regime", subtitle = "Wider bands at high SES reveal growing within-regime inequality") + theme_mixqr() ## ----select, fig.height = 3.6, fig.alt = "Cross-validated score against number of components."---- sel <- mixqr_select(score ~ ses, data = schools, tau = 0.5, m = 1:4, criterion = "cv", folds = 5, nstart = 8, control = mixqr_control(seed = 1)) sel$table ggplot(sel$table, aes(m, cv_negloglik)) + geom_line(colour = "#1b6ca8") + geom_point(size = 3, colour = "#1b6ca8") + geom_point(data = sel$table[sel$table$m == 2, ], size = 5, shape = 21, fill = "#e07b39", colour = "#e07b39") + labs(x = "Number of components (m)", y = "CV held-out negative log-likelihood", title = "Choosing the number of regimes", subtitle = "Lower is better; the elbow at two regimes is the parsimonious choice") + theme_mixqr() ## ----report, eval = FALSE----------------------------------------------------- # # everything needed to reproduce the headline fit # fit <- mixqr(score ~ ses, data = schools, tau = 0.5, m = 2, # nstart = 25, variance = "stochEM", # vcontrol = mixqr_vcontrol(B = 200, seed = 1)) # summary(fit) ## ----cite, eval = FALSE------------------------------------------------------- # citation("mixqr")