## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE,
  fig.width = 7, fig.height = 4.4, dpi = 150, fig.align = "center"
)
set.seed(2026)

## ----libs---------------------------------------------------------------------
library(mixqr)
library(ggplot2)
library(quantreg)

## ----theme, include = FALSE---------------------------------------------------
# A clean, consistent look used throughout.
theme_mixqr <- function() {
  theme_minimal(base_size = 12) +
    theme(panel.grid.minor = element_blank(),
          plot.title = element_text(face = "bold"),
          plot.subtitle = element_text(colour = "grey35"),
          legend.position = "top")
}
pal_seq <- c("#1b6ca8", "#e07b39", "#5aae61", "#9b59b6")

## ----simulate-----------------------------------------------------------------
simulate_schools <- function(n = 600, pi_A = 0.6) {
  ses <- rnorm(n)                                   # standardized SES
  regime <- ifelse(runif(n) < pi_A, "A", "B")
  err <- exp(rnorm(n, 0, 0.45)) - 1                 # right-skewed, median 0
  spread <- ifelse(regime == "A", 6, 9) * exp(0.30 * ses)  # grows with SES
  score <- ifelse(regime == "A",
                  56 + 8.0 * ses,                   # supportive schools
                  46 + 3.0 * ses) + spread * err
  data.frame(score = score, ses = ses, regime = factor(regime))
}

schools <- simulate_schools()
head(schools)

## ----raw, fig.height = 4.2, fig.alt = "Scatter of score against SES with a single pooled median line."----
single <- rq(score ~ ses, tau = 0.5, data = schools)

ggplot(schools, aes(ses, score)) +
  geom_point(alpha = 0.35, colour = "grey30") +
  geom_abline(intercept = coef(single)[1], slope = coef(single)[2],
              linewidth = 1.1, linetype = "dashed") +
  labs(x = "Family SES (standardized)", y = "Math score",
       title = "One median line hides the subgroups",
       subtitle = "A single quantile regression averages two regimes together") +
  theme_mixqr()

## ----fit----------------------------------------------------------------------
fit <- mixqr(score ~ ses, data = schools, tau = 0.5, m = 2,
             nstart = 25, variance = "stochEM",
             vcontrol = mixqr_vcontrol(B = 200, seed = 1))
fit

## ----summary------------------------------------------------------------------
summary(fit)

## ----est-plot, fig.alt = "Score against SES, points coloured by recovered regime, with the two component median lines."----
# Components are ordered by SES slope, so component 1 is the shallower-gradient
# (under-resourced) regime and component 2 the steeper (supportive) one.
comp_label <- c("Under-resourced schools", "Supportive schools")
pal2 <- c("Under-resourced schools" = "#e07b39",
          "Supportive schools"      = "#1b6ca8")

schools$regime_hat <- factor(comp_label[predict(fit, type = "class")],
                             levels = comp_label)

xseq <- seq(min(schools$ses), max(schools$ses), length.out = 100)
lines_df <- do.call(rbind, lapply(1:2, function(j) {
  data.frame(ses = xseq,
             score = as.numeric(cbind(1, xseq) %*% fit$beta[, j]),
             regime = factor(comp_label[j], levels = comp_label))
}))

ggplot(schools, aes(ses, score)) +
  geom_point(aes(colour = regime_hat), alpha = 0.45, size = 1.8) +
  geom_line(data = lines_df, aes(colour = regime), linewidth = 1.3) +
  scale_colour_manual(values = pal2, name = "Recovered regime") +
  labs(x = "Family SES (standardized)", y = "Math score",
       title = "Two SES–achievement gradients, recovered jointly",
       subtitle = "Component median regressions from mixqr") +
  theme_mixqr()

## ----coef-plot, fig.height = 3.6, fig.alt = "Coefficient estimates with 95% confidence intervals for each regime, by term."----
ci <- confint(fit)                      # Wald intervals from stochastic-EM SEs
keep <- grep("^comp", rownames(ci))
coef_df <- data.frame(
  label  = rownames(ci)[keep],
  est    = c(as.numeric(fit$beta)),
  lo     = ci[keep, 1], hi = ci[keep, 2]
)
comp_idx <- as.integer(sub("comp(\\d+):.*", "\\1", coef_df$label))
coef_df$regime <- factor(comp_label[comp_idx], levels = comp_label)
coef_df$term <- sub("comp\\d+:", "", coef_df$label)
coef_df$term <- factor(coef_df$term, labels = c("Intercept", "SES slope"))

ggplot(coef_df, aes(est, regime, colour = regime)) +
  geom_pointrange(aes(xmin = lo, xmax = hi), linewidth = 0.9, size = 0.7) +
  facet_wrap(~ term, scales = "free_x") +
  scale_colour_manual(values = pal2, guide = "none") +
  labs(x = "Estimate (95% CI)", y = NULL,
       title = "Regime-specific effects with uncertainty",
       subtitle = "The SES gradient is far steeper in the supportive regime") +
  theme_mixqr()

## ----posterior, fig.height = 3.6, fig.alt = "Histogram of the probability of belonging to regime A."----
post <- predict(fit, type = "posterior")
schools$p_supportive <- post[, 2]       # component 2 = supportive (steeper) regime

ggplot(schools, aes(p_supportive)) +
  geom_histogram(bins = 30, fill = "#1b6ca8", colour = "white") +
  labs(x = "Posterior probability of the supportive regime",
       y = "Students",
       title = "Most students are classified with confidence",
       subtitle = "Mass near 0 and 1 means confident assignments") +
  theme_mixqr()

## ----overlap------------------------------------------------------------------
fit$diagnostics$overlap

## ----densities, fig.height = 3.8, fig.alt = "Estimated component error densities with the constraint marker at zero."----
fit_kd <- mixqr(score ~ ses, data = schools, tau = 0.5, m = 2,
                engine = "kdEM", init = "ald", nstart = 15)

rng <- range(vapply(fit_kd$density, function(d) range(d$grid), numeric(2)))
tt <- seq(rng[1], rng[2], length.out = 400)
dens_df <- do.call(rbind, lapply(1:2, function(j) {
  data.frame(resid = tt, density = fit_kd$density[[j]]$eval(tt),
             regime = factor(comp_label[j], levels = comp_label))
}))

ggplot(dens_df, aes(resid, density, colour = regime)) +
  geom_line(linewidth = 1.1) +
  geom_vline(xintercept = 0, linetype = 3, colour = "grey40") +
  scale_colour_manual(values = pal2, name = "Regime") +
  labs(x = "Residual", y = "Density",
       title = "Each regime has its own (skewed) error distribution",
       subtitle = "Estimated by the Wu & Yao kernel-density EM; tau-quantile fixed at 0") +
  theme_mixqr()

## ----engines------------------------------------------------------------------
rbind(ALD = as.numeric(fit$beta), kdEM = as.numeric(fit_kd$beta))

## ----confint------------------------------------------------------------------
confint(fit)

## ----multitau, fig.height = 4.6, fig.alt = "Conditional quantile lines at tau = 0.1, 0.5, 0.9 for each regime."----
taus <- c(0.1, 0.5, 0.9)
multi <- do.call(rbind, lapply(taus, function(tt) {
  f <- mixqr(score ~ ses, data = schools, tau = tt, m = 2, nstart = 15)
  do.call(rbind, lapply(1:2, function(j) {
    data.frame(ses = xseq,
               score = as.numeric(cbind(1, xseq) %*% f$beta[, j]),
               regime = factor(comp_label[j], levels = comp_label), tau = tt)
  }))
}))

ggplot(schools, aes(ses, score)) +
  geom_point(alpha = 0.18, colour = "grey45", size = 1.2) +
  geom_line(data = multi,
            aes(colour = regime, group = interaction(regime, tau),
                linewidth = factor(tau))) +
  scale_colour_manual(values = pal2, name = "Regime") +
  scale_linewidth_manual(values = c("0.1" = 0.5, "0.5" = 1.2, "0.9" = 0.5),
                         name = "Quantile") +
  labs(x = "Family SES (standardized)", y = "Math score",
       title = "Conditional quantile bands by regime",
       subtitle = "Wider bands at high SES reveal growing within-regime inequality") +
  theme_mixqr()

## ----select, fig.height = 3.6, fig.alt = "Cross-validated score against number of components."----
sel <- mixqr_select(score ~ ses, data = schools, tau = 0.5, m = 1:4,
                    criterion = "cv", folds = 5, nstart = 8,
                    control = mixqr_control(seed = 1))
sel$table

ggplot(sel$table, aes(m, cv_negloglik)) +
  geom_line(colour = "#1b6ca8") +
  geom_point(size = 3, colour = "#1b6ca8") +
  geom_point(data = sel$table[sel$table$m == 2, ],
             size = 5, shape = 21, fill = "#e07b39", colour = "#e07b39") +
  labs(x = "Number of components (m)", y = "CV held-out negative log-likelihood",
       title = "Choosing the number of regimes",
       subtitle = "Lower is better; the elbow at two regimes is the parsimonious choice") +
  theme_mixqr()

## ----report, eval = FALSE-----------------------------------------------------
# # everything needed to reproduce the headline fit
# fit <- mixqr(score ~ ses, data = schools, tau = 0.5, m = 2,
#              nstart = 25, variance = "stochEM",
#              vcontrol = mixqr_vcontrol(B = 200, seed = 1))
# summary(fit)

## ----cite, eval = FALSE-------------------------------------------------------
# citation("mixqr")