## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( echo = TRUE, collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = TRUE, message = FALSE ) library(riskutility) ## ----intro-hook, eval=FALSE--------------------------------------------------- # library(riskutility) # pair <- synth_pair(original, synthetic, # key_vars = c("age", "sex", "region"), # target_var = "income") # report <- disclosure_report(pair) # print(report) ## ----table1-scope, echo=FALSE------------------------------------------------- scope <- data.frame( Category = c("Privacy models", "Attribution (CAP)", "ML-based (RAPID)", "Distance-based", "Record linkage", "Membership inference", "Utility measures", "Frameworks"), Functions = c(8, 4, 4, 6, "1 (8 methods)", 6, 15, 3), Paradigm = c("Equivalence class", "Matching", "Prediction", "Nearest neighbor", "Linkage", "Attack simulation", "Various", "Composite"), `Applicable to` = c(rep("Both", 8)), check.names = FALSE ) knitr::kable(scope, caption = "Package scope at a glance. Both = applicable to traditionally anonymized and synthetic data.") ## ----table2-threats, echo=FALSE----------------------------------------------- threats <- data.frame( Threat = c("Identity", "Attribute", "Membership", "Memorization"), Definition = c( "Attacker links record to individual", "Attacker learns sensitive value via linkage", "Attacker determines if individual is in dataset", "Generator reproduces training records" ), `Key measures` = c( "recordLinkage, kanonymity, individual_risk", "dcap, tcap, weap, disco, rapid", "mia_classifier, domias, nnaa, singling_out", "ims, dcr, nndr" ), check.names = FALSE ) knitr::kable(threats, caption = "Disclosure threat taxonomy.") ## ----table3-comparison, echo=FALSE-------------------------------------------- comp <- data.frame( Measure = c("k-Anonymity", "l-Diversity", "t-Closeness", "DCAP/TCAP", "RAPID", "DCR/NNDR", "Record linkage", "MIA / GDPR criteria", "pMSE / SPECKS", "R-U map"), sdcMicro = c("freq()", "ldiversity()", "--", "--", "--", "dRisk()", "--", "--", "--", "--"), synthpop = c("--", "--", "--", "disclosure()", "--", "--", "--", "--", "utility.gen()", "--"), `SDMetrics (Py)` = c("--", "--", "--", "--", "--", "Yes", "--", "--", "--", "--"), `Anonymeter (Py)` = c("--", "--", "--", "--", "--", "--", "--", "Yes", "--", "--"), riskutility = c("kanonymity()", "ldiversity()", "tcloseness()", "dcap(), tcap()", "rapid()", "dcr(), nndr()", "recordLinkage()", "mia_classifier(), singling_out()", "propscore(), specks()", "rumap()"), check.names = FALSE ) knitr::kable(comp, caption = "Risk/utility measures across R and Python packages.") ## ----synth-pair-demo, eval=FALSE---------------------------------------------- # pair <- synth_pair(original, synthetic, # key_vars = c("age", "gender", "region"), # target_var = "income", # holdout = holdout_data) ## ----synth-pair-dispatch, eval=FALSE------------------------------------------ # # All functions accept synth_pair --- no parameter repetition: # dcap(pair) # Attribution risk # rapid(pair, model_type = "rf") # ML-based risk # propscore(pair) # Propensity score utility # disclosure_report(pair) # Full risk report # rumap(pair) # Risk-Utility map ## ----s3-pattern, eval=FALSE--------------------------------------------------- # # 1. Two equivalent calling conventions: # result <- dcap(pair) # synth_pair method # result <- dcap(X, Y, key_vars = ..., target_var = ...) # default method # # # 2. Inspection: # print(result) # One-screen summary with key statistic # s <- summary(result) # Detailed statistics (returns summary.dcap) # print(s) # Formatted multi-line output # # # 3. Visualization: # plot(result, which = 1) # Plot type 1 # plot(result, which = 1:2) # Multiple plot types ## ----integration, eval=FALSE-------------------------------------------------- # # From synthpop: pass synds object + original data # pair <- from_synthpop(synds_object, original_data, # key_vars = c("age", "sex"), # target_var = "income") # # # From simPop: original data extracted automatically from simPopObj # pair <- from_simPop(simPopObj, # key_vars = c("age", "sex"), # target_var = "income") # # # From sdcMicro: variable roles extracted from sdcMicroObj # pair <- from_sdcMicro(sdcMicroObj) ## ----running-data------------------------------------------------------------- set.seed(42) n <- 500 original <- data.frame( age = sample(18:85, n, replace = TRUE), sex = factor(sample(c("M", "F"), n, replace = TRUE)), education = factor(sample(c("Primary", "Secondary", "Tertiary"), n, replace = TRUE, prob = c(0.3, 0.5, 0.2))), region = factor(sample(paste0("R", 1:5), n, replace = TRUE)), income = round(rlnorm(n, log(40000), 0.5)) ) # Synthetic: independent draws (low risk expected) synthetic <- data.frame( age = sample(18:85, n, replace = TRUE), sex = factor(sample(c("M", "F"), n, replace = TRUE)), education = factor(sample(c("Primary", "Secondary", "Tertiary"), n, replace = TRUE, prob = c(0.3, 0.5, 0.2))), region = factor(sample(paste0("R", 1:5), n, replace = TRUE)), income = round(rlnorm(n, log(40000), 0.5)) ) key_vars <- c("age", "sex", "education", "region") target_var <- "income" pair <- synth_pair(original, synthetic, key_vars = key_vars, target_var = target_var) # Train/holdout split for distance-based metrics set.seed(123) train_idx <- sample(n, size = floor(0.7 * n)) train_data <- original[train_idx, ] holdout_data <- original[-train_idx, ] ## ----privacy-models----------------------------------------------------------- # k-Anonymity: minimum equivalence class size k_res <- riskutility::kanonymity(synthetic, key_vars = key_vars) k_res # l-Diversity: sensitive attribute diversity per EC l_res <- riskutility::ldiversity(synthetic, key_vars = key_vars, sensitive_var = target_var) print(l_res) # t-Closeness: EMD between EC and overall distribution t_res <- riskutility::tcloseness(synthetic, key_vars = key_vars, sensitive_var = target_var) t_res ## ----table5-privacy, echo=FALSE----------------------------------------------- privacy <- data.frame( Function = c("kanonymity()", "ldiversity()", "tcloseness()", "suda()", "individual_risk()", "population_uniqueness()", "attacker_risk()", "epsilon_identifiability()"), Input = rep("Single dataset", 8), `Key output` = c("Min EC size", "Min distinct values per EC", "Max EMD across ECs", "SUDA scores", "Per-record frequency risk", "Estimated pop. uniques", "Scenario-based risk", "Identifiability fraction"), `Threats` = c("Identity", "Attribute", "Attribute", "Identity", "Identity", "Identity", "Identity", "Identity"), check.names = FALSE ) knitr::kable(privacy, caption = "Privacy models overview.") ## ----cap-demo----------------------------------------------------------------- # TCAP: per-record risk (most informative member of CAP family) tcap_res <- tcap(pair) summary(tcap_res) plot(tcap_res) ## ----cap-table, echo=FALSE---------------------------------------------------- cap <- data.frame( Metric = c("DCAP", "TCAP", "WEAP", "DiSCO"), `Requires original?` = c("Yes", "Yes", "No", "Yes"), `Per-record?` = c("No", "Yes", "Yes", "Yes"), `Measures` = c("Mean attribution probability", "Individual attribution risk", "Within-EC homogeneity", "Correct + confident attribution"), `Low risk` = c("ratio < 1.5", "< 0.1 per record", "< 0.1", "< 5%"), check.names = FALSE ) knitr::kable(cap, caption = "CAP family comparison with interpretation thresholds.") ## ----rapid-demo, warning=FALSE------------------------------------------------ rapid_res <- rapid(pair, model_type = "lm") summary(rapid_res) plot(rapid_res, which = c(1, 3)) ## ----rapid-models, echo=FALSE------------------------------------------------- models <- data.frame( Model = c("lm", "rf", "cart", "gbm", "logit"), Package = c("stats", "ranger", "rpart", "xgboost", "stats"), Numeric = c("Yes", "Yes", "Yes", "Yes", "No"), Categorical = c("No", "Yes", "Yes", "Yes", "Yes"), Interactions = c("Manual", "Automatic", "Automatic", "Automatic", "Manual"), check.names = FALSE ) knitr::kable(models, caption = "RAPID model backends.") ## ----distance-demo, warning=FALSE--------------------------------------------- dcr_res <- dcr(pair, holdout_fraction = 0.2) summary(dcr_res) plot(dcr_res, which = 1) ## ----distance-table, echo=FALSE----------------------------------------------- dist <- data.frame( Metric = c("DCR", "NNDR", "IMS", "RF proximity", "dRisk", "Hitting rate", "Epsilon ID", "Delta-presence"), Holdout = c("Yes", "Yes", "No", "Yes", "No", "No", "No", "No"), Detects = c("Memorization", "Memorization", "Exact copies", "Memorization (non-linear)", "Close records", "Close records", "Identifiability", "Membership bounds"), `Low risk` = c("share < 0.55", "share < 0.55", "< 0.01", "ratio near 1", "< 0.05", "< 0.05", "< 0.01", "> 0.5"), check.names = FALSE ) knitr::kable(dist, caption = "Distance-based and proximity risk measures.") ## ----recordlinkage-table, echo=FALSE------------------------------------------ rl <- data.frame( Method = c("Deterministic", "Probabilistic", "PRAM", "Predictive", "RF", "RBRL", "Mahalanobis", "Embedding"), Distance = c("Gower", "Fellegi-Sunter", "Transition prob.", "Propensity", "RF proximity", "Rank-based", "Mahalanobis", "Autoencoder"), `Mixed types` = c("Yes", "Yes", "Categorical", "Yes", "Yes", "Yes", "Numeric", "Yes"), Matching = c("All 3", "All 3", "All 3", "All 3", "All 3", "Independent", "All 3", "All 3"), check.names = FALSE ) knitr::kable(rl, caption = "Record linkage methods. All 3 = independent, bijective, OT.") ## ----nnaa-demo---------------------------------------------------------------- nnaa_res <- nnaa(train_data, synthetic, holdout = holdout_data, method = "gower", seed = 42) print(nnaa_res) ## ----membership-demo---------------------------------------------------------- so_res <- singling_out(original, synthetic, n_attacks = 500, n_cols = 3, mode = "multivariate", seed = 42) print(so_res) link_res <- linkability(original, synthetic, n_attacks = 500, n_neighbors = 1, seed = 42) print(link_res) ## ----membership-table, echo=FALSE--------------------------------------------- mia <- data.frame( Metric = c("MIA classifier", "DOMIAS", "NNAA", "Singling out", "Linkability", "delta-Presence"), `Attack type` = c("Shadow model", "Density overfitting", "Nearest neighbor", "Predicate-based", "Record linkage", "Membership bounds"), Holdout = c("Yes", "Yes", "Yes", "Yes", "Yes", "No"), `GDPR criterion` = c("--", "--", "--", "Art. 29 WP", "Art. 29 WP", "--"), `Low risk` = c("< 0.55", "< 0.6", "< 0.05", "< 0.1", "< 0.1", "> 0.5"), check.names = FALSE ) knitr::kable(mia, caption = "Membership inference and GDPR measures.") ## ----rosetta------------------------------------------------------------------ # Near-copy: original + small noise (high risk expected) set.seed(99) near_copy <- original near_copy$age <- near_copy$age + sample(-1:1, n, replace = TRUE) near_copy$income <- near_copy$income + round(rnorm(n, 0, 500)) pair_risky <- synth_pair(original, near_copy, key_vars = key_vars, target_var = target_var) # Compare key metrics across the two datasets comparison <- data.frame( Metric = c("DCAP", "RAPID (lm)", "IMS"), Safe = c( dcap(pair)$dcap, rapid(pair, model_type = "lm", verbose = FALSE)$rapid, ims(pair)$ims ), Risky = c( dcap(pair_risky)$dcap, rapid(pair_risky, model_type = "lm", verbose = FALSE)$rapid, ims(pair_risky)$ims ) ) comparison$Safe <- round(comparison$Safe, 4) comparison$Risky <- round(comparison$Risky, 4) knitr::kable(comparison, caption = "Cross-family comparison: safe vs. risky synthetic data.") ## ----utility-quick, warning=FALSE--------------------------------------------- prop_res <- propscore(pair) summary(prop_res) ## ----utility-univariate------------------------------------------------------- # Hellinger distance for categorical variables h_res <- hellinger(original, synthetic, vars = c("sex", "education")) print(h_res) # CI proximity: confidence interval overlap for means cip_res <- ci_proximity(original, synthetic, vars = c("age", "income")) print(cip_res) ## ----utility-structural------------------------------------------------------- e_res <- energy_distance(original[, c("age", "income")], synthetic[, c("age", "income")], seed = 42) print(e_res) ## ----mmd-demo----------------------------------------------------------------- mmd_res <- mmd(original[, c("age", "income")], synthetic[, c("age", "income")], kernel = "gaussian", method = "rff", n_features = 500, seed = 42) print(mmd_res) ## ----fidelity-demo------------------------------------------------------------ cop_res <- copula_fidelity(original, synthetic, vars = c("age", "income")) print(cop_res) ctf_res <- contingency_fidelity(original, synthetic, vars = c("sex", "education", "region")) print(ctf_res) ## ----tstr-demo, warning=FALSE, eval=requireNamespace("ranger", quietly=TRUE)---- set.seed(42) tstr_res <- tstr(pair, target_var = "income", model = "rf", test_fraction = 0.3, seed = 42) print(tstr_res) ## ----regression-demo---------------------------------------------------------- reg_res <- regression_fidelity(original, synthetic, formula = income ~ age + sex + education) summary(reg_res) plot(reg_res, which = 1) ## ----tail-demo---------------------------------------------------------------- tail_res <- tail_fidelity(original, synthetic, vars = c("age", "income"), percentile = 95, tails = "both") print(tail_res) ## ----subgroup-demo------------------------------------------------------------ su_res <- subgroup_utility(original, synthetic, group_var = "region", utility_fun = energy_distance, threshold = 0.5, seed = 42) print(su_res) ## ----table7-utility, echo=FALSE----------------------------------------------- util <- data.frame( `Use case` = c(rep("Quick assessment", 2), rep("Univariate", 3), rep("Multivariate", 4), rep("Predictive", 3), "Subgroup"), Function = c("propscore()", "specks()", "compare_wasserstein()", "hellinger()", "ci_proximity()", "energy_distance()", "mmd()", "copula_fidelity()", "contingency_fidelity()", "tstr()", "regression_fidelity()", "compare_feature_importance()", "subgroup_utility()"), `Data type` = c("Mixed", "Mixed", "Numeric", "Categorical", "Numeric", "Numeric", "Numeric", "Numeric", "Categorical", "Mixed", "Mixed", "Mixed", "Mixed"), Interpretation = c("< 0.1: good", "< 0.05: good", "Lower = better", "< 0.1: good", "> 0.8: good", "Lower = better", "Lower = better", "< 0.1: good", "< 0.05: good", "ratio near 1: good", "overlap > 0.8: good", "High corr: good", "min > 0.5: good"), check.names = FALSE ) knitr::kable(util, caption = "Utility measures by use case.") ## ----case-data---------------------------------------------------------------- set.seed(123) N <- 1000 edu_levels <- c("Primary", "Secondary", "Tertiary") age_groups <- c("20-29", "30-39", "40-49", "50-59", "60-69") orig <- data.frame( age_group = factor(sample(age_groups, N, replace = TRUE)), sex = factor(sample(c("M", "F"), N, replace = TRUE)), education = factor(sample(edu_levels, N, replace = TRUE, prob = c(0.25, 0.50, 0.25))), region = factor(sample(paste0("R", 1:4), N, replace = TRUE)) ) edu_effect <- c(Primary = 0, Secondary = 0.3, Tertiary = 0.7) age_effect <- c("20-29" = 0, "30-39" = 0.15, "40-49" = 0.3, "50-59" = 0.4, "60-69" = 0.35) orig$income <- round(exp( 10 + age_effect[as.character(orig$age_group)] + edu_effect[as.character(orig$education)] + rnorm(N, 0, 0.4) )) qi <- c("age_group", "sex", "education", "region") sens <- "income" ## ----case-synthesis----------------------------------------------------------- set.seed(456) # Method A: Independent marginals (safest, but destroys correlations) synA <- data.frame( age_group = factor(sample(age_groups, N, replace = TRUE)), sex = factor(sample(c("M", "F"), N, replace = TRUE)), education = factor(sample(edu_levels, N, replace = TRUE, prob = c(0.25, 0.50, 0.25))), region = factor(sample(paste0("R", 1:4), N, replace = TRUE)), income = sample(orig$income, N, replace = TRUE) ) # Method B: Category-preserving bootstrap with income noise idx_B <- sample(N, N, replace = TRUE) synB <- orig[idx_B, ] rownames(synB) <- NULL synB$income <- round(synB$income * exp(rnorm(N, 0, 0.15))) swap_idx <- sample(N, round(0.2 * N)) synB$age_group[swap_idx] <- factor(sample(age_groups, length(swap_idx), replace = TRUE)) # Method C: Near-copy with minimal perturbation (risky) synC <- orig synC$income <- round(synC$income * exp(rnorm(N, 0, 0.03))) ## ----case-report, warning=FALSE----------------------------------------------- pair_A <- synth_pair(orig, synA, key_vars = qi, target_var = sens) pair_B <- synth_pair(orig, synB, key_vars = qi, target_var = sens) pair_C <- synth_pair(orig, synC, key_vars = qi, target_var = sens) rep_A <- disclosure_report(pair_A, compute = c("attribution", "privacy"), seed = 42, verbose = FALSE) rep_B <- disclosure_report(pair_B, compute = c("attribution", "privacy"), seed = 42, verbose = FALSE) rep_C <- disclosure_report(pair_C, compute = c("attribution", "privacy"), seed = 42, verbose = FALSE) verdicts <- data.frame( Method = c("A: Independent", "B: Bootstrap+noise", "C: Near-copy"), Overall = c(rep_A$overall_risk, rep_B$overall_risk, rep_C$overall_risk), Pass = c(rep_A$n_pass, rep_B$n_pass, rep_C$n_pass), Warn = c(rep_A$n_warn, rep_B$n_warn, rep_C$n_warn) ) knitr::kable(verdicts, caption = "Quick risk screening across three methods.") ## ----case-rumap, warning=FALSE------------------------------------------------ set.seed(42) ru <- rumap(orig, list("A: Independent" = synA, "B: Bootstrap+noise" = synB, "C: Near-copy" = synC), risk_measures = c("dcap", "tcap", "ims"), utility_measures = c("pmse", "wasserstein"), key_vars = qi, target_var = sens, seed = 42) print(ru) ## ----case-rumap-scatter, fig.width=8, fig.height=6---------------------------- plot(ru, which = 1) # R-U scatterplot with Pareto front ## ----case-rumap-heatmap, fig.width=8, fig.height=5---------------------------- plot(ru, which = 2) # Heatmap of individual measures ## ----scalability-table, echo=FALSE-------------------------------------------- scale_df <- data.frame( Metric = c("dcap()", "dcr()", "kanonymity()", "energy_distance()", "mmd(method='rff')", "propscore()", "rumap()"), `n=1000` = c("< 1 s", "< 1 s", "< 1 s", "< 1 s", "< 1 s", "~1 s", "~10 s"), `n=10000` = c("~5 s", "~10 s", "~1 s", "~2 s", "~1 s", "~5 s", "~60 s"), `n=100000` = c("~60 s", "~5 min", "~5 s", "~30 s", "~5 s", "~30 s", "depends"), Scaling = c("O(n*k)", "O(n^2)", "O(n log n)", "O(n^2)", "O(n*D)", "O(n*p)", "Sum of components"), check.names = FALSE ) knitr::kable(scale_df, caption = "Approximate runtimes for key metrics.") ## ----session-info------------------------------------------------------------- sessionInfo()