## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE ) ## ----------------------------------------------------------------------------- # library(taxify) ## ----------------------------------------------------------------------------- # taxify_data_dir() # #> [1] "C:/Users/jane/AppData/Local/R/taxify" ## ----------------------------------------------------------------------------- # # Assume `species_list` is a character vector of 10,000 plant names # # # Exact + fuzzy (default) # t_fuzzy <- system.time( # result_fuzzy <- taxify(species_list, backend = "wfo", fuzzy = TRUE) # ) # # # Exact only # t_exact <- system.time( # result_exact <- taxify(species_list, backend = "wfo", fuzzy = FALSE) # ) # # t_fuzzy["elapsed"] # #> elapsed # #> 18.4 # # t_exact["elapsed"] # #> elapsed # #> 2.1 ## ----------------------------------------------------------------------------- # # Pass 1: exact only # result <- taxify(species_list, backend = "wfo", fuzzy = FALSE) # # # How many names remain unmatched? # n_unmatched <- sum(result$match_type == "none") # message(n_unmatched, " names unmatched after exact pass") # # # Pass 2: fuzzy only on the unmatched subset # if (n_unmatched > 0) { # unmatched_names <- result$input_name[result$match_type == "none"] # fuzzy_result <- taxify(unmatched_names, backend = "wfo", fuzzy = TRUE) # # # Merge back # matched_rows <- fuzzy_result$match_type != "none" # idx <- match(fuzzy_result$input_name[matched_rows], # result$input_name) # result[idx, ] <- fuzzy_result[matched_rows, ] # } ## ----------------------------------------------------------------------------- # # 8,000 names: ~6,000 plants, ~1,500 fish, ~500 invertebrates # t_wfo_first <- system.time( # result_a <- taxify(survey_names, # backend = c("wfo", "col"), # fuzzy = TRUE) # ) # t_wfo_first["elapsed"] # #> elapsed # #> 25.3 # # # Reversed order: COL first, WFO second # t_col_first <- system.time( # result_b <- taxify(survey_names, # backend = c("col", "wfo"), # fuzzy = TRUE) # ) # t_col_first["elapsed"] # #> elapsed # #> 41.7 ## ----------------------------------------------------------------------------- # # Match names # result <- taxify(species_list, backend = "gbif") # # # Save result # saveRDS(result, "matched_names.rds") # # # Free the backbone from memory # taxify_clear_cache() # # # Now ~800 MB of RAM is available for downstream work # gc() ## ----------------------------------------------------------------------------- # # 300,000 names from a herbarium digitization project # all_names <- readLines("herbarium_names.txt") # chunk_size <- 50000 # # # Split into chunks # chunks <- split(all_names, # ceiling(seq_along(all_names) / chunk_size)) # # # Process each chunk # results <- lapply(seq_along(chunks), function(i) { # message(sprintf("Chunk %d/%d (%d names)...", # i, length(chunks), length(chunks[[i]]))) # taxify(chunks[[i]], backend = "wfo", fuzzy = TRUE, verbose = FALSE) # }) # # # Combine # result <- do.call(rbind, results) # nrow(result) # #> [1] 300000 ## ----------------------------------------------------------------------------- # output_dir <- "results" # dir.create(output_dir, showWarnings = FALSE) # # for (i in seq_along(chunks)) { # message(sprintf("Chunk %d/%d", i, length(chunks))) # res <- taxify(chunks[[i]], backend = "wfo", # fuzzy = TRUE, verbose = FALSE) # saveRDS(res, file.path(output_dir, # sprintf("chunk_%04d.rds", i))) # } # # # Combine when needed # all_files <- list.files(output_dir, pattern = "\\.rds$", # full.names = TRUE) # result <- do.call(rbind, lapply(all_files, readRDS)) ## ----------------------------------------------------------------------------- # for (i in seq_along(chunks)) { # out_file <- file.path(output_dir, sprintf("chunk_%04d.rds", i)) # if (file.exists(out_file)) next # message(sprintf("Chunk %d/%d", i, length(chunks))) # res <- taxify(chunks[[i]], backend = "wfo", # fuzzy = TRUE, verbose = FALSE) # saveRDS(res, out_file) # } ## ----------------------------------------------------------------------------- # # After finishing all matching work # taxify_clear_cache() # gc() ## ----------------------------------------------------------------------------- # taxify_refresh_manifest() ## ----------------------------------------------------------------------------- # # Total size of all backbones and enrichments # data_dir <- taxify_data_dir() # files <- list.files(data_dir, recursive = TRUE, full.names = TRUE) # total_mb <- sum(file.size(files), na.rm = TRUE) / 1048576 # message(sprintf("taxify data: %.0f MB across %d files", # total_mb, length(files))) ## ----------------------------------------------------------------------------- # # Remove GBIF backbone (frees ~500-700 MB) # unlink(file.path(taxify_data_dir(), "gbif"), recursive = TRUE) # # # Clear the session cache so taxify() doesn't try to use the old path # taxify_clear_cache() ## ----------------------------------------------------------------------------- # # Pre-download everything needed for a multi-kingdom analysis # # with conservation status and trait enrichments # # # Backbones # taxify_download_vtr(c("wfo", "col")) # # # Enrichments # taxify_download_enrichment(c( # "conservation_status", # "woodiness", # "eive", # "elton_traits" # )) # # # Now the analysis can run fully offline # result <- taxify(species_list, backend = c("wfo", "col")) # result <- add_conservation_status(result) # result <- add_woodiness(result) ## ----------------------------------------------------------------------------- # list_enrichments() # #> name version nrow static # #> 1 conservation_status 2026.04 59583 FALSE # #> 2 griis 2026.04 98131 FALSE # #> 3 wcvp 2026.04 1973234 FALSE # #> 4 eive 1.0 14835 TRUE # #> 5 elton_traits 1.0 15394 TRUE # #> 6 avonet 1.0 11009 TRUE # #> ... ## ----------------------------------------------------------------------------- # # Pattern for 500,000+ names with enrichments # for (i in seq_along(chunks)) { # res <- taxify(chunks[[i]], backend = "wfo", # fuzzy = TRUE, verbose = FALSE) # res <- add_conservation_status(res, verbose = FALSE) # res <- add_woodiness(res, verbose = FALSE) # saveRDS(res, sprintf("results/chunk_%04d.rds", i)) # } ## ----------------------------------------------------------------------------- # # Tight threshold for clean input # result <- taxify(clean_names, backend = "wfo", # fuzzy = TRUE, fuzzy_threshold = 0.1) # # # Integer threshold: at most 2 edits, period # result <- taxify(noisy_names, backend = "wfo", # fuzzy = TRUE, fuzzy_threshold = 2L)