From 20d2764dae5d3f33861cb1f62b44ff96c2da56f9 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 22 Apr 2026 13:39:20 -0400 Subject: [PATCH 01/12] corrected importFrom statement in parentage functions and ped_check --- NAMESPACE | 1 + R/check_ped.R | 5 +++-- R/validate_pedigree.R | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c919736..4b5cb05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(validate_pedigree) import(dplyr) import(janitor) import(parallel) diff --git a/R/check_ped.R b/R/check_ped.R index 0ebbd5a..e71e79b 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -38,10 +38,11 @@ #' messy_ids <- ped_errors$messy_parents$id #' print(messy_ids) #' -#' @import dplyr -#' @import janitor +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows +#' @importFrom janitor clean_names #' @importFrom stats setNames #' @importFrom utils read.table +#' @importFrom tools file_path_sans_ext #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 8b2ecbc..5b366c9 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -51,6 +51,7 @@ #' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} #' } #' @export +#' @importFrom data.table fread fwrite copy data.table set rbindlist validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, trio_error_threshold = 5.0, From f2d3f408c2793bffe3cd25f8b726327343be2e73 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 22 Apr 2026 13:43:10 -0400 Subject: [PATCH 02/12] updated NAMESPACE to reflect updated importFrom statements --- NAMESPACE | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 4b5cb05..1fde3fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,7 +25,6 @@ export(thinSNP) export(updog2vcf) export(validate_pedigree) import(dplyr) -import(janitor) import(parallel) import(quadprog) import(stringr) @@ -47,15 +46,20 @@ importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) +importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,distinct) importFrom(dplyr,filter) +importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) +importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) +importFrom(janitor,clean_names) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) @@ -66,6 +70,7 @@ importFrom(stats,cor) importFrom(stats,reorder) importFrom(stats,setNames) importFrom(tibble,as_tibble) +importFrom(tools,file_path_sans_ext) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) From c1fe54372792b4eb3519fd03475c444304f82d6b Mon Sep 17 00:00:00 2001 From: Josue Chinchilla-Vargas Date: Wed, 22 Apr 2026 13:58:21 -0400 Subject: [PATCH 03/12] Update R/check_ped.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/check_ped.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_ped.R b/R/check_ped.R index e71e79b..0065f78 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -38,7 +38,7 @@ #' messy_ids <- ped_errors$messy_parents$id #' print(messy_ids) #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n #' @importFrom janitor clean_names #' @importFrom stats setNames #' @importFrom utils read.table From 0a13483db94177fc605d9e8130058301e7ea38af Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Fri, 24 Apr 2026 13:52:51 -0400 Subject: [PATCH 04/12] updated error handling and reporting for check_ped, Also major improvement for documentation on check_ped --- .Rbuildignore | 2 + .gitignore | 1 + NAMESPACE | 7 - R/check_ped.R | 331 ++++++++++-------- man/check_ped.Rd | 30 +- tests/testthat/corrected_pedigree.txt | 4 - tests/testthat/test-check_ped.R | 463 +++++++++++++++++++++++++- 7 files changed, 645 insertions(+), 193 deletions(-) delete mode 100644 tests/testthat/corrected_pedigree.txt diff --git a/.Rbuildignore b/.Rbuildignore index 307d5d0..50c7896 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,3 +10,5 @@ ^package-code\.Rmd$ ^docs$ ^CRAN-SUBMISSION$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index d3ffaad..e056686 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata revdep/ .DS_Store +.positai diff --git a/NAMESPACE b/NAMESPACE index 1fde3fd..ba1c1d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,20 +46,15 @@ importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) -importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) -importFrom(dplyr,distinct) importFrom(dplyr,filter) -importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) -importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) -importFrom(janitor,clean_names) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) @@ -68,9 +63,7 @@ importFrom(reshape2,melt) importFrom(rlang,sym) importFrom(stats,cor) importFrom(stats,reorder) -importFrom(stats,setNames) importFrom(tibble,as_tibble) -importFrom(tools,file_path_sans_ext) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/R/check_ped.R b/R/check_ped.R index 0065f78..71a889e 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,13 +1,13 @@ #' Check a pedigree file for accuracy and report/correct common errors #' -#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `male_parent`, `female_parent` in any order) #' and performs quality checks, optionally correcting or flagging errors. #' #' The function checks for: #' * Exact duplicate rows and removes them (keeping one copy) -#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") -#' * IDs that appear in both sire and dam columns -#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * IDs that appear more than once with conflicting male_parent/female_parent assignments (sets both to "0") +#' * IDs that appear in both male_parent and female_parent columns +#' * Missing parents (IDs referenced as male_parent/female_parent but not in `id` column), adds them with both = "0" #' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' #' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. @@ -18,124 +18,92 @@ #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional seed for reproducibility. -#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' @param verbose Logical. If TRUE (default), prints findings and prompts for saving. +#' @param correct Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues. +#' @param save_zip Logical. If TRUE, writes each report component as a .txt file and bundles them into a .zip archive in the working directory. +#' @param save_corrected_zip Logical. If TRUE (and save_zip = TRUE and correct = TRUE), also writes the corrected pedigree into the .zip archive. #' -#' @return A list of data.frames containing detected issues: -#' * `exact_duplicates`: rows that were exact duplicates -#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam -#' * `messy_parents`: IDs appearing as both sire and dam -#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam -#' * `dependencies`: detected cycles in the pedigree +#' @return A named list of data.frames with offending rows (including row numbers): +#' * `exact_duplicates` +#' * `repeated_ids_diff` +#' * `inconsistent_sex_roles` +#' * `missing_parents` +#' * `dependencies` #' -#' @examples -#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) -#' -#' # Access messy parents -#' ped_errors$messy_parents -#' -#' # IDs with messy parents -#' messy_ids <- ped_errors$messy_parents$id -#' print(messy_ids) -#' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select #' @importFrom janitor clean_names #' @importFrom stats setNames -#' @importFrom utils read.table +#' @importFrom utils read.table write.table zip #' @importFrom tools file_path_sans_ext #' @export -check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { +check_ped <- function(ped.file, + seed = NULL, + verbose = TRUE, + correct = TRUE, + save_zip = FALSE, + save_corrected_zip = FALSE) { #### setup #### if (!is.null(seed)) set.seed(seed) - - # Read and clean data data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( - id = as.character(id), - sire = as.character(sire), - dam = as.character(dam) + id = as.character(id), + male_parent = as.character(male_parent), + female_parent = as.character(female_parent) ) - - original_data <- data + # Add row numbers before any processing + data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) #### check 1: exact duplicates #### - exact_duplicates <- data[duplicated(data), ] - if (nrow(exact_duplicates) > 0) { - data <- distinct(data) # remove exact duplicates - } + exact_duplicates <- data[ + duplicated(data %>% select(-row_number)) | + duplicated(data %>% select(-row_number), fromLast = TRUE), + ] - #### check 2: repeated IDs with conflicting sire/dam #### - repeated_ids <- data %>% + #### check 2: repeated IDs with conflicting male_parent/female_parent #### + repeated_ids_diff <- data %>% group_by(id) %>% filter(n() > 1) %>% + filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% ungroup() - # Only IDs with actual conflicting sire/dam - conflicting_ids <- repeated_ids %>% - group_by(id) %>% - filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% - ungroup() - - if (nrow(conflicting_ids) > 0) { - # Keep one row per ID, set sire/dam to "0" - data <- data %>% - group_by(id) %>% - summarize( - sire = if(n_distinct(sire) > 1) "0" else first(sire), - dam = if(n_distinct(dam) > 1) "0" else first(dam), - .groups = "drop" - ) - } - - repeated_ids_report <- conflicting_ids - - #### check 3: missing parents #### - for (i in seq_len(nrow(data))) { - id <- data$id[i] - sire <- data$sire[i] - dam <- data$dam[i] - - if (sire != "0" && sire != id && !sire %in% data$id) { - missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) - } - if (dam != "0" && dam != id && !dam %in% data$id) { - missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) - } - - if (sire == id || dam == id) { - errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) - } - } - - missing_parents <- distinct(missing_parents) - if (nrow(missing_parents) > 0) { - data <- bind_rows(data, missing_parents) - } - - #### check 4: messy parents #### - sire_ids <- unique(data$sire[data$sire != "0"]) - dam_ids <- unique(data$dam[data$dam != "0"]) - messy_ids <- intersect(sire_ids, dam_ids) - messy_parents <- data %>% filter(id %in% messy_ids) + #### check 3: inconsistent parent sex roles #### + male_ids <- unique(data$male_parent[data$male_parent != "0"]) + female_ids <- unique(data$female_parent[data$female_parent != "0"]) + messy_ids <- intersect(male_ids, female_ids) + inconsistent_sex_roles <- data %>% + filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + + #### check 4: missing parents #### + all_ids <- unique(data$id) + ref_ids <- unique(c(data$male_parent, data$female_parent)) + ref_ids <- ref_ids[ref_ids != "0"] + missing_ids <- setdiff(ref_ids, all_ids) + missing_parents <- data %>% + filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + # For the corrected pedigree, build founder rows for truly absent parents + missing_founders <- data.frame( + row_number = NA_integer_, + id = missing_ids, + male_parent = "0", + female_parent = "0", + stringsAsFactors = FALSE + ) #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { row <- data[data$id == x, ] - c(row$sire, row$dam) + c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id - dfs <- function(node, visited, rec_stack, path) { - visited[node] <- TRUE + visited[node] <- TRUE rec_stack[node] <- TRUE - path <- append(path, node) + path <- append(path, node) cycles <- list() - for (neighbor in adj_list[[node]]) { if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { @@ -146,94 +114,163 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } - rec_stack[node] <- FALSE return(cycles) } - - visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) - if (length(node_cycles) > 0) { - all_cycles <- append(all_cycles, node_cycles) - } + if (length(node_cycles) > 0) all_cycles <- append(all_cycles, node_cycles) } } return(all_cycles) } - cycles <- detect_all_cycles(data) + cycles <- detect_all_cycles(data) + cycle_ids <- character(0) if (length(cycles) > 0) { for (cycle_group in cycles) { - cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) + ids <- unique(unlist(cycle_group)) + cycle_ids <- unique(c(cycle_ids, ids)) + errors <- append(errors, + paste("Cycle detected involving IDs:", + paste(ids, collapse = " -> "))) } } + dependencies <- data %>% filter(id %in% cycle_ids) + if (nrow(dependencies) == 0 && length(errors) > 0) { + dependencies <- data.frame(Dependency = unique(unlist(errors)), + stringsAsFactors = FALSE) + } - #### compile findings #### + #### compile report #### input_ped_report <- list( - exact_duplicates = exact_duplicates, - repeated_ids_diff = repeated_ids_report, - messy_parents = messy_parents, - missing_parents = missing_parents, - dependencies = data.frame(Dependency = unique(unlist(errors))) + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_diff, + inconsistent_sex_roles = inconsistent_sex_roles, + missing_parents = missing_parents, + dependencies = dependencies ) + #### build corrected pedigree (no row_number column) #### + if (correct) { + corrected <- data %>% select(-row_number) + # Remove exact duplicates + corrected <- distinct(corrected) + # Resolve conflicting IDs + if (nrow(repeated_ids_diff) > 0) { + corrected <- corrected %>% + group_by(id) %>% + summarize( + male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), + female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), + .groups = "drop" + ) + } + # Add missing founders + if (length(missing_ids) > 0) { + corrected <- bind_rows( + corrected, + missing_founders %>% select(-row_number) + ) + } + } + #### file names #### - file_base <- tools::file_path_sans_ext(basename(ped.file)) + file_base <- tools::file_path_sans_ext(basename(ped.file)) corrected_name <- paste0(file_base, "_corrected") - report_name <- paste0(file_base, "_report") + report_name <- paste0(file_base, "_report") + zip_name <- paste0(file_base, "_report.zip") #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") - - if (nrow(exact_duplicates) > 0) { - cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") - print(exact_duplicates) - } else cat("\nNo exact duplicate trios found.\n") - - if (nrow(repeated_ids_report) > 0) { - cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") - print(repeated_ids_report) - } else cat("\nNo conflicting repeated trios found.\n") - - if (nrow(missing_parents) > 0) { - cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") - print(missing_parents) - } else cat("\nNo missing parents found.\n") - - if (nrow(messy_parents) > 0) { - cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") - print(messy_parents) - } else cat("\nNo IDs found as both sire and dam.\n") - - - if (nrow(input_ped_report$dependencies) > 0) { - cat("\nDependencies detected:\n") - print(input_ped_report$dependencies) - } else cat("\nNo dependencies detected.\n") - - #### interactive save #### - cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) - ans <- tolower(trimws(readline())) - if (ans == "y") { - assign(corrected_name, data, envir = .GlobalEnv) - assign("input_ped_report", input_ped_report, envir = .GlobalEnv) - cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) + cat("\n--- Exact Duplicates ---\n") + if (nrow(exact_duplicates) > 0) print(exact_duplicates) else cat("None found.\n") + cat("\n--- Conflicting IDs ---\n") + if (nrow(repeated_ids_diff) > 0) print(repeated_ids_diff) else cat("None found.\n") + cat("\n--- Inconsistent Parent Sex Roles ---\n") + if (nrow(inconsistent_sex_roles) > 0) print(inconsistent_sex_roles) else cat("None found.\n") + cat("\n--- Missing Parents (rows that reference them) ---\n") + if (nrow(missing_parents) > 0) print(missing_parents) else cat("None found.\n") + cat("\n--- Cycles / Dependencies ---\n") + if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") + if (correct) { + cat(paste0("\nSave corrected pedigree as `", corrected_name, "`? (y/n): ")) + ans <- tolower(trimws(readline())) + if (ans == "y") { + assign(corrected_name, corrected, envir = .GlobalEnv) + assign(report_name, input_ped_report, envir = .GlobalEnv) + cat(paste0("Saved `", corrected_name, "` and `", report_name, "`.\n")) + } else { + cat("No corrected pedigree saved.\n") + assign(report_name, input_ped_report, envir = .GlobalEnv) + } } else { - cat("No corrected pedigree was saved.\n") + cat("\n`correct = FALSE`: no changes made to the pedigree.\n") + assign(report_name, input_ped_report, envir = .GlobalEnv) } - } else { - # Silent automatic mode - assign(corrected_name, data, envir = .GlobalEnv) + # Silent mode: always save report; save corrected only if correct = TRUE + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) assign(report_name, input_ped_report, envir = .GlobalEnv) } + #### zip export #### + if (save_zip) { + tmp_dir <- tempfile() + dir.create(tmp_dir) + zip_files <- character(0) + + # Section headers matching the console output labels + section_labels <- c( + exact_duplicates = "Exact Duplicates", + repeated_ids_diff = "Conflicting IDs", + inconsistent_sex_roles = "Inconsistent Parent Sex Roles", + missing_parents = "Missing Parents (rows that reference them)", + dependencies = "Cycles / Dependencies" + ) + + for (component in names(section_labels)) { + df <- input_ped_report[[component]] + file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) + + if (nrow(df) == 0) { + # Write header + "None found." for clean empty reports + writeLines( + c(paste0("--- ", section_labels[[component]], " ---"), "None found."), + con = file_path + ) + } else { + # Write header, then column names, then data — avoids col.names warning + file_con <- file(file_path, open = "wt") + writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) + writeLines(paste(colnames(df), collapse = "\t"), con = file_con) + close(file_con) + utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, + quote = FALSE, append = TRUE, col.names = FALSE) + } + + zip_files <- c(zip_files, file_path) + } + + # Optionally include corrected pedigree in zip + if (save_corrected_zip && correct) { + corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) + utils::write.table(corrected, file = corrected_path, sep = "\t", + row.names = FALSE, quote = FALSE) + zip_files <- c(zip_files, corrected_path) + } + + # Bundle all files into zip in the working directory + zip_path <- file.path(getwd(), zip_name) + utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") + if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) + } + invisible(input_ped_report) } + diff --git a/man/check_ped.Rd b/man/check_ped.Rd index ea63de7..0d1df79 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -4,23 +4,25 @@ \alias{check_ped} \title{Check a pedigree file for accuracy and report/correct common errors} \usage{ -check_ped(ped.file, seed = NULL, verbose = TRUE) +check_ped(ped.file, seed = NULL, verbose = TRUE, correct = TRUE) } \arguments{ \item{ped.file}{Path to the pedigree text file.} \item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} +\item{verbose}{Logical. If TRUE (default), prints findings and prompts for saving.} + +\item{correct}{Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues.} } \value{ -A list of data.frames containing detected issues: +A named list of data.frames with offending rows (including row numbers): \itemize{ -\item \code{exact_duplicates}: rows that were exact duplicates -\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam -\item \code{messy_parents}: IDs appearing as both sire and dam -\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam -\item \code{dependencies}: detected cycles in the pedigree +\item \code{exact_duplicates} +\item \code{repeated_ids_diff} +\item \code{inconsistent_sex_roles} +\item \code{missing_parents} +\item \code{dependencies} } } \description{ @@ -45,15 +47,3 @@ The function does \strong{not} overwrite the input file. Instead, it prints find \item A report listing all detected issues } } -\examples{ -ped_file <- system.file("check_ped_test.txt", package = "BIGr") -ped_errors <- check_ped(ped.file = ped_file, seed = 101919) - -# Access messy parents -ped_errors$messy_parents - -# IDs with messy parents -messy_ids <- ped_errors$messy_parents$id -print(messy_ids) - -} diff --git a/tests/testthat/corrected_pedigree.txt b/tests/testthat/corrected_pedigree.txt deleted file mode 100644 index 0434b0c..0000000 --- a/tests/testthat/corrected_pedigree.txt +++ /dev/null @@ -1,4 +0,0 @@ -ID Male_Parent Female_Parent -IND_C IND_A IND_B -IND_D 0 IND_A -GHOST IND_A IND_B diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index 706143f..64d13ca 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -1,22 +1,455 @@ -context("Imputation Concordance") +context("check_ped – Pedigree Quality Checks") +# --------------------------------------------------------------------------- +# Helper: write a data.frame to a temp tab-separated file and return the path +# --------------------------------------------------------------------------- +write_ped <- function(df) { + f <- tempfile(fileext = ".txt") + utils::write.table(df, f, sep = "\t", row.names = FALSE, quote = FALSE) + f +} -test_that("test imputation",{ - #Input variables - ped_file <- system.file("check_ped_test.txt", package="BIGr") +# =========================================================================== +# 1. Return structure +# Tests that check_ped returns a named list of exactly 5 data frame +# components covering all five issue categories. +# =========================================================================== +test_that("check_ped returns a named list of length 5", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE, correct = FALSE) - #Calculations - output.list <- check_ped(ped_file, - seed = 101919, - verbose = FALSE) + expect_type(out, "list") + expect_length(out, 5) + expect_named(out, c( + "exact_duplicates", + "repeated_ids_diff", + "inconsistent_sex_roles", + "missing_parents", + "dependencies" + )) +}) + +# =========================================================================== +# 2. Component types +# Tests that every element in the returned list is a data.frame, ensuring +# downstream code can safely call nrow(), colnames(), etc. on any component. +# =========================================================================== +test_that("check_ped report components are data.frames", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_true(is.data.frame(out$exact_duplicates)) + expect_true(is.data.frame(out$repeated_ids_diff)) + expect_true(is.data.frame(out$inconsistent_sex_roles)) + expect_true(is.data.frame(out$missing_parents)) + expect_true(is.data.frame(out$dependencies)) +}) + +# =========================================================================== +# 3. Clean pedigree → empty report +# Tests that a structurally valid pedigree with no issues produces zero rows +# in every report component (no false positives). +# =========================================================================== +test_that("clean pedigree produces no issues", { + ped <- data.frame( + id = c("G1", "G2", "P1"), + male_parent = c("0", "0", "G1"), + female_parent = c("0", "0", "G2") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$exact_duplicates), 0) + expect_equal(nrow(out$repeated_ids_diff), 0) + expect_equal(nrow(out$inconsistent_sex_roles), 0) + expect_equal(nrow(out$missing_parents), 0) + expect_equal(nrow(out$dependencies), 0) +}) + +# =========================================================================== +# 4. Exact duplicate detection and correction +# Tests that rows sharing the same (id, male_parent, female_parent) triplet +# are flagged, and that correct = TRUE retains only one copy of each. +# =========================================================================== +test_that("check_ped detects exact duplicates", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$exact_duplicates), 2) + expect_true(all(out$exact_duplicates$id == "A")) +}) + +test_that("correct = TRUE removes exact duplicates", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # Only one "A" row should remain + expect_equal(sum(corrected$id == "A"), 1) +}) + +# =========================================================================== +# 5. Repeated IDs with conflicting parent assignments +# Tests that the same ID appearing on multiple rows with different parent +# values is flagged, and that correct = TRUE blanks only the conflicting +# parent field while preserving consistent values. +# =========================================================================== +test_that("check_ped detects repeated IDs with conflicting parents", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$repeated_ids_diff), 2) + expect_true(all(out$repeated_ids_diff$id == "A")) +}) + +test_that("correct = TRUE resolves conflicting IDs: conflicting field -> '0', consistent field kept", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), # conflicting -> should become "0" + female_parent = c("M", "M", "0") # consistent -> should stay "M" + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + a_row <- corrected[corrected$id == "A", ] + + expect_equal(nrow(a_row), 1) + expect_equal(a_row$male_parent, "0") + expect_equal(a_row$female_parent, "M") +}) + +# =========================================================================== +# 6. Inconsistent parent sex roles +# Tests that an individual ID appearing in both male_parent and female_parent +# columns is flagged. Also confirms that correct = TRUE does NOT auto-resolve +# this issue since the correct fix requires manual review. +# =========================================================================== +test_that("check_ped detects inconsistent parent sex roles", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_true("inconsistent_sex_roles" %in% names(out)) + expect_gt(nrow(out$inconsistent_sex_roles), 0) + + # Both rows that reference P should be flagged + expect_true(any(out$inconsistent_sex_roles$male_parent == "P" | + out$inconsistent_sex_roles$female_parent == "P")) +}) + +test_that("correct = TRUE does NOT auto-correct inconsistent sex roles", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # P should still appear with its original (inconsistent) parent assignments + expect_true(any(corrected$male_parent == "P" | + corrected$female_parent == "P")) +}) + +# =========================================================================== +# 7. Missing parent detection and founder row injection +# Tests that parent IDs referenced in male_parent/female_parent but absent +# from the id column are flagged. Confirms that correct = TRUE appends a +# founder row (both parents = "0") for each missing parent. +# =========================================================================== +test_that("check_ped detects missing parents", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$missing_parents), 1) + expect_true(all(out$missing_parents$id == "B")) +}) + +test_that("correct = TRUE adds missing parents as founder rows", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # X and Y should now appear as founder rows + expect_true("X" %in% corrected$id) + expect_true("Y" %in% corrected$id) + x_row <- corrected[corrected$id == "X", ] + y_row <- corrected[corrected$id == "Y", ] + expect_equal(x_row$male_parent, "0") + expect_equal(x_row$female_parent, "0") + expect_equal(y_row$male_parent, "0") + expect_equal(y_row$female_parent, "0") +}) + +# =========================================================================== +# 8. Pedigree cycle / dependency detection +# Tests that circular ancestry chains (e.g. A is parent of B and B is parent +# of A) are detected and reported. Also confirms that correct = TRUE does NOT +# silently remove cycle-involved individuals, as cycles require manual review. +# =========================================================================== +test_that("check_ped detects a direct two-node cycle", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_gt(nrow(out$dependencies), 0) + expect_true(all(c("A", "B") %in% out$dependencies$id)) +}) + +test_that("check_ped does NOT auto-correct cycles", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # Cycle-involved IDs should still be present (no silent removal) + expect_true("A" %in% corrected$id) + expect_true("B" %in% corrected$id) +}) + +# =========================================================================== +# 9. Global environment assignments +# Tests that the report list is always written to the global environment, +# that the corrected pedigree is written only when correct = TRUE, and that +# the internal row_number column does not leak into the corrected output. +# =========================================================================== +test_that("report is always assigned to global environment", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + rep_name <- paste0(file_base, "_report") + on.exit(if (exists(rep_name, envir = .GlobalEnv)) + rm(list = rep_name, envir = .GlobalEnv)) + + check_ped(f, verbose = FALSE, correct = FALSE) + + expect_true(exists(rep_name, envir = .GlobalEnv)) + expect_type(get(rep_name, envir = .GlobalEnv), "list") +}) + +test_that("corrected pedigree is assigned to global env when correct = TRUE", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + + expect_true(exists(corr_name, envir = .GlobalEnv)) + corrected <- get(corr_name, envir = .GlobalEnv) + expect_true(is.data.frame(corrected)) + + # row_number must not leak into corrected output + expect_false("row_number" %in% names(corrected)) + expect_true(all(c("id", "male_parent", "female_parent") %in% names(corrected))) +}) + +test_that("corrected pedigree is NOT assigned to global env when correct = FALSE", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = FALSE) + + expect_false(exists(corr_name, envir = .GlobalEnv)) + expect_true(exists(rep_name, envir = .GlobalEnv)) +}) + +# =========================================================================== +# 10. Missing required columns → hard stop +# Tests that a file lacking the canonical column names (after alias remapping) +# causes an immediate error rather than silently producing empty results. +# Column names like animal_id/parent1/parent2 are not in the alias list and +# therefore cannot be remapped, triggering the validation stop. +# =========================================================================== +test_that("check_ped errors when required columns are missing", { + bad_df <- data.frame( + animal_id = c("a", "b"), + parent1 = c("0", "a"), + parent2 = c("0", "0") + ) + expect_error( + check_ped(write_ped(bad_df), verbose = FALSE), + "Input file is missing required column" + ) +}) + +# =========================================================================== +# 11. Integration test using the bundled check_ped_test.txt fixture +# End-to-end test against a real fixture file shipped with the package. +# Validates the overall row counts and that known problematic individuals +# (grandfather2, grandfather3) are flagged for inconsistent sex roles. +# =========================================================================== +test_that("integration test with bundled fixture file", { + ped_file <- system.file("check_ped_test.txt", package = "BIGr") + skip_if(ped_file == "", "Bundled fixture file not found; skipping integration test.") + + out <- check_ped(ped_file, seed = 101919, verbose = FALSE) + + expect_length(out, 5) + + # Inconsistent sex roles should flag grandfather2 & grandfather3 + expect_true(all( + c("grandfather2", "grandfather3") %in% out$inconsistent_sex_roles$male_parent | + c("grandfather2", "grandfather3") %in% out$inconsistent_sex_roles$female_parent + )) + expect_equal(nrow(out$missing_parents), 8) +}) + +# =========================================================================== +# 12. Zip export (save_zip = TRUE / save_corrected_zip = TRUE) +# Tests that save_zip = TRUE produces a .zip archive in the working directory, +# and that save_corrected_zip = TRUE includes the corrected pedigree file +# inside that archive when correct = TRUE. +# =========================================================================== +test_that("save_zip = TRUE creates a zip archive in the working directory", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + zip_name <- paste0(file_base, "_report.zip") + rep_name <- paste0(file_base, "_report") + zip_path <- file.path(getwd(), zip_name) + + on.exit({ + if (file.exists(zip_path)) file.remove(zip_path) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = FALSE, save_zip = TRUE) + + expect_true(file.exists(zip_path)) +}) + +test_that("save_zip + save_corrected_zip + correct = TRUE includes corrected file in zip", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + zip_name <- paste0(file_base, "_report.zip") + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + zip_path <- file.path(getwd(), zip_name) - #Check - df_length <- length(output.list) - messy_parents <- output.list$messy_parents - missing_parents <- output.list$missing_parents + on.exit({ + if (file.exists(zip_path)) file.remove(zip_path) + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) - expect_true(df_length == 5) # Before was 4 - expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) - expect_true(nrow(missing_parents) == 13) + check_ped(f, verbose = FALSE, correct = TRUE, + save_zip = TRUE, save_corrected_zip = TRUE) + expect_true(file.exists(zip_path)) + zip_contents <- utils::unzip(zip_path, list = TRUE)$Name + expect_true(any(grepl("_corrected", zip_contents))) }) From 9bbc5077612b73e356b0498ab88036ead189b88b Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Fri, 24 Apr 2026 13:52:51 -0400 Subject: [PATCH 05/12] updated error handling and reporting for check_ped --- NAMESPACE | 11 ++++ R/check_ped.R | 164 ++++++++++++++++++++++++----------------------- man/check_ped.Rd | 64 +++++++++--------- 3 files changed, 129 insertions(+), 110 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ba1c1d1..4e518d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,13 +46,20 @@ importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) +importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,distinct) importFrom(dplyr,filter) +importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) +importFrom(dplyr,n) +importFrom(dplyr,n_distinct) +importFrom(dplyr,row_number) importFrom(dplyr,select) importFrom(dplyr,summarise) +importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) importFrom(pwalign,nucleotideSubstitutionMatrix) @@ -63,13 +70,17 @@ importFrom(reshape2,melt) importFrom(rlang,sym) importFrom(stats,cor) importFrom(stats,reorder) +importFrom(stats,setNames) importFrom(tibble,as_tibble) +importFrom(tools,file_path_sans_ext) +importFrom(utils,capture.output) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) importFrom(utils,tail) importFrom(utils,write.csv) importFrom(utils,write.table) +importFrom(utils,zip) importFrom(vcfR,extract.gt) importFrom(vcfR,maf) importFrom(vcfR,read.vcfR) diff --git a/R/check_ped.R b/R/check_ped.R index 71a889e..e1d9407 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,39 +1,37 @@ -#' Check a pedigree file for accuracy and report/correct common errors +#' Check a pedigree file for structural issues and optionally apply corrections #' -#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `male_parent`, `female_parent` in any order) -#' and performs quality checks, optionally correcting or flagging errors. +#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and +#' `female_parent` and checks for five classes of issues: missing required +#' columns, exact duplicate rows, repeated IDs with conflicting parent +#' assignments, inconsistent parent sex roles, and pedigree cycles. The input +#' file is never modified. A report list is always assigned to the global +#' environment; when `correct = TRUE`, a corrected pedigree is assigned as well. #' -#' The function checks for: -#' * Exact duplicate rows and removes them (keeping one copy) -#' * IDs that appear more than once with conflicting male_parent/female_parent assignments (sets both to "0") -#' * IDs that appear in both male_parent and female_parent columns -#' * Missing parents (IDs referenced as male_parent/female_parent but not in `id` column), adds them with both = "0" -#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant -#' -#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -#' -#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: -#' * Corrected pedigree as a dataframe in the global environment -#' * A report listing all detected issues +#' When `correct = TRUE`, the function removes exact duplicates, collapses +#' conflicting repeated IDs (replacing the inconsistent parent field with `"0"`), +#' and appends missing parents as founder rows. Inconsistent sex roles and +#' pedigree cycles are reported but not corrected and must be resolved manually. #' #' @param ped.file Path to the pedigree text file. -#' @param seed Optional seed for reproducibility. -#' @param verbose Logical. If TRUE (default), prints findings and prompts for saving. -#' @param correct Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues. -#' @param save_zip Logical. If TRUE, writes each report component as a .txt file and bundles them into a .zip archive in the working directory. -#' @param save_corrected_zip Logical. If TRUE (and save_zip = TRUE and correct = TRUE), also writes the corrected pedigree into the .zip archive. +#' @param seed Optional integer seed for reproducibility. +#' @param verbose Logical. If `TRUE` (default), prints findings to the console. +#' @param correct Logical. If `TRUE` (default), builds and assigns a corrected +#' pedigree to the global environment. If `FALSE`, issues are only reported. +#' @param save_zip Logical. If `TRUE`, bundles report components into a `.zip` +#' archive in the working directory. +#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = TRUE` and +#' `correct = TRUE`), includes the corrected pedigree in the `.zip` archive. #' -#' @return A named list of data.frames with offending rows (including row numbers): -#' * `exact_duplicates` -#' * `repeated_ids_diff` -#' * `inconsistent_sex_roles` -#' * `missing_parents` -#' * `dependencies` +#' @return An invisible named list of data frames describing detected issues: +#' * `exact_duplicates`: exact duplicate rows. +#' * `repeated_ids_diff`: rows where the same `id` has conflicting parents. +#' * `inconsistent_sex_roles`: rows involving IDs used as both parents. +#' * `missing_parents`: rows referencing parent IDs absent from `id`. +#' * `dependencies`: rows whose `id` is involved in a pedigree cycle. #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select -#' @importFrom janitor clean_names +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number #' @importFrom stats setNames -#' @importFrom utils read.table write.table zip +#' @importFrom utils read.table write.table zip capture.output #' @importFrom tools file_path_sans_ext #' @export check_ped <- function(ped.file, @@ -42,40 +40,61 @@ check_ped <- function(ped.file, correct = TRUE, save_zip = FALSE, save_corrected_zip = FALSE) { - #### setup #### if (!is.null(seed)) set.seed(seed) - data <- utils::read.table(ped.file, header = TRUE) %>% - janitor::clean_names() %>% + data <- utils::read.table(ped.file, header = TRUE) + # ── Column alias remapping ────────────────────────────────────────────────── + # Accept common alternative column names and silently remap them so the rest + # of the function always works with the canonical names. + col_aliases <- list( + id = c("id", "ID", "animal", "Animal", "ind", "Ind"), + male_parent = c("male_parent", "sire", "Sire", "father", "Father", "pat", "Pat"), + female_parent = c("female_parent", "dam", "Dam", "mother", "Mother", "mat", "Mat") + ) + for (canonical in names(col_aliases)) { + if (!canonical %in% colnames(data)) { + match <- intersect(col_aliases[[canonical]], colnames(data)) + if (length(match) > 0) { + colnames(data)[colnames(data) == match[1]] <- canonical + } + } + } + # Validate required column names before any processing + required_cols <- c("id", "male_parent", "female_parent") + missing_cols <- setdiff(required_cols, colnames(data)) + if (length(missing_cols) > 0) { + stop( + "Input file is missing required column(s): ", + paste(missing_cols, collapse = ", "), + ".\nExpected columns: id, male_parent, female_parent." + ) + } + data <- data %>% mutate( id = as.character(id), male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) + data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() - #### check 1: exact duplicates #### exact_duplicates <- data[ duplicated(data %>% select(-row_number)) | duplicated(data %>% select(-row_number), fromLast = TRUE), ] - #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% group_by(id) %>% filter(n() > 1) %>% filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% ungroup() - #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% filter(male_parent %in% messy_ids | female_parent %in% messy_ids) - #### check 4: missing parents #### all_ids <- unique(data$id) ref_ids <- unique(c(data$male_parent, data$female_parent)) @@ -83,15 +102,24 @@ check_ped <- function(ped.file, missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% filter(male_parent %in% missing_ids | female_parent %in% missing_ids) - # For the corrected pedigree, build founder rows for truly absent parents - missing_founders <- data.frame( - row_number = NA_integer_, - id = missing_ids, - male_parent = "0", - female_parent = "0", - stringsAsFactors = FALSE - ) - + # Only build founder rows if there are actually missing parents to add + if (length(missing_ids) > 0) { + missing_founders <- data.frame( + row_number = NA_integer_, + id = missing_ids, + male_parent = "0", + female_parent = "0", + stringsAsFactors = FALSE + ) + } else { + missing_founders <- data.frame( + row_number = integer(0), + id = character(0), + male_parent = character(0), + female_parent = character(0), + stringsAsFactors = FALSE + ) + } #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { @@ -128,7 +156,6 @@ check_ped <- function(ped.file, } return(all_cycles) } - cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -145,7 +172,6 @@ check_ped <- function(ped.file, dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) } - #### compile report #### input_ped_report <- list( exact_duplicates = exact_duplicates, @@ -154,7 +180,6 @@ check_ped <- function(ped.file, missing_parents = missing_parents, dependencies = dependencies ) - #### build corrected pedigree (no row_number column) #### if (correct) { corrected <- data %>% select(-row_number) @@ -178,13 +203,11 @@ check_ped <- function(ped.file, ) } } - #### file names #### file_base <- tools::file_path_sans_ext(basename(ped.file)) corrected_name <- paste0(file_base, "_corrected") report_name <- paste0(file_base, "_report") zip_name <- paste0(file_base, "_report.zip") - #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") @@ -199,32 +222,20 @@ check_ped <- function(ped.file, cat("\n--- Cycles / Dependencies ---\n") if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") if (correct) { - cat(paste0("\nSave corrected pedigree as `", corrected_name, "`? (y/n): ")) - ans <- tolower(trimws(readline())) - if (ans == "y") { - assign(corrected_name, corrected, envir = .GlobalEnv) - assign(report_name, input_ped_report, envir = .GlobalEnv) - cat(paste0("Saved `", corrected_name, "` and `", report_name, "`.\n")) - } else { - cat("No corrected pedigree saved.\n") - assign(report_name, input_ped_report, envir = .GlobalEnv) - } + cat(paste0("\`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) } else { - cat("\n`correct = FALSE`: no changes made to the pedigree.\n") - assign(report_name, input_ped_report, envir = .GlobalEnv) + cat("\`correct = FALSE`: no changes made to the pedigree.\n") } - } else { - # Silent mode: always save report; save corrected only if correct = TRUE - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) - assign(report_name, input_ped_report, envir = .GlobalEnv) } - + # Always save report to global environment + assign(report_name, input_ped_report, envir = .GlobalEnv) + # Save corrected pedigree if correct = TRUE + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) #### zip export #### if (save_zip) { tmp_dir <- tempfile() dir.create(tmp_dir) zip_files <- character(0) - # Section headers matching the console output labels section_labels <- c( exact_duplicates = "Exact Duplicates", @@ -233,19 +244,15 @@ check_ped <- function(ped.file, missing_parents = "Missing Parents (rows that reference them)", dependencies = "Cycles / Dependencies" ) - for (component in names(section_labels)) { df <- input_ped_report[[component]] file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) - if (nrow(df) == 0) { - # Write header + "None found." for clean empty reports writeLines( c(paste0("--- ", section_labels[[component]], " ---"), "None found."), con = file_path ) } else { - # Write header, then column names, then data — avoids col.names warning file_con <- file(file_path, open = "wt") writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) writeLines(paste(colnames(df), collapse = "\t"), con = file_con) @@ -253,24 +260,19 @@ check_ped <- function(ped.file, utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, quote = FALSE, append = TRUE, col.names = FALSE) } - zip_files <- c(zip_files, file_path) } - - # Optionally include corrected pedigree in zip if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", row.names = FALSE, quote = FALSE) zip_files <- c(zip_files, corrected_path) } - - # Bundle all files into zip in the working directory zip_path <- file.path(getwd(), zip_name) - utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") + invisible(capture.output( + utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") + )) if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } - invisible(input_ped_report) } - diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 0d1df79..bbae7c3 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,54 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check a pedigree file for accuracy and report/correct common errors} +\title{Check a pedigree file for structural issues and optionally apply corrections} \usage{ -check_ped(ped.file, seed = NULL, verbose = TRUE, correct = TRUE) +check_ped( + ped.file, + seed = NULL, + verbose = TRUE, + correct = TRUE, + save_zip = FALSE, + save_corrected_zip = FALSE +) } \arguments{ \item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility.} +\item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. If TRUE (default), prints findings and prompts for saving.} +\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the console.} -\item{correct}{Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues.} +\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a corrected +pedigree to the global environment. If \code{FALSE}, issues are only reported.} + +\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a \code{.zip} +archive in the working directory.} + +\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and +\code{correct = TRUE}), includes the corrected pedigree in the \code{.zip} archive.} } \value{ -A named list of data.frames with offending rows (including row numbers): +An invisible named list of data frames describing detected issues: \itemize{ -\item \code{exact_duplicates} -\item \code{repeated_ids_diff} -\item \code{inconsistent_sex_roles} -\item \code{missing_parents} -\item \code{dependencies} +\item \code{exact_duplicates}: exact duplicate rows. +\item \code{repeated_ids_diff}: rows where the same \code{id} has conflicting parents. +\item \code{inconsistent_sex_roles}: rows involving IDs used as both parents. +\item \code{missing_parents}: rows referencing parent IDs absent from \code{id}. +\item \code{dependencies}: rows whose \code{id} is involved in a pedigree cycle. } } \description{ -\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) -and performs quality checks, optionally correcting or flagging errors. +Reads a tab-separated pedigree file with columns \code{id}, \code{male_parent}, and +\code{female_parent} and checks for five classes of issues: missing required +columns, exact duplicate rows, repeated IDs with conflicting parent +assignments, inconsistent parent sex roles, and pedigree cycles. The input +file is never modified. A report list is always assigned to the global +environment; when \code{correct = TRUE}, a corrected pedigree is assigned as well. } \details{ -The function checks for: -\itemize{ -\item Exact duplicate rows and removes them (keeping one copy) -\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") -\item IDs that appear in both sire and dam columns -\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" -\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant -} - -After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. - -The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: -\itemize{ -\item Corrected pedigree as a dataframe in the global environment -\item A report listing all detected issues -} +When \code{correct = TRUE}, the function removes exact duplicates, collapses +conflicting repeated IDs (replacing the inconsistent parent field with \code{"0"}), +and appends missing parents as founder rows. Inconsistent sex roles and +pedigree cycles are reported but not corrected and must be resolved manually. } From f798362faad1ccc1df8c3630b5d3387c65d3f74e Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 27 Apr 2026 14:07:03 -0400 Subject: [PATCH 06/12] updated parentage functions to work allow user to display NA or not, and to plot results. --- R/find_parentage.R | 320 +++++++++------- R/validate_pedigree.R | 254 ++++++------ man/find_parentage.Rd | 155 +++----- man/validate_pedigree.Rd | 40 +- tests/testthat/test-find_parentage.R | 196 +++++++--- tests/testthat/test-validate_pedigree.R | 490 ++++++++++++++---------- 6 files changed, 850 insertions(+), 605 deletions(-) diff --git a/R/find_parentage.R b/R/find_parentage.R index 728822a..f53a323 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -7,104 +7,62 @@ #' Must include an 'ID' column followed by marker columns coded as 0, 1, 2 #' (allele dosage). #' @param parents_file Path to a TSV/CSV/TXT file listing candidate parent IDs. -#' Must include an 'ID' column. An optional 'Sex' column with values -#' 'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines -#' which parents are tested for each role. If absent, all parents are treated -#' as ambiguous. -#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs to assign. +#' Must include an 'ID' column. An optional 'Sex' column ('M', 'F', or 'A') +#' determines parent roles. If absent, all parents are treated as ambiguous. +#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs. #' Must include an 'ID' column. -#' @param method Character. Parentage assignment method. One of: -#' \itemize{ -#' \item \code{"best_male_parent"} — finds the best male parent for each -#' progeny using homozygous mismatch rate. -#' \item \code{"best_female_parent"} — finds the best female parent for each -#' progeny using homozygous mismatch rate. -#' \item \code{"best_match"} — finds the single best parent (either sex) -#' using homozygous mismatch rate. -#' \item \code{"best_pair"} — finds the best male-female parent pair for -#' each progeny using full Mendelian error rate (default). -#' } -#' @param min_markers Integer. Minimum number of non-missing markers required -#' to report a parentage assignment. Progeny-parent comparisons with fewer -#' markers are flagged as \code{LOW_MARKERS} and no assignment is made -#' (default: \code{10}). -#' @param error_threshold Numeric. Maximum mismatch percentage to report a -#' parentage assignment as confident. Assignments above this threshold are -#' flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column -#' (default: \code{5.0}). Must be between 0 and 100. -#' @param show_ties Logical. If \code{TRUE}, all tied best pairs (after -#' tie-breaking by maximum markers tested) are reported as additional columns -#' (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when -#' \code{method = "best_pair"}. The base columns (\code{Male_Parent}, -#' \code{Female_Parent}, etc.) are always populated with the top result. -#' If \code{FALSE}, only one tied pair is reported with a warning. -#' Default is \code{TRUE}. -#' @param allow_selfing Logical. If \code{FALSE}, male-female parent pairs where -#' both IDs are identical are excluded when \code{method = "best_pair"}. -#' Default is \code{TRUE}. -#' @param verbose Logical. If \code{TRUE}, prints progress messages, summary -#' statistics, and the results table to the console. Default is \code{TRUE}. +#' @param method Character. One of \code{"best_male_parent"}, +#' \code{"best_female_parent"}, \code{"best_match"}, or +#' \code{"best_pair"} (default). See Details. +#' @param min_markers Integer. Minimum markers required for a valid assignment. +#' Fewer markers flags \code{LOW_MARKERS} (default: \code{10}). +#' @param error_threshold Numeric. Maximum mismatch percentage for a confident +#' assignment. Exceeded values flag \code{HIGH_ERROR} (default: \code{5.0}). +#' Must be between 0 and 100. +#' @param show_ties Logical. If \code{TRUE}, tied best pairs are reported as +#' additional suffix columns (e.g. \code{Male_Parent_2}) when +#' \code{method = "best_pair"}. Default is \code{TRUE}. +#' @param allow_selfing Logical. If \code{FALSE}, pairs with identical male and +#' female parent IDs are excluded. Default is \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints progress, summary, and +#' results to the console. Default is \code{TRUE}. #' @param write_txt Logical. If \code{TRUE}, writes results to -#' \code{parentage_results_dt.txt} in the working directory. Default is -#' \code{TRUE}. +#' \code{parentage_testing_results.txt}. Default is \code{TRUE}. +#' @param na_string Character. String for missing values in output file. +#' Use \code{"NA"} or \code{""} (default: \code{"NA"}). +#' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error +#' distribution colored by assignment status. Requires \code{ggplot2}. +#' Plot is saved as \code{.jpg} when \code{write_txt = TRUE}. +#' Default is \code{TRUE}. #' -#' @return A \code{data.table} with one row per progeny. Columns depend on the -#' method used: -#' \itemize{ -#' \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: -#' \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -#' \code{Markers_Tested}, \code{Assignment_Status}. -#' \item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, -#' \code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, -#' \code{Markers_Tested}, \code{Assignment_Status}. -#' \item \code{best_pair} (ties remain after tie-breaking, -#' \code{show_ties = TRUE}): base columns are always populated with the -#' top result, plus suffix columns \code{Male_Parent_1}, -#' \code{Female_Parent_1}, etc. for each tied pair. -#' } -#' \code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or -#' \code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. +#' @return A \code{data.table} with one row per progeny. Base columns are +#' \code{Progeny}, \code{Male_Parent} (or \code{Best_Match}), +#' \code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, +#' and \code{Assignment_Status} (\code{PASS}, \code{HIGH_ERROR}, or +#' \code{LOW_MARKERS}). Tie suffix columns (\code{Male_Parent_2}, etc.) are +#' appended when \code{show_ties = TRUE} and ties remain after tie-breaking. +#' Returned invisibly when \code{verbose = TRUE}. #' #' @details -#' A homozygous-only genotype matrix is pre-computed once at startup and shared -#' across all methods that require it, avoiding redundant computation. -#' -#' For \code{"best_male_parent"}, \code{"best_female_parent"}, and -#' \code{"best_match"}, only homozygous markers (coded 0 or 2) are used for -#' comparison; heterozygous markers (coded 1) are set to \code{NA}. This -#' reduces false mismatches caused by phase ambiguity. -#' -#' For \code{"best_pair"}, all markers are used and full Mendelian inheritance -#' rules are applied. Mismatch rates and comparison counts are computed across -#' all progeny simultaneously using vectorised \code{vapply} calls, producing -#' \code{n_pairs x n_progeny} matrices and giving substantial speed gains for -#' large datasets. Both matrices are explicitly coerced to matrix form to -#' handle the edge case of a single parent pair correctly. -#' -#' When multiple pairs share the lowest Mendelian error rate, ties are broken -#' by selecting the pair(s) with the greatest number of markers tested. If ties -#' still remain after this step, all remaining tied pairs are reported when -#' \code{show_ties = TRUE}. +#' A homozygous-only matrix is pre-computed once and shared across +#' \code{"best_male_parent"}, \code{"best_female_parent"}, and +#' \code{"best_match"}, where heterozygous markers (coded 1) are set to +#' \code{NA} to reduce false mismatches from phase ambiguity. #' -#' The base columns (\code{Male_Parent}, \code{Female_Parent}, -#' \code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with -#' the top result, ensuring no missing values in these columns regardless of -#' tie status. +#' For \code{"best_pair"}, full Mendelian inheritance rules are applied across +#' all marker and pair combinations using vectorised \code{vapply} calls, +#' producing \code{n_pairs x n_progeny} mismatch and comparison matrices. #' -#' Output rows are pre-allocated as a \code{data.table} and filled by reference -#' using \code{data.table::set()}, avoiding repeated memory allocation during -#' the results-building step. +#' Ties on mismatch rate are broken by maximum markers tested. Remaining ties +#' are reported as suffix columns when \code{show_ties = TRUE}. Base columns +#' are always populated with the top result. #' -#' Individuals in \code{parents_file} or \code{progeny_file} that are absent -#' from \code{genotypes_file} are removed with a warning. -#' -#' Progeny with fewer non-missing markers than \code{min_markers} are flagged -#' \code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best -#' match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +#' Parents or progeny absent from \code{genotypes_file} are removed with a +#' warning. Progeny below \code{min_markers} are flagged \code{LOW_MARKERS}; +#' those exceeding \code{error_threshold} are flagged \code{HIGH_ERROR}. #' #' @examples #' \dontrun{ -#' # Assign best male-female parent pair to each progeny #' results <- find_parentage( #' genotypes_file = "genotypes.txt", #' parents_file = "parents.txt", @@ -113,10 +71,11 @@ #' min_markers = 50, #' error_threshold = 5.0, #' show_ties = TRUE, -#' allow_selfing = FALSE +#' allow_selfing = FALSE, +#' na_string = "NA", +#' plot_results = TRUE #' ) #' -#' # Find best individual parent match (ignoring sex) #' results <- find_parentage( #' genotypes_file = "genotypes.txt", #' parents_file = "parents.txt", @@ -136,8 +95,10 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE) { - + write_txt = TRUE, + na_string = "NA", + plot_results = TRUE) { + #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") if (!method %in% allowed_methods) @@ -146,7 +107,9 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("min_markers must be a positive integer.") if (error_threshold < 0 || error_threshold > 100) stop("error_threshold must be between 0 and 100.") - + if (!na_string %in% c("NA", "")) + stop("na_string must be either 'NA' or ''.") + tryCatch({ genos <- data.table::fread(genotypes_file, sep = "auto") all_parents <- data.table::fread(parents_file, sep = "auto") @@ -154,7 +117,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + valid_ids <- genos$ID removed_parents <- base::setdiff(all_parents$ID, valid_ids) if (base::length(removed_parents) > 0) { @@ -162,48 +125,50 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, paste(removed_parents, collapse = ", "), call. = FALSE) all_parents <- all_parents[ID %in% valid_ids] } - + removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) if (base::length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) progeny_candidates <- progeny_candidates[ID %in% valid_ids] } - + if (!"Sex" %in% base::colnames(all_parents)) { warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") all_parents[, Sex := "A"] } - + all_parents[, Sex := base::toupper(Sex)] male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] - + if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) warning("No valid female parent candidates remain after filtering.", call. = FALSE) if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") - + #### Pre-compute genotype matrices once (shared across all methods) #### + # Full genotype matrix used by best_pair genos_mat <- base::as.matrix(genos, rownames = "ID") - + # Homozygous-only matrix (het markers set to NA) used by hom methods genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - + #### Assignment_Status from markers and error rate #### + # Returns LOW_MARKERS, HIGH_ERROR, or PASS assign_status <- function(markers, error_pct) { base::ifelse(markers < min_markers, "LOW_MARKERS", base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) } - + #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { parent_ids <- base::switch(method, @@ -211,11 +176,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - + # Subset pre-computed homozygous matrix for relevant parents and progeny - parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] - progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] - + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + # Pre-allocate results data.table; fill by reference with set() n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( @@ -225,26 +190,26 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + for (i in base::seq_len(n_progeny)) { progeny_vec <- progeny_genos[i, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA - + best_idx <- base::which.min(percent_mismatch) - + # No candidate found — flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { data.table::set(results_dt, i, "Markers_Tested", 0L) data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") next } - + best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - + data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) @@ -252,7 +217,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } final_df <- results_dt } - + #### Logic for Best Pair Method #### if (method == "best_pair") { parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, @@ -262,18 +227,17 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - + # Pre-extract parent genotype row blocks for vectorised operations male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - + # Subset full genotype matrix to progeny only progeny_ids <- progeny_candidates$ID progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) - - # Vectorised mismatch computation across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case mismatch_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -292,8 +256,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - - # Vectorised comparison count across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case comparison_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -304,11 +267,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - + # Percent mismatch matrix: n_pairs x n_progeny pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA - + # Pre-allocate base results data.table; tie columns added dynamically results_dt <- data.table::data.table( Progeny = progeny_ids, @@ -318,7 +281,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + # Per-progeny result extraction from pre-computed matrices results_list <- base::vector("list", n_progeny) for (j in base::seq_len(n_progeny)) { @@ -326,48 +289,48 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, percent_mismatch <- pct_mismatch_mat[, j] comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - + # No markers overlap at all — flag LOW_MARKERS if (base::is.infinite(min_mismatch_val)) { data.table::set(results_dt, j, "Markers_Tested", 0L) data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") next } - + best_indices <- base::which(percent_mismatch == min_mismatch_val) - + # Tie-break: prefer pair(s) with the most markers tested if (base::length(best_indices) > 1) { best_markers_per_pair <- comparisons[best_indices] max_markers <- base::max(best_markers_per_pair) best_indices <- best_indices[best_markers_per_pair == max_markers] } - + best_pairs <- parent_pairs[best_indices] best_markers <- comparisons[best_indices[1]] best_error <- base::round(min_mismatch_val, 2) a_status <- assign_status(best_markers, best_error) - + if (!show_ties && base::nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) } - + num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - + # Always populate base columns with the top result data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) data.table::set(results_dt, j, "Assignment_Status", a_status) - + # If ties remain after tie-breaking and show_ties is TRUE, # store tie details for later column binding if (show_ties && num_to_report > 1) { tie_row <- base::list(Progeny = prog_id) - for (k in base::seq_len(num_to_report)) { + for (k in base::seq(2, num_to_report)) { tie_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] tie_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] tie_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val @@ -376,19 +339,24 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, results_list[[j]] <- data.table::as.data.table(tie_row) } } - + # Merge any tie suffix columns onto the pre-allocated base table tie_rows <- data.table::rbindlist( base::Filter(Negate(base::is.null), results_list), - fill = TRUE + fill = TRUE, + use.names = TRUE ) if (base::nrow(tie_rows) > 0) { final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) + # Replace any empty strings with NA + for (col in base::names(final_df)) { + data.table::set(final_df, which(final_df[[col]] == ""), col, NA_character_) + } } else { final_df <- results_dt } } - + #### Summary #### if (verbose) { total <- base::nrow(final_df) @@ -401,18 +369,98 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::cat("Min markers threshold :", min_markers, "\n") base::cat("Error threshold :", error_threshold, "%\n\n") } - + #### Output #### if (write_txt) { - output_filename <- "parentage_results_dt.txt" + output_filename <- "parentage_testing_results.txt" tryCatch({ - data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) + data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE, + na = na_string) if (verbose) base::cat("Results successfully written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } - + + #### Plot Results #### + if (plot_results) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) + } else { + plot_df <- final_df[!is.na(final_df$Mendelian_Error_Pct)] + plot_df$Mendelian_Error_Pct <- base::as.numeric(plot_df$Mendelian_Error_Pct) + + plot_df$Plot_Status <- dplyr::case_when( + plot_df$Assignment_Status == "PASS" ~ "PASS", + plot_df$Assignment_Status == "HIGH_ERROR" ~ "HIGH_ERROR", + plot_df$Assignment_Status == "LOW_MARKERS" ~ "LOW_MARKERS", + TRUE ~ "OTHER" + ) + + n_total <- base::nrow(plot_df) + n_pass <- base::sum(plot_df$Assignment_Status == "PASS", na.rm = TRUE) + n_high <- base::sum(plot_df$Assignment_Status == "HIGH_ERROR", na.rm = TRUE) + n_low <- base::sum(plot_df$Assignment_Status == "LOW_MARKERS", na.rm = TRUE) + + threshold_label <- base::paste0( + "Error Threshold: ", error_threshold, "% | ", + "Pass: ", n_pass, " | ", + "High Error: ", n_high, " | ", + "Low Markers: ", n_low + ) + + p <- ggplot2::ggplot( + plot_df, + ggplot2::aes(x = Mendelian_Error_Pct, fill = Plot_Status) + ) + + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + + ggplot2::geom_vline( + xintercept = error_threshold, + linetype = "dashed", + color = "black", + linewidth = 1 + ) + + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_y_continuous(breaks = seq(0, 10000, by = 5)) + + ggplot2::scale_fill_manual( + values = c( + "PASS" = "#339900", + "HIGH_ERROR" = "#cc3333", + "LOW_MARKERS" = "#F1C40F", + "OTHER" = "#BDC3C7" + ), + labels = c( + "PASS" = "Pass", + "HIGH_ERROR" = "High Error", + "LOW_MARKERS" = "Low Markers", + "OTHER" = "Other" + ) + ) + + ggplot2::labs( + title = "Parentage Mendelian Error Distribution", + subtitle = base::paste0("Progeny Tested: ", n_total, "\n \n", threshold_label), + x = "Mendelian Error (%)", + y = "Number of Progeny", + fill = "Status" + ) + + ggplot2::theme_classic(base_size = 13) + + ggplot2::theme(legend.position = "top") + + base::print(p) + + if (write_txt) { + plot_filename <- base::sub("\\.[^.]+$", ".jpg", "parentage_testing_results.txt") + tryCatch({ + ggplot2::ggsave(plot_filename, plot = p, + device = "jpeg", width = 10, height = 6, dpi = 300) + if (verbose) base::cat("Plot saved to:", plot_filename, "\n") + }, error = function(e) { + warning("Could not save plot. Error: ", e$message, call. = FALSE) + }) + } + } + } + if (verbose) { base::cat("\n--- Parentage Assignment Results ---\n") base::print(final_df) diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 5b366c9..a3526ce 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -23,57 +23,67 @@ #' parent-level evaluation (default: 2.0). Must be between 0 and 100. #' @param verbose Logical. If TRUE, prints progress messages, a summary #' table, and results to the console (default: TRUE). -#' @param write_txt Logical. If TRUE, writes validation results to -#' output_filename (default: TRUE). -#' @param output_filename Character. Path/name of the output file -#' (default: "pedigree_validation_results.txt"). +#' @param write_results Logical. If TRUE, writes the validation results to +#' output_filename and saves the plot as a .jpg file (default: TRUE). +#' @param output_filename Character. Path/name of the output results file +#' (default: "__validation_report.txt"). The plot will be saved using the +#' same base name with a .jpg extension. +#' @param plot_results Logical. If TRUE, prints a histogram of Trio Mendelian +#' Error percentages with a threshold line (default: TRUE). If write_results +#' is also TRUE, the plot is additionally saved as a .jpg file. +#' @param na_string Character. String for missing values in the output file. +#' Use \code{"NA"} or \code{""} (default: \code{"NA"}). #' #' @return A data.table (returned invisibly) with one row per trio and #' the following columns: #' \describe{ #' \item{ID}{Individual ID.} -#' \item{Male_Parent}{Declared male parent ID.} -#' \item{Female_Parent}{Declared female parent ID.} -#' \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -#' \item{Markers_Tested}{Number of markers with non-missing genotypes.} +#' \item{Orig_Male_Parent}{Declared male parent ID.} +#' \item{Orig_Female_Parent}{Declared female parent ID.} +#' \item{Trio_Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +#' \item{Trio_Markers_Tested}{Number of markers with non-missing genotypes.} #' \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, #' MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, #' or NO_GENOTYPE_DATA.} -#' \item{Correction_Decision}{One of NONE, KEEP_BOTH, +#' \item{Recommended_Correction}{One of NONE, KEEP_BOTH, #' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, #' LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, #' LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} #' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} #' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -#' \item{Best_Male_Parent}{Best-matching male parent candidate ID.} -#' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} -#' \item{Best_Female_Parent}{Best-matching female parent candidate ID.} -#' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} +#' \item{Best_Male_Candidate}{Best-matching male parent candidate ID.} +#' \item{Best_Male_Candidate_Error_Pct}{Homozygous mismatch percentage for the best male candidate.} +#' \item{Best_Female_Candidate}{Best-matching female parent candidate ID.} +#' \item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} #' } #' @export -#' @importFrom data.table fread fwrite copy data.table set rbindlist +#' @importFrom data.table fread fwrite copy data.table set rbindlist validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, trio_error_threshold = 5.0, min_markers = 10, single_parent_error_threshold = 2.0, verbose = TRUE, - write_txt = TRUE, - output_filename = "pedigree_validation_results.txt") { - + write_results = TRUE, + output_filename = "__validation_report.txt", + plot_results = TRUE, + na_string = "NA") { + #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) stop("trio_error_threshold must be between 0 and 100") if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) stop("single_parent_error_threshold must be between 0 and 100") - + if (!na_string %in% c("NA", "")) + stop("na_string must be either 'NA' or ''.") + tryCatch({ pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") genos <- data.table::fread(genotypes_file, sep = "auto") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + #### Check required columns #### required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) @@ -82,12 +92,11 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::paste(missing_cols, collapse = ", ")) if (!"ID" %in% base::names(genos)) stop("Genotypes file must have an 'ID' column") - - # Ensure parent columns are character for consistent "0" comparisons + pedigree[, Male_Parent := as.character(Male_Parent)] pedigree[, Female_Parent := as.character(Female_Parent)] original_pedigree <- data.table::copy(pedigree) - + #### Read founders list #### if (!is.null(founders_file)) { founders_raw <- tryCatch({ @@ -99,42 +108,36 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { founder_ids <- character(0) } - + #### Identify trios missing from the genotype file #### valid_ids <- as.character(genos$ID) has_geno <- pedigree[ID %in% valid_ids & (Male_Parent %in% valid_ids | Male_Parent == "0") & (Female_Parent %in% valid_ids | Female_Parent == "0")] - no_geno_rows <- pedigree[!(ID %in% valid_ids) | (!(Male_Parent %in% valid_ids) & Male_Parent != "0") | (!(Female_Parent %in% valid_ids) & Female_Parent != "0")] - if (base::nrow(no_geno_rows) > 0 && verbose) base::cat("Found", base::nrow(no_geno_rows), "trios with missing genotype data; flagged as NO_GENOTYPE_DATA.\n") - pedigree <- has_geno if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") - + #### Build genotype matrices #### genos_mat <- base::as.matrix(genos, rownames = "ID") - - # Homozygous-only matrix (het markers set to NA) genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - - #### Helper: find best matching parent via homozygous mismatch #### + + #### Find best matching parent via homozygous mismatch #### find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { candidates <- base::setdiff(base::rownames(genos_hom_mat), c(prog_id, exclude_ids)) if (base::length(candidates) == 0) return(base::list(id = NA_character_, error_pct = NA_real_)) - prog_hom <- genos_hom_mat[prog_id, ] errors <- base::sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] @@ -142,20 +145,16 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (comparisons == 0) return(NA_real_) (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) - best_idx <- base::which.min(errors) base::list(id = candidates[best_idx], error_pct = base::round(errors[best_idx], 2)) } - + #### Main trio evaluation loop #### results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { - prog_id <- pedigree$ID[i] male_parent_id <- pedigree$Male_Parent[i] female_parent_id <- pedigree$Female_Parent[i] - - # Default values correction_decision <- "NONE" error_pct <- NA_real_ status <- "NO_DATA" @@ -166,51 +165,37 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_male_parent_pct <- NA_real_ best_female_parent <- NA_character_ best_female_parent_pct <- NA_real_ - - ## Founder check — both parents "0" and ID in founders list + if (male_parent_id == "0" && female_parent_id == "0" && prog_id %in% founder_ids) { status <- "FOUNDERS" correction_decision <- "NONE" - } else { - - ## Missing parent(s) — recommendations only, "0"s preserved in pedigree if (male_parent_id == "0" && female_parent_id == "0") { status <- "MISSING_BOTH_PARENTS" correction_decision <- "NONE" - best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - best_f <- find_best_parent(prog_id, exclude_ids = c(best_m$id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct - } else if (male_parent_id == "0" && female_parent_id != "0") { status <- "MISSING_MALE_PARENT" correction_decision <- "NONE" - best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - } else if (male_parent_id != "0" && female_parent_id == "0") { status <- "MISSING_FEMALE_PARENT" correction_decision <- "NONE" - best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct - } else { - - ## Both parents present — Mendelian error calculation progeny_vec <- genos_mat[prog_id, ] male_parent_vec <- genos_mat[male_parent_id, ] female_parent_vec <- genos_mat[female_parent_id, ] - mismatches <- base::sum( (male_parent_vec == 0 & female_parent_vec == 0 & progeny_vec > 0) | (male_parent_vec == 2 & female_parent_vec == 2 & progeny_vec < 2) | @@ -222,19 +207,14 @@ validate_pedigree <- function(pedigree_file, genotypes_file, (male_parent_vec == 2 & female_parent_vec == 0)) & (progeny_vec != 1), na.rm = TRUE ) - markers_tested <- base::sum(!base::is.na(male_parent_vec) & !base::is.na(female_parent_vec) & !base::is.na(progeny_vec)) - if (markers_tested == 0) { status <- "NO_DATA" correction_decision <- "NONE" - } else { error_pct <- (mismatches / markers_tested) * 100 - - # LOW_MARKERS still computes parent mismatch/recommendations if (markers_tested < min_markers) { status <- "LOW_MARKERS" } else if (error_pct <= trio_error_threshold) { @@ -243,32 +223,24 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { status <- "FAIL" } - - # Run parent-level evaluation for both FAIL and LOW_MARKERS if (status %in% c("FAIL", "LOW_MARKERS")) { - - # Homozygous mismatch per parent progeny_hom <- genos_hom_mat[prog_id, ] male_parent_hom <- genos_hom_mat[male_parent_id, ] female_parent_hom <- genos_hom_mat[female_parent_id, ] - male_comparisons <- base::sum(!base::is.na(male_parent_hom) & !base::is.na(progeny_hom)) male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else base::round((base::sum(male_parent_hom != progeny_hom, na.rm = TRUE) / male_comparisons) * 100, 2) - female_comparisons <- base::sum(!base::is.na(female_parent_hom) & !base::is.na(progeny_hom)) female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else base::round((base::sum(female_parent_hom != progeny_hom, na.rm = TRUE) / female_comparisons) * 100, 2) - male_acceptable <- !is.na(male_parent_error_pct) && male_parent_error_pct <= single_parent_error_threshold female_acceptable <- !is.na(female_parent_error_pct) && female_parent_error_pct <= single_parent_error_threshold - if (male_acceptable && female_acceptable) { correction_decision <- "KEEP_BOTH" } else if (male_acceptable && !female_acceptable) { @@ -290,61 +262,58 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } - - # Do not alter corrected pedigree for LOW_MARKERS rows if (status == "LOW_MARKERS") correction_decision <- paste0("LOW_MARKERS_", correction_decision) } } } } - + data.table::data.table( - ID = prog_id, - Male_Parent = male_parent_id, - Female_Parent = female_parent_id, - Mendelian_Error_Pct = base::round(error_pct, 2), - Markers_Tested = markers_tested, - Status = status, - Correction_Decision = correction_decision, - Male_Parent_Hom_Error_Pct = male_parent_error_pct, - Female_Parent_Hom_Error_Pct = female_parent_error_pct, - Best_Male_Parent = best_male_parent, - Best_Male_Parent_Error_Pct = best_male_parent_pct, - Best_Female_Parent = best_female_parent, - Best_Female_Parent_Error_Pct = best_female_parent_pct + ID = prog_id, + Orig_Male_Parent = male_parent_id, + Orig_Female_Parent = female_parent_id, + Trio_Mendelian_Error_Pct = base::round(error_pct, 2), + Trio_Markers_Tested = markers_tested, + Status = status, + Recommended_Correction = correction_decision, + Male_Parent_Hom_Error_Pct = male_parent_error_pct, + Female_Parent_Hom_Error_Pct = female_parent_error_pct, + Best_Male_Candidate = best_male_parent, + Best_Male_Candidate_Error_Pct = best_male_parent_pct, + Best_Female_Candidate = best_female_parent, + Best_Female_Candidate_Error_Pct = best_female_parent_pct ) }) - + final_df <- data.table::rbindlist(results_list) - - #### Append NO_GENOTYPE_DATA rows to the final report #### + + #### Append NO_GENOTYPE_DATA rows #### if (base::nrow(no_geno_rows) > 0) { no_geno_df <- data.table::data.table( - ID = no_geno_rows$ID, - Male_Parent = no_geno_rows$Male_Parent, - Female_Parent = no_geno_rows$Female_Parent, - Mendelian_Error_Pct = NA_real_, - Markers_Tested = 0L, - Status = "NO_GENOTYPE_DATA", - Correction_Decision = "NONE", - Male_Parent_Hom_Error_Pct = NA_real_, - Female_Parent_Hom_Error_Pct = NA_real_, - Best_Male_Parent = NA_character_, - Best_Male_Parent_Error_Pct = NA_real_, - Best_Female_Parent = NA_character_, - Best_Female_Parent_Error_Pct = NA_real_ + ID = no_geno_rows$ID, + Orig_Male_Parent = no_geno_rows$Male_Parent, + Orig_Female_Parent = no_geno_rows$Female_Parent, + Trio_Mendelian_Error_Pct = NA_real_, + Trio_Markers_Tested = 0L, + Status = "NO_GENOTYPE_DATA", + Recommended_Correction = "NONE", + Male_Parent_Hom_Error_Pct = NA_real_, + Female_Parent_Hom_Error_Pct = NA_real_, + Best_Male_Candidate = NA_character_, + Best_Male_Candidate_Error_Pct = NA_real_, + Best_Female_Candidate = NA_character_, + Best_Female_Candidate_Error_Pct = NA_real_ ) final_df <- data.table::rbindlist(list(final_df, no_geno_df)) } - + #### Write corrected pedigree #### corrected_pedigree <- data.table::copy(original_pedigree) for (i in base::seq_len(base::nrow(final_df))) { prog_id <- final_df$ID[i] - decision <- final_df$Correction_Decision[i] + decision <- final_df$Recommended_Correction[i] row_idx <- base::which(corrected_pedigree$ID == prog_id) - if (decision == "REMOVE_MALE_PARENT") { data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") } else if (decision == "REMOVE_FEMALE_PARENT") { @@ -354,7 +323,6 @@ validate_pedigree <- function(pedigree_file, genotypes_file, data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") } } - tryCatch({ data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", sep = "\t", quote = FALSE) @@ -362,7 +330,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, }, error = function(e) { warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) }) - + #### Summary output #### if (verbose) { total_trios <- base::nrow(final_df) @@ -376,8 +344,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::cat("Error threshold:", trio_error_threshold, "%\n") base::cat("Homozygous threshold:", single_parent_error_threshold, "%\n") base::cat("Minimum markers required:", min_markers, "\n\n") - - corrections <- base::table(final_df$Correction_Decision) + corrections <- base::table(final_df$Recommended_Correction) base::cat("Correction summary:\n") for (decision in base::names(corrections)) if (decision != "NONE") @@ -385,17 +352,86 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::cat("\n") base::print(final_df) } - + #### Write results #### - if (write_txt) { + if (write_results) { tryCatch({ data.table::fwrite(final_df, file = output_filename, - sep = "\t", quote = FALSE) + sep = "\t", quote = FALSE, na = na_string) # <-- na_string used here if (verbose) base::cat("Results written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results. Error: ", e$message, call. = FALSE) }) } - + + #### Plot results #### + if (plot_results) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) + } else { + plot_df <- final_df[!is.na(final_df$Trio_Mendelian_Error_Pct)] + plot_df$Plot_Status <- dplyr::case_when( + plot_df$Recommended_Correction %in% c("NONE", + "KEEP_BOTH", + "LOW_MARKERS_KEEP_BOTH") ~ "PASS", + plot_df$Recommended_Correction %in% c("REMOVE_MALE_PARENT", + "REMOVE_FEMALE_PARENT", + "LOW_MARKERS_REMOVE_MALE_PARENT", + "LOW_MARKERS_REMOVE_FEMALE_PARENT") ~ "FAIL_ONE_PARENT", + plot_df$Recommended_Correction %in% c("REMOVE_BOTH", + "LOW_MARKERS_REMOVE_BOTH") ~ "FAIL_BOTH_PARENTS", + TRUE ~ "OTHER" + ) + n_total <- nrow(plot_df) + n_fail <- sum(plot_df$Trio_Mendelian_Error_Pct > trio_error_threshold) + n_pass <- sum(plot_df$Trio_Mendelian_Error_Pct <= trio_error_threshold) + pct_lost <- round((n_fail / n_total) * 100, 1) + threshold_label <- paste0("Mendelian Error Threshold: ", trio_error_threshold, "% | ", + "Lost: ", n_fail, " trios | ", + "Kept: ", n_pass, " trios") + p <- ggplot2::ggplot(plot_df, + ggplot2::aes(x = Trio_Mendelian_Error_Pct, fill = Plot_Status)) + + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + + ggplot2::geom_vline(xintercept = trio_error_threshold, + linetype = "dashed", color = "black", linewidth = 1) + + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_y_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_fill_manual( + values = c( + "PASS" = "#339900", + "FAIL_ONE_PARENT" = "#F1C40F", + "FAIL_BOTH_PARENTS" = "#cc3333", + "OTHER" = "#BDC3C7" + ), + labels = c( + "PASS" = "Pass", + "FAIL_ONE_PARENT" = "Fail - One Parent", + "FAIL_BOTH_PARENTS" = "Fail - Both Parents", + "OTHER" = "Other" + ) + ) + + ggplot2::labs( + title = "Trio Mendelian Error Distribution", + subtitle = paste0("Trios with Genotype Data Tested: ", n_total, "\n \n", threshold_label), + x = "Mendelian Error (%)", + y = "Number of Trios", + fill = "Status" + ) + + ggplot2::theme_classic(base_size = 13) + + ggplot2::theme(legend.position = "top") + print(p) + if (write_results) { + plot_filename <- base::sub("\\.[^.]+$", ".jpg", output_filename) + tryCatch({ + ggplot2::ggsave(plot_filename, plot = p, + device = "jpeg", width = 10, height = 6, dpi = 300) + if (verbose) base::cat("Plot saved to:", plot_filename, "\n") + }, error = function(e) { + warning("Could not save plot. Error: ", e$message, call. = FALSE) + }) + } + } + } + return(base::invisible(final_df)) -} +} \ No newline at end of file diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 706a15f..44b9772 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -14,7 +14,9 @@ find_parentage( show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE + write_txt = TRUE, + na_string = "NA", + plot_results = TRUE ) } \arguments{ @@ -23,117 +25,77 @@ Must include an 'ID' column followed by marker columns coded as 0, 1, 2 (allele dosage).} \item{parents_file}{Path to a TSV/CSV/TXT file listing candidate parent IDs. -Must include an 'ID' column. An optional 'Sex' column with values -'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines -which parents are tested for each role. If absent, all parents are treated -as ambiguous.} +Must include an 'ID' column. An optional 'Sex' column ('M', 'F', or 'A') +determines parent roles. If absent, all parents are treated as ambiguous.} -\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs to assign. +\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs. Must include an 'ID' column.} -\item{method}{Character. Parentage assignment method. One of: -\itemize{ -\item \code{"best_male_parent"} — finds the best male parent for each -progeny using homozygous mismatch rate. -\item \code{"best_female_parent"} — finds the best female parent for each -progeny using homozygous mismatch rate. -\item \code{"best_match"} — finds the single best parent (either sex) -using homozygous mismatch rate. -\item \code{"best_pair"} — finds the best male-female parent pair for -each progeny using full Mendelian error rate (default). -}} - -\item{min_markers}{Integer. Minimum number of non-missing markers required -to report a parentage assignment. Progeny-parent comparisons with fewer -markers are flagged as \code{LOW_MARKERS} and no assignment is made -(default: \code{10}).} - -\item{error_threshold}{Numeric. Maximum mismatch percentage to report a -parentage assignment as confident. Assignments above this threshold are -flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column -(default: \code{5.0}). Must be between 0 and 100.} - -\item{show_ties}{Logical. If \code{TRUE}, all tied best pairs (after -tie-breaking by maximum markers tested) are reported as additional columns -(\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when -\code{method = "best_pair"}. The base columns (\code{Male_Parent}, -\code{Female_Parent}, etc.) are always populated with the top result. -If \code{FALSE}, only one tied pair is reported with a warning. -Default is \code{TRUE}.} +\item{method}{Character. One of \code{"best_male_parent"}, +\code{"best_female_parent"}, \code{"best_match"}, or +\code{"best_pair"} (default). See Details.} -\item{allow_selfing}{Logical. If \code{FALSE}, male-female parent pairs where -both IDs are identical are excluded when \code{method = "best_pair"}. -Default is \code{TRUE}.} +\item{min_markers}{Integer. Minimum markers required for a valid assignment. +Fewer markers flags \code{LOW_MARKERS} (default: \code{10}).} + +\item{error_threshold}{Numeric. Maximum mismatch percentage for a confident +assignment. Exceeded values flag \code{HIGH_ERROR} (default: \code{5.0}). +Must be between 0 and 100.} + +\item{show_ties}{Logical. If \code{TRUE}, tied best pairs are reported as +additional suffix columns (e.g. \code{Male_Parent_2}) when +\code{method = "best_pair"}. Default is \code{TRUE}.} + +\item{allow_selfing}{Logical. If \code{FALSE}, pairs with identical male and +female parent IDs are excluded. Default is \code{TRUE}.} -\item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary -statistics, and the results table to the console. Default is \code{TRUE}.} +\item{verbose}{Logical. If \code{TRUE}, prints progress, summary, and +results to the console. Default is \code{TRUE}.} \item{write_txt}{Logical. If \code{TRUE}, writes results to -\code{parentage_results_dt.txt} in the working directory. Default is -\code{TRUE}.} +\code{parentage_testing_results.txt}. Default is \code{TRUE}.} + +\item{na_string}{Character. String for missing values in output file. +Use \code{"NA"} or \code{""} (default: \code{"NA"}).} + +\item{plot_results}{Logical. If \code{TRUE}, plots the Mendelian error +distribution colored by assignment status. Requires \code{ggplot2}. +Plot is saved as \code{.jpg} when \code{write_txt = TRUE}. +Default is \code{TRUE}.} } \value{ -A \code{data.table} with one row per progeny. Columns depend on the -method used: -\itemize{ -\item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: -\code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -\code{Markers_Tested}, \code{Assignment_Status}. -\item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, -\code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, -\code{Markers_Tested}, \code{Assignment_Status}. -\item \code{best_pair} (ties remain after tie-breaking, -\code{show_ties = TRUE}): base columns are always populated with the -top result, plus suffix columns \code{Male_Parent_1}, -\code{Female_Parent_1}, etc. for each tied pair. -} -\code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or -\code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. +A \code{data.table} with one row per progeny. Base columns are +\code{Progeny}, \code{Male_Parent} (or \code{Best_Match}), +\code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, +and \code{Assignment_Status} (\code{PASS}, \code{HIGH_ERROR}, or +\code{LOW_MARKERS}). Tie suffix columns (\code{Male_Parent_2}, etc.) are +appended when \code{show_ties = TRUE} and ties remain after tie-breaking. +Returned invisibly when \code{verbose = TRUE}. } \description{ Assigns the most likely parent(s) to each progeny individual based on genotypic data using Mendelian error rates or homozygous mismatch rates. } \details{ -A homozygous-only genotype matrix is pre-computed once at startup and shared -across all methods that require it, avoiding redundant computation. - -For \code{"best_male_parent"}, \code{"best_female_parent"}, and -\code{"best_match"}, only homozygous markers (coded 0 or 2) are used for -comparison; heterozygous markers (coded 1) are set to \code{NA}. This -reduces false mismatches caused by phase ambiguity. - -For \code{"best_pair"}, all markers are used and full Mendelian inheritance -rules are applied. Mismatch rates and comparison counts are computed across -all progeny simultaneously using vectorised \code{vapply} calls, producing -\code{n_pairs x n_progeny} matrices and giving substantial speed gains for -large datasets. Both matrices are explicitly coerced to matrix form to -handle the edge case of a single parent pair correctly. - -When multiple pairs share the lowest Mendelian error rate, ties are broken -by selecting the pair(s) with the greatest number of markers tested. If ties -still remain after this step, all remaining tied pairs are reported when -\code{show_ties = TRUE}. - -The base columns (\code{Male_Parent}, \code{Female_Parent}, -\code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with -the top result, ensuring no missing values in these columns regardless of -tie status. - -Output rows are pre-allocated as a \code{data.table} and filled by reference -using \code{data.table::set()}, avoiding repeated memory allocation during -the results-building step. - -Individuals in \code{parents_file} or \code{progeny_file} that are absent -from \code{genotypes_file} are removed with a warning. - -Progeny with fewer non-missing markers than \code{min_markers} are flagged -\code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best -match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +A homozygous-only matrix is pre-computed once and shared across +\code{"best_male_parent"}, \code{"best_female_parent"}, and +\code{"best_match"}, where heterozygous markers (coded 1) are set to +\code{NA} to reduce false mismatches from phase ambiguity. + +For \code{"best_pair"}, full Mendelian inheritance rules are applied across +all marker and pair combinations using vectorised \code{vapply} calls, +producing \code{n_pairs x n_progeny} mismatch and comparison matrices. + +Ties on mismatch rate are broken by maximum markers tested. Remaining ties +are reported as suffix columns when \code{show_ties = TRUE}. Base columns +are always populated with the top result. + +Parents or progeny absent from \code{genotypes_file} are removed with a +warning. Progeny below \code{min_markers} are flagged \code{LOW_MARKERS}; +those exceeding \code{error_threshold} are flagged \code{HIGH_ERROR}. } \examples{ \dontrun{ -# Assign best male-female parent pair to each progeny results <- find_parentage( genotypes_file = "genotypes.txt", parents_file = "parents.txt", @@ -142,10 +104,11 @@ results <- find_parentage( min_markers = 50, error_threshold = 5.0, show_ties = TRUE, - allow_selfing = FALSE + allow_selfing = FALSE, + na_string = "NA", + plot_results = TRUE ) -# Find best individual parent match (ignoring sex) results <- find_parentage( genotypes_file = "genotypes.txt", parents_file = "parents.txt", diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 8f47a7d..8ec975b 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -12,8 +12,10 @@ validate_pedigree( min_markers = 10, single_parent_error_threshold = 2, verbose = TRUE, - write_txt = TRUE, - output_filename = "pedigree_validation_results.txt" + write_results = TRUE, + output_filename = "__validation_report.txt", + plot_results = TRUE, + na_string = "NA" ) } \arguments{ @@ -40,34 +42,42 @@ parent-level evaluation (default: 2.0). Must be between 0 and 100.} \item{verbose}{Logical. If TRUE, prints progress messages, a summary table, and results to the console (default: TRUE).} -\item{write_txt}{Logical. If TRUE, writes validation results to -output_filename (default: TRUE).} +\item{write_results}{Logical. If TRUE, writes the validation results to +output_filename and saves the plot as a .jpg file (default: TRUE).} -\item{output_filename}{Character. Path/name of the output file -(default: "pedigree_validation_results.txt").} +\item{output_filename}{Character. Path/name of the output results file +(default: "__validation_report.txt"). The plot will be saved using the +same base name with a .jpg extension.} + +\item{plot_results}{Logical. If TRUE, prints a histogram of Trio Mendelian +Error percentages with a threshold line (default: TRUE). If write_results +is also TRUE, the plot is additionally saved as a .jpg file.} + +\item{na_string}{Character. String for missing values in the output file. +Use \code{"NA"} or \code{""} (default: \code{"NA"}).} } \value{ A data.table (returned invisibly) with one row per trio and the following columns: \describe{ \item{ID}{Individual ID.} -\item{Male_Parent}{Declared male parent ID.} -\item{Female_Parent}{Declared female parent ID.} -\item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -\item{Markers_Tested}{Number of markers with non-missing genotypes.} +\item{Orig_Male_Parent}{Declared male parent ID.} +\item{Orig_Female_Parent}{Declared female parent ID.} +\item{Trio_Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +\item{Trio_Markers_Tested}{Number of markers with non-missing genotypes.} \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, or NO_GENOTYPE_DATA.} -\item{Correction_Decision}{One of NONE, KEEP_BOTH, +\item{Recommended_Correction}{One of NONE, KEEP_BOTH, REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -\item{Best_Male_Parent}{Best-matching male parent candidate ID.} -\item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} -\item{Best_Female_Parent}{Best-matching female parent candidate ID.} -\item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} +\item{Best_Male_Candidate}{Best-matching male parent candidate ID.} +\item{Best_Male_Candidate_Error_Pct}{Homozygous mismatch percentage for the best male candidate.} +\item{Best_Female_Candidate}{Best-matching female parent candidate ID.} +\item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} } } \description{ diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R index 55e6913..d1dfb3c 100644 --- a/tests/testthat/test-find_parentage.R +++ b/tests/testthat/test-find_parentage.R @@ -17,11 +17,12 @@ make_files <- function(genos, parents, progeny, dir = tempdir()) { list(g = geno_file, p = parent_file, pr = progeny_file) } -# ------------------------------------------------------------------------------ +# ============================================================================== # Base toy data # S1 / D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 # S2 / D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 -# ------------------------------------------------------------------------------ +# ============================================================================== + base_genos <- data.table::data.table( ID = c("S1","S2","D1","D2","child1","child2"), M1 = c(0L, 2L, 0L, 2L, 0L, 2L), @@ -54,7 +55,7 @@ test_that("invalid method throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, method = "bad_method", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "Method must be one of" ) }) @@ -63,7 +64,7 @@ test_that("min_markers < 1 throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, min_markers = 0, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "min_markers" ) }) @@ -72,21 +73,31 @@ test_that("error_threshold out of range throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = 150, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "error_threshold" ) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = -1, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "error_threshold" ) }) +test_that("invalid na_string throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, na_string = "NULL", + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + regexp = "na_string" + ) +}) + test_that("missing genotype file throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage("nonexistent.txt", f$p, f$pr, - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + regexp = "Error reading input files" ) }) @@ -96,7 +107,7 @@ test_that("parent IDs absent from genotype file raise a warning and are dropped" f <- make_files(base_genos, extra_parents, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "GHOST" ) }) @@ -107,7 +118,7 @@ test_that("progeny IDs absent from genotype file raise a warning and are dropped f <- make_files(base_genos, base_parents, extra_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "GHOST_KID" ) }) @@ -118,7 +129,7 @@ test_that("no valid progeny candidates after filtering stops with an error", { expect_warning( expect_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "No valid progeny" ) ) @@ -129,7 +140,7 @@ test_that("missing Sex column raises a warning and defaults to ambiguous", { f <- make_files(base_genos, parents_no_sex, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "Sex" ) }) @@ -141,7 +152,8 @@ test_that("missing Sex column raises a warning and defaults to ambiguous", { test_that("best_pair returns a data.table with expected columns (no ties)", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Male_Parent","Female_Parent", "Mendelian_Error_Pct","Markers_Tested", @@ -152,17 +164,18 @@ test_that("best_pair returns a data.table with expected columns (no ties)", { test_that("best_male_parent returns expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", "Markers_Tested","Assignment_Status") %in% names(res))) + expect_false("Male_Parent" %in% names(res)) expect_equal(nrow(res), 1L) }) test_that("best_female_parent returns expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", "Markers_Tested","Assignment_Status") %in% names(res))) @@ -172,7 +185,7 @@ test_that("best_female_parent returns expected columns", { test_that("best_match returns expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", "Markers_Tested","Assignment_Status") %in% names(res))) @@ -183,7 +196,7 @@ test_that("one row returned per progeny for single-parent methods", { f <- make_files(base_genos, base_parents, child1_progeny) for (m in c("best_male_parent","best_female_parent","best_match")) { res <- find_parentage(f$g, f$p, f$pr, method = m, - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_equal(nrow(res), 1L, label = paste("row count for method", m)) } }) @@ -191,14 +204,16 @@ test_that("one row returned per progeny for single-parent methods", { test_that("Markers_Tested equals the number of non-NA marker columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) }) test_that("Mendelian_Error_Pct is between 0 and 100", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) pct <- as.numeric(res$Mendelian_Error_Pct) expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) }) @@ -210,7 +225,8 @@ test_that("Mendelian_Error_Pct is between 0 and 100", { test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Male_Parent, "S1") expect_equal(res$Female_Parent, "D1") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) @@ -219,7 +235,8 @@ test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { f <- make_files(base_genos, base_parents, child2_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Male_Parent, "S2") expect_equal(res$Female_Parent, "D2") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) @@ -228,24 +245,39 @@ test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { test_that("best_male_parent identifies S1 as best male for child1", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_equal(res$Best_Match, "S1") }) test_that("best_female_parent identifies D1 as best female for child1", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_equal(res$Best_Match, "D1") }) test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) +test_that("both child1 and child2 are correctly assigned when run together", { + f <- make_files(base_genos, base_parents, base_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + expect_equal(nrow(res), 2L) + child1_row <- res[Progeny == "child1"] + child2_row <- res[Progeny == "child2"] + expect_equal(child1_row$Male_Parent, "S1") + expect_equal(child1_row$Female_Parent, "D1") + expect_equal(child2_row$Male_Parent, "S2") + expect_equal(child2_row$Female_Parent, "D2") +}) + # ============================================================================== # 4. Assignment_Status — min_markers and error_threshold # ============================================================================== @@ -254,7 +286,8 @@ test_that("Assignment_Status = PASS for perfect trio within thresholds", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Assignment_Status, "PASS") }) @@ -262,7 +295,8 @@ test_that("Assignment_Status = LOW_MARKERS when min_markers exceeds available ma f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 99999, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Assignment_Status, "LOW_MARKERS") }) @@ -281,7 +315,8 @@ test_that("Assignment_Status = HIGH_ERROR when error rate exceeds threshold", { f <- make_files(high_error_genos, parents, progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Assignment_Status, "HIGH_ERROR") }) @@ -289,12 +324,24 @@ test_that("Assignment_Status column is present in all methods", { f <- make_files(base_genos, base_parents, child1_progeny) for (m in c("best_pair","best_male_parent","best_female_parent","best_match")) { res <- find_parentage(f$g, f$p, f$pr, method = m, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_true("Assignment_Status" %in% names(res), label = paste("Assignment_Status present for method", m)) } }) +test_that("LOW_MARKERS is flagged for single-parent methods too", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_male_parent","best_female_parent","best_match")) { + res <- find_parentage(f$g, f$p, f$pr, method = m, + min_markers = 99999, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + expect_equal(res$Assignment_Status, "LOW_MARKERS", + label = paste("LOW_MARKERS for method", m)) + } +}) + # ============================================================================== # 5. allow_selfing # ============================================================================== @@ -302,11 +349,10 @@ test_that("Assignment_Status column is present in all methods", { test_that("allow_selfing = FALSE removes self-pairs from candidates", { ambig_parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("A","A")) f <- make_files(base_genos, ambig_parents, child1_progeny) - expect_warning( - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - allow_selfing = FALSE, show_ties = FALSE, - verbose = FALSE, write_txt = FALSE), - regexp = "tied" + res <- suppressWarnings( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + allow_selfing = FALSE, show_ties = FALSE, + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) ) if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) expect_false(res$Male_Parent == res$Female_Parent) @@ -316,10 +362,11 @@ test_that("allow_selfing = FALSE removes self-pairs from candidates", { # 6. show_ties # ============================================================================== -test_that("show_ties = TRUE produces _1/_2 suffixed columns when ties exist", { +test_that("show_ties = TRUE produces suffixed columns when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = TRUE, verbose = FALSE, write_txt = FALSE) + show_ties = TRUE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_true(any(grepl("^Male_Parent_", names(res)))) expect_true(any(grepl("^Female_Parent_", names(res)))) }) @@ -328,7 +375,8 @@ test_that("show_ties = FALSE warns about ties and returns single-result columns" f <- make_files(tied_genos, tied_parents, tied_progeny) expect_warning( res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE), + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE), regexp = "tied" ) expect_true("Male_Parent" %in% names(res)) @@ -337,33 +385,42 @@ test_that("show_ties = FALSE warns about ties and returns single-result columns" expect_false(any(grepl("^Female_Parent_\\d", names(res)))) }) +test_that("base columns are always populated even when ties exist", { + f <- make_files(tied_genos, tied_parents, tied_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = TRUE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + expect_false(is.na(res$Male_Parent[1])) + expect_false(is.na(res$Female_Parent[1])) +}) + # ============================================================================== # 7. verbose / write_txt # ============================================================================== -test_that("verbose = TRUE returns the result invisibly as data.table", { +test_that("verbose = TRUE returns the result as data.table", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = TRUE, write_txt = FALSE) + verbose = TRUE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") }) test_that("verbose = FALSE returns the result as data.table", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") }) -test_that("write_txt = TRUE creates the output file", { +test_that("write_txt = TRUE creates parentage_testing_results.txt", { old_wd <- getwd() tmp <- tempdir() setwd(tmp) on.exit(setwd(old_wd), add = TRUE) f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = TRUE) - expect_true(file.exists(file.path(tmp, "parentage_results_dt.txt"))) + verbose = FALSE, write_txt = TRUE, plot_results = FALSE) + expect_true(file.exists(file.path(tmp, "parentage_testing_results.txt"))) }) test_that("write_txt = FALSE does not create the output file", { @@ -371,14 +428,23 @@ test_that("write_txt = FALSE does not create the output file", { tmp <- tempdir() setwd(tmp) on.exit(setwd(old_wd), add = TRUE) - out_file <- file.path(tmp, "parentage_results_dt.txt") + out_file <- file.path(tmp, "parentage_testing_results.txt") if (file.exists(out_file)) file.remove(out_file) f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_false(file.exists(out_file)) }) +test_that("na_string = '' is accepted without error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_no_error( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + na_string = "", verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + ) +}) + # ============================================================================== # 8. Sex-based candidate filtering # ============================================================================== @@ -386,7 +452,7 @@ test_that("write_txt = FALSE does not create the output file", { test_that("best_male_parent only assigns M or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) valid_male_parents <- base_parents[Sex %in% c("M","A")]$ID expect_true(res$Best_Match %in% valid_male_parents) }) @@ -394,11 +460,29 @@ test_that("best_male_parent only assigns M or A parents", { test_that("best_female_parent only assigns F or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) valid_female_parents <- base_parents[Sex %in% c("F","A")]$ID expect_true(res$Best_Match %in% valid_female_parents) }) +test_that("best_pair male slot contains only M or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + valid_males <- base_parents[Sex %in% c("M","A")]$ID + expect_true(res$Male_Parent %in% valid_males) +}) + +test_that("best_pair female slot contains only F or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + valid_females <- base_parents[Sex %in% c("F","A")]$ID + expect_true(res$Female_Parent %in% valid_females) +}) + # ============================================================================== # 9. Edge cases # ============================================================================== @@ -406,7 +490,8 @@ test_that("best_female_parent only assigns F or A parents", { test_that("single progeny individual is handled correctly", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(nrow(res), 1L) }) @@ -416,20 +501,33 @@ test_that("all-NA marker column does not cause an error", { f <- make_files(na_genos, base_parents, child1_progeny) expect_no_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) ) }) test_that("Progeny column contains the correct progeny IDs", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_setequal(res$Progeny, child1_progeny$ID) }) test_that("multiple progeny are all represented in output", { f <- make_files(base_genos, base_parents, base_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_setequal(res$Progeny, base_progeny$ID) }) + +test_that("single parent pair (n_pairs = 1) does not cause dimension errors", { + single_pair_parents <- data.table::data.table(ID = c("S1","D1"), + Sex = c("M","F")) + f <- make_files(base_genos, single_pair_parents, child1_progeny) + expect_no_error( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + ) +}) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index ef4e493..02acc22 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -43,56 +43,84 @@ write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { list(ped = ped_file, genos = genos_file) } +# Helper to run validate_pedigree in a temporary working directory +# so corrected_pedigree.txt doesn't pollute the real wd +run_in_tmpdir <- function(expr) { + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + force(expr) +} + # ============================================================================== # 1. Input validation # ============================================================================== test_that("trio_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = 150, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = -1, - verbose = FALSE, write_txt = FALSE)) + run_in_tmpdir({ + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = 150, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = -1, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + }) }) test_that("single_parent_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = 101, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = -5, - verbose = FALSE, write_txt = FALSE)) + run_in_tmpdir({ + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = 101, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = -5, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + }) }) test_that("missing required pedigree column raises an error", { bad_ped <- data.table(ID = "IND_C", Parent1 = "IND_A", Female_Parent = "IND_B") f <- write_temp_files(ped = bad_ped) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - regexp = "missing required columns" - ) + run_in_tmpdir({ + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, + plot_results = FALSE), + regexp = "missing required columns" + ) + }) }) test_that("missing ID column in genotypes raises an error", { bad_genos <- copy(make_genos()) setnames(bad_genos, "ID", "SampleID") f <- write_temp_files(genos = bad_genos) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - regexp = "ID" - ) + run_in_tmpdir({ + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, + plot_results = FALSE), + regexp = "ID" + ) + }) }) test_that("all trios with no genotype data stop with an error", { ped <- data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") f <- write_temp_files(ped = ped) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - regexp = "No valid trios remain" - ) + run_in_tmpdir({ + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, + plot_results = FALSE), + regexp = "No valid trios remain" + ) + }) }) # ============================================================================== @@ -100,29 +128,38 @@ test_that("all trios with no genotype data stop with an error", { # ============================================================================== test_that("returns a data.table", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_s3_class(res, "data.table") + }) }) test_that("result has one row per pedigree entry", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(nrow(res), 2L) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(nrow(res), 2L) + }) }) test_that("result has all expected columns", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expected_cols <- c( - "ID", "Male_Parent", "Female_Parent", - "Mendelian_Error_Pct", "Markers_Tested", "Status", - "Correction_Decision", - "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", - "Best_Male_Parent", "Best_Male_Parent_Error_Pct", - "Best_Female_Parent", "Best_Female_Parent_Error_Pct" - ) - expect_true(all(expected_cols %in% names(res))) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expected_cols <- c( + "ID", "Orig_Male_Parent", "Orig_Female_Parent", + "Trio_Mendelian_Error_Pct", "Trio_Markers_Tested", "Status", + "Recommended_Correction", + "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", + "Best_Male_Candidate", "Best_Male_Candidate_Error_Pct", + "Best_Female_Candidate", "Best_Female_Candidate_Error_Pct" + ) + expect_true(all(expected_cols %in% names(res))) + }) }) # ============================================================================== @@ -130,68 +167,87 @@ test_that("result has all expected columns", { # ============================================================================== test_that("PASS trio is correctly identified", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - pass_row <- res[ID == "IND_C"] - expect_equal(nrow(pass_row), 1L) - expect_equal(pass_row$Status, "PASS") - expect_equal(pass_row$Mendelian_Error_Pct, 0) - expect_equal(pass_row$Correction_Decision, "NONE") + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + pass_row <- res[ID == "IND_C"] + expect_equal(nrow(pass_row), 1L) + expect_equal(pass_row$Status, "PASS") + expect_equal(pass_row$Trio_Mendelian_Error_Pct, 0) + expect_equal(pass_row$Recommended_Correction, "NONE") + }) }) test_that("FAIL trio is correctly identified", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(nrow(fail_row), 1L) - expect_equal(fail_row$Status, "FAIL") - expect_gt(fail_row$Mendelian_Error_Pct, 5.0) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + fail_row <- res[ID == "IND_D"] + expect_equal(nrow(fail_row), 1L) + expect_equal(fail_row$Status, "FAIL") + expect_gt(fail_row$Trio_Mendelian_Error_Pct, 5.0) + }) }) test_that("FAIL trio has REMOVE_MALE_PARENT decision with best match populated", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(fail_row$Correction_Decision, "REMOVE_MALE_PARENT") - expect_false(is.na(fail_row$Best_Male_Parent)) - expect_true(is.na(fail_row$Best_Female_Parent)) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + fail_row <- res[ID == "IND_D"] + expect_equal(fail_row$Recommended_Correction, "REMOVE_MALE_PARENT") + expect_false(is.na(fail_row$Best_Male_Candidate)) + expect_true(is.na(fail_row$Best_Female_Candidate)) + }) }) -test_that("Mendelian_Error_Pct is 0 for a perfect trio", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Mendelian_Error_Pct, 0) +test_that("Trio_Mendelian_Error_Pct is 0 for a perfect trio", { + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Trio_Mendelian_Error_Pct, 0) + }) }) -test_that("Markers_Tested equals number of markers for complete data", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 20L) +test_that("Trio_Markers_Tested equals number of markers for complete data", { + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Trio_Markers_Tested, 20L) + }) }) test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, min_markers = 25L) - expect_true(all(res$Status == "LOW_MARKERS")) - expect_true(all(grepl("^LOW_MARKERS_", res$Correction_Decision))) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE, + min_markers = 25L) + expect_true(all(res$Status == "LOW_MARKERS")) + expect_true(all(grepl("^LOW_MARKERS_", res$Recommended_Correction))) + }) }) -test_that("NA markers reduce Markers_Tested and do not cause errors", { +test_that("NA markers reduce Trio_Markers_Tested and do not cause errors", { genos <- make_genos() genos[ID == "IND_C", M1 := NA_integer_] genos[ID == "IND_C", M2 := NA_integer_] - f <- write_temp_files(genos = genos) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 18L) - expect_equal(res[ID == "IND_C"]$Status, "PASS") + f <- write_temp_files(genos = genos) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Trio_Markers_Tested, 18L) + expect_equal(res[ID == "IND_C"]$Status, "PASS") + }) }) # ============================================================================== # 4. Missing parent statuses (MISSING_MALE_PARENT / MISSING_FEMALE_PARENT / # MISSING_BOTH_PARENTS) -# Note: each test includes make_pedigree() rows so has_geno is never empty, -# and filters res by the specific ID under test [3][4] # ============================================================================== test_that("MISSING_MALE_PARENT status and recommendation are correct", { @@ -199,13 +255,16 @@ test_that("MISSING_MALE_PARENT status and recommendation are correct", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_MALE_PARENT") - expect_equal(r$Correction_Decision, "NONE") - expect_false(is.na(r$Best_Male_Parent)) - expect_true(is.na(r$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_MALE_PARENT") + expect_equal(r$Recommended_Correction, "NONE") + expect_false(is.na(r$Best_Male_Candidate)) + expect_true(is.na(r$Best_Female_Candidate)) + }) }) test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { @@ -213,13 +272,16 @@ test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "IND_A", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_FEMALE_PARENT") - expect_equal(r$Correction_Decision, "NONE") - expect_true(is.na(r$Best_Male_Parent)) - expect_false(is.na(r$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_FEMALE_PARENT") + expect_equal(r$Recommended_Correction, "NONE") + expect_true(is.na(r$Best_Male_Candidate)) + expect_false(is.na(r$Best_Female_Candidate)) + }) }) test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { @@ -227,16 +289,19 @@ test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_BOTH_PARENTS") - expect_equal(r$Correction_Decision, "NONE") - expect_false(is.na(r$Best_Male_Parent)) - expect_false(is.na(r$Best_Female_Parent)) -}) - -test_that("MISSING_* rows preserve 0s in corrected pedigree", { + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_BOTH_PARENTS") + expect_equal(r$Recommended_Correction, "NONE") + expect_false(is.na(r$Best_Male_Candidate)) + expect_false(is.na(r$Best_Female_Candidate)) + }) +}) + +test_that("MISSING_* rows preserve 0s in corrected_pedigree.txt", { ped <- rbind( make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") @@ -247,22 +312,29 @@ test_that("MISSING_* rows preserve 0s in corrected pedigree", { old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), colClasses = "character") - expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") + # MISSING_MALE_PARENT — male stays "0", female unchanged + expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") + expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") }) -test_that("Best_Male_Parent for MISSING_MALE_PARENT is excluded from being the known female", { +test_that("Best_Male_Candidate for MISSING_MALE_PARENT is not the known female parent", { ped <- rbind( make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - # The known female parent should not be suggested as the best male parent - expect_false(r$Best_Male_Parent == "IND_B") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + # The known female parent should not be suggested as the best male candidate + expect_false(r$Best_Male_Candidate == "IND_B") + }) }) # ============================================================================== @@ -277,16 +349,19 @@ test_that("FOUNDERS status is assigned when ID in founders list with 0 0 parents founders_file <- tempfile(fileext = ".txt") fwrite(data.table(ID = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_txt = FALSE) - founder_row <- res[ID == "IND_A"] - expect_equal(founder_row$Status, "FOUNDERS") - expect_equal(founder_row$Correction_Decision, "NONE") - expect_true(is.na(founder_row$Best_Male_Parent)) - expect_true(is.na(founder_row$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_results = FALSE, + plot_results = FALSE) + founder_row <- res[ID == "IND_A"] + expect_equal(founder_row$Status, "FOUNDERS") + expect_equal(founder_row$Recommended_Correction, "NONE") + expect_true(is.na(founder_row$Best_Male_Candidate)) + expect_true(is.na(founder_row$Best_Female_Candidate)) + }) }) test_that("non-founder rows are still evaluated normally when founders file is supplied", { @@ -297,13 +372,16 @@ test_that("non-founder rows are still evaluated normally when founders file is s founders_file <- tempfile(fileext = ".txt") fwrite(data.table(ID = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_txt = FALSE) - # IND_C has real parents — should still get PASS - expect_equal(res[ID == "IND_C"]$Status, "PASS") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_results = FALSE, + plot_results = FALSE) + # IND_C has real parents — should still get PASS + expect_equal(res[ID == "IND_C"]$Status, "PASS") + }) }) test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { @@ -311,9 +389,12 @@ test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + }) }) test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUNDERS", { @@ -321,12 +402,15 @@ test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUND make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = NULL, - verbose = FALSE, - write_txt = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, + founders_file = NULL, + verbose = FALSE, + write_results = FALSE, + plot_results = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + }) }) # ============================================================================== @@ -338,36 +422,45 @@ test_that("NO_GENOTYPE_DATA is flagged for progeny absent from genotype file", { make_pedigree(), data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_equal(nrow(ghost_row), 1L) - expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") - expect_equal(ghost_row$Correction_Decision, "NONE") -}) - -test_that("NO_GENOTYPE_DATA rows have NA for all analysis columns", { + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_equal(nrow(ghost_row), 1L) + expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") + expect_equal(ghost_row$Recommended_Correction, "NONE") + }) +}) + +test_that("NO_GENOTYPE_DATA rows have NA/0 for all analysis columns", { ped <- rbind( make_pedigree(), data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_true(is.na(ghost_row$Mendelian_Error_Pct)) - expect_equal(ghost_row$Markers_Tested, 0L) - expect_true(is.na(ghost_row$Best_Male_Parent)) - expect_true(is.na(ghost_row$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_true(is.na(ghost_row$Trio_Mendelian_Error_Pct)) + expect_equal(ghost_row$Trio_Markers_Tested, 0L) + expect_true(is.na(ghost_row$Best_Male_Candidate)) + expect_true(is.na(ghost_row$Best_Female_Candidate)) + }) }) test_that("NO_GENOTYPE_DATA flagged when declared parent is absent from genotype file", { ped <- rbind( - make_pedigree(), # ensures has_geno is not empty + make_pedigree(), data.table(ID = "IND_C_GHOST", Male_Parent = "GHOST_DAD", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") + }) }) test_that("valid trios still evaluated correctly when ghost rows are present", { @@ -375,24 +468,29 @@ test_that("valid trios still evaluated correctly when ghost rows are present", { make_pedigree(), data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Status, "PASS") - expect_equal(res[ID == "IND_D"]$Status, "FAIL") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Status, "PASS") + expect_equal(res[ID == "IND_D"]$Status, "FAIL") + }) }) # ============================================================================== # 7. Corrected pedigree output # ============================================================================== -test_that("corrected_pedigree.txt is written and PASS parents are unchanged", { +test_that("corrected_pedigree.txt is always written and PASS parents are unchanged", { f <- write_temp_files() tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) expect_true(file.exists(file.path(tmp_dir, "corrected_pedigree.txt"))) corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) # IND_C passes — parents must be unchanged @@ -407,7 +505,9 @@ test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), colClasses = "character") # IND_D fails with REMOVE_MALE_PARENT — male should become "0" @@ -415,45 +515,32 @@ test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") }) -test_that("corrected_pedigree.txt preserves 0s for MISSING_* rows", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), - colClasses = "character") - # MISSING_MALE_PARENT — male stays "0", female unchanged - expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") - expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") -}) - # ============================================================================== -# 8. write_txt / output file +# 8. write_results / output file # ============================================================================== -test_that("write_txt = TRUE writes output file with correct row count", { +test_that("write_results = TRUE writes output file with correct row count", { f <- write_temp_files() out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = TRUE, output_filename = out_file) - expect_true(file.exists(out_file)) - written <- fread(out_file) - expect_equal(nrow(written), 2L) + run_in_tmpdir({ + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = TRUE, output_filename = out_file, + plot_results = FALSE) + expect_true(file.exists(out_file)) + written <- fread(out_file) + expect_equal(nrow(written), 2L) + }) }) -test_that("write_txt = FALSE does not create output file", { +test_that("write_results = FALSE does not create output file", { f <- write_temp_files() out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, output_filename = out_file) - expect_false(file.exists(out_file)) + run_in_tmpdir({ + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, output_filename = out_file, + plot_results = FALSE) + expect_false(file.exists(out_file)) + }) }) test_that("output file contains correct number of rows when ghost rows present", { @@ -463,9 +550,12 @@ test_that("output file contains correct number of rows when ghost rows present", ) f <- write_temp_files(ped = ped) out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = TRUE, output_filename = out_file) - written <- fread(out_file) - # 2 valid trios + 1 ghost = 3 rows total - expect_equal(nrow(written), 3L) + run_in_tmpdir({ + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = TRUE, output_filename = out_file, + plot_results = FALSE) + written <- fread(out_file) + # 2 valid trios + 1 ghost = 3 rows total + expect_equal(nrow(written), 3L) + }) }) From 03725d8b5567cef3749e170806f6dfe0c6394a8a Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 27 Apr 2026 15:09:40 -0400 Subject: [PATCH 07/12] Made edits suggested by autopilot --- R/check_ped.R | 50 ++++++++++++--- R/find_parentage.R | 127 +++++++++++++++++++++------------------ R/validate_pedigree.R | 111 +++++++++++++++++++--------------- man/find_parentage.Rd | 8 ++- man/validate_pedigree.Rd | 17 ++++-- 5 files changed, 193 insertions(+), 120 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index e1d9407..0ec5cf2 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -40,9 +40,11 @@ check_ped <- function(ped.file, correct = TRUE, save_zip = FALSE, save_corrected_zip = FALSE) { + #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) + # ── Column alias remapping ────────────────────────────────────────────────── # Accept common alternative column names and silently remap them so the rest # of the function always works with the canonical names. @@ -59,6 +61,7 @@ check_ped <- function(ped.file, } } } + # Validate required column names before any processing required_cols <- c("id", "male_parent", "female_parent") missing_cols <- setdiff(required_cols, colnames(data)) @@ -75,33 +78,44 @@ check_ped <- function(ped.file, male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) + # Add row numbers before any processing data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() + #### check 1: exact duplicates #### exact_duplicates <- data[ duplicated(data %>% select(-row_number)) | duplicated(data %>% select(-row_number), fromLast = TRUE), ] + #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% group_by(id) %>% filter(n() > 1) %>% filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% ungroup() + #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + #### check 4: missing parents #### - all_ids <- unique(data$id) - ref_ids <- unique(c(data$male_parent, data$female_parent)) - ref_ids <- ref_ids[ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) + all_ids <- unique(data$id) + ref_ids <- unique(c(data$male_parent, data$female_parent)) + # Fix #1: trimws() strips whitespace-only strings; the combined filter then + # drops NA (which NA != "0" would not catch), empty strings, and "0" + # to prevent spurious founder rows with id = NA in the corrected pedigree. + ref_ids <- trimws(ref_ids) + ref_ids <- ref_ids[!is.na(ref_ids) & ref_ids != "" & ref_ids != "0"] + + missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + # Only build founder rows if there are actually missing parents to add if (length(missing_ids) > 0) { missing_founders <- data.frame( @@ -120,6 +134,7 @@ check_ped <- function(ped.file, stringsAsFactors = FALSE ) } + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { @@ -127,6 +142,7 @@ check_ped <- function(ped.file, c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE @@ -145,6 +161,7 @@ check_ped <- function(ped.file, rec_stack[node] <- FALSE return(cycles) } + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() @@ -156,6 +173,7 @@ check_ped <- function(ped.file, } return(all_cycles) } + cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -172,6 +190,7 @@ check_ped <- function(ped.file, dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) } + #### compile report #### input_ped_report <- list( exact_duplicates = exact_duplicates, @@ -180,11 +199,14 @@ check_ped <- function(ped.file, missing_parents = missing_parents, dependencies = dependencies ) + #### build corrected pedigree (no row_number column) #### if (correct) { corrected <- data %>% select(-row_number) + # Remove exact duplicates corrected <- distinct(corrected) + # Resolve conflicting IDs if (nrow(repeated_ids_diff) > 0) { corrected <- corrected %>% @@ -195,6 +217,7 @@ check_ped <- function(ped.file, .groups = "drop" ) } + # Add missing founders if (length(missing_ids) > 0) { corrected <- bind_rows( @@ -203,11 +226,13 @@ check_ped <- function(ped.file, ) } } + #### file names #### file_base <- tools::file_path_sans_ext(basename(ped.file)) corrected_name <- paste0(file_base, "_corrected") report_name <- paste0(file_base, "_report") zip_name <- paste0(file_base, "_report.zip") + #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") @@ -222,20 +247,28 @@ check_ped <- function(ped.file, cat("\n--- Cycles / Dependencies ---\n") if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") if (correct) { - cat(paste0("\`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) + cat(paste0("`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) } else { - cat("\`correct = FALSE`: no changes made to the pedigree.\n") + cat("`correct = FALSE`: no changes made to the pedigree.\n") } } + # Always save report to global environment assign(report_name, input_ped_report, envir = .GlobalEnv) + # Save corrected pedigree if correct = TRUE if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + #### zip export #### if (save_zip) { - tmp_dir <- tempfile() + tmp_dir <- tempfile() dir.create(tmp_dir) + # Fix #2: register cleanup so the temp directory is always removed on + # function exit, whether normal or due to an error. + on.exit(unlink(tmp_dir, recursive = TRUE), add = TRUE) + zip_files <- character(0) + # Section headers matching the console output labels section_labels <- c( exact_duplicates = "Exact Duplicates", @@ -262,17 +295,20 @@ check_ped <- function(ped.file, } zip_files <- c(zip_files, file_path) } + if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", row.names = FALSE, quote = FALSE) zip_files <- c(zip_files, corrected_path) } + zip_path <- file.path(getwd(), zip_name) invisible(capture.output( utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") )) if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } + invisible(input_ped_report) } diff --git a/R/find_parentage.R b/R/find_parentage.R index f53a323..410981b 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -27,7 +27,11 @@ #' @param verbose Logical. If \code{TRUE}, prints progress, summary, and #' results to the console. Default is \code{TRUE}. #' @param write_txt Logical. If \code{TRUE}, writes results to -#' \code{parentage_testing_results.txt}. Default is \code{TRUE}. +#' \code{output_filename}. The plot will be saved using the same base name +#' with a .jpg extension. Default is \code{TRUE}. +#' @param output_filename Character. Path/name of the output results file +#' (default: \code{"parentage_testing_results.txt"}). The plot will be saved +#' using the same base name with a .jpg extension. #' @param na_string Character. String for missing values in output file. #' Use \code{"NA"} or \code{""} (default: \code{"NA"}). #' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error @@ -96,9 +100,10 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE, + output_filename = "parentage_testing_results.txt", na_string = "NA", plot_results = TRUE) { - + #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") if (!method %in% allowed_methods) @@ -109,7 +114,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("error_threshold must be between 0 and 100.") if (!na_string %in% c("NA", "")) stop("na_string must be either 'NA' or ''.") - + tryCatch({ genos <- data.table::fread(genotypes_file, sep = "auto") all_parents <- data.table::fread(parents_file, sep = "auto") @@ -117,7 +122,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + valid_ids <- genos$ID removed_parents <- base::setdiff(all_parents$ID, valid_ids) if (base::length(removed_parents) > 0) { @@ -125,50 +130,50 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, paste(removed_parents, collapse = ", "), call. = FALSE) all_parents <- all_parents[ID %in% valid_ids] } - + removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) if (base::length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) progeny_candidates <- progeny_candidates[ID %in% valid_ids] } - + if (!"Sex" %in% base::colnames(all_parents)) { warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") all_parents[, Sex := "A"] } - + all_parents[, Sex := base::toupper(Sex)] male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] - + if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) warning("No valid female parent candidates remain after filtering.", call. = FALSE) if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") - + #### Pre-compute genotype matrices once (shared across all methods) #### - + # Full genotype matrix used by best_pair genos_mat <- base::as.matrix(genos, rownames = "ID") - + # Homozygous-only matrix (het markers set to NA) used by hom methods genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - + #### Assignment_Status from markers and error rate #### - + # Returns LOW_MARKERS, HIGH_ERROR, or PASS assign_status <- function(markers, error_pct) { base::ifelse(markers < min_markers, "LOW_MARKERS", base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) } - + #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { parent_ids <- base::switch(method, @@ -176,11 +181,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - + # Subset pre-computed homozygous matrix for relevant parents and progeny - parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] - progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] - + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + # Pre-allocate results data.table; fill by reference with set() n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( @@ -190,26 +195,26 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + for (i in base::seq_len(n_progeny)) { progeny_vec <- progeny_genos[i, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA - + best_idx <- base::which.min(percent_mismatch) - + # No candidate found — flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { data.table::set(results_dt, i, "Markers_Tested", 0L) data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") next } - + best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - + data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) @@ -217,7 +222,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } final_df <- results_dt } - + #### Logic for Best Pair Method #### if (method == "best_pair") { parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, @@ -227,17 +232,17 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - + # Pre-extract parent genotype row blocks for vectorised operations male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - + # Subset full genotype matrix to progeny only progeny_ids <- progeny_candidates$ID progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) - + # Wrapped in matrix() to handle the n_pairs = 1 edge case mismatch_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -256,7 +261,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - + # Wrapped in matrix() to handle the n_pairs = 1 edge case comparison_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -267,11 +272,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - + # Percent mismatch matrix: n_pairs x n_progeny pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA - + # Pre-allocate base results data.table; tie columns added dynamically results_dt <- data.table::data.table( Progeny = progeny_ids, @@ -281,7 +286,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + # Per-progeny result extraction from pre-computed matrices results_list <- base::vector("list", n_progeny) for (j in base::seq_len(n_progeny)) { @@ -289,43 +294,43 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, percent_mismatch <- pct_mismatch_mat[, j] comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - - # No markers overlap at all — flag LOW_MARKERS + + # No markers overlap at all — flag LOW_MARKERS and continue if (base::is.infinite(min_mismatch_val)) { data.table::set(results_dt, j, "Markers_Tested", 0L) data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") next } - + best_indices <- base::which(percent_mismatch == min_mismatch_val) - + # Tie-break: prefer pair(s) with the most markers tested if (base::length(best_indices) > 1) { best_markers_per_pair <- comparisons[best_indices] max_markers <- base::max(best_markers_per_pair) best_indices <- best_indices[best_markers_per_pair == max_markers] } - + best_pairs <- parent_pairs[best_indices] best_markers <- comparisons[best_indices[1]] best_error <- base::round(min_mismatch_val, 2) a_status <- assign_status(best_markers, best_error) - + if (!show_ties && base::nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) } - + num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - + # Always populate base columns with the top result data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) data.table::set(results_dt, j, "Assignment_Status", a_status) - + # If ties remain after tie-breaking and show_ties is TRUE, # store tie details for later column binding if (show_ties && num_to_report > 1) { @@ -339,7 +344,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, results_list[[j]] <- data.table::as.data.table(tie_row) } } - + # Merge any tie suffix columns onto the pre-allocated base table tie_rows <- data.table::rbindlist( base::Filter(Negate(base::is.null), results_list), @@ -348,15 +353,21 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ) if (base::nrow(tie_rows) > 0) { final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) - # Replace any empty strings with NA - for (col in base::names(final_df)) { - data.table::set(final_df, which(final_df[[col]] == ""), col, NA_character_) + + char_cols <- base::names(final_df)[ + base::vapply(final_df, base::is.character, logical(1)) + ] + for (col in char_cols) { + empty_idx <- which(final_df[[col]] == "") + if (base::length(empty_idx) > 0) { + data.table::set(final_df, empty_idx, col, NA_character_) + } } } else { final_df <- results_dt } } - + #### Summary #### if (verbose) { total <- base::nrow(final_df) @@ -369,10 +380,10 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::cat("Min markers threshold :", min_markers, "\n") base::cat("Error threshold :", error_threshold, "%\n\n") } - + #### Output #### + if (write_txt) { - output_filename <- "parentage_testing_results.txt" tryCatch({ data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE, na = na_string) @@ -381,7 +392,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } - + #### Plot Results #### if (plot_results) { if (!requireNamespace("ggplot2", quietly = TRUE)) { @@ -389,26 +400,26 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } else { plot_df <- final_df[!is.na(final_df$Mendelian_Error_Pct)] plot_df$Mendelian_Error_Pct <- base::as.numeric(plot_df$Mendelian_Error_Pct) - + plot_df$Plot_Status <- dplyr::case_when( plot_df$Assignment_Status == "PASS" ~ "PASS", plot_df$Assignment_Status == "HIGH_ERROR" ~ "HIGH_ERROR", plot_df$Assignment_Status == "LOW_MARKERS" ~ "LOW_MARKERS", TRUE ~ "OTHER" ) - + n_total <- base::nrow(plot_df) n_pass <- base::sum(plot_df$Assignment_Status == "PASS", na.rm = TRUE) n_high <- base::sum(plot_df$Assignment_Status == "HIGH_ERROR", na.rm = TRUE) n_low <- base::sum(plot_df$Assignment_Status == "LOW_MARKERS", na.rm = TRUE) - + threshold_label <- base::paste0( "Error Threshold: ", error_threshold, "% | ", "Pass: ", n_pass, " | ", "High Error: ", n_high, " | ", "Low Markers: ", n_low ) - + p <- ggplot2::ggplot( plot_df, ggplot2::aes(x = Mendelian_Error_Pct, fill = Plot_Status) @@ -421,7 +432,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, linewidth = 1 ) + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + - ggplot2::scale_y_continuous(breaks = seq(0, 10000, by = 5)) + + + ggplot2::scale_y_continuous(breaks = scales::pretty_breaks()) + ggplot2::scale_fill_manual( values = c( "PASS" = "#339900", @@ -445,11 +457,12 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ) + ggplot2::theme_classic(base_size = 13) + ggplot2::theme(legend.position = "top") - + base::print(p) - + if (write_txt) { - plot_filename <- base::sub("\\.[^.]+$", ".jpg", "parentage_testing_results.txt") + + plot_filename <- paste0(tools::file_path_sans_ext(output_filename), ".jpg") tryCatch({ ggplot2::ggsave(plot_filename, plot = p, device = "jpeg", width = 10, height = 6, dpi = 300) @@ -460,7 +473,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } } } - + if (verbose) { base::cat("\n--- Parentage Assignment Results ---\n") base::print(final_df) diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index a3526ce..8784914 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -33,6 +33,9 @@ #' is also TRUE, the plot is additionally saved as a .jpg file. #' @param na_string Character. String for missing values in the output file. #' Use \code{"NA"} or \code{""} (default: \code{"NA"}). +#' @param corrected_pedigree_filename Character. Path/name of the output file +#' for the corrected pedigree (default: "corrected_pedigree.txt"). Set to +#' NULL to suppress writing the corrected pedigree. #' #' @return A data.table (returned invisibly) with one row per trio and #' the following columns: @@ -65,10 +68,11 @@ validate_pedigree <- function(pedigree_file, genotypes_file, single_parent_error_threshold = 2.0, verbose = TRUE, write_results = TRUE, - output_filename = "__validation_report.txt", plot_results = TRUE, - na_string = "NA") { - + na_string = "NA", + output_filename = "__validation_report.txt", + corrected_pedigree_filename = "corrected_pedigree.txt") { + #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) stop("trio_error_threshold must be between 0 and 100") @@ -76,14 +80,14 @@ validate_pedigree <- function(pedigree_file, genotypes_file, stop("single_parent_error_threshold must be between 0 and 100") if (!na_string %in% c("NA", "")) stop("na_string must be either 'NA' or ''.") - + tryCatch({ pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") genos <- data.table::fread(genotypes_file, sep = "auto") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + #### Check required columns #### required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) @@ -92,11 +96,11 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::paste(missing_cols, collapse = ", ")) if (!"ID" %in% base::names(genos)) stop("Genotypes file must have an 'ID' column") - + pedigree[, Male_Parent := as.character(Male_Parent)] pedigree[, Female_Parent := as.character(Female_Parent)] original_pedigree <- data.table::copy(pedigree) - + #### Read founders list #### if (!is.null(founders_file)) { founders_raw <- tryCatch({ @@ -108,7 +112,15 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { founder_ids <- character(0) } - + + #### Build genotype matrices #### + genos_mat <- base::as.matrix(genos, rownames = "ID") + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "ID") + for (col in marker_cols) + genos_hom[base::get(col) == 1, (col) := NA_integer_] + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + #### Identify trios missing from the genotype file #### valid_ids <- as.character(genos$ID) has_geno <- pedigree[ID %in% valid_ids & @@ -123,15 +135,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, pedigree <- has_geno if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") - - #### Build genotype matrices #### - genos_mat <- base::as.matrix(genos, rownames = "ID") - genos_hom <- data.table::copy(genos) - marker_cols <- base::setdiff(base::names(genos_hom), "ID") - for (col in marker_cols) - genos_hom[base::get(col) == 1, (col) := NA_integer_] - genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - + #### Find best matching parent via homozygous mismatch #### find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { candidates <- base::setdiff(base::rownames(genos_hom_mat), @@ -145,11 +149,14 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (comparisons == 0) return(NA_real_) (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) + + if (base::all(base::is.na(errors))) + return(base::list(id = NA_character_, error_pct = NA_real_)) best_idx <- base::which.min(errors) base::list(id = candidates[best_idx], error_pct = base::round(errors[best_idx], 2)) } - + #### Main trio evaluation loop #### results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { prog_id <- pedigree$ID[i] @@ -165,7 +172,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_male_parent_pct <- NA_real_ best_female_parent <- NA_character_ best_female_parent_pct <- NA_real_ - + if (male_parent_id == "0" && female_parent_id == "0" && prog_id %in% founder_ids) { status <- "FOUNDERS" @@ -268,7 +275,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } } } - + data.table::data.table( ID = prog_id, Orig_Male_Parent = male_parent_id, @@ -285,9 +292,9 @@ validate_pedigree <- function(pedigree_file, genotypes_file, Best_Female_Candidate_Error_Pct = best_female_parent_pct ) }) - + final_df <- data.table::rbindlist(results_list) - + #### Append NO_GENOTYPE_DATA rows #### if (base::nrow(no_geno_rows) > 0) { no_geno_df <- data.table::data.table( @@ -307,30 +314,34 @@ validate_pedigree <- function(pedigree_file, genotypes_file, ) final_df <- data.table::rbindlist(list(final_df, no_geno_df)) } - + #### Write corrected pedigree #### - corrected_pedigree <- data.table::copy(original_pedigree) - for (i in base::seq_len(base::nrow(final_df))) { - prog_id <- final_df$ID[i] - decision <- final_df$Recommended_Correction[i] - row_idx <- base::which(corrected_pedigree$ID == prog_id) - if (decision == "REMOVE_MALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - } else if (decision == "REMOVE_FEMALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") - } else if (decision == "REMOVE_BOTH") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + + # pass NULL to suppress writing. + if (!is.null(corrected_pedigree_filename)) { + corrected_pedigree <- data.table::copy(original_pedigree) + for (i in base::seq_len(base::nrow(final_df))) { + prog_id <- final_df$ID[i] + decision <- final_df$Recommended_Correction[i] + row_idx <- base::which(corrected_pedigree$ID == prog_id) + if (decision == "REMOVE_MALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + } else if (decision == "REMOVE_FEMALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + } else if (decision == "REMOVE_BOTH") { + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + } } + tryCatch({ + data.table::fwrite(corrected_pedigree, file = corrected_pedigree_filename, + sep = "\t", quote = FALSE) + if (verbose) base::cat("Corrected pedigree written to:", corrected_pedigree_filename, "\n") + }, error = function(e) { + warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) + }) } - tryCatch({ - data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", - sep = "\t", quote = FALSE) - if (verbose) base::cat("Corrected pedigree written to: corrected_pedigree.txt\n") - }, error = function(e) { - warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) - }) - + #### Summary output #### if (verbose) { total_trios <- base::nrow(final_df) @@ -352,18 +363,18 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::cat("\n") base::print(final_df) } - + #### Write results #### if (write_results) { tryCatch({ data.table::fwrite(final_df, file = output_filename, - sep = "\t", quote = FALSE, na = na_string) # <-- na_string used here + sep = "\t", quote = FALSE, na = na_string) if (verbose) base::cat("Results written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results. Error: ", e$message, call. = FALSE) }) } - + #### Plot results #### if (plot_results) { if (!requireNamespace("ggplot2", quietly = TRUE)) { @@ -421,7 +432,9 @@ validate_pedigree <- function(pedigree_file, genotypes_file, ggplot2::theme(legend.position = "top") print(p) if (write_results) { - plot_filename <- base::sub("\\.[^.]+$", ".jpg", output_filename) + # Fix #3: use tools::file_path_sans_ext() to reliably build the .jpg + # filename regardless of whether output_filename has an extension or not. + plot_filename <- paste0(tools::file_path_sans_ext(output_filename), ".jpg") tryCatch({ ggplot2::ggsave(plot_filename, plot = p, device = "jpeg", width = 10, height = 6, dpi = 300) @@ -432,6 +445,6 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } } } - + return(base::invisible(final_df)) -} \ No newline at end of file +} diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 44b9772..5a148af 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -15,6 +15,7 @@ find_parentage( allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE, + output_filename = "parentage_testing_results.txt", na_string = "NA", plot_results = TRUE ) @@ -53,7 +54,12 @@ female parent IDs are excluded. Default is \code{TRUE}.} results to the console. Default is \code{TRUE}.} \item{write_txt}{Logical. If \code{TRUE}, writes results to -\code{parentage_testing_results.txt}. Default is \code{TRUE}.} +\code{output_filename}. The plot will be saved using the same base name +with a .jpg extension. Default is \code{TRUE}.} + +\item{output_filename}{Character. Path/name of the output results file +(default: \code{"parentage_testing_results.txt"}). The plot will be saved +using the same base name with a .jpg extension.} \item{na_string}{Character. String for missing values in output file. Use \code{"NA"} or \code{""} (default: \code{"NA"}).} diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 8ec975b..b4987de 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -13,9 +13,10 @@ validate_pedigree( single_parent_error_threshold = 2, verbose = TRUE, write_results = TRUE, - output_filename = "__validation_report.txt", plot_results = TRUE, - na_string = "NA" + na_string = "NA", + output_filename = "__validation_report.txt", + corrected_pedigree_filename = "corrected_pedigree.txt" ) } \arguments{ @@ -45,16 +46,20 @@ table, and results to the console (default: TRUE).} \item{write_results}{Logical. If TRUE, writes the validation results to output_filename and saves the plot as a .jpg file (default: TRUE).} -\item{output_filename}{Character. Path/name of the output results file -(default: "__validation_report.txt"). The plot will be saved using the -same base name with a .jpg extension.} - \item{plot_results}{Logical. If TRUE, prints a histogram of Trio Mendelian Error percentages with a threshold line (default: TRUE). If write_results is also TRUE, the plot is additionally saved as a .jpg file.} \item{na_string}{Character. String for missing values in the output file. Use \code{"NA"} or \code{""} (default: \code{"NA"}).} + +\item{output_filename}{Character. Path/name of the output results file +(default: "__validation_report.txt"). The plot will be saved using the +same base name with a .jpg extension.} + +\item{corrected_pedigree_filename}{Character. Path/name of the output file +for the corrected pedigree (default: "corrected_pedigree.txt"). Set to +NULL to suppress writing the corrected pedigree.} } \value{ A data.table (returned invisibly) with one row per trio and From 506a47ff1dba98752d1e7910a251b6e5fdd66b8b Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 27 Apr 2026 15:44:13 -0400 Subject: [PATCH 08/12] added edits suggested by copilot --- R/check_ped.R | 152 +++++++++++++++++++++++------------------------ man/check_ped.Rd | 53 ++++++++--------- 2 files changed, 101 insertions(+), 104 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 0ec5cf2..26e0d8a 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,35 +1,38 @@ -#' Check a pedigree file for structural issues and optionally apply corrections +#' @title Check a pedigree file for structural issues #' -#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and -#' `female_parent` and checks for five classes of issues: missing required -#' columns, exact duplicate rows, repeated IDs with conflicting parent -#' assignments, inconsistent parent sex roles, and pedigree cycles. The input -#' file is never modified. A report list is always assigned to the global -#' environment; when `correct = TRUE`, a corrected pedigree is assigned as well. -#' -#' When `correct = TRUE`, the function removes exact duplicates, collapses -#' conflicting repeated IDs (replacing the inconsistent parent field with `"0"`), -#' and appends missing parents as founder rows. Inconsistent sex roles and -#' pedigree cycles are reported but not corrected and must be resolved manually. +#' @description +#' Reads a tab‑separated pedigree file with columns `id`, +#' `male_parent`, and `female_parent` and checks for five classes of +#' errors: missing required columns, exact duplicate rows, +#' repeated IDs with conflicting parents, inconsistent parent +#' sex roles, and pedigree cycles. The input file is never +#' modified. A report list is always assigned to the global +#' environment; when `correct = TRUE` a corrected pedigree is +#' assigned as well. #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional integer seed for reproducibility. -#' @param verbose Logical. If `TRUE` (default), prints findings to the console. -#' @param correct Logical. If `TRUE` (default), builds and assigns a corrected -#' pedigree to the global environment. If `FALSE`, issues are only reported. -#' @param save_zip Logical. If `TRUE`, bundles report components into a `.zip` -#' archive in the working directory. -#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = TRUE` and -#' `correct = TRUE`), includes the corrected pedigree in the `.zip` archive. +#' @param verbose Logical. If `TRUE` (default), prints findings to the +#' console. +#' @param correct Logical. If `TRUE` (default), builds and assigns a +#' corrected pedigree to the global environment. If `FALSE`, issues +#' are only reported. +#' @param save_zip Logical. If `TRUE`, bundles report components into a +#' `.zip` archive in the working directory. +#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = +#' TRUE` and `correct = TRUE`), includes the corrected pedigree in +#' the `.zip` archive. #' -#' @return An invisible named list of data frames describing detected issues: -#' * `exact_duplicates`: exact duplicate rows. -#' * `repeated_ids_diff`: rows where the same `id` has conflicting parents. -#' * `inconsistent_sex_roles`: rows involving IDs used as both parents. -#' * `missing_parents`: rows referencing parent IDs absent from `id`. -#' * `dependencies`: rows whose `id` is involved in a pedigree cycle. +#' @return An invisible named list of data frames describing detected +#' issues: +#' \item{exact_duplicates}{exact duplicate rows} +#' \item{repeated_ids_diff}{rows where the same `id` has conflicting parents} +#' \item{inconsistent_sex_roles}{rows involving IDs used as both parents} +#' \item{missing_parents}{rows referencing parent IDs absent from `id`} +#' \item{dependencies}{rows whose `id` is involved in a pedigree cycle} #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate +#' summarize first bind_rows n_distinct n select row_number #' @importFrom stats setNames #' @importFrom utils read.table write.table zip capture.output #' @importFrom tools file_path_sans_ext @@ -40,14 +43,11 @@ check_ped <- function(ped.file, correct = TRUE, save_zip = FALSE, save_corrected_zip = FALSE) { - #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) # ── Column alias remapping ────────────────────────────────────────────────── - # Accept common alternative column names and silently remap them so the rest - # of the function always works with the canonical names. col_aliases <- list( id = c("id", "ID", "animal", "Animal", "ind", "Ind"), male_parent = c("male_parent", "sire", "Sire", "father", "Father", "pat", "Pat"), @@ -73,48 +73,47 @@ check_ped <- function(ped.file, ) } data <- data %>% - mutate( + dplyr::mutate( id = as.character(id), male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) + data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) errors <- list() #### check 1: exact duplicates #### exact_duplicates <- data[ - duplicated(data %>% select(-row_number)) | - duplicated(data %>% select(-row_number), fromLast = TRUE), + duplicated(data %>% dplyr::select(-row_number)) | + duplicated(data %>% dplyr::select(-row_number), fromLast = TRUE), ] #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% - group_by(id) %>% - filter(n() > 1) %>% - filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% - ungroup() + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::filter(dplyr::n_distinct(male_parent) > 1 | + dplyr::n_distinct(female_parent) > 1) %>% + dplyr::ungroup() #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% - filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + dplyr::filter(male_parent %in% messy_ids | female_parent %in% messy_ids) #### check 4: missing parents #### all_ids <- unique(data$id) ref_ids <- unique(c(data$male_parent, data$female_parent)) - # Fix #1: trimws() strips whitespace-only strings; the combined filter then - # drops NA (which NA != "0" would not catch), empty strings, and "0" - # to prevent spurious founder rows with id = NA in the corrected pedigree. + + # Trim, remove NAs and zeros ref_ids <- trimws(ref_ids) ref_ids <- ref_ids[!is.na(ref_ids) & ref_ids != "" & ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% - filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + dplyr::filter(male_parent %in% missing_ids | female_parent %in% missing_ids) # Only build founder rows if there are actually missing parents to add if (length(missing_ids) > 0) { @@ -173,7 +172,6 @@ check_ped <- function(ped.file, } return(all_cycles) } - cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -185,7 +183,7 @@ check_ped <- function(ped.file, paste(ids, collapse = " -> "))) } } - dependencies <- data %>% filter(id %in% cycle_ids) + dependencies <- data %>% dplyr::filter(id %in% cycle_ids) if (nrow(dependencies) == 0 && length(errors) > 0) { dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) @@ -202,27 +200,27 @@ check_ped <- function(ped.file, #### build corrected pedigree (no row_number column) #### if (correct) { - corrected <- data %>% select(-row_number) + corrected <- data %>% dplyr::select(-row_number) # Remove exact duplicates - corrected <- distinct(corrected) + corrected <- dplyr::distinct(corrected) # Resolve conflicting IDs if (nrow(repeated_ids_diff) > 0) { corrected <- corrected %>% - group_by(id) %>% - summarize( - male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), - female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), + dplyr::group_by(id) %>% + dplyr::summarize( + male_parent = if (dplyr::n_distinct(male_parent) > 1) "0" else dplyr::first(male_parent), + female_parent = if (dplyr::n_distinct(female_parent) > 1) "0" else dplyr::first(female_parent), .groups = "drop" ) } # Add missing founders if (length(missing_ids) > 0) { - corrected <- bind_rows( + corrected <- dplyr::bind_rows( corrected, - missing_founders %>% select(-row_number) + missing_founders %>% dplyr::select(-row_number) ) } } @@ -253,45 +251,42 @@ check_ped <- function(ped.file, } } - # Always save report to global environment - assign(report_name, input_ped_report, envir = .GlobalEnv) - - # Save corrected pedigree if correct = TRUE - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + # Section headers matching the console output labels + section_labels <- c( + exact_duplicates = "Exact Duplicates", + repeated_ids_diff = "Conflicting IDs", + inconsistent_sex_roles = "Inconsistent Parent Sex Roles", + missing_parents = "Missing Parents (rows that reference them)", + dependencies = "Cycles / Dependencies" + ) - #### zip export #### if (save_zip) { tmp_dir <- tempfile() dir.create(tmp_dir) - # Fix #2: register cleanup so the temp directory is always removed on - # function exit, whether normal or due to an error. on.exit(unlink(tmp_dir, recursive = TRUE), add = TRUE) - zip_files <- character(0) - # Section headers matching the console output labels - section_labels <- c( - exact_duplicates = "Exact Duplicates", - repeated_ids_diff = "Conflicting IDs", - inconsistent_sex_roles = "Inconsistent Parent Sex Roles", - missing_parents = "Missing Parents (rows that reference them)", - dependencies = "Cycles / Dependencies" - ) for (component in names(section_labels)) { df <- input_ped_report[[component]] file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) if (nrow(df) == 0) { writeLines( - c(paste0("--- ", section_labels[[component]], " ---"), "None found."), + c(paste0("--- ", section_labels[[component]], " ---"), + "None found."), con = file_path ) } else { file_con <- file(file_path, open = "wt") - writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) - writeLines(paste(colnames(df), collapse = "\t"), con = file_con) + writeLines(paste0("--- ", section_labels[[component]], " ---"), + con = file_con) + writeLines(paste(colnames(df), collapse = "\t"), + con = file_con) close(file_con) - utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, - quote = FALSE, append = TRUE, col.names = FALSE) + utils::write.table(df, file = file_path, sep = "\t", + row.names = FALSE, + quote = FALSE, + append = TRUE, + col.names = FALSE) } zip_files <- c(zip_files, file_path) } @@ -299,7 +294,8 @@ check_ped <- function(ped.file, if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", - row.names = FALSE, quote = FALSE) + row.names = FALSE, + quote = FALSE) zip_files <- c(zip_files, corrected_path) } @@ -310,5 +306,9 @@ check_ped <- function(ped.file, if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } + # Assign objects to the global environment + assign(report_name, input_ped_report, envir = .GlobalEnv) + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + invisible(input_ped_report) } diff --git a/man/check_ped.Rd b/man/check_ped.Rd index bbae7c3..0cf3840 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check a pedigree file for structural issues and optionally apply corrections} +\title{Check a pedigree file for structural issues} \usage{ check_ped( ped.file, @@ -18,38 +18,35 @@ check_ped( \item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the console.} +\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the +console.} -\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a corrected -pedigree to the global environment. If \code{FALSE}, issues are only reported.} +\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a +corrected pedigree to the global environment. If \code{FALSE}, issues +are only reported.} -\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a \code{.zip} -archive in the working directory.} +\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a +\code{.zip} archive in the working directory.} -\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and -\code{correct = TRUE}), includes the corrected pedigree in the \code{.zip} archive.} +\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and \code{correct = TRUE}), includes the corrected pedigree in +the \code{.zip} archive.} } \value{ -An invisible named list of data frames describing detected issues: -\itemize{ -\item \code{exact_duplicates}: exact duplicate rows. -\item \code{repeated_ids_diff}: rows where the same \code{id} has conflicting parents. -\item \code{inconsistent_sex_roles}: rows involving IDs used as both parents. -\item \code{missing_parents}: rows referencing parent IDs absent from \code{id}. -\item \code{dependencies}: rows whose \code{id} is involved in a pedigree cycle. -} +An invisible named list of data frames describing detected +issues: +\item{exact_duplicates}{exact duplicate rows} +\item{repeated_ids_diff}{rows where the same \code{id} has conflicting parents} +\item{inconsistent_sex_roles}{rows involving IDs used as both parents} +\item{missing_parents}{rows referencing parent IDs absent from \code{id}} +\item{dependencies}{rows whose \code{id} is involved in a pedigree cycle} } \description{ -Reads a tab-separated pedigree file with columns \code{id}, \code{male_parent}, and -\code{female_parent} and checks for five classes of issues: missing required -columns, exact duplicate rows, repeated IDs with conflicting parent -assignments, inconsistent parent sex roles, and pedigree cycles. The input -file is never modified. A report list is always assigned to the global -environment; when \code{correct = TRUE}, a corrected pedigree is assigned as well. -} -\details{ -When \code{correct = TRUE}, the function removes exact duplicates, collapses -conflicting repeated IDs (replacing the inconsistent parent field with \code{"0"}), -and appends missing parents as founder rows. Inconsistent sex roles and -pedigree cycles are reported but not corrected and must be resolved manually. +Reads a tab‑separated pedigree file with columns \code{id}, +\code{male_parent}, and \code{female_parent} and checks for five classes of +errors: missing required columns, exact duplicate rows, +repeated IDs with conflicting parents, inconsistent parent +sex roles, and pedigree cycles. The input file is never +modified. A report list is always assigned to the global +environment; when \code{correct = TRUE} a corrected pedigree is +assigned as well. } From 977e0702fb52e430d2147333c352dbfe97cd0101 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 28 Apr 2026 11:15:05 -0400 Subject: [PATCH 09/12] modified default params --- R/check_ped.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 26e0d8a..6fb77ef 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -41,8 +41,8 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE, correct = TRUE, - save_zip = FALSE, - save_corrected_zip = FALSE) { + save_zip = TRUE, + save_corrected_zip = TRUE) { #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) From 5db5fa29477c69d830873ca6434671cf00b60d55 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 28 Apr 2026 13:13:50 -0400 Subject: [PATCH 10/12] Added authors fror functions and made headers concise --- NAMESPACE | 8 +- R/breedtools_functions.R | 282 +++++++++++++++------------------- R/check_ped.R | 225 ++++++++++++++------------- R/find_parentage.R | 2 + R/imputation_concordance.R | 95 +++++------- R/validate_pedigree.R | 3 + man/allele_freq_poly.Rd | 49 +++--- man/check_ped.Rd | 78 +++++++--- man/find_parentage.Rd | 3 + man/imputation_concordance.Rd | 82 +++++----- man/solve_composition_poly.Rd | 61 ++++---- man/validate_pedigree.Rd | 3 + 12 files changed, 433 insertions(+), 458 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4e518d1..c460be1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,6 +62,13 @@ importFrom(dplyr,summarise) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) +importFrom(ggplot2,aes) +importFrom(ggplot2,element_text) +importFrom(ggplot2,geom_bar) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,labs) +importFrom(ggplot2,theme) +importFrom(ggplot2,theme_minimal) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) @@ -73,7 +80,6 @@ importFrom(stats,reorder) importFrom(stats,setNames) importFrom(tibble,as_tibble) importFrom(tools,file_path_sans_ext) -importFrom(utils,capture.output) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/R/breedtools_functions.R b/R/breedtools_functions.R index 5f21836..52a2d85 100644 --- a/R/breedtools_functions.R +++ b/R/breedtools_functions.R @@ -1,149 +1,129 @@ -#' Compute Allele Frequencies for Populations +#' Compute allele frequencies for populations #' -#' Computes allele frequencies for specified populations given SNP array data +#' Computes allele frequencies for specified populations from SNP array data +#' coded as dosage of allele B. #' -#' @param geno matrix of genotypes coded as the dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' with individuals in rows (named) and SNPs in columns (named) -#' @param populations list of named populations. Each population has a vector of IDs -#' that belong to the population. Allele frequencies will be derived from all animals -#' @param ploidy integer indicating the ploidy level (default is 2 for diploid) -#' @return data.frame consisting of allele_frequencies for populations (columns) for -#' each SNP (rows) -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +#' @param geno Numeric matrix of genotypes coded as dosage of allele B +#' \code{{0, 1, 2, ..., ploidy}}, with individuals in rows (named) and +#' SNPs in columns (named). +#' @param populations Named list of populations, each containing a character +#' vector of individual IDs belonging to that population. +#' @param ploidy Integer. Ploidy level of the species. Default is `2`. +#' +#' @return A data frame of allele frequencies with SNPs as rows and populations +#' as columns. +#' +#' @author Josué Chinchilla-Vargas +#' +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. #' #' @examples -#' # Example inputs #' geno_matrix <- matrix( -#' c(4, 1, 4, 0, # S1 -#' 2, 2, 1, 3, # S2 -#' 0, 4, 0, 4, # S3 -#' 3, 3, 2, 2, # S4 -#' 1, 4, 2, 3),# S5 -#' nrow = 4, ncol = 5, byrow = FALSE, # individuals=rows, SNPs=cols -#' dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) +#' c(4, 1, 4, 0, +#' 2, 2, 1, 3, +#' 0, 4, 0, 4, +#' 3, 3, 2, 2, +#' 1, 4, 2, 3), +#' nrow = 4, ncol = 5, byrow = FALSE, +#' dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) #' ) -#' -#'pop_list <- list( -#' PopA = c("Ind1", "Ind2"), -#' PopB = c("Ind3", "Ind4") +#' pop_list <- list( +#' PopA = c("Ind1", "Ind2"), +#' PopB = c("Ind3", "Ind4") #' ) -#' #' allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) #' print(allele_freqs) #' #' @export allele_freq_poly <- function(geno, populations, ploidy = 2) { - - # Initialize returned df + # Initialize result matrix X <- matrix(NA, nrow = ncol(geno), ncol = length(populations)) - - # Subset geno into different populations for (i in 1:length(populations)) { - - # Get name of ith item in the list (population name) pop_name <- names(populations[i]) - - # Subset geno to only include genotypes of IDs in pop - pop_geno <- geno[rownames(geno) %in% populations[[i]], ] - - # Calculate allele frequencies - al_freq <- colMeans(pop_geno, na.rm = TRUE) / ploidy - - # Add to X - X[, i] <- al_freq + pop_geno <- geno[base::rownames(geno) %in% populations[[i]], ] + al_freq <- base::colMeans(pop_geno, na.rm = TRUE) / ploidy + X[, i] <- al_freq } - - # Label X with populations and SNPs - colnames(X) <- names(populations) - rownames(X) <- colnames(geno) - + base::colnames(X) <- base::names(populations) + base::rownames(X) <- base::colnames(geno) return(X) } +#' Solve breed composition for a single animal via quadratic programming +#' +#' Internal helper that solves the constrained OLS problem for one animal, +#' returning breed proportion estimates and the R² of the fit. #' -#' Performs whole genome breed composition prediction. +#' @param Y Numeric vector of genotypes (named by SNP) for a single animal, +#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}. +#' @param X Numeric matrix of allele frequencies with SNPs as rows and +#' breeds/populations as columns. +#' +#' @return A named numeric vector of breed proportions plus `R2`. +#' +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. #' -#' @param Y numeric vector of genotypes (with names as SNPs) from a single animal. -#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' @param X numeric matrix of allele frequencies from reference animals -#' @param p numeric indicating number of breeds represented in X -#' @param names character names of breeds -#' @return data.frame of breed composition estimates #' @import quadprog #' @importFrom stats cor -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. #' #' @noRd QPsolve <- function(Y, X) { - - # Remove NAs from Y and remove corresponding - # SNPs from X. Ensure Y is numeric - Ymod <- Y[!is.na(Y)] - Xmod <- X[names(Ymod), ] - - # Determine properties from X matrix - the number of parameters (breeds) p - # and the names of those parameters. - p <- ncol(X) - names <- colnames(X) - - # perfom steps needed to solve OLS by framing - # as a QP problem - # Rinv - matrix should be of dimensions px(p+1) where p is the number of variables in X - Rinv <- solve(chol(t(Xmod) %*% Xmod)) - - # C - the first column is a sum restriction (all equal to 1) and the rest of the columns an identity matrix - C <- cbind(rep(1, p), diag(p)) - - # b2 - This should be a vector of length p+1 the first element is the value of the sum (1) - # the other elements are the restriction of individual coefficients (>) - # so a value 0 produces positive coefficients - b2 <- c(1, rep(0, p)) - - # dd - this should be a matrix NOT a vector - dd <- (t(Ymod) %*% Xmod) - - qp <- solve.QP(Dmat = Rinv, factorized = TRUE, dvec = dd, Amat = C, bvec = b2, meq = 1) - beta <- qp$solution - rr <- cor(Ymod, Xmod %*% beta)^2 + Ymod <- Y[!base::is.na(Y)] + Xmod <- X[base::names(Ymod), ] + p <- base::ncol(X) + nms <- base::colnames(X) + Rinv <- base::solve(base::chol(base::t(Xmod) %*% Xmod)) + C <- base::cbind(base::rep(1, p), base::diag(p)) + b2 <- c(1, base::rep(0, p)) + dd <- base::t(Ymod) %*% Xmod + qp <- quadprog::solve.QP(Dmat = Rinv, factorized = TRUE, dvec = dd, + Amat = C, bvec = b2, meq = 1) + beta <- qp$solution + rr <- stats::cor(Ymod, Xmod %*% beta)^2 result <- c(beta, rr) - names(result) <- c(names, "R2") + base::names(result) <- c(nms, "R2") return(result) } -#' Compute Genome-Wide Breed Composition -#' -#' Computes genome-wide breed/ancestry composition using quadratic programming on a -#' batch of animals. -#' -#' @param Y numeric matrix of genotypes (columns) from all animals (rows) in population -#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' @param X numeric matrix of allele frequencies (rows) from each reference panel (columns). Frequencies are -#' relative to allele B. -#' @param ped data.frame giving pedigree information. Must be formatted "ID", "Sire", "Dam" -#' @param groups list of IDs categorized by breed/population. If specified, output will be a list -#' of results categorized by breed/population. -#' @param mia logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing the inferred maternally inherited allele for each locus for each animal instead -#' of breed composition results. -#' @param sire logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing sire genotypes for each locus for each animal instead of breed composition results. -#' @param dam logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing dam genotypes for each locus for each animal instead of breed composition results. -#' @param ploidy integer. The ploidy level of the species (e.g., 2 for diploid, 3 for triploid, etc.). -#' @return A data.frame or list of data.frames (if groups is !NULL) with breed/ancestry composition -#' results -#' @import quadprog -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +#' Compute genome-wide breed composition #' -#' @examples -#' # Example inputs for solve_composition_poly (ploidy = 4) +#' Estimates genome-wide breed/ancestry composition for a batch of animals +#' using quadratic programming, with optional pedigree-assisted and +#' grouped-output modes. +#' +#' @param Y Numeric matrix of genotypes with individuals as rows and SNPs as +#' columns, coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}. +#' @param X Numeric matrix of allele frequencies with SNPs as rows and +#' breeds/populations as columns. +#' @param ped Optional data frame with pedigree information formatted with +#' columns `ID`, `Sire`, and `Dam`. When supplied, `QPsolve_par` is used +#' and only animals with genotyped parents are included. +#' @param groups Optional named list of IDs grouped by breed/population. +#' When supplied, results are returned as a named list partitioned by group. +#' @param mia Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' the inferred maternally inherited allele per locus per animal. Default `FALSE`. +#' @param sire Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' sire genotypes per locus per animal. Default `FALSE`. +#' @param dam Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' dam genotypes per locus per animal. Default `FALSE`. +#' @param ploidy Integer. Ploidy level of the species. Default is `2`. +#' +#' @return A data frame, or a named list of data frames when `groups` is +#' supplied, containing breed/ancestry composition estimates. +#' +#' @author Josué Chinchilla-Vargas #' -#' # (This would typically be the output from allele_freq_poly) +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. +#' +#' @examples #' allele_freqs_matrix <- matrix( #' c(0.625, 0.500, #' 0.500, 0.500, @@ -153,70 +133,50 @@ QPsolve <- function(Y, X) { #' nrow = 5, ncol = 2, byrow = TRUE, #' dimnames = list(paste0("SNP", 1:5), c("VarA", "VarB")) #' ) -#' -#' # Validation Genotypes (individuals x SNPs) #' val_geno_matrix <- matrix( -#' c(2, 1, 2, 3, 4, # Test1 dosages for SNP1-5 -#' 3, 4, 2, 3, 0), # Test2 dosages for SNP1-5 +#' c(2, 1, 2, 3, 4, +#' 3, 4, 2, 3, 0), #' nrow = 2, ncol = 5, byrow = TRUE, #' dimnames = list(paste0("Test", 1:2), paste0("SNP", 1:5)) #' ) -#' -#' # Calculate Breed Composition #' composition <- solve_composition_poly(Y = val_geno_matrix, #' X = allele_freqs_matrix, #' ploidy = 4) #' print(composition) #' +#' @import quadprog +#' #' @export solve_composition_poly <- function(Y, X, - ped = NULL, + ped = NULL, groups = NULL, - mia = FALSE, - sire = FALSE, - dam = FALSE, + mia = FALSE, + sire = FALSE, + dam = FALSE, ploidy = 2) { + # Transpose so Y is SNPs x animals, as required internally + Y <- base::t(Y) + # Retain only SNPs present in X + Y <- Y[base::rownames(Y) %in% base::rownames(X), ] + + if (!base::is.null(ped)) { + # Pedigree-assisted mode: use QPsolve_par for animals with genotyped parents + mat_results <- base::lapply(base::colnames(Y), QPsolve_par, Y, X, ped, + mia = mia, sire = sire, dam = dam) + mat_results_tab <- base::do.call(rbind, mat_results) + return(mat_results_tab) + + } else if (!base::is.null(groups)) { + # Grouped mode: standard computation, results partitioned by group + Y <- Y / ploidy + grouped_results <- base::lapply(groups, QPseparate, Y, X) + return(grouped_results) - # Functions require Y to be animals x SNPs. Transpose - Y <- t(Y) - - # SNPs in Y should only be the ones present in X - Y <- Y[rownames(Y) %in% rownames(X), ] - - # If ped is supplied, use QPsolve_par to compute genomic composition using - # only animals who have genotyped parents (by incorporating Sire genotype). - if (!is.null(ped)) { - mat_results <- lapply(colnames(Y), - QPsolve_par, - Y, - X, - ped, - mia = mia, - sire = sire, - dam = dam) - - mat_results_tab <- do.call(rbind, mat_results) - return (mat_results_tab) - - # Else if groups supplied - perform regular genomic computation - # and list results by groups - } else if (!is.null(groups)) { - - # When using regular genomic computation - adjust dosage based on ploidy - Y <- Y / ploidy #(default is 2) - - grouped_results <- lapply(groups, QPseparate, Y, X) - return (grouped_results) - - # If neither using the ped or grouping option - just perform normal, unsegregated - # calculation } else { - - # Adjust dosage based on ploidy (default is 2) - Y <- Y / ploidy - - results <- t(apply(Y, 2, QPsolve, X)) - return (results) + # Standard mode: unsegregated computation across all animals + Y <- Y / ploidy + results <- base::t(base::apply(Y, 2, QPsolve, X)) + return(results) } } diff --git a/R/check_ped.R b/R/check_ped.R index 6fb77ef..847ebe1 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,67 +1,74 @@ -#' @title Check a pedigree file for structural issues +#' Check and optionally correct a pedigree file #' -#' @description -#' Reads a tab‑separated pedigree file with columns `id`, -#' `male_parent`, and `female_parent` and checks for five classes of -#' errors: missing required columns, exact duplicate rows, -#' repeated IDs with conflicting parents, inconsistent parent -#' sex roles, and pedigree cycles. The input file is never -#' modified. A report list is always assigned to the global -#' environment; when `correct = TRUE` a corrected pedigree is -#' assigned as well. +#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and +#' `female_parent` (any column order), performs structural and consistency checks, +#' reports findings, and optionally builds a partially corrected pedigree object. +#' +#' The input file is never modified. Findings are printed to the console when +#' `verbose = TRUE`, a report list is assigned to the global environment, and when +#' `correct = TRUE`, a corrected pedigree data frame is also assigned there. +#' +#' @section Checks performed: +#' \describe{ +#' \item{Missing required columns}{Stops immediately if `id`, `male_parent`, or +#' `female_parent` are absent.} +#' \item{Exact duplicate rows}{Rows identical across all three fields. Removed +#' when `correct = TRUE`.} +#' \item{Repeated IDs with conflicting parents}{Same `id` with differing parent +#' assignments. Collapsed to one row when `correct = TRUE`; conflicting fields +#' are set to `"0"`.} +#' \item{Inconsistent sex roles}{IDs appearing as both `male_parent` and +#' `female_parent`. Reported only; no automatic correction.} +#' \item{Missing parents}{Parent IDs not listed as individuals. Reported, and +#' added as founder rows (`"0"/"0"`) when `correct = TRUE`.} +#' \item{Cycles}{Direct or indirect ancestry loops. Reported only; must be +#' resolved manually.} +#' } +#' +#' @section Correction behavior: +#' When `correct = TRUE`, the function removes exact duplicates, collapses +#' conflicting repeated IDs (replacing inconsistent parent fields with `"0"`), +#' and appends missing parents as founders. Inconsistent sex roles and cycles are +#' not automatically corrected. Running the function again on the corrected +#' pedigree is recommended to reassess downstream issues. #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional integer seed for reproducibility. -#' @param verbose Logical. If `TRUE` (default), prints findings to the -#' console. -#' @param correct Logical. If `TRUE` (default), builds and assigns a -#' corrected pedigree to the global environment. If `FALSE`, issues -#' are only reported. -#' @param save_zip Logical. If `TRUE`, bundles report components into a -#' `.zip` archive in the working directory. -#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = -#' TRUE` and `correct = TRUE`), includes the corrected pedigree in -#' the `.zip` archive. +#' @param verbose Logical. Print findings to the console? Default `TRUE`. +#' @param correct Logical. Build and save a corrected pedigree? Default `TRUE`. +#' @param save_zip Logical. Export report components as a `.zip` archive? Default `FALSE`. +#' @param save_corrected_zip Logical. Include corrected pedigree in the `.zip`? +#' Only applies when `save_zip = TRUE` and `correct = TRUE`. Default `FALSE`. #' -#' @return An invisible named list of data frames describing detected -#' issues: -#' \item{exact_duplicates}{exact duplicate rows} -#' \item{repeated_ids_diff}{rows where the same `id` has conflicting parents} -#' \item{inconsistent_sex_roles}{rows involving IDs used as both parents} -#' \item{missing_parents}{rows referencing parent IDs absent from `id`} -#' \item{dependencies}{rows whose `id` is involved in a pedigree cycle} +#' @return An invisible named list of data frames: +#' \describe{ +#' \item{exact_duplicates}{Exact duplicate rows.} +#' \item{repeated_ids_diff}{Rows with the same `id` but conflicting parent assignments.} +#' \item{inconsistent_sex_roles}{Rows involving IDs used as both male and female parent.} +#' \item{missing_parents}{Rows referencing parent IDs absent from `id`.} +#' \item{dependencies}{Rows whose `id` is involved in a detected cycle.} +#' } #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate -#' summarize first bind_rows n_distinct n select row_number +#' @author Josué Chinchilla-Vargas +#' +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first +#' bind_rows n_distinct n select row_number #' @importFrom stats setNames -#' @importFrom utils read.table write.table zip capture.output +#' @importFrom utils read.table write.table zip #' @importFrom tools file_path_sans_ext #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE, correct = TRUE, - save_zip = TRUE, - save_corrected_zip = TRUE) { + save_zip = FALSE, + save_corrected_zip = FALSE) { + + #### setup #### #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) - # ── Column alias remapping ────────────────────────────────────────────────── - col_aliases <- list( - id = c("id", "ID", "animal", "Animal", "ind", "Ind"), - male_parent = c("male_parent", "sire", "Sire", "father", "Father", "pat", "Pat"), - female_parent = c("female_parent", "dam", "Dam", "mother", "Mother", "mat", "Mat") - ) - for (canonical in names(col_aliases)) { - if (!canonical %in% colnames(data)) { - match <- intersect(col_aliases[[canonical]], colnames(data)) - if (length(match) > 0) { - colnames(data)[colnames(data) == match[1]] <- canonical - } - } - } - # Validate required column names before any processing required_cols <- c("id", "male_parent", "female_parent") missing_cols <- setdiff(required_cols, colnames(data)) @@ -72,48 +79,47 @@ check_ped <- function(ped.file, ".\nExpected columns: id, male_parent, female_parent." ) } + data <- data %>% - dplyr::mutate( + mutate( id = as.character(id), male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) - # Add row numbers before any processing - data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) + data <- data %>% mutate(row_number = row_number(), .before = id) + errors <- list() + # Add row numbers before any processing + data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() #### check 1: exact duplicates #### exact_duplicates <- data[ - duplicated(data %>% dplyr::select(-row_number)) | - duplicated(data %>% dplyr::select(-row_number), fromLast = TRUE), + duplicated(data %>% select(-row_number)) | + duplicated(data %>% select(-row_number), fromLast = TRUE), ] #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% - dplyr::group_by(id) %>% - dplyr::filter(dplyr::n() > 1) %>% - dplyr::filter(dplyr::n_distinct(male_parent) > 1 | - dplyr::n_distinct(female_parent) > 1) %>% - dplyr::ungroup() + group_by(id) %>% + filter(n() > 1) %>% + filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% + ungroup() #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% - dplyr::filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + filter(male_parent %in% messy_ids | female_parent %in% messy_ids) #### check 4: missing parents #### - all_ids <- unique(data$id) - ref_ids <- unique(c(data$male_parent, data$female_parent)) - - # Trim, remove NAs and zeros - ref_ids <- trimws(ref_ids) - ref_ids <- ref_ids[!is.na(ref_ids) & ref_ids != "" & ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) + all_ids <- unique(data$id) + ref_ids <- unique(c(data$male_parent, data$female_parent)) + ref_ids <- ref_ids[ref_ids != "0"] + missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% - dplyr::filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + filter(male_parent %in% missing_ids | female_parent %in% missing_ids) # Only build founder rows if there are actually missing parents to add if (length(missing_ids) > 0) { @@ -141,7 +147,6 @@ check_ped <- function(ped.file, c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id - dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE @@ -160,7 +165,6 @@ check_ped <- function(ped.file, rec_stack[node] <- FALSE return(cycles) } - visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() @@ -172,6 +176,7 @@ check_ped <- function(ped.file, } return(all_cycles) } + cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -183,7 +188,7 @@ check_ped <- function(ped.file, paste(ids, collapse = " -> "))) } } - dependencies <- data %>% dplyr::filter(id %in% cycle_ids) + dependencies <- data %>% filter(id %in% cycle_ids) if (nrow(dependencies) == 0 && length(errors) > 0) { dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) @@ -200,27 +205,24 @@ check_ped <- function(ped.file, #### build corrected pedigree (no row_number column) #### if (correct) { - corrected <- data %>% dplyr::select(-row_number) - + corrected <- data %>% select(-row_number) # Remove exact duplicates - corrected <- dplyr::distinct(corrected) - + corrected <- distinct(corrected) # Resolve conflicting IDs if (nrow(repeated_ids_diff) > 0) { corrected <- corrected %>% - dplyr::group_by(id) %>% - dplyr::summarize( - male_parent = if (dplyr::n_distinct(male_parent) > 1) "0" else dplyr::first(male_parent), - female_parent = if (dplyr::n_distinct(female_parent) > 1) "0" else dplyr::first(female_parent), + group_by(id) %>% + summarize( + male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), + female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), .groups = "drop" ) } - # Add missing founders if (length(missing_ids) > 0) { - corrected <- dplyr::bind_rows( + corrected <- bind_rows( corrected, - missing_founders %>% dplyr::select(-row_number) + missing_founders %>% select(-row_number) ) } } @@ -230,7 +232,6 @@ check_ped <- function(ped.file, corrected_name <- paste0(file_base, "_corrected") report_name <- paste0(file_base, "_report") zip_name <- paste0(file_base, "_report.zip") - #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") @@ -245,70 +246,68 @@ check_ped <- function(ped.file, cat("\n--- Cycles / Dependencies ---\n") if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") if (correct) { - cat(paste0("`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) + cat(paste0("\n`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) } else { - cat("`correct = FALSE`: no changes made to the pedigree.\n") + cat("\n`correct = FALSE`: no changes made to the pedigree.\n") } } - # Section headers matching the console output labels - section_labels <- c( - exact_duplicates = "Exact Duplicates", - repeated_ids_diff = "Conflicting IDs", - inconsistent_sex_roles = "Inconsistent Parent Sex Roles", - missing_parents = "Missing Parents (rows that reference them)", - dependencies = "Cycles / Dependencies" - ) + # Always save report to global environment + assign(report_name, input_ped_report, envir = .GlobalEnv) + # Save corrected pedigree if correct = TRUE + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + #### zip export #### if (save_zip) { - tmp_dir <- tempfile() + tmp_dir <- tempfile() dir.create(tmp_dir) - on.exit(unlink(tmp_dir, recursive = TRUE), add = TRUE) zip_files <- character(0) + # Section headers matching the console output labels + section_labels <- c( + exact_duplicates = "Exact Duplicates", + repeated_ids_diff = "Conflicting IDs", + inconsistent_sex_roles = "Inconsistent Parent Sex Roles", + missing_parents = "Missing Parents (rows that reference them)", + dependencies = "Cycles / Dependencies" + ) + for (component in names(section_labels)) { df <- input_ped_report[[component]] file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) + if (nrow(df) == 0) { + # Write header + "None found." for clean empty reports writeLines( - c(paste0("--- ", section_labels[[component]], " ---"), - "None found."), + c(paste0("--- ", section_labels[[component]], " ---"), "None found."), con = file_path ) } else { + # Write header, then column names, then data — avoids col.names warning file_con <- file(file_path, open = "wt") - writeLines(paste0("--- ", section_labels[[component]], " ---"), - con = file_con) - writeLines(paste(colnames(df), collapse = "\t"), - con = file_con) + writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) + writeLines(paste(colnames(df), collapse = "\t"), con = file_con) close(file_con) - utils::write.table(df, file = file_path, sep = "\t", - row.names = FALSE, - quote = FALSE, - append = TRUE, - col.names = FALSE) + utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, + quote = FALSE, append = TRUE, col.names = FALSE) } + zip_files <- c(zip_files, file_path) } + # Optionally include corrected pedigree in zip if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", - row.names = FALSE, - quote = FALSE) + row.names = FALSE, quote = FALSE) zip_files <- c(zip_files, corrected_path) } + # Bundle all files into zip in the working directory zip_path <- file.path(getwd(), zip_name) - invisible(capture.output( - utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") - )) + utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } - # Assign objects to the global environment - assign(report_name, input_ped_report, envir = .GlobalEnv) - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) - invisible(input_ped_report) } diff --git a/R/find_parentage.R b/R/find_parentage.R index 410981b..b02477c 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -90,6 +90,8 @@ #' ) #' } #' +#' @author Josué Chinchilla-Vargas +#' #' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 918071a..7ebbdcc 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -1,79 +1,60 @@ -#' Calculate Concordance between Imputed and Reference Genotypes -#' -#' This function calculates the concordance between imputed and reference -#' genotypes. It assumes that samples are rows and markers are columns. -#' Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. -#' Missing data in either dataset can be excluded from the concordance calculation -#' using the `missing_code` argument. Specific markers can be excluded using -#' the `snps_2_exclude` argument. -#' -#' @param reference_genos A data frame containing reference genotype data, -#' with rows as samples and columns as markers. Must include a column named `ID`. -#' -#' @param imputed_genos A data frame containing imputed genotype data, -#' with rows as samples and columns as markers. Must include a column named `ID`. -#' -#' @param missing_code Optional value specifying missing data. If provided, -#' loci with this value in either dataset will be excluded from the concordance calculation. -#' -#' @param snps_2_exclude Optional vector of marker IDs to exclude from the concordance calculation. -#' -#' @param verbose Logical. If `TRUE`, prints summary statistics (minimum, quartiles, -#' median, mean, maximum) of concordance percentages. -#' -#' @param plot Logical. If `TRUE`, produces a bar plot of concordance percentage -#' by sample. -#' -#' @param print_result Logical. If `TRUE` (default), prints the concordance -#' results data frame to the console. If `FALSE`, results are returned invisibly. -#' -#' @return A data frame with: -#' \itemize{ -#' \item \code{ID}: Sample identifiers shared between the datasets. -#' \item \code{Concordance}: Percentage of matching genotypes per sample. +#' Calculate concordance between imputed and reference genotypes +#' +#' Compares imputed and reference genotype datasets sample by sample, returning +#' the percentage of matching genotypes per sample. Expects samples as rows and +#' markers as columns, with allele dosages (0, 1, 2) recommended. Other numeric +#' formats are supported. +#' +#' @param reference_genos A data frame of reference genotypes (samples × markers) +#' with a column named `ID`. +#' @param imputed_genos A data frame of imputed genotypes (samples × markers) +#' with a column named `ID`. +#' @param missing_code Optional value indicating missing data. Loci carrying this +#' value in either dataset are excluded from the concordance calculation. +#' @param snps_2_exclude Optional character vector of marker names to exclude. +#' @param verbose Logical. Print a five-number summary of concordance? Default `FALSE`. +#' @param plot Logical. Produce a bar plot of concordance by sample? Default `FALSE`. +#' @param print_result Logical. Print the results data frame to the console? +#' Default `TRUE`. If `FALSE`, results are returned invisibly. +#' +#' @return A data frame with columns: +#' \describe{ +#' \item{ID}{Sample identifiers shared between the two datasets.} +#' \item{Concordance}{Percentage of matching genotypes per sample.} #' } -#' If \code{print_result = FALSE}, the data frame is returned invisibly. #' #' @details -#' The function: -#' \enumerate{ -#' \item Identifies common samples and markers between the datasets. -#' \item Optionally excludes specified SNPs. -#' \item Removes loci with missing data (if \code{missing_code} is provided). -#' \item Computes per-sample concordance as the percentage of matching genotypes. -#' } -#' -#' When \code{plot = TRUE}, a bar plot showing concordance percentage per sample -#' is generated using \pkg{ggplot2}. -#' -#' @import dplyr +#' The function identifies common samples and markers between the two datasets, +#' optionally removes specified SNPs and loci with missing data, then computes +#' per-sample concordance as the percentage of matching genotypes over valid loci. +#' When `plot = TRUE`, a bar plot is produced with \pkg{ggplot2}. #' #' @examples #' ref <- data.frame( -#' ID = c("S1", "S2", "S3"), +#' ID = c("S1", "S2", "S3"), #' SNP1 = c(0, 1, 2), #' SNP2 = c(1, 1, 0), #' SNP3 = c(2, 5, 1) #' ) -#' -#' test <- data.frame( -#' ID = c("S1", "S2", "S3"), +#' imp <- data.frame( +#' ID = c("S1", "S2", "S3"), #' SNP1 = c(0, 0, 2), #' SNP2 = c(1, 1, 1), #' SNP3 = c(2, 5, 0) #' ) -#' -#' result <- imputation_concordance( +#' imputation_concordance( #' reference_genos = ref, -#' imputed_genos = test, -#' snps_2_exclude = "SNP2", -#' missing_code = 5, -#' print_result = FALSE +#' imputed_genos = imp, +#' snps_2_exclude = "SNP2", +#' missing_code = 5, +#' print_result = FALSE #' ) #' -#' result +#' @author Josué Chinchilla-Vargas #' +#' @importFrom dplyr %>% filter arrange #' @importFrom stats reorder +#' @importFrom ggplot2 ggplot aes geom_bar labs theme_minimal theme element_text #' @export imputation_concordance <- function(reference_genos, imputed_genos, diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 8784914..c30491f 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -60,6 +60,9 @@ #' \item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} #' } #' @export +#' +#' @author Josué Chinchilla-Vargas +#' #' @importFrom data.table fread fwrite copy data.table set rbindlist validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, diff --git a/man/allele_freq_poly.Rd b/man/allele_freq_poly.Rd index 407d158..6d56f79 100644 --- a/man/allele_freq_poly.Rd +++ b/man/allele_freq_poly.Rd @@ -2,48 +2,51 @@ % Please edit documentation in R/breedtools_functions.R \name{allele_freq_poly} \alias{allele_freq_poly} -\title{Compute Allele Frequencies for Populations} +\title{Compute allele frequencies for populations} \usage{ allele_freq_poly(geno, populations, ploidy = 2) } \arguments{ -\item{geno}{matrix of genotypes coded as the dosage of allele B \code{{0, 1, 2, ..., ploidy}} -with individuals in rows (named) and SNPs in columns (named)} +\item{geno}{Numeric matrix of genotypes coded as dosage of allele B +\code{{0, 1, 2, ..., ploidy}}, with individuals in rows (named) and +SNPs in columns (named).} -\item{populations}{list of named populations. Each population has a vector of IDs -that belong to the population. Allele frequencies will be derived from all animals} +\item{populations}{Named list of populations, each containing a character +vector of individual IDs belonging to that population.} -\item{ploidy}{integer indicating the ploidy level (default is 2 for diploid)} +\item{ploidy}{Integer. Ploidy level of the species. Default is \code{2}.} } \value{ -data.frame consisting of allele_frequencies for populations (columns) for -each SNP (rows) +A data frame of allele frequencies with SNPs as rows and populations +as columns. } \description{ -Computes allele frequencies for specified populations given SNP array data +Computes allele frequencies for specified populations from SNP array data +coded as dosage of allele B. } \examples{ -# Example inputs geno_matrix <- matrix( -c(4, 1, 4, 0, # S1 - 2, 2, 1, 3, # S2 - 0, 4, 0, 4, # S3 - 3, 3, 2, 2, # S4 - 1, 4, 2, 3),# S5 -nrow = 4, ncol = 5, byrow = FALSE, # individuals=rows, SNPs=cols -dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) + c(4, 1, 4, 0, + 2, 2, 1, 3, + 0, 4, 0, 4, + 3, 3, 2, 2, + 1, 4, 2, 3), + nrow = 4, ncol = 5, byrow = FALSE, + dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) ) - pop_list <- list( -PopA = c("Ind1", "Ind2"), -PopB = c("Ind3", "Ind4") + PopA = c("Ind1", "Ind2"), + PopB = c("Ind3", "Ind4") ) - allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) print(allele_freqs) } \references{ -Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +Estimation of genome-wide and locus-specific breed composition in pigs. +\emph{Transl Anim Sci.} 2017;1(1):36–44. +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 0cf3840..8df46e8 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check a pedigree file for structural issues} +\title{Check and optionally correct a pedigree file} \usage{ check_ped( ped.file, @@ -18,35 +18,63 @@ check_ped( \item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the -console.} +\item{verbose}{Logical. Print findings to the console? Default \code{TRUE}.} -\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a -corrected pedigree to the global environment. If \code{FALSE}, issues -are only reported.} +\item{correct}{Logical. Build and save a corrected pedigree? Default \code{TRUE}.} -\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a -\code{.zip} archive in the working directory.} +\item{save_zip}{Logical. Export report components as a \code{.zip} archive? Default \code{FALSE}.} -\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and \code{correct = TRUE}), includes the corrected pedigree in -the \code{.zip} archive.} +\item{save_corrected_zip}{Logical. Include corrected pedigree in the \code{.zip}? +Only applies when \code{save_zip = TRUE} and \code{correct = TRUE}. Default \code{FALSE}.} } \value{ -An invisible named list of data frames describing detected -issues: -\item{exact_duplicates}{exact duplicate rows} -\item{repeated_ids_diff}{rows where the same \code{id} has conflicting parents} -\item{inconsistent_sex_roles}{rows involving IDs used as both parents} -\item{missing_parents}{rows referencing parent IDs absent from \code{id}} -\item{dependencies}{rows whose \code{id} is involved in a pedigree cycle} +An invisible named list of data frames: +\describe{ +\item{exact_duplicates}{Exact duplicate rows.} +\item{repeated_ids_diff}{Rows with the same \code{id} but conflicting parent assignments.} +\item{inconsistent_sex_roles}{Rows involving IDs used as both male and female parent.} +\item{missing_parents}{Rows referencing parent IDs absent from \code{id}.} +\item{dependencies}{Rows whose \code{id} is involved in a detected cycle.} +} } \description{ -Reads a tab‑separated pedigree file with columns \code{id}, -\code{male_parent}, and \code{female_parent} and checks for five classes of -errors: missing required columns, exact duplicate rows, -repeated IDs with conflicting parents, inconsistent parent -sex roles, and pedigree cycles. The input file is never -modified. A report list is always assigned to the global -environment; when \code{correct = TRUE} a corrected pedigree is -assigned as well. +Reads a tab-separated pedigree file with columns \code{id}, \code{male_parent}, and +\code{female_parent} (any column order), performs structural and consistency checks, +reports findings, and optionally builds a partially corrected pedigree object. +} +\details{ +The input file is never modified. Findings are printed to the console when +\code{verbose = TRUE}, a report list is assigned to the global environment, and when +\code{correct = TRUE}, a corrected pedigree data frame is also assigned there. +} +\section{Checks performed}{ + +\describe{ +\item{Missing required columns}{Stops immediately if \code{id}, \code{male_parent}, or +\code{female_parent} are absent.} +\item{Exact duplicate rows}{Rows identical across all three fields. Removed +when \code{correct = TRUE}.} +\item{Repeated IDs with conflicting parents}{Same \code{id} with differing parent +assignments. Collapsed to one row when \code{correct = TRUE}; conflicting fields +are set to \code{"0"}.} +\item{Inconsistent sex roles}{IDs appearing as both \code{male_parent} and +\code{female_parent}. Reported only; no automatic correction.} +\item{Missing parents}{Parent IDs not listed as individuals. Reported, and +added as founder rows (\code{"0"/"0"}) when \code{correct = TRUE}.} +\item{Cycles}{Direct or indirect ancestry loops. Reported only; must be +resolved manually.} +} +} + +\section{Correction behavior}{ + +When \code{correct = TRUE}, the function removes exact duplicates, collapses +conflicting repeated IDs (replacing inconsistent parent fields with \code{"0"}), +and appends missing parents as founders. Inconsistent sex roles and cycles are +not automatically corrected. Running the function again on the corrected +pedigree is recommended to reassess downstream issues. +} + +\author{ +Josué Chinchilla-Vargas } diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 5a148af..55b26e8 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -126,3 +126,6 @@ results <- find_parentage( } } +\author{ +Josué Chinchilla-Vargas +} diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 22e9462..f96493f 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/imputation_concordance.R \name{imputation_concordance} \alias{imputation_concordance} -\title{Calculate Concordance between Imputed and Reference Genotypes} +\title{Calculate concordance between imputed and reference genotypes} \usage{ imputation_concordance( reference_genos, @@ -15,77 +15,65 @@ imputation_concordance( ) } \arguments{ -\item{reference_genos}{A data frame containing reference genotype data, -with rows as samples and columns as markers. Must include a column named \code{ID}.} +\item{reference_genos}{A data frame of reference genotypes (samples × markers) +with a column named \code{ID}.} -\item{imputed_genos}{A data frame containing imputed genotype data, -with rows as samples and columns as markers. Must include a column named \code{ID}.} +\item{imputed_genos}{A data frame of imputed genotypes (samples × markers) +with a column named \code{ID}.} -\item{missing_code}{Optional value specifying missing data. If provided, -loci with this value in either dataset will be excluded from the concordance calculation.} +\item{missing_code}{Optional value indicating missing data. Loci carrying this +value in either dataset are excluded from the concordance calculation.} -\item{snps_2_exclude}{Optional vector of marker IDs to exclude from the concordance calculation.} +\item{snps_2_exclude}{Optional character vector of marker names to exclude.} -\item{verbose}{Logical. If \code{TRUE}, prints summary statistics (minimum, quartiles, -median, mean, maximum) of concordance percentages.} +\item{verbose}{Logical. Print a five-number summary of concordance? Default \code{FALSE}.} -\item{plot}{Logical. If \code{TRUE}, produces a bar plot of concordance percentage -by sample.} +\item{plot}{Logical. Produce a bar plot of concordance by sample? Default \code{FALSE}.} -\item{print_result}{Logical. If \code{TRUE} (default), prints the concordance -results data frame to the console. If \code{FALSE}, results are returned invisibly.} +\item{print_result}{Logical. Print the results data frame to the console? +Default \code{TRUE}. If \code{FALSE}, results are returned invisibly.} } \value{ -A data frame with: -\itemize{ -\item \code{ID}: Sample identifiers shared between the datasets. -\item \code{Concordance}: Percentage of matching genotypes per sample. +A data frame with columns: +\describe{ +\item{ID}{Sample identifiers shared between the two datasets.} +\item{Concordance}{Percentage of matching genotypes per sample.} } -If \code{print_result = FALSE}, the data frame is returned invisibly. } \description{ -This function calculates the concordance between imputed and reference -genotypes. It assumes that samples are rows and markers are columns. -Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. -Missing data in either dataset can be excluded from the concordance calculation -using the \code{missing_code} argument. Specific markers can be excluded using -the \code{snps_2_exclude} argument. +Compares imputed and reference genotype datasets sample by sample, returning +the percentage of matching genotypes per sample. Expects samples as rows and +markers as columns, with allele dosages (0, 1, 2) recommended. Other numeric +formats are supported. } \details{ -The function: -\enumerate{ -\item Identifies common samples and markers between the datasets. -\item Optionally excludes specified SNPs. -\item Removes loci with missing data (if \code{missing_code} is provided). -\item Computes per-sample concordance as the percentage of matching genotypes. -} - -When \code{plot = TRUE}, a bar plot showing concordance percentage per sample -is generated using \pkg{ggplot2}. +The function identifies common samples and markers between the two datasets, +optionally removes specified SNPs and loci with missing data, then computes +per-sample concordance as the percentage of matching genotypes over valid loci. +When \code{plot = TRUE}, a bar plot is produced with \pkg{ggplot2}. } \examples{ ref <- data.frame( - ID = c("S1", "S2", "S3"), + ID = c("S1", "S2", "S3"), SNP1 = c(0, 1, 2), SNP2 = c(1, 1, 0), SNP3 = c(2, 5, 1) ) - -test <- data.frame( - ID = c("S1", "S2", "S3"), +imp <- data.frame( + ID = c("S1", "S2", "S3"), SNP1 = c(0, 0, 2), SNP2 = c(1, 1, 1), SNP3 = c(2, 5, 0) ) - -result <- imputation_concordance( +imputation_concordance( reference_genos = ref, - imputed_genos = test, - snps_2_exclude = "SNP2", - missing_code = 5, - print_result = FALSE + imputed_genos = imp, + snps_2_exclude = "SNP2", + missing_code = 5, + print_result = FALSE ) -result - +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/solve_composition_poly.Rd b/man/solve_composition_poly.Rd index bd92739..128f7ad 100644 --- a/man/solve_composition_poly.Rd +++ b/man/solve_composition_poly.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/breedtools_functions.R \name{solve_composition_poly} \alias{solve_composition_poly} -\title{Compute Genome-Wide Breed Composition} +\title{Compute genome-wide breed composition} \usage{ solve_composition_poly( Y, @@ -16,41 +16,40 @@ solve_composition_poly( ) } \arguments{ -\item{Y}{numeric matrix of genotypes (columns) from all animals (rows) in population -coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}} +\item{Y}{Numeric matrix of genotypes with individuals as rows and SNPs as +columns, coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}.} -\item{X}{numeric matrix of allele frequencies (rows) from each reference panel (columns). Frequencies are -relative to allele B.} +\item{X}{Numeric matrix of allele frequencies with SNPs as rows and +breeds/populations as columns.} -\item{ped}{data.frame giving pedigree information. Must be formatted "ID", "Sire", "Dam"} +\item{ped}{Optional data frame with pedigree information formatted with +columns \code{ID}, \code{Sire}, and \code{Dam}. When supplied, \code{QPsolve_par} is used +and only animals with genotyped parents are included.} -\item{groups}{list of IDs categorized by breed/population. If specified, output will be a list -of results categorized by breed/population.} +\item{groups}{Optional named list of IDs grouped by breed/population. +When supplied, results are returned as a named list partitioned by group.} -\item{mia}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing the inferred maternally inherited allele for each locus for each animal instead -of breed composition results.} +\item{mia}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +the inferred maternally inherited allele per locus per animal. Default \code{FALSE}.} -\item{sire}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing sire genotypes for each locus for each animal instead of breed composition results.} +\item{sire}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +sire genotypes per locus per animal. Default \code{FALSE}.} -\item{dam}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing dam genotypes for each locus for each animal instead of breed composition results.} +\item{dam}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +dam genotypes per locus per animal. Default \code{FALSE}.} -\item{ploidy}{integer. The ploidy level of the species (e.g., 2 for diploid, 3 for triploid, etc.).} +\item{ploidy}{Integer. Ploidy level of the species. Default is \code{2}.} } \value{ -A data.frame or list of data.frames (if groups is !NULL) with breed/ancestry composition -results +A data frame, or a named list of data frames when \code{groups} is +supplied, containing breed/ancestry composition estimates. } \description{ -Computes genome-wide breed/ancestry composition using quadratic programming on a -batch of animals. +Estimates genome-wide breed/ancestry composition for a batch of animals +using quadratic programming, with optional pedigree-assisted and +grouped-output modes. } \examples{ -# Example inputs for solve_composition_poly (ploidy = 4) - -# (This would typically be the output from allele_freq_poly) allele_freqs_matrix <- matrix( c(0.625, 0.500, 0.500, 0.500, @@ -60,16 +59,12 @@ allele_freqs_matrix <- matrix( nrow = 5, ncol = 2, byrow = TRUE, dimnames = list(paste0("SNP", 1:5), c("VarA", "VarB")) ) - -# Validation Genotypes (individuals x SNPs) val_geno_matrix <- matrix( - c(2, 1, 2, 3, 4, # Test1 dosages for SNP1-5 - 3, 4, 2, 3, 0), # Test2 dosages for SNP1-5 + c(2, 1, 2, 3, 4, + 3, 4, 2, 3, 0), nrow = 2, ncol = 5, byrow = TRUE, dimnames = list(paste0("Test", 1:2), paste0("SNP", 1:5)) ) - -# Calculate Breed Composition composition <- solve_composition_poly(Y = val_geno_matrix, X = allele_freqs_matrix, ploidy = 4) @@ -77,6 +72,10 @@ print(composition) } \references{ -Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +Estimation of genome-wide and locus-specific breed composition in pigs. +\emph{Transl Anim Sci.} 2017;1(1):36–44. +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index b4987de..08c7329 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -93,3 +93,6 @@ are declared founders (both parents coded as 0) are preserved unchanged with no recommendations. Trios removed due to missing genotype data are retained in the output with a NO_GENOTYPE_DATA status. } +\author{ +Josué Chinchilla-Vargas +} From 6b84e18653c7072657e942aea90b27253a7bbff1 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 20 May 2026 09:53:16 -0400 Subject: [PATCH 11/12] staged comments for next push --- DESCRIPTION | 2 +- R/check_ped.R | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0524f2f..b71abd1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,6 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: @@ -74,3 +73,4 @@ Suggests: polyRAD, testthat (>= 3.0.0) RdMacros: Rdpack +Config/roxygen2/version: 8.0.0 diff --git a/R/check_ped.R b/R/check_ped.R index 847ebe1..a948138 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -51,8 +51,7 @@ #' #' @author Josué Chinchilla-Vargas #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first -#' bind_rows n_distinct n select row_number +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number #' @importFrom stats setNames #' @importFrom utils read.table write.table zip #' @importFrom tools file_path_sans_ext @@ -60,11 +59,10 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE, - correct = TRUE, - save_zip = FALSE, + correct = TRUE, #add arguments for removing/filtering conflicting ids, repeated ids and founders are always fixed, dependencies are always manual. + save_zip = FALSE, #remove all save zips, export each report and corrected_pedigree as a list save_corrected_zip = FALSE) { - #### setup #### #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) From 4eb79bb186b83c6a52d0e8620419501719597474 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 20 May 2026 11:09:30 -0400 Subject: [PATCH 12/12] removed zip option and added specific correction arguments --- R/check_ped.R | 386 +++++++++++++++++++++++--------------------------- 1 file changed, 177 insertions(+), 209 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index a948138..6f74732 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,71 +1,62 @@ -#' Check and optionally correct a pedigree file +#' Check a pedigree file for accuracy and report and correct common errors #' -#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and -#' `female_parent` (any column order), performs structural and consistency checks, -#' reports findings, and optionally builds a partially corrected pedigree object. +#' Reads a 3-column pedigree file (tab-separated, columns id, male_parent, +#' female_parent in any order) and performs quality checks, optionally +#' correcting detected errors. #' -#' The input file is never modified. Findings are printed to the console when -#' `verbose = TRUE`, a report list is assigned to the global environment, and when -#' `correct = TRUE`, a corrected pedigree data frame is also assigned there. -#' -#' @section Checks performed: -#' \describe{ -#' \item{Missing required columns}{Stops immediately if `id`, `male_parent`, or -#' `female_parent` are absent.} -#' \item{Exact duplicate rows}{Rows identical across all three fields. Removed -#' when `correct = TRUE`.} -#' \item{Repeated IDs with conflicting parents}{Same `id` with differing parent -#' assignments. Collapsed to one row when `correct = TRUE`; conflicting fields -#' are set to `"0"`.} -#' \item{Inconsistent sex roles}{IDs appearing as both `male_parent` and -#' `female_parent`. Reported only; no automatic correction.} -#' \item{Missing parents}{Parent IDs not listed as individuals. Reported, and -#' added as founder rows (`"0"/"0"`) when `correct = TRUE`.} -#' \item{Cycles}{Direct or indirect ancestry loops. Reported only; must be -#' resolved manually.} -#' } -#' -#' @section Correction behavior: -#' When `correct = TRUE`, the function removes exact duplicates, collapses -#' conflicting repeated IDs (replacing inconsistent parent fields with `"0"`), -#' and appends missing parents as founders. Inconsistent sex roles and cycles are -#' not automatically corrected. Running the function again on the corrected -#' pedigree is recommended to reassess downstream issues. +#' Exact duplicate rows and missing parents are always corrected. +#' Conflicting trios and inconsistent sex roles are corrected when their +#' respective arguments are TRUE. Cycles and dependencies are always +#' reported only and must be resolved manually. #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional integer seed for reproducibility. -#' @param verbose Logical. Print findings to the console? Default `TRUE`. -#' @param correct Logical. Build and save a corrected pedigree? Default `TRUE`. -#' @param save_zip Logical. Export report components as a `.zip` archive? Default `FALSE`. -#' @param save_corrected_zip Logical. Include corrected pedigree in the `.zip`? -#' Only applies when `save_zip = TRUE` and `correct = TRUE`. Default `FALSE`. +#' @param verbose Logical. If TRUE (default), prints the report to the console. +#' @param correct_conflicting_trios Logical. If TRUE (default), sets conflicting +#' male_parent and female_parent to 0 and collapses to one row per ID. +#' @param correct_inconsistent_sex_roles Logical. If TRUE (default), sets +#' male_parent and female_parent to 0 for rows involving IDs found as both, +#' then removes any resulting exact duplicates. #' #' @return An invisible named list of data frames: #' \describe{ -#' \item{exact_duplicates}{Exact duplicate rows.} -#' \item{repeated_ids_diff}{Rows with the same `id` but conflicting parent assignments.} -#' \item{inconsistent_sex_roles}{Rows involving IDs used as both male and female parent.} -#' \item{missing_parents}{Rows referencing parent IDs absent from `id`.} -#' \item{dependencies}{Rows whose `id` is involved in a detected cycle.} +#' \item{exact_duplicates}{Exact duplicate rows found in the input.} +#' \item{conflicting_trios}{IDs with conflicting male_parent or female_parent assignments.} +#' \item{inconsistent_sex_roles}{IDs appearing as both male_parent and female_parent.} +#' \item{missing_parents}{Parent IDs absent from id, added as founders.} +#' \item{dependencies}{Cycles detected in the pedigree. Must be resolved manually.} +#' \item{corrected_pedigree}{Corrected pedigree table.} #' } #' -#' @author Josué Chinchilla-Vargas +#' @examples +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number +#' ped_errors$inconsistent_sex_roles +#' ped_errors$corrected_pedigree +#' +#' conflicting_sex_ids <- ped_errors$inconsistent_sex_roles$id +#' print(conflicting_sex_ids) +#' +#' @importFrom dplyr %>% mutate filter group_by ungroup summarize distinct bind_rows select first n n_distinct if_else row_number #' @importFrom stats setNames -#' @importFrom utils read.table write.table zip -#' @importFrom tools file_path_sans_ext +#' @importFrom utils read.table #' @export check_ped <- function(ped.file, - seed = NULL, - verbose = TRUE, - correct = TRUE, #add arguments for removing/filtering conflicting ids, repeated ids and founders are always fixed, dependencies are always manual. - save_zip = FALSE, #remove all save zips, export each report and corrected_pedigree as a list - save_corrected_zip = FALSE) { + seed = NULL, + verbose = TRUE, + correct_conflicting_trios = TRUE, + correct_inconsistent_sex_roles = TRUE) { #### setup #### if (!is.null(seed)) set.seed(seed) - data <- utils::read.table(ped.file, header = TRUE) + + data <- utils::read.table(ped.file, header = TRUE) %>% + dplyr::mutate( + id = as.character(id), + male_parent = as.character(male_parent), + female_parent = as.character(female_parent) + ) # Validate required column names before any processing required_cols <- c("id", "male_parent", "female_parent") @@ -78,78 +69,120 @@ check_ped <- function(ped.file, ) } - data <- data %>% - mutate( - id = as.character(id), - male_parent = as.character(male_parent), - female_parent = as.character(female_parent) - ) - # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) - errors <- list() - # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) - errors <- list() + # Add row numbers before any processing so all reports reference original rows [1] + data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) + + errors <- list() + missing_parents <- data.frame( + row_number = integer(), + id = character(), + male_parent = character(), + female_parent = character(), + stringsAsFactors = FALSE + ) - #### check 1: exact duplicates #### + #### check 1: exact duplicates (always fixed) #### exact_duplicates <- data[ - duplicated(data %>% select(-row_number)) | - duplicated(data %>% select(-row_number), fromLast = TRUE), + duplicated(data %>% dplyr::select(-row_number)) | + duplicated(data %>% dplyr::select(-row_number), fromLast = TRUE), ] + if (nrow(exact_duplicates) > 0) { + data <- data %>% + dplyr::select(-row_number) %>% + dplyr::distinct() %>% + dplyr::mutate(row_number = dplyr::row_number(), .before = id) + } #### check 2: repeated IDs with conflicting male_parent/female_parent #### - repeated_ids_diff <- data %>% - group_by(id) %>% - filter(n() > 1) %>% - filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% - ungroup() + repeated_ids <- data %>% + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::ungroup() - #### check 3: inconsistent parent sex roles #### - male_ids <- unique(data$male_parent[data$male_parent != "0"]) - female_ids <- unique(data$female_parent[data$female_parent != "0"]) - messy_ids <- intersect(male_ids, female_ids) - inconsistent_sex_roles <- data %>% - filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + conflicting_ids <- repeated_ids %>% + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n_distinct(male_parent) > 1 | dplyr::n_distinct(female_parent) > 1) %>% + dplyr::ungroup() - #### check 4: missing parents #### - all_ids <- unique(data$id) - ref_ids <- unique(c(data$male_parent, data$female_parent)) - ref_ids <- ref_ids[ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) - missing_parents <- data %>% - filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + if (correct_conflicting_trios && nrow(conflicting_ids) > 0) { + # Set conflicting parents to "0" -- rows become exact duplicates, summarize collapses to one [1] + data <- data %>% + dplyr::group_by(id) %>% + dplyr::summarize( + row_number = dplyr::first(row_number), + male_parent = if (dplyr::n_distinct(male_parent) > 1) "0" else dplyr::first(male_parent), + female_parent = if (dplyr::n_distinct(female_parent) > 1) "0" else dplyr::first(female_parent), + .groups = "drop" + ) %>% + dplyr::select(row_number, id, male_parent, female_parent) + } - # Only build founder rows if there are actually missing parents to add - if (length(missing_ids) > 0) { - missing_founders <- data.frame( - row_number = NA_integer_, - id = missing_ids, - male_parent = "0", - female_parent = "0", - stringsAsFactors = FALSE - ) - } else { - missing_founders <- data.frame( - row_number = integer(0), - id = character(0), - male_parent = character(0), - female_parent = character(0), - stringsAsFactors = FALSE - ) + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents (always fixed) #### + for (i in seq_len(nrow(data))) { + id <- data$id[i] + male_parent <- data$male_parent[i] + female_parent <- data$female_parent[i] + + if (male_parent != "0" && male_parent != id && !male_parent %in% data$id) { + missing_parents <- rbind( + missing_parents, + data.frame(row_number = data$row_number[i], id = male_parent, + male_parent = "0", female_parent = "0", + stringsAsFactors = FALSE) + ) + } + if (female_parent != "0" && female_parent != id && !female_parent %in% data$id) { + missing_parents <- rbind( + missing_parents, + data.frame(row_number = data$row_number[i], id = female_parent, + male_parent = "0", female_parent = "0", + stringsAsFactors = FALSE) + ) + } + + if (male_parent == id || female_parent == id) { + errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) + } + } + + missing_parents <- dplyr::distinct(missing_parents) + if (nrow(missing_parents) > 0) { + data <- dplyr::bind_rows(data, missing_parents) + } + + #### check 4: inconsistent sex roles #### + male_ids <- unique(data$male_parent[data$male_parent != "0"]) + female_ids <- unique(data$female_parent[data$female_parent != "0"]) + conflicting_sex_ids <- intersect(male_ids, female_ids) + inconsistent_sex_roles <- data %>% dplyr::filter(id %in% conflicting_sex_ids) + + if (correct_inconsistent_sex_roles && length(conflicting_sex_ids) > 0) { + # Zero out male_parent/female_parent wherever a conflicting ID appears, + # then distinct() removes any rows that became exact duplicates [1] + data <- data %>% + dplyr::mutate( + male_parent = dplyr::if_else(male_parent %in% conflicting_sex_ids, "0", male_parent), + female_parent = dplyr::if_else(female_parent %in% conflicting_sex_ids, "0", female_parent) + ) %>% + dplyr::distinct(id, male_parent, female_parent, .keep_all = TRUE) } - #### check 5: dependencies (cycles) #### + #### check 5: dependencies (cycles) -- reported only, never modified #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { row <- data[data$id == x, ] c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { @@ -163,148 +196,83 @@ check_ped <- function(ped.file, rec_stack[node] <- FALSE return(cycles) } + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) - if (length(node_cycles) > 0) all_cycles <- append(all_cycles, node_cycles) + if (length(node_cycles) > 0) { + all_cycles <- append(all_cycles, node_cycles) + } } } return(all_cycles) } - cycles <- detect_all_cycles(data) - cycle_ids <- character(0) + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { for (cycle_group in cycles) { - ids <- unique(unlist(cycle_group)) - cycle_ids <- unique(c(cycle_ids, ids)) + cycle_ids <- unique(unlist(cycle_group)) errors <- append(errors, paste("Cycle detected involving IDs:", - paste(ids, collapse = " -> "))) + paste(cycle_ids, collapse = " -> "))) } } - dependencies <- data %>% filter(id %in% cycle_ids) - if (nrow(dependencies) == 0 && length(errors) > 0) { - dependencies <- data.frame(Dependency = unique(unlist(errors)), - stringsAsFactors = FALSE) - } - #### compile report #### + #### compile findings #### input_ped_report <- list( exact_duplicates = exact_duplicates, - repeated_ids_diff = repeated_ids_diff, + conflicting_trios = repeated_ids_report, inconsistent_sex_roles = inconsistent_sex_roles, missing_parents = missing_parents, - dependencies = dependencies + dependencies = data.frame(Dependency = unique(unlist(errors)), + stringsAsFactors = FALSE), + corrected_pedigree = data %>% dplyr::select(-row_number) ) - #### build corrected pedigree (no row_number column) #### - if (correct) { - corrected <- data %>% select(-row_number) - # Remove exact duplicates - corrected <- distinct(corrected) - # Resolve conflicting IDs - if (nrow(repeated_ids_diff) > 0) { - corrected <- corrected %>% - group_by(id) %>% - summarize( - male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), - female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), - .groups = "drop" - ) - } - # Add missing founders - if (length(missing_ids) > 0) { - corrected <- bind_rows( - corrected, - missing_founders %>% select(-row_number) - ) - } - } - - #### file names #### - file_base <- tools::file_path_sans_ext(basename(ped.file)) - corrected_name <- paste0(file_base, "_corrected") - report_name <- paste0(file_base, "_report") - zip_name <- paste0(file_base, "_report.zip") #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") - cat("\n--- Exact Duplicates ---\n") - if (nrow(exact_duplicates) > 0) print(exact_duplicates) else cat("None found.\n") - cat("\n--- Conflicting IDs ---\n") - if (nrow(repeated_ids_diff) > 0) print(repeated_ids_diff) else cat("None found.\n") - cat("\n--- Inconsistent Parent Sex Roles ---\n") - if (nrow(inconsistent_sex_roles) > 0) print(inconsistent_sex_roles) else cat("None found.\n") - cat("\n--- Missing Parents (rows that reference them) ---\n") - if (nrow(missing_parents) > 0) print(missing_parents) else cat("None found.\n") - cat("\n--- Cycles / Dependencies ---\n") - if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") - if (correct) { - cat(paste0("\n`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) - } else { - cat("\n`correct = FALSE`: no changes made to the pedigree.\n") - } - } - - # Always save report to global environment - assign(report_name, input_ped_report, envir = .GlobalEnv) - # Save corrected pedigree if correct = TRUE - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) - #### zip export #### - if (save_zip) { - tmp_dir <- tempfile() - dir.create(tmp_dir) - zip_files <- character(0) + if (nrow(exact_duplicates) > 0) { + cat("\nExact duplicate trios detected (removed in corrected pedigree):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate trios found.\n") - # Section headers matching the console output labels - section_labels <- c( - exact_duplicates = "Exact Duplicates", - repeated_ids_diff = "Conflicting IDs", - inconsistent_sex_roles = "Inconsistent Parent Sex Roles", - missing_parents = "Missing Parents (rows that reference them)", - dependencies = "Cycles / Dependencies" - ) + if (nrow(repeated_ids_report) > 0) { + cat("\nConflicting trios detected:\n") + print(repeated_ids_report) + if (correct_conflicting_trios) { + cat(" -> parents set to 0 and collapsed to one row in corrected pedigree.\n") + } else { + cat(" -> correct_conflicting_trios = FALSE: left as-is in corrected pedigree.\n") + } + } else cat("\nNo conflicting trios found.\n") - for (component in names(section_labels)) { - df <- input_ped_report[[component]] - file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) + if (nrow(missing_parents) > 0) { + cat("\nParents missing as IDs found in the pedigree (added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") - if (nrow(df) == 0) { - # Write header + "None found." for clean empty reports - writeLines( - c(paste0("--- ", section_labels[[component]], " ---"), "None found."), - con = file_path - ) + if (nrow(inconsistent_sex_roles) > 0) { + cat("\nIDs found as both male_parent and female_parent (is selfing or hermaphrodytism possible?):\n") + print(inconsistent_sex_roles) + if (correct_inconsistent_sex_roles) { + cat(" -> parent fields set to 0 for conflicting IDs; exact duplicates removed in corrected pedigree.\n") } else { - # Write header, then column names, then data — avoids col.names warning - file_con <- file(file_path, open = "wt") - writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) - writeLines(paste(colnames(df), collapse = "\t"), con = file_con) - close(file_con) - utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, - quote = FALSE, append = TRUE, col.names = FALSE) + cat(" -> correct_inconsistent_sex_roles = FALSE: left as-is in corrected pedigree.\n") } + } else cat("\nNo IDs found as both male_parent and female_parent.\n") - zip_files <- c(zip_files, file_path) - } - - # Optionally include corrected pedigree in zip - if (save_corrected_zip && correct) { - corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) - utils::write.table(corrected, file = corrected_path, sep = "\t", - row.names = FALSE, quote = FALSE) - zip_files <- c(zip_files, corrected_path) - } + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected (must be resolved manually):\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") - # Bundle all files into zip in the working directory - zip_path <- file.path(getwd(), zip_name) - utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") - if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) + cat("\nThe corrected pedigree is included in the returned list as corrected_pedigree.\n") } invisible(input_ped_report)