From 778aefaeedcf121f2243cdbecb1124f5d50af639 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 3 Oct 2025 15:11:40 -0400 Subject: [PATCH 01/80] indels support for madc2vcf_targets --- NAMESPACE | 1 + R/check_madc_sanity.R | 105 ++++++++ R/madc2vcf_targets.R | 345 +++++++++++++++--------- man/check_madc_sanity.Rd | 43 +++ man/madc2vcf_targets.Rd | 118 ++++++-- tests/testthat/test-check_madc_sanity.R | 10 + 6 files changed, 474 insertions(+), 148 deletions(-) create mode 100644 R/check_madc_sanity.R create mode 100644 man/check_madc_sanity.Rd create mode 100644 tests/testthat/test-check_madc_sanity.R diff --git a/NAMESPACE b/NAMESPACE index d47d95d..443314d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(allele_freq_poly) export(calculate_Het) export(calculate_MAF) export(check_homozygous_trios) +export(check_madc_sanity) export(check_ped) export(check_replicates) export(dosage2vcf) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R new file mode 100644 index 0000000..19d3ff6 --- /dev/null +++ b/R/check_madc_sanity.R @@ -0,0 +1,105 @@ +#' Run basic sanity checks on a MADC-style allele report +#' +#' @description +#' Performs five quick validations on an allele report: +#' 1) **Columns** – required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); +#' 2) **FixAlleleIDs** – first column’s first up-to-6 rows are not all blank or "*" +#' *and* both `_0001` and `_0002` appear in `AlleleID`; +#' 3) **IUPACcodes** – presence of non-ATCG characters in `AlleleSequence`; +#' 4) **LowerCase** – presence of lowercase a/t/c/g in `AlleleSequence`; +#' 5) **Indels** – reference/alternate allele lengths differ for the same `CloneID`. +#' +#' @param report A `data.frame` with at least the columns +#' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also +#' used in the “FixAlleleIDs” check to inspect its first up to six entries. +#' +#' @details +#' - **IUPAC check:** Flags any character outside `ATCG` (case-insensitive), +#' which will include ambiguity codes (`N`, `R`, `Y`, etc.) and symbols like `-`. +#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs `"Alt_0002"`, +#' merged by `CloneID`, and the lengths of `AlleleSequence` are compared. +#' - If required columns are missing, only **Columns** is evaluated (`FALSE`) and +#' `indel_clone_ids` is returned as `NULL`. +#' +#' @return A list with: +#' \describe{ +#' \item{checks}{Named logical vector with entries +#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`.} +#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt lengths differ. +#' Returns `character(0)` if none, or `NULL` when required columns are missing.} +#' } +#' +#' +#' @export +check_madc_sanity <- function(report) { + + # Initialize + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + + # Validate required columns + required <- c("CloneID", "AlleleID", "AlleleSequence") + missing_cols <- setdiff(required, names(report)) + checks["Columns"] <- length(missing_cols) == 0 + + if(checks[["Columns"]]){ + # ---- FixAlleleIDs ---- + # Check if first up-to-6 entries in the *first column* are all "" or "*" + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 + + # ---- IUPACcodes ---- + iu <- grepl("[^ATCG]", report$AlleleSequence, ignore.case = TRUE) + checks["IUPACcodes"] <- any(iu, na.rm = TRUE) + + # ---- LowerCase ---- + lc <- grepl("[atcg]", report$AlleleSequence) + checks["LowerCase"] <- any(lc, na.rm = TRUE) + + # ---- Indels ---- + refs <- subset(report, grepl("Ref_0001", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + alts <- subset(report, grepl("Alt_0002", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + + merged <- merge(refs, alts, by = "CloneID", suffixes = c("_ref", "_alt"), all = FALSE) + + if (nrow(merged) > 0) { + ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE) + alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) + cmp_ok <- !is.na(ref_len) & !is.na(alt_len) + indel_mask <- cmp_ok & (ref_len != alt_len) + checks["Indels"] <- any(indel_mask) + indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) + } else { + checks["Indels"] <- FALSE + indels <- character(0) + } + + # ---- Chrom Pos ---- + pos <- strsplit(report[,2], "_") + checks["ChromPos"] <- all(sapply(pos, length) == 2) + + } else indels <- NULL + + messages[["Columns"]] <- c("Required columns are present\n", + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n") + messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n", + "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n", + "No IUPAC (non-ATCG) codes found in AlleleSequence\n") + messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n", + "No lowercase bases found in AlleleSequence\n") + messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), + "No indels found (ref/alt lengths match) for all CloneIDs\n") + messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good\n", + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information\n") + + list(checks = checks, messages = messages, indel_clone_ids = indels) +} diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 1b02c31..fc022f9 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -1,45 +1,121 @@ #' Format MADC Target Loci Read Counts Into VCF #' -#' This function will extract the read count information from a MADC file target markers and convert to VCF file format. +#' Convert DArTag MADC target read counts to a VCF #' -#' The DArTag MADC file format is not commonly supported through existing tools. This function -#' will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -#' genotyping panel target markers only +#' @description +#' Parses a DArTag **MADC** report and writes a **VCF v4.3** containing per-target +#' read counts for the panel’s target loci. This is useful because MADC is not +#' widely supported by general-purpose tools, while VCF is. #' -#' @param madc_file Path to MADC file -#' @param output.file output file name and path -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. -#' @param get_REF_ALT if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded. +#' @details +#' **What this function does** +#' - Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, +#' IUPAC/ambiguous bases, lowercase bases, indels). +#' - Extracts reference and total read counts per sample and target. +#' - Derives `AD` (ref,alt) by subtraction (alt = total − ref). +#' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing +#' the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. +#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT` +#' (and `Type`, `Indel_pos` when indels are present), bypassing sequence-based +#' inference. +#' +#' **Output VCF layout** +#' - `INFO` fields: +#' * `DP` — total depth across all samples for the locus +#' * `ADS` — total counts across samples in the order `ref,alt` +#' - `FORMAT` fields (per sample): +#' * `DP` — total reads (ref + alt) +#' * `RA` — reads supporting the reference allele +#' * `AD` — `"ref,alt"` counts +#' +#' **Strand handling** +#' If a target ID appears in `botloci_file`, its probe sequences are reverse- +#' complemented prior to base comparison so that REF/ALT are reported in the +#' top-strand genomic orientation. +#' +#' **Sanity check behavior** +#' - If required columns or fixed IDs are missing, the function `stop()`s. +#' - If IUPAC/lowercase/indels are detected and `markers_info` is **not** +#' provided, the function `stop()`s with a diagnostic message explaining what to fix. +#' +#' @param madc_file character. Path to the input MADC CSV file. +#' @param output.file character. Path to the output VCF file to write. +#' @param botloci_file character. Path to a plain-text file listing target IDs +#' designed on the **bottom** strand (one ID per line). Required when +#' `get_REF_ALT = TRUE` and `markers_info` is not provided. +#' @param markers_info character or `NULL`. Optional path to a CSV providing target +#' metadata. Required columns: `BI_markerID, Chr, Pos, Ref, Alt`. If indels are +#' present, also require `Type, Indel_pos`. When supplied, these values populate +#' `#CHROM, POS, REF, ALT` in the VCF directly. +#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT +#' bases from the Ref/Alt probe sequences in the MADC file (with strand correction +#' using `botloci_file`). Targets with more than one difference between Ref/Alt +#' sequences are removed. +#' +#' @return (Invisibly) returns the path to `output.file`. The side effect is a +#' **VCF v4.3** written to disk containing one row per target and columns for all +#' samples in the MADC file. +#' +#' @section Dependencies: +#' Uses **dplyr**, **tidyr**, **tibble**, **reshape2**, **Biostrings** and base +#' **utils**. Helper functions expected in this package: `check_madc_sanity()`, +#' `get_countsMADC()`, `get_counts()`, and `check_botloci()`. +#' +#' @examples +#' # Example files shipped with the package +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", +#' package = "BIGr") +#' out_vcf <- tempfile(fileext = ".vcf") +#' +#' # Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +#' \dontrun{ +#' madc2vcf_targets( +#' madc_file = madc_file, +#' output.file = out_vcf, +#' botloci_file = bot_file, +#' get_REF_ALT = TRUE +#' ) +#' } +#' +#' # Clean up (example) +#' unlink(out_vcf) +#' +#' @seealso +#' `check_madc_sanity()`, `get_countsMADC()`, `check_botloci()` #' -#' @return A VCF file v4.3 with the target marker read count information #' @import dplyr #' @import tidyr #' @import tibble -#' @importFrom Rdpack reprompt #' @importFrom reshape2 melt dcast #' @importFrom utils write.table #' @importFrom Biostrings DNAString reverseComplement -#' @return A VCF file v4.3 with the target marker read count information -#' -#' @examples -#' # Load example files -#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#' -#' #Temp location (only for example) -#' output_file <- tempfile() -#' -#' # Convert MADC to VCF -#' madc2vcf_targets(madc_file = madc_file, -#' output.file = output_file, -#' get_REF_ALT = TRUE, -#' botloci_file = bot_file) -#' -#' rm(output_file) #' #' @export -madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) { - #Making the VCF (This is highly dependent on snps being in a format where the SNP IDs are the CHR_POS) +madc2vcf_targets <- function(madc_file, + output.file, + botloci_file, + markers_info = NULL, + get_REF_ALT = FALSE) { + + # MADC checks + report <- read.csv(madc_file) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ + idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ + idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) + if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + } matrices <- get_countsMADC(madc_file) ref_df <- data.frame(matrices[[1]], check.names = FALSE) @@ -56,18 +132,124 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = row.names(ad_df) <- row.names(ref_df) #Obtaining Chr and Pos information from the row_names - new_df <- size_df %>% - rownames_to_column(var = "row_name") %>% - separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% - select(CHROM, POS) + if(is.null(markers_info)){ + new_df <- size_df %>% + rownames_to_column(var = "row_name") %>% + separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% + select(CHROM, POS) + + # Remove leading zeros from the POS column + new_df$POS <- sub("^0+", "", new_df$POS) + + #Get read count sums + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + # Get REF and ALT + if(get_REF_ALT){ + if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") + csv <- get_counts(madc_file) + # Keep only the ones that have alt and ref + csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] + + # Get reverse complement the tag is present in botloci + botloci <- read.table(botloci_file, header = FALSE) + + # Check if the botloci file marker IDs match with the MADC file + checked_botloci <- check_botloci(botloci, csv) + botloci <- checked_botloci[[1]] + csv <- checked_botloci[[2]] + + # FIXED: Store original sequences before any transformation + csv$OriginalAlleleSequence <- csv$AlleleSequence + + # Apply reverse complement to sequences for bottom strand markers + idx <- which(csv$CloneID %in% botloci[,1]) + csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) + + ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] + alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + + # FIXED: Get original sequences for SNP calling + orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - # Remove leading zeros from the POS column - new_df$POS <- sub("^0+", "", new_df$POS) + if(all(sort(ref_ord) == sort(alt_ord))){ + # Order sequences consistently + ref_seq <- ref_seq[order(ref_ord)] + alt_seq <- alt_seq[order(alt_ord)] + orig_ref_seq <- orig_ref_seq[order(ref_ord)] + orig_alt_seq <- orig_alt_seq[order(alt_ord)] + ordered_clone_ids <- sort(ref_ord) - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) - new_df$TotalSize <- rowSums(size_df) + ref_base <- alt_base <- vector() + for(i in 1:length(orig_ref_seq)){ + # FIXED: Use original sequences for SNP calling + temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) + + if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences + ref_base[i] <- NA + alt_base[i] <- NA + } else if(length(idx_diff) == 1) { + orig_ref_base <- temp_list[[1]][idx_diff] + orig_alt_base <- temp_list[[2]][idx_diff] + + # FIXED: Apply reverse complement to bases only if marker is in botloci + if(ordered_clone_ids[i] %in% botloci[,1]) { + ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) + alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) + } else { + ref_base[i] <- orig_ref_base + alt_base[i] <- orig_alt_base + } + } else { + # No differences found + ref_base[i] <- NA + alt_base[i] <- NA + } + } + } else { + warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") + ref_base <- "." + alt_base <- "." + } + + } else { + ref_base <- "." + alt_base <- "." + } + } else { + # Verify markers_info file + df <- read.csv(markers_info) + if(checks$checks["Indels"]){ + if(!all(c("BI_markerID","Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) + stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT, Type, Indel_pos") + } + if(!all(c("BI_markerID","Chr","Pos","Ref","Alt") %in% colnames(df))) + stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT") + + if(!all(rownames(ad_df)%in% df$BI_markerID)) + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + + matched <- df[match(rownames(ad_df), df$BI_markerID),] + + new_df <- data.frame( + CHROM = matched$Chr, + POS = matched$Pos + ) + + #Get read count sums + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + ref_base <- matched$Ref + alt_base <- matched$Alt + } #Make a header separate from the dataframe vcf_header <- c( @@ -82,83 +264,6 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = '##FORMAT=' ) - # Get REF and ALT - if(get_REF_ALT){ - if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - csv <- get_counts(madc_file) - # Keep only the ones that have alt and ref - csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] - - # Get reverse complement the tag is present in botloci - botloci <- read.table(botloci_file, header = FALSE) - - # Check if the botloci file marker IDs match with the MADC file - checked_botloci <- check_botloci(botloci, csv) - botloci <- checked_botloci[[1]] - csv <- checked_botloci[[2]] - - # FIXED: Store original sequences before any transformation - csv$OriginalAlleleSequence <- csv$AlleleSequence - - # Apply reverse complement to sequences for bottom strand markers - idx <- which(csv$CloneID %in% botloci[,1]) - csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) - - ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] - alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] - - # FIXED: Get original sequences for SNP calling - orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - - if(all(sort(ref_ord) == sort(alt_ord))){ - # Order sequences consistently - ref_seq <- ref_seq[order(ref_ord)] - alt_seq <- alt_seq[order(alt_ord)] - orig_ref_seq <- orig_ref_seq[order(ref_ord)] - orig_alt_seq <- orig_alt_seq[order(alt_ord)] - ordered_clone_ids <- sort(ref_ord) - - ref_base <- alt_base <- vector() - for(i in 1:length(orig_ref_seq)){ - # FIXED: Use original sequences for SNP calling - temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") - idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - - if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences - ref_base[i] <- NA - alt_base[i] <- NA - } else if(length(idx_diff) == 1) { - orig_ref_base <- temp_list[[1]][idx_diff] - orig_alt_base <- temp_list[[2]][idx_diff] - - # FIXED: Apply reverse complement to bases only if marker is in botloci - if(ordered_clone_ids[i] %in% botloci[,1]) { - ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) - alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) - } else { - ref_base[i] <- orig_ref_base - alt_base[i] <- orig_alt_base - } - } else { - # No differences found - ref_base[i] <- NA - alt_base[i] <- NA - } - } - } else { - warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") - ref_base <- "." - alt_base <- "." - } - - } else { - ref_base <- "." - alt_base <- "." - } - #Make the header#Make the VCF df vcf_df <- data.frame( CHROM = new_df$CHROM, @@ -233,12 +338,4 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) - #Unload all items from memory - rm(matrices) - rm(ref_df) - rm(alt_df) - rm(size_df) - rm(ad_df) - rm(vcf_df) - rm(geno_df) } diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd new file mode 100644 index 0000000..494145e --- /dev/null +++ b/man/check_madc_sanity.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_madc_sanity.R +\name{check_madc_sanity} +\alias{check_madc_sanity} +\title{Run basic sanity checks on a MADC-style allele report} +\usage{ +check_madc_sanity(report) +} +\arguments{ +\item{report}{A \code{data.frame} with at least the columns +\code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also +used in the “FixAlleleIDs” check to inspect its first up to six entries.} +} +\value{ +A list with: +\describe{ +\item{checks}{Named logical vector with entries +\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}.} +\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ. +Returns \code{character(0)} if none, or \code{NULL} when required columns are missing.} +} +} +\description{ +Performs five quick validations on an allele report: +\enumerate{ +\item \strong{Columns} – required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); +\item \strong{FixAlleleIDs} – first column’s first up-to-6 rows are not all blank or "*" +\emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID}; +\item \strong{IUPACcodes} – presence of non-ATCG characters in \code{AlleleSequence}; +\item \strong{LowerCase} – presence of lowercase a/t/c/g in \code{AlleleSequence}; +\item \strong{Indels} – reference/alternate allele lengths differ for the same \code{CloneID}. +} +} +\details{ +\itemize{ +\item \strong{IUPAC check:} Flags any character outside \code{ATCG} (case-insensitive), +which will include ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.) and symbols like \code{-}. +\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs \code{"Alt_0002"}, +merged by \code{CloneID}, and the lengths of \code{AlleleSequence} are compared. +\item If required columns are missing, only \strong{Columns} is evaluated (\code{FALSE}) and +\code{indel_clone_ids} is returned as \code{NULL}. +} +} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index a790460..fad847c 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -4,44 +4,114 @@ \alias{madc2vcf_targets} \title{Format MADC Target Loci Read Counts Into VCF} \usage{ -madc2vcf_targets(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) +madc2vcf_targets( + madc_file, + output.file, + botloci_file, + markers_info = NULL, + get_REF_ALT = FALSE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character. Path to the input MADC CSV file.} -\item{output.file}{output file name and path} +\item{output.file}{character. Path to the output VCF file to write.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{character. Path to a plain-text file listing target IDs +designed on the \strong{bottom} strand (one ID per line). Required when +\code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.} -\item{get_REF_ALT}{if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded.} +\item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target +metadata. Required columns: \verb{BI_markerID, Chr, Pos, Ref, Alt}. If indels are +present, also require \verb{Type, Indel_pos}. When supplied, these values populate +\verb{#CHROM, POS, REF, ALT} in the VCF directly.} + +\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT +bases from the Ref/Alt probe sequences in the MADC file (with strand correction +using \code{botloci_file}). Targets with more than one difference between Ref/Alt +sequences are removed.} } \value{ -A VCF file v4.3 with the target marker read count information - -A VCF file v4.3 with the target marker read count information +(Invisibly) returns the path to \code{output.file}. The side effect is a +\strong{VCF v4.3} written to disk containing one row per target and columns for all +samples in the MADC file. } \description{ -This function will extract the read count information from a MADC file target markers and convert to VCF file format. +Parses a DArTag \strong{MADC} report and writes a \strong{VCF v4.3} containing per-target +read counts for the panel’s target loci. This is useful because MADC is not +widely supported by general-purpose tools, while VCF is. } \details{ -The DArTag MADC file format is not commonly supported through existing tools. This function -will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -genotyping panel target markers only +Convert DArTag MADC target read counts to a VCF + +\strong{What this function does} +\itemize{ +\item Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, +IUPAC/ambiguous bases, lowercase bases, indels). +\item Extracts reference and total read counts per sample and target. +\item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref). +\item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing +the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. +\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT} +(and \code{Type}, \code{Indel_pos} when indels are present), bypassing sequence-based +inference. } -\examples{ -# Load example files -madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#Temp location (only for example) -output_file <- tempfile() +\strong{Output VCF layout} +\itemize{ +\item \code{INFO} fields: +\itemize{ +\item \code{DP} — total depth across all samples for the locus +\item \code{ADS} — total counts across samples in the order \verb{ref,alt} +} +\item \code{FORMAT} fields (per sample): +\itemize{ +\item \code{DP} — total reads (ref + alt) +\item \code{RA} — reads supporting the reference allele +\item \code{AD} — \code{"ref,alt"} counts +} +} -# Convert MADC to VCF -madc2vcf_targets(madc_file = madc_file, - output.file = output_file, - get_REF_ALT = TRUE, - botloci_file = bot_file) +\strong{Strand handling} +If a target ID appears in \code{botloci_file}, its probe sequences are reverse- +complemented prior to base comparison so that REF/ALT are reported in the +top-strand genomic orientation. -rm(output_file) +\strong{Sanity check behavior} +\itemize{ +\item If required columns or fixed IDs are missing, the function \code{stop()}s. +\item If IUPAC/lowercase/indels are detected and \code{markers_info} is \strong{not} +provided, the function \code{stop()}s with a diagnostic message explaining what to fix. +} +} +\section{Dependencies}{ +Uses \strong{dplyr}, \strong{tidyr}, \strong{tibble}, \strong{reshape2}, \strong{Biostrings} and base +\strong{utils}. Helper functions expected in this package: \code{check_madc_sanity()}, +\code{get_countsMADC()}, \code{get_counts()}, and \code{check_botloci()}. +} + +\examples{ +# Example files shipped with the package +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", + package = "BIGr") +out_vcf <- tempfile(fileext = ".vcf") + +# Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +\dontrun{ +madc2vcf_targets( + madc_file = madc_file, + output.file = out_vcf, + botloci_file = bot_file, + get_REF_ALT = TRUE +) +} + +# Clean up (example) +unlink(out_vcf) + +} +\seealso{ +\code{check_madc_sanity()}, \code{get_countsMADC()}, \code{check_botloci()} } diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R new file mode 100644 index 0000000..5053e55 --- /dev/null +++ b/tests/testthat/test-check_madc_sanity.R @@ -0,0 +1,10 @@ +test_that("check madc",{ + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + report <- read.csv(madc_file, check.names = FALSE) + + check_madc_sanity(report) + + +}) + + From 1b761b92586c5eca1521b52cb7e745edb7e173dc Mon Sep 17 00:00:00 2001 From: josuechinchilla Date: Tue, 4 Nov 2025 16:06:12 -0500 Subject: [PATCH 02/80] updated check_ped to save corrected dataframe and report --- .DS_Store | Bin 0 -> 8196 bytes BIGr.Rproj | 1 + DESCRIPTION | 2 +- R/check_ped.R | 267 +++++++++++++++++++++++++++++--------------------- 4 files changed, 157 insertions(+), 113 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3b9d8426aa5a73e270c1816e6de3b764b9deb853 GIT binary patch literal 8196 zcmeHMU2GLa6rOKeV3!TB)B*)rduxSaxrVktrJ``TZ6iM|f48MS{M5a77rJ4)Tkh_? zr8d?2qyaVYMdSY)Q6EsFDEjJuVtgP-l$eMy@rf6W;Dd?r%+6k*Eq&Aw*hyy2oHOUl z^ql?9%$+G?4C#W|$XFF)Os2}Irc!g0!pC`?R-{N$P84L%{Nb$QWr;hR9iFBg3*rdG z5r`uYM<9+s9D!RQ0(55cBF}K{i_y4^BM?X6j*I}`9}-kKO+|D<&~WRZCcFX=O19Ce0V?}Tsw(I&v=O>yYtb#ENM*eYHl z$}-tu;myU8H_BX9ou~Ko_Vw>SFraA#-`ZLfdX=Gu}aB>AwwzfD<#(rFA=klK8?Dl*+usyf0H?WQee2h;tX{fi~_wqq&I9abv z^9!vCzQ(R$ZtU-~@;*^TIX!<#^?j=zXnOGBRQtBhJ?9r_3oF!0b#YJ8_U)`=b&uF) zFxY4KIm5REEQFQnW?AXYc|$|wj4xzqy-n7>mJjiTI}AVA zHEg-1KPjzmmF0ea(8yc6MhJ2xiw;>1KO@PxNxNwWUOtS*&Kh0s<5O)hP0J*wmsKsT z&@+0steqnp&3216ftYNCHR-399fU6aRu`tH9sRqGUmc$X|hnmhBSek8T zhuAP1WhdE->`nF|JHtL=U$F1k1@;sBjs4F4U{}~x_BYBg7nN9!6{y21+>a)##dirPV^%O6NhmG4hksZI7aaVp2jmcfs=Rvui#a@jyG@$@8UhYk5BO#KF62%3K#G* zF5wqk#uZ$}Uy>qKNXsQ%x<{&$R!Q~JYH5?SS=uV?k@}?pX;63|l@jgXWY45g;-hVY z5-oc6lTP0}(W)Qax^4T8j$1a-ew@!!>#oX0iHDhe5O>?=J2$Y&vWgXd(~=f7A1^eSJn|bo03JatCQ8$nnXD& z*oMYxRV$+mR6^VJNn#N(Ef1-xR!+Gn#y081vWQ8y(lk7?`G+2X={FCXD}s zGRRnh1XdzRnBIgG;kpg&gzMediwwGO5CbrfC6qgGF^pr#V+`YX5)*ieF#ar_!}E9v zFB8sRBcz|gTX-Aq-~*h&$2f~mLd1WI?{N{oPQh^JWDMia>liL(@~-2#M@Uja{AA_Z zk}DHlRq^|O`^@kEx25l~Q*i|12#5%vye-p~rv6NOAHQoSs6IxO7p^xUXlO#sq=x|F kul>W2`Uz6yHmQhC2uczv|NBG0^dH*s`5&MEcij2^FAa5Kvj6}9 literal 0 HcmV?d00001 diff --git a/BIGr.Rproj b/BIGr.Rproj index 69fafd4..5638e2e 100644 --- a/BIGr.Rproj +++ b/BIGr.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 0eeaab63-2615-4da7-b10a-927160fc78a3 RestoreWorkspace: No SaveWorkspace: No diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..cba831e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: diff --git a/R/check_ped.R b/R/check_ped.R index 3f4831b..35b0ba9 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,99 +1,142 @@ -#' Evaluate Pedigree File for Accuracy +#' Check a pedigree file for accuracy and report/correct common errors #' -#' Check a pedigree file for accuracy and output suspected errors +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' and performs quality checks, optionally correcting or flagging errors. #' -#'check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: -#'* Ids that appear more than once in the id column -#'* Ids that appear in both sire and dam columns -#'* Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -#'* Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +#' The function checks for: +#' * Exact duplicate rows and removes them (keeping one copy) +#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +#' * IDs that appear in both sire and dam columns +#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' -#'When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -#'Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. #' -#'Note: This function does not change the input file but prints any errors found in the console. +#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: +#' * Corrected pedigree as a dataframe in the global environment +#' * A report listing all detected issues +#' +#' @param ped.file Path to the pedigree text file. +#' @param seed Optional seed for reproducibility. +#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' +#' @return A list of data.frames containing detected issues: +#' * `exact_duplicates`: rows that were exact duplicates +#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam +#' * `messy_parents`: IDs appearing as both sire and dam +#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam +#' * `dependencies`: detected cycles in the pedigree #' -#' @param ped.file path to pedigree text file. The pedigree file is a -#' 3-column pedigree tab separated file with columns labeled as id sire dam in any order -#' @param seed Optional seed for reproducibility -#' @param verbose Logical. If TRUE, print the errors to the console. -#' @return A list of data.frames of error types, and the output printed to the console #' @examples -#' ##Get list with a dataframe for each error type -#' ped_file <- system.file("check_ped_test.txt", package="BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, -#' seed = 101919) +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) #' -#' ##Access the "messy parents" dataframe result +#' # Access messy parents #' ped_errors$messy_parents #' -#' ##Get list of sample IDs with messy parents error -#' messy_parent_ids <- ped_errors$messy_parents$id -#' print(messy_parent_ids) +#' # IDs with messy parents +#' messy_ids <- ped_errors$messy_parents$id +#' print(messy_ids) +#' #' @import dplyr #' @import janitor #' @importFrom stats setNames #' @importFrom utils read.table #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { - #### Function to check for hierarchical errors missing parents and repeated ids #### - if(!is.null(seed)){ - set.seed(seed) - } - #### read in data #### - data = utils::read.table(ped.file, header = T) - data <- data %>% + + #### setup #### + if (!is.null(seed)) set.seed(seed) + + # Read and clean data + data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( id = as.character(id), sire = as.character(sire), dam = as.character(dam) - ) - #Missing parents dataframe initialize - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + ) + + original_data <- data errors <- list() - # repeated id checks - n_occur <- data.frame(table(data$id)) - repeated_ids = n_occur[n_occur$Freq > 1,] %>% - rename(id = Var1) - # Check for ids that appear as both sire and dam ###This is possible for plants so maybe do not control for this or do not delete these rows just print them - messy_parents <- as.data.frame(intersect(data$sire, data$dam)) %>% - rename(id = 1) %>% - filter(id != 0) - # Missing parents check + missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + + #### check 1: exact duplicates #### + exact_duplicates <- data[duplicated(data), ] + if (nrow(exact_duplicates) > 0) { + data <- distinct(data) # remove exact duplicates + } + + #### check 2: repeated IDs with conflicting sire/dam #### + repeated_ids <- data %>% + group_by(id) %>% + filter(n() > 1) %>% + ungroup() + + # Only IDs with actual conflicting sire/dam + conflicting_ids <- repeated_ids %>% + group_by(id) %>% + filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% + ungroup() + + if (nrow(conflicting_ids) > 0) { + # Keep one row per ID, set sire/dam to "0" + data <- data %>% + group_by(id) %>% + summarize( + sire = if(n_distinct(sire) > 1) "0" else first(sire), + dam = if(n_distinct(dam) > 1) "0" else first(dam), + .groups = "drop" + ) + } + + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents #### for (i in 1:nrow(data)) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] + if (sire != "0" && sire != id && !sire %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) } if (dam != "0" && dam != id && !dam %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) } + if (sire == id || dam == id) { errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) } } - # Remove duplicates + missing_parents <- distinct(missing_parents) - # Combine original data with missing parents - corrected_data <- bind_rows(data, missing_parents) - # Function to detect cycles in the pedigree graph and identify the nodes involved + if (nrow(missing_parents) > 0) { + data <- bind_rows(data, missing_parents) + } + + #### check 4: messy parents #### + sire_ids <- unique(data$sire[data$sire != "0"]) + dam_ids <- unique(data$dam[data$dam != "0"]) + messy_ids <- intersect(sire_ids, dam_ids) + messy_parents <- data %>% filter(id %in% messy_ids) + + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { - # Create an adjacency list - adj_list <- list() - for (i in 1:nrow(data)) { - adj_list[[data$id[i]]] <- c(data$sire[i], data$dam[i]) - } - # Helper function to perform DFS and detect cycles + adj_list <- lapply(data$id, function(x) { + row <- data[data$id == x, ] + c(row$sire, row$dam) + }) + names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { - if (neighbor != "0") { + if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { cycles <- append(cycles, dfs(neighbor, visited, rec_stack, path)) } else if (rec_stack[neighbor]) { @@ -102,14 +145,15 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } + rec_stack[node] <- FALSE return(cycles) } - # Initialize visited and recursion stack + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - # Check for cycles in the graph and return the nodes involved + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) @@ -120,75 +164,74 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } return(all_cycles) } - # Check for cycles in the corrected pedigree data - cycles <- detect_all_cycles(corrected_data) + + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { - cycle_number <- 1 for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) } } - results <- list(missing_parents = missing_parents, dependencies = data.frame(Dependency = unlist(errors)), repeated_ids = repeated_ids, messy_parents = messy_parents) - repeated_ids <- results$repeated_ids - missing_parents <- results$missing_parents - messy_parents <- results$messy_parents - errors <- results$dependencies - # Adding the dataframes as an output list - output.results <- list() - #### Print errors and cycles #### - # Print repeated ids if any - if (nrow(repeated_ids) > 0) { - if (verbose) { - cat("Repeated ids found:\n") - message(repeated_ids) - } - output.results$repeated_ids <- repeated_ids - } else { - if (verbose) { - cat("No repeated ids found.\n") - } - } - #Print parents that appear as male and female - if (nrow(messy_parents) > 0) { - if (verbose) { - cat("Ids found as male and female parent:\n") - message(messy_parents) - } - output.results$messy_parents <- messy_parents + #### compile findings #### + input_ped_report <- list( + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_report, + messy_parents = messy_parents, + missing_parents = missing_parents, + dependencies = data.frame(Dependency = unique(unlist(errors))) + ) - } else { - if (verbose) { - cat("No ids found as male and female parent.\n") - } - } - # Print missing parents if any - if (nrow(missing_parents) > 0) { - if (verbose) { - cat("Missing parents found:\n") - message(missing_parents) - } - output.results$missing_parents <- missing_parents + #### file names #### + file_base <- tools::file_path_sans_ext(basename(ped.file)) + corrected_name <- paste0(file_base, "_corrected") + report_name <- paste0(file_base, "_report") - } else { - if (verbose) { - cat("No missing parents found.\n") - } - } - # Print errors if any - if (nrow(errors) > 0) { - if (verbose) { - cat("Dependencies found:\n") - message(unique(errors$Dependency)) + #### output #### + if (verbose) { + cat("\n=== Pedigree Quality Check Report ===\n") + + if (nrow(exact_duplicates) > 0) { + cat("\nExact duplicate rows detected and removed (only one copy kept):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate rows found.\n") + + if (nrow(repeated_ids_report) > 0) { + cat("\nRepeated IDs with conflicting sire/dam (sire/dam set to 0 in corrected pedigree):\n") + print(repeated_ids_report) + } else cat("\nNo conflicting repeated IDs found.\n") + + if (nrow(messy_parents) > 0) { + cat("\nIDs found as both sire and dam:\n") + print(messy_parents) + } else cat("\nNo IDs found as both sire and dam.\n") + + if (nrow(missing_parents) > 0) { + cat("\nMissing parents were added to the pedigree with 0 as sire/dam:\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") + + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected:\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") + + #### interactive save #### + cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) + ans <- tolower(trimws(readline())) + if (ans == "y") { + assign(corrected_name, data, envir = .GlobalEnv) + assign("input_ped_report", input_ped_report, envir = .GlobalEnv) + cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) + } else { + cat("No corrected pedigree was saved.\n") } - output.results$dependencies <- data.frame(Dependency = unlist(errors)) } else { - if (verbose) { - cat("No dependencies found.\n") - } + # Silent automatic mode + assign(corrected_name, data, envir = .GlobalEnv) + assign(report_name, input_ped_report, envir = .GlobalEnv) } - return(results) + invisible(input_ped_report) } From 743043a0753f35def5d52c441d077f07800adb1d Mon Sep 17 00:00:00 2001 From: josuechinchilla Date: Tue, 4 Nov 2025 16:23:14 -0500 Subject: [PATCH 03/80] reorganized report and fixed language --- R/check_ped.R | 21 +++++++++--------- man/check_ped.Rd | 56 ++++++++++++++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 35b0ba9..a2277bc 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -169,7 +169,7 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { if (length(cycles) > 0) { for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) + errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) } } @@ -192,24 +192,25 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { cat("\n=== Pedigree Quality Check Report ===\n") if (nrow(exact_duplicates) > 0) { - cat("\nExact duplicate rows detected and removed (only one copy kept):\n") + cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") print(exact_duplicates) - } else cat("\nNo exact duplicate rows found.\n") + } else cat("\nNo exact duplicate trios found.\n") if (nrow(repeated_ids_report) > 0) { - cat("\nRepeated IDs with conflicting sire/dam (sire/dam set to 0 in corrected pedigree):\n") + cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") print(repeated_ids_report) - } else cat("\nNo conflicting repeated IDs found.\n") + } else cat("\nNo conflicting repeated trios found.\n") + + if (nrow(missing_parents) > 0) { + cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") if (nrow(messy_parents) > 0) { - cat("\nIDs found as both sire and dam:\n") + cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") print(messy_parents) } else cat("\nNo IDs found as both sire and dam.\n") - if (nrow(missing_parents) > 0) { - cat("\nMissing parents were added to the pedigree with 0 as sire/dam:\n") - print(missing_parents) - } else cat("\nNo missing parents found.\n") if (nrow(input_ped_report$dependencies) > 0) { cat("\nDependencies detected:\n") diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 693bfe0..ea63de7 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,58 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Evaluate Pedigree File for Accuracy} +\title{Check a pedigree file for accuracy and report/correct common errors} \usage{ check_ped(ped.file, seed = NULL, verbose = TRUE) } \arguments{ -\item{ped.file}{path to pedigree text file. The pedigree file is a -3-column pedigree tab separated file with columns labeled as id sire dam in any order} +\item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility} +\item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE, print the errors to the console.} +\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} } \value{ -A list of data.frames of error types, and the output printed to the console +A list of data.frames containing detected issues: +\itemize{ +\item \code{exact_duplicates}: rows that were exact duplicates +\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam +\item \code{messy_parents}: IDs appearing as both sire and dam +\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam +\item \code{dependencies}: detected cycles in the pedigree +} } \description{ -Check a pedigree file for accuracy and output suspected errors +\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) +and performs quality checks, optionally correcting or flagging errors. } \details{ -check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: +The function checks for: \itemize{ -\item Ids that appear more than once in the id column -\item Ids that appear in both sire and dam columns -\item Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -\item Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +\item Exact duplicate rows and removes them (keeping one copy) +\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +\item IDs that appear in both sire and dam columns +\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" +\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant } -When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -Note: This function does not change the input file but prints any errors found in the console. +The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: +\itemize{ +\item Corrected pedigree as a dataframe in the global environment +\item A report listing all detected issues +} } \examples{ -##Get list with a dataframe for each error type -ped_file <- system.file("check_ped_test.txt", package="BIGr") -ped_errors <- check_ped(ped.file = ped_file, - seed = 101919) +ped_file <- system.file("check_ped_test.txt", package = "BIGr") +ped_errors <- check_ped(ped.file = ped_file, seed = 101919) -##Access the "messy parents" dataframe result +# Access messy parents ped_errors$messy_parents -##Get list of sample IDs with messy parents error -messy_parent_ids <- ped_errors$messy_parents$id -print(messy_parent_ids) +# IDs with messy parents +messy_ids <- ped_errors$messy_parents$id +print(messy_ids) + } From 0b97b46434afcd7f441cb2b9705df2a9323232bf Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 14 Nov 2025 12:59:33 -0500 Subject: [PATCH 04/80] bugfix - if hapDB padding is not matching report --- R/madc2vcf_all.R | 16 ++++++++++++++-- R/utils.R | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 329ac5a..125d540 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -135,7 +135,7 @@ madc2vcf_all <- function(madc = NULL, loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){ if(!is.null(hap_seq)){ - hap_seq <- get_ref_alt_hap_seq(hap_seq) + hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) } nsamples <- ncol(report) - 3 @@ -376,7 +376,8 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ #' @param hap_seq haplotype db #' #' @noRd -get_ref_alt_hap_seq <- function(hap_seq){ +get_ref_alt_hap_seq <- function(hap_seq, botloci){ + headers <- hap_seq$V1[grep(">",hap_seq$V1)] headers <- gsub(">", "", headers) @@ -394,6 +395,17 @@ get_ref_alt_hap_seq <- function(hap_seq){ seqs <- sapply(seqs, function(x) paste0(x, collapse = "")) hap_seq <- data.frame(AlleleID = headers, AlleleSequence = seqs) + + # Check padding + hap_cloneID <- sapply(strsplit(hap_seq$AlleleID, "[|]"), function(x) x[1]) + botloci_cloneID <- botloci$V1 + + pad_hap <- unique(nchar(sub(".*_", "", hap_cloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci_cloneID))) + + if(length(pad_hap) > 1) stop("Check marker IDs in haplotype DB file. They should have the same padding.") + if(pad_hap != pad_botloci) stop("Check marker IDs padding in haplotype DB file. They should match the botloci file.") + return(hap_seq) } diff --git a/R/utils.R b/R/utils.R index c280ad2..2399560 100644 --- a/R/utils.R +++ b/R/utils.R @@ -56,6 +56,7 @@ check_botloci <- function(botloci, report, verbose=TRUE){ report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) ) + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) } else { botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) From bccf0acb289597b284836e23bfb364e20e98674c Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 22 Jan 2026 17:13:05 -0500 Subject: [PATCH 05/80] add indel exception --- R/madc2vcf_all.R | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 329ac5a..919fdb2 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -147,7 +147,7 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align by_cloneID <- split.data.frame(new.file, new.file$CloneID) clust <- makeCluster(n.cores) - #clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) + clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = verbose)) stopCluster(clust) @@ -168,8 +168,8 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align } clust <- makeCluster(n.cores) - #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci", "alignment_score_thr")) + clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) + clusterExport(clust, c("botloci", "alignment_score_thr")) compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) stopCluster(clust) @@ -180,10 +180,13 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_score <- unlist(rm_score) rm_N <- sapply(compare_results, "[[", 3) rm_N <- unlist(rm_N) + rm_indels <- sapply(compare_results, "[[", 4) + rm_indels <- unlist(rm_indels) if(verbose){ cat("Number of tags removed because of low alignment score:", length(rm_score),"tags\n") cat("Number of tags removed because of N in the alternative sequence:", length(rm_N),"tags\n") + cat("Number of tags removed because of indels as targets (not yet supported):", length(rm_indels),"tags\n") } rownames(my_results_csv) <- NULL @@ -301,6 +304,13 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ # The score is a bit different from the python script despite same weights if(align@score > alignment_score_thr){ # if score for the target sequence is smaller than the threshold, the tag will be discarted pos_target_idx <- align@pattern@mismatch@unlistData + if(length(align@pattern@mismatch@unlistData) == 0){ + #No polymorphisms found between ref and alt sequences - or just indels + return(list(update_tag = NULL, + rm_score = NULL, + rm_N = NULL, + rm_indels= cloneID)) + } ref_base <- substring(ref_seq, align@pattern@mismatch@unlistData, align@pattern@mismatch@unlistData) alt_base <- substring(alt_seq, align@subject@mismatch@unlistData, align@subject@mismatch@unlistData) @@ -356,18 +366,20 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ } return(list(update_tag = update_tag, # updated data.frame, NULL if discarted rm_score = NULL, # cloneID if removed because of low alignment score, NULL if kept - rm_N = NULL)) # cloneID if removed because of N in the target alternative, NULL if kept + rm_N = NULL, + rm_indels = NULL)) # cloneID if removed because of N in the target alternative, NULL if kept } else { return(list(update_tag = NULL, rm_score = NULL, - rm_N = cloneID)) + rm_N = cloneID, + rm_indels = NULL)) } } else{ return(list(update_tag = NULL, rm_score = cloneID, - rm_N = NULL)) + rm_N = NULL, + rm_indels = NULL)) } - } #' Converts the fasta to a data.frame with first column the AlleleID and second the AlleleSequence From 25caf0a12dfd179f85a87b1bc5adc8481a04d7f2 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 22 Jan 2026 17:14:14 -0500 Subject: [PATCH 06/80] bugfix --- R/madc2vcf_all.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 919fdb2..2dc4fb5 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -147,7 +147,7 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align by_cloneID <- split.data.frame(new.file, new.file$CloneID) clust <- makeCluster(n.cores) - clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) + #clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = verbose)) stopCluster(clust) @@ -168,8 +168,8 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align } clust <- makeCluster(n.cores) - clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - clusterExport(clust, c("botloci", "alignment_score_thr")) + #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) + #clusterExport(clust, c("botloci", "alignment_score_thr")) compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) stopCluster(clust) From 4f30e528d7176269b2e502883e1d43552347f014 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 22 Jan 2026 17:18:39 -0500 Subject: [PATCH 07/80] up version --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..0cef1b8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.2 +Version: 0.6.3 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index b089e67..19d2509 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# BIGr 0.6.3 + +- Ignore tags when targets are indels + # BIGr 0.6.2 - Fixed the doi and name list in the CITATION file From 82279af97ef82a077a0dcdfd264f961b9d2abf2c Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Thu, 26 Feb 2026 15:12:09 -0500 Subject: [PATCH 08/80] added option to print plot or list to imputation_concordance --- R/imputation_concordance.R | 155 ++++++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 47 deletions(-) diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 6ab2eba..ac501bb 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -1,89 +1,121 @@ #' Calculate Concordance between Imputed and Reference Genotypes #' -#' This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -#' It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -#' will not be considered for concordance if the `missing_code` argument is used. If a specific subset of markers should be excluded, -#' it can be provided using the `snps_2_exclude` argument. +#' This function calculates the concordance between imputed and reference +#' genotypes. It assumes that samples are rows and markers are columns. +#' Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +#' Missing data in either dataset can be excluded from the concordance calculation +#' using the `missing_code` argument. Specific markers can be excluded using +#' the `snps_2_exclude` argument. #' -#' @param reference_genos A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param imputed_genos A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param missing_code An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation. -#' @param snps_2_exclude An optional vector of marker IDs to exclude from the concordance calculation. -#' @param verbose A logical value indicating whether to print a summary of the concordance results. Default is FALSE. +#' @param reference_genos A data frame containing reference genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @return A list with two elements: -#' \itemize{ -#' \item \code{result_df}: A data frame with sample IDs and their concordance percentages. -#' \item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. -#' } +#' @param imputed_genos A data frame containing imputed genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @details The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +#' @param missing_code Optional value specifying missing data. If provided, +#' loci with this value in either dataset will be excluded from the concordance calculation. #' -#' @import dplyr +#' @param snps_2_exclude Optional vector of marker IDs to exclude from the concordance calculation. #' -#' @examples +#' @param verbose Logical. If `TRUE`, prints summary statistics (minimum, quartiles, +#' median, mean, maximum) of concordance percentages. #' -#' # Example Input variables -#' ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -#' ref_file <- system.file("imputation_reference.txt", package="BIGr") -#' test_file <- system.file("imputation_test.txt", package="BIGr") +#' @param plot Logical. If `TRUE`, produces a bar plot of concordance percentage +#' by sample. #' -#' # Import files -#' snps = read.table(ignore_file, header = TRUE) -#' ref = read.table(ref_file, header = TRUE) -#' test = read.table(test_file, header = TRUE) +#' @param print_result Logical. If `TRUE` (default), prints the concordance +#' results data frame to the console. If `FALSE`, results are returned invisibly. #' -#' #Calculations -#' result <- imputation_concordance(reference_genos = ref, -#' imputed_genos = test, -#' snps_2_exclude = snps, -#' missing_code = 5, -#' verbose = FALSE) +#' @return A data frame with: +#' \itemize{ +#' \item \code{ID}: Sample identifiers shared between the datasets. +#' \item \code{Concordance}: Percentage of matching genotypes per sample. +#' } +#' If \code{print_result = FALSE}, the data frame is returned invisibly. #' +#' @details +#' The function: +#' \enumerate{ +#' \item Identifies common samples and markers between the datasets. +#' \item Optionally excludes specified SNPs. +#' \item Removes loci with missing data (if \code{missing_code} is provided). +#' \item Computes per-sample concordance as the percentage of matching genotypes. +#' } #' +#' When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +#' is generated using \pkg{ggplot2}. #' -#' @export +#' @import dplyr +#' @import ggplot2 +#' +#' @examples +#' result <- imputation_concordance( +#' reference_genos = ref, +#' imputed_genos = test, +#' snps_2_exclude = snps, +#' missing_code = 5, +#' verbose = TRUE, +#' plot = TRUE +#' ) #' +#' @export imputation_concordance <- function(reference_genos, imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE) { + verbose = FALSE, + plot = FALSE, + print_result = TRUE) { # Find common IDs common_ids <- intersect(imputed_genos$ID, reference_genos$ID) - imputed_genos <- imputed_genos %>% filter(ID %in% common_ids) %>% arrange(ID) - reference_genos <- reference_genos %>% filter(ID %in% common_ids) %>% arrange(ID) + imputed_genos <- imputed_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) + + reference_genos <- reference_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) - # Find common SNPs, excluding those in snps_2_exclude if provided + # Find common SNPs common_snps <- setdiff( intersect(colnames(imputed_genos), colnames(reference_genos)), as.vector(unlist(snps_2_exclude)) ) - # Subset and convert to matrices for faster computation + # Remove ID column if present + common_snps <- setdiff(common_snps, "ID") + + # Convert to matrices imputed_matrix <- as.matrix(imputed_genos[, common_snps]) reference_matrix <- as.matrix(reference_genos[, common_snps]) - # Identify valid SNPs that are not missing in either dataset + # Identify valid SNPs if (!is.null(missing_code)) { - valid_snps <- (imputed_matrix != missing_code) & (reference_matrix != missing_code) + valid_snps <- (imputed_matrix != missing_code) & + (reference_matrix != missing_code) } else { - valid_snps <- matrix(TRUE, nrow = nrow(imputed_matrix), ncol = ncol(imputed_matrix)) + valid_snps <- matrix(TRUE, + nrow = nrow(imputed_matrix), + ncol = ncol(imputed_matrix)) } - # Compute concordance (row-wise percentage of matching SNPs) + # Compute concordance matches <- (imputed_matrix == reference_matrix) & valid_snps - percentage_match <- rowSums(matches, na.rm = TRUE) / rowSums(valid_snps, na.rm = TRUE) + percentage_match <- rowSums(matches, na.rm = TRUE) / + rowSums(valid_snps, na.rm = TRUE) + + percentage_match[is.nan(percentage_match)] <- NA - # Create output data frame + # Output data frame (original structure preserved) result_df <- data.frame( - ID = common_ids, + ID = imputed_genos$ID, Concordance = paste0(round(percentage_match * 100, 2), "%") ) - # Print mean concordance + # Summary statistics summary_concordance <- summary(percentage_match, na.rm = TRUE) * 100 names(summary_concordance) <- c("Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max") @@ -94,6 +126,35 @@ imputation_concordance <- function(reference_genos, } } - return(result_df) -} + # Print results to console (NEW OPTION) + if (print_result) { + print(result_df) + } + + # Optional plot + if (plot) { + + plot_df <- data.frame( + ID = imputed_genos$ID, + Concordance = percentage_match * 100 + ) + + concordance_plot <- ggplot(plot_df, + aes(x = reorder(ID, Concordance), + y = Concordance)) + + geom_bar(stat = "identity") + + labs(title = "Imputation Concordance by Sample", + x = "Sample ID", + y = "Concordance (%)") + + theme_minimal() + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + print(concordance_plot) + } + + if (print_result) { + return(result_df) + } else { + invisible(result_df) + } +} From 6b81982330c37127c9c0c6125cc62f6063d5fad3 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 3 Mar 2026 09:05:31 -0500 Subject: [PATCH 09/80] ignore DS_STore --- .DS_Store | Bin 8196 -> 8196 bytes .gitignore | 1 + 2 files changed, 1 insertion(+) diff --git a/.DS_Store b/.DS_Store index 3b9d8426aa5a73e270c1816e6de3b764b9deb853..55d74ff50dbdf39668d23a3bcd564e8fac996bea 100644 GIT binary patch literal 8196 zcmeHMTWl0n7(U;$z!^HwDHIswvWr!Tu)+eRf`DvqXc1`BZRrJM*~XSj@EgECgS0yM9MvbUZ5`&@-J}6!=#s{Oucz-ZH`QU%%%o5tdlg5b7N#^|L|Ih!Q zIp;gyKeK0+F@}~>VHINrV~o=!q+UbSMT+Pb*QBC^mTHoqc;@D)buu?>Tb@73J2FHF zLaI(ikzf>s4q?F+d;o$?}oe`c;5Zs-{j|c7y zDJi2mLLfq5G6M4K-klrqhI96X=l5jt6DU>HQ>ISi(|8TPHND>*N&8v9kcR4qw7~E6Y<71O@|Mwd``p44c(ao_K@Qqk+Vnm1Yk2JtmXmI!Mbg)+$xt+ zwN`$LNG3n8EJdAOqs=gSdUy6|0~vi*ndaO%>|3tm?KF!X$&6m>(b_uluG3XCMSox3 zwM#>em(lC;Svy~{v%cAE+iB|#lWetKzUbO^m+M(#n1O!ZJmibMNnQPeo?G-=gTXSo zUh>Rtlg6^jW*d#@^;>jH5m-NASx`^RoVRev)yq~kuHV$MW$TW!wR+twtzMhkSGGJW zXPdnvR>9vtkoEGWQ?Q(Ug9NVQTen$eQLRHl?Qyb0W)Nk}Hx_Wcf%lhk*1?kH-!8{m zY{XUlWxl&W6WQ(-L}WqwFE*~?e3!7d7EP+9SzBsctLjd|`I1e_D>>{MSVEJy?x!Bmw`G6R#xlo{SX6bdcon3}O z+h93mi&#xtC&{gU$T@GQY(Yb6nQS(kvQGuA086zYl?+`tTf|ngHnxrJWkoj1j__%H00mPp6*Z{G9L$4(C0K^#xDG3@0c~i<7Ib45c4H4RC}0Q< zhH(&O+>J5Zi~Ddt9>Jq{43Fa(Jd5XW3@_q!yn#3I7CykI_zdUpHGaa+_(g#-U8zwt zWv&uaRx6FlTBS+3UP&n(O1ILd^vjtml{6X*R}u;(OAVPu;S)hQaq%>Y-E?!uR&Cp5 zlSb!eio}^+KWFZ|xX7k;o0`wlPO`K@%nJ!0aC}^51RM`|Fp;0e{3{kLT%^T%DGAI& z6VtMiuKF~Ql>DslUUH=t)2C3HNcEbfq)w$2k?Qh9ET$`znNm%zifMWk<((SDu1%1N zP+5P2rs>s`fwJ$$MWnJ&X=$ZlL_Ym(%s$7yV&Ab}*&oF0nV3y{UWz1|u#tGY16@d? z2ff7Qy~rUS@Yy9c7vbSf+(nGOhd6x%NAVyY!ozqHPvPl+*T--i&*KHWf>&`8uiBY^6*bXyB8u=0~&;#$R7J5JYLxGPV45=L_R}qs6<(QOY Zq4JM^2#DVQ0`|Z5%Qkxd3k)yo@+aYeOHcp+ literal 8196 zcmeHMU2GLa6rOKeV3!TB)B*)rduxSaxrVktrJ``TZ6iM|f48MS{M5a77rJ4)Tkh_? zr8d?2qyaVYMdSY)Q6EsFDEjJuVtgP-l$eMy@rf6W;Dd?r%+6k*Eq&Aw*hyy2oHOUl z^ql?9%$+G?4C#W|$XFF)Os2}Irc!g0!pC`?R-{N$P84L%{Nb$QWr;hR9iFBg3*rdG z5r`uYM<9+s9D!RQ0(55cBF}K{i_y4^BM?X6j*I}`9}-kKO+|D<&~WRZCcFX=O19Ce0V?}Tsw(I&v=O>yYtb#ENM*eYHl z$}-tu;myU8H_BX9ou~Ko_Vw>SFraA#-`ZLfdX=Gu}aB>AwwzfD<#(rFA=klK8?Dl*+usyf0H?WQee2h;tX{fi~_wqq&I9abv z^9!vCzQ(R$ZtU-~@;*^TIX!<#^?j=zXnOGBRQtBhJ?9r_3oF!0b#YJ8_U)`=b&uF) zFxY4KIm5REEQFQnW?AXYc|$|wj4xzqy-n7>mJjiTI}AVA zHEg-1KPjzmmF0ea(8yc6MhJ2xiw;>1KO@PxNxNwWUOtS*&Kh0s<5O)hP0J*wmsKsT z&@+0steqnp&3216ftYNCHR-399fU6aRu`tH9sRqGUmc$X|hnmhBSek8T zhuAP1WhdE->`nF|JHtL=U$F1k1@;sBjs4F4U{}~x_BYBg7nN9!6{y21+>a)##dirPV^%O6NhmG4hksZI7aaVp2jmcfs=Rvui#a@jyG@$@8UhYk5BO#KF62%3K#G* zF5wqk#uZ$}Uy>qKNXsQ%x<{&$R!Q~JYH5?SS=uV?k@}?pX;63|l@jgXWY45g;-hVY z5-oc6lTP0}(W)Qax^4T8j$1a-ew@!!>#oX0iHDhe5O>?=J2$Y&vWgXd(~=f7A1^eSJn|bo03JatCQ8$nnXD& z*oMYxRV$+mR6^VJNn#N(Ef1-xR!+Gn#y081vWQ8y(lk7?`G+2X={FCXD}s zGRRnh1XdzRnBIgG;kpg&gzMediwwGO5CbrfC6qgGF^pr#V+`YX5)*ieF#ar_!}E9v zFB8sRBcz|gTX-Aq-~*h&$2f~mLd1WI?{N{oPQh^JWDMia>liL(@~-2#M@Uja{AA_Z zk}DHlRq^|O`^@kEx25l~Q*i|12#5%vye-p~rv6NOAHQoSs6IxO7p^xUXlO#sq=x|F kul>W2`Uz6yHmQhC2uczv|NBG0^dH*s`5&MEcij2^FAa5Kvj6}9 diff --git a/.gitignore b/.gitignore index b64a99f..d3ffaad 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .RData .Ruserdata revdep/ +.DS_Store From 8205e4ec01d2de3ca53eb909e24e70cccc07e032 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 3 Mar 2026 10:24:31 -0500 Subject: [PATCH 10/80] added option to print pre-filtering depth and genotyping rate --- R/filterVCF.R | 292 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 247 insertions(+), 45 deletions(-) diff --git a/R/filterVCF.R b/R/filterVCF.R index a54e32e..6170697 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -26,42 +26,84 @@ #' @examples #' ## Use file paths for each file on the local system #' -#' #Temp location (only for example) -#' output_file <- tempfile() #' -#' filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), -#' filter.OD = 0.5, -#' filter.MAF = 0.05, -#' ploidy = 2, -#' output.file = output_file) -#' -#' # Removing the output for the example -#' rm(output_file) +#' #filterVCF(vcf.file = "example_dart_Dosage_Report.csv", +#' # filter.OD = 0.5, +#' # ploidy = 2, +#' # output.file = "name_for_vcf") #' #' ##The function will output the filtered VCF to the current working directory #' #' @export filterVCF <- function(vcf.file, - filter.OD = NULL, - filter.BIAS.min = NULL, - filter.BIAS.max = NULL, - filter.DP = NULL, - filter.MPP = NULL, - filter.PMC = NULL, - filter.MAF = NULL, - filter.SAMPLE.miss = NULL, - filter.SNP.miss = NULL, - ploidy, - output.file = NULL) { + quality.rates = F, + filter.OD = NULL, + filter.BIAS.min = NULL, + filter.BIAS.max = NULL, + filter.DP = NULL, + filter.MPP = NULL, + filter.PMC = NULL, + filter.MAF = NULL, + filter.SAMPLE.miss = NULL, + filter.SNP.miss = NULL, + ploidy, + output.file = NULL) { #Should allow for any INFO field to be entered to be filtered - # Import VCF (can be .vcf or .vcf.gz) - if (!inherits(vcf.file, "vcfR")) { - vcf <- read.vcfR(vcf.file, verbose = FALSE) + + + # Read VCF (can be .vcf or .vcf.gz) + + if (class(vcf.file) != "vcfR") { + vcf <- read.vcfR(vcf.file) } else { vcf <- vcf.file - #rm(vcf.file) + } + + # Keep original VCF for pre‑filter statistics + vcf_orig <- vcf + + + # pre‑filtering quality rates + + if (quality.rates) { + gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE) + + dfmt <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]] + if ("DP" %in% dfmt) { + dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE) + } else { + dp_orig <- matrix(NA_real_, nrow = nrow(gt_orig), ncol = ncol(gt_orig), + dimnames = dimnames(gt_orig)) + } + + # Per‑marker + mean_depth_marker <- rowMeans(dp_orig, na.rm = TRUE) + genotype_present <- !is.na(gt_orig) + genotyping_rate_marker <- rowMeans(genotype_present) + markers_df <- data.frame( + marker = vcf_orig@fix[, "ID"], + mean_depth = round(mean_depth_marker,2), + genotyping_rate = round(genotyping_rate_marker,2), + stringsAsFactors = FALSE + ) + + # Per‑sample + mean_depth_sample <- colMeans(dp_orig, na.rm = TRUE) + genotyping_rate_sample <- colMeans(genotype_present) + samples_df <- data.frame( + sample = colnames(gt_orig), + mean_depth = round(mean_depth_sample,2), + genotyping_rate = round(genotyping_rate_sample,2), + stringsAsFactors = FALSE + ) + + base_name <- if (!is.null(output.file)) output.file else "pre_filter" + write.csv(markers_df, paste0(base_name, "_marker_stats.csv"), + row.names = FALSE, quote = FALSE) + write.csv(samples_df, paste0(base_name, "_sample_stats.csv"), + row.names = FALSE, quote = FALSE) } #Update header based on user filtering parameters @@ -102,7 +144,7 @@ filterVCF <- function(vcf.file, # Extract the DP values if ("DP" %in% format_fields && !is.null(filter.DP)) { - message("Filtering by DP\n") + cat("Filtering by DP\n") dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold threshold <- as.numeric(filter.DP) @@ -116,7 +158,7 @@ filterVCF <- function(vcf.file, #Filter if the MPP field is present if ("MPP" %in% format_fields && !is.null(filter.MPP)) { - message("Filtering by MPP\n") + cat("Filtering by MPP\n") # Extract the MPP values mpp <- extract.gt(vcf, element = "MPP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold @@ -156,13 +198,13 @@ filterVCF <- function(vcf.file, # Filtering by OD if ("OD" %in% info_ids && !is.null(filter.OD)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by OD\n") + cat("Filtering by OD\n") od_values <- extract_info_value(info, "OD") # Ensure no NA values before filtering if (!all(is.na(od_values))) { vcf <- vcf[od_values < as.numeric(filter.OD), ] } else { - warning("No valid OD values found.\n") + cat("No valid OD values found.\n") } } @@ -171,26 +213,26 @@ filterVCF <- function(vcf.file, # Filtering by BIAS if ("BIAS" %in% info_ids && !is.null(filter.BIAS.min) && !is.null(filter.BIAS.max)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by BIAS\n") + cat("Filtering by BIAS\n") bias_values <- extract_info_value(info, "BIAS") # Ensure no NA values before filtering if (!all(is.na(bias_values))) { vcf <- vcf[bias_values > as.numeric(filter.BIAS.min) & bias_values < as.numeric(filter.BIAS.max), ] } else { - warning("No valid BIAS values found.\n") + cat("No valid BIAS values found.\n") } } # Filtering by PMC if ("PMC" %in% info_ids && !is.null(filter.PMC)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - message("Filtering by PMC\n") + cat("Filtering by PMC\n") pmc_values <- extract_info_value(info, "PMC") # Ensure no NA values before filtering if (!all(is.na(pmc_values))) { vcf <- vcf[pmc_values < as.numeric(filter.PMC), ] } else { - warning("No valid PMC values found.\n") + cat("No valid PMC values found.\n") } } @@ -200,14 +242,14 @@ filterVCF <- function(vcf.file, gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) if (!is.null(filter.SNP.miss)) { - message("Filtering by SNP missing data\n") + cat("Filtering by SNP missing data\n") snp_missing_data <- rowMeans(is.na(gt_matrix)) vcf <- vcf[snp_missing_data < as.numeric(filter.SNP.miss), ] gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) } if (!is.null(filter.SAMPLE.miss)) { - message("Filtering by Sample missing data\n") + cat("Filtering by Sample missing data\n") # Calculate the proportion of missing data for each sample sample_missing_data <- colMeans(is.na(gt_matrix)) # Identify samples to keep based on the missing data threshold @@ -222,30 +264,112 @@ filterVCF <- function(vcf.file, rm(gt_matrix) } + ##Convert GT to dosage + #gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) + + # Function to determine the ploidy level from a genotype string + #determine_ploidy <- function(gt) { + # if (is.na(gt)) { + # return(NA) + # } + # return(length(strsplit(gt, "[|/]")[[1]])) + #} + + # Function to find a non-NA example genotype to determine ploidy + #find_example_gt <- function(matrix) { + # for (i in seq_len(nrow(matrix))) { + # for (j in seq_len(ncol(matrix))) { + # if (!is.na(matrix[i, j])) { + # return(matrix[i, j]) + # } + # } + # } + # return(NA) # Return NA if no non-NA genotype is found + #} + + # Find a non-NA example genotype + #example_gt <- find_example_gt(gt_matrix) + + # Determine the ploidy level + #if (!is.na(example_gt)) { + # ploidy <- determine_ploidy(example_gt) + #} else { + # stop("No non-NA genotype found to determine ploidy.") + #} + + # Generate lookup table for genotypes to dosage conversion + #generate_lookup_table <- function(ploidy) { + # possible_alleles <- 0:ploidy + # genotypes <- expand.grid(rep(list(possible_alleles), ploidy)) + # genotypes <- apply(genotypes, 1, function(x) paste(x, collapse = "/")) + # dosage_values <- rowSums(expand.grid(rep(list(possible_alleles), ploidy))) + # lookup_table <- setNames(dosage_values, genotypes) + # return(lookup_table) + #} + + # Generate the lookup table + #lookup_table <- generate_lookup_table(ploidy) + + # Function to convert genotype to dosage using the lookup table + #genotype_to_dosage <- function(gt, lookup_table) { + # if (is.na(gt)) { + # return(NA) + # } + # return(lookup_table[[gt]]) + #} + + # Function to convert genotype matrix to dosage matrix using vectorized operations + #convert_genotypes_to_dosage <- function(gt_matrix, lookup_table) { + # unique_gts <- unique(gt_matrix) + # gt_to_dosage <- setNames(rep(NA, length(unique_gts)), unique_gts) + # valid_gts <- unique_gts[unique_gts %in% names(lookup_table)] + # gt_to_dosage[valid_gts] <- lookup_table[valid_gts] + # dosage_matrix <- gt_to_dosage[gt_matrix] + #colnames(dosage_matrix) <- colnames(gt_matrix) + #row.names(dosage_matrix) <- row.names(gt_matrix) + # return(matrix(as.numeric(dosage_matrix), nrow = nrow(gt_matrix), ncol = ncol(gt_matrix))) + #} + + # Convert the genotype matrix to dosage matrix + #dosage_matrix <- convert_genotypes_to_dosage(gt_matrix, lookup_table) + ##MAF filter + #Compare my lengthy process to estimate MAF with vcfR::maf() function + #The BIGr::calculate_MAF(dosage_matrix, ploidy) is the exact same as the vcfR::maf() calculations + #The step where I extract UD and calculate MAF is different... + #if ("UD" %in% format_fields) { + # maf_df <- BIGr::calculate_MAF(extract.gt(vcf, element = "UD", as.numeric = TRUE), ploidy = ploidy) + #} else { + #convert genotypes to dosage and filter + # maf_df <- BIGr::calculate_MAF(dosage_matrix, ploidy) + #} + #Need to confirm that vcfR::maf will work with any ploidy...if not, use my code if (!is.null(filter.MAF)) { - message("Filtering by MAF\n") + cat("Filtering by MAF\n") maf_df <- data.frame(vcfR::maf(vcf, element = 2)) vcf <- vcf[maf_df$Frequency > as.numeric(filter.MAF), ] } ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz) - message("Exporting VCF\n") - if (!inherits(vcf.file, "vcfR")) { - if (!is.null(output.file)) { - output_name <- paste0(output.file, ".vcf.gz") + cat("Exporting VCF\n") + if (!class(vcf.file) == "vcfR"){ + if (!is.null(output.file)){ + output_name <- paste0(output.file,".vcf.gz") vcfR::write.vcf(vcf, file = output_name) - } else { + }else{ return(vcf) } - } else { - if (!is.null(output.file)) { - output_name <- paste0(output.file, "_filtered.vcf.gz") + }else{ + if (!is.null(output.file)){ + output_name <- paste0(output.file,"_filtered.vcf.gz") vcfR::write.vcf(vcf, file = output_name) - } else { + }else{ return(vcf) } } + #Message that includes the output vcf stats + print(vcf) + #Message samples_removed <- starting_samples - (ncol(vcf@gt)-1) SNPs_removed <- starting_snps - nrow(vcf) @@ -253,3 +377,81 @@ filterVCF <- function(vcf.file, message("SNPs removed due to filtering: ",SNPs_removed) message("Complete!") } +#This is not reliable, so no longer use this shortcut to get dosage matrix +#test2 <- vcfR2genlight(vcf) + + +#####Testing custom VCF reading function###### +# Open the gzipped VCF file +#con <- gzfile("/Users/ams866/Desktop/output.vcf", "rt") + +# Read in the entire file +#lines <- readLines(con) +#close(con) +# Read in the entire file +#lines <- readLines("/Users/ams866/Desktop/output.vcf") +# Filter out lines that start with ## +#filtered_lines <- lines[!grepl("^##", lines)] +# Create a temporary file to write the filtered lines +#temp_file <- tempfile() +#writeLines(filtered_lines, temp_file) +# Read in the filtered data using read.table or read.csv +#vcf_data <- read.table(temp_file, header = TRUE, sep = "\t", comment.char = "", check.names = FALSE) +# Clean up the temporary file +#unlink(temp_file) + +##Extract INFO column and Filter SNPs by those values +#Update the filtering options by the items present in the INFO column? + +# Load required library +#library(dplyr) + +# Split INFO column into key-value pairs +#vcf_data_parsed <- vcf_data %>% +# mutate(INFO_PARSED = strsplit(INFO, ";")) %>% +# unnest(INFO_PARSED) %>% +# separate(INFO_PARSED, into = c("KEY", "VALUE"), sep = "=") %>% +# spread(KEY, VALUE) + +#Filter by DP +#filtered_vcf_data <- vcf_data_parsed %>% +# filter(as.numeric(DP) > 10) + +# View the filtered dataframe +#print(filtered_vcf_data) + +##Extracting and filtering by FORMAT column +# Identify the columns that are not sample columns +#non_sample_cols <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT") +# Identify the sample columns +#sample_cols <- setdiff(names(vcf_data), non_sample_cols) +# Extract FORMAT keys +#format_keys <- strsplit(as.character(vcf_data$FORMAT[1]), ":")[[1]] +# Split SAMPLE columns based on FORMAT +#vcf_data_samples <- vcf_data %>% +# mutate(across(all_of(sample_cols), ~strsplit(as.character(.), ":"))) %>% +# mutate(across(all_of(sample_cols), ~map(., ~setNames(as.list(.), format_keys)))) %>% +# unnest_wider(all_of(sample_cols), names_sep = "_") + +# View the parsed dataframe +#print(head(vcf_data_samples)) + +# Create separate dataframes for each FORMAT variable +#format_dfs <- lapply(format_keys, function(format_key) { +# vcf_data_samples %>% +# select(ID, ends_with(paste0("_", format_key))) %>% +# column_to_rownames("ID") +#}) + +# Assign names to the list elements +#names(format_dfs) <- format_keys + +# Access the separate dataframes +#gt_df <- format_dfs$GT # Genotype dataframe +#ad_df <- format_dfs$AD # Allelic depths dataframe + +#*I think the above method is okay if you only need to filter at the INFO level, +#*But I think if you want to filter for FORMAT, that vcfR is probably best, +#*Will need to explore further if I can easily just filter for MPP by checking if it is above a +#*threshold, and then converting the GT and UD values to NA if so... +#*If that is efficient and works, then I will just use this custom VCF method... From 31248e3d443c9196533a0ef4f71b5201e3965784 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 4 Mar 2026 15:19:19 -0500 Subject: [PATCH 11/80] added calculation for Ho --- .DS_Store | Bin 8196 -> 8196 bytes R/filterVCF.R | 44 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/.DS_Store b/.DS_Store index 55d74ff50dbdf39668d23a3bcd564e8fac996bea..dd2adc3630aaafc6fd7700efa91c96db437ecab5 100644 GIT binary patch delta 239 zcmZp1XmOYj%eZ4>+9h2n*3b`e!t4%G9bQDYs3?|POkY?*xy3`Ptt3_c7A3`Gpp48069 z7#2=W5T47JF!`UbFOxSmL!OA35n+gxSTp12&HKbW88@>@d}rDGPeg_UUG9tk^@%=r delta 270 zcmZp1XmOYj%eZr6+os;DR3V9@}tBnoJOmq~?Oe`kP5|C!>p1e&!p0Q{0H33~F zp2*2QBGoeS0s^JQsX?hZi6xn3sV<2nsl^eQ$@zJ~nN_Kq=LjxlRNz#0FkpZH4hC5U zBL)`+ABF^mB8F;)UWOSAt0%_`&t;65{8!kQiN}YqnU6)xh%nPqteNrk=DlK`jGNgd PzOiinBO*(JE;mL1vIt69 diff --git a/R/filterVCF.R b/R/filterVCF.R index 6170697..550468c 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -68,37 +68,67 @@ filterVCF <- function(vcf.file, # pre‑filtering quality rates if (quality.rates) { + ## Extract genotypes, depth and DP matrix gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE) dfmt <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]] if ("DP" %in% dfmt) { dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE) } else { - dp_orig <- matrix(NA_real_, nrow = nrow(gt_orig), ncol = ncol(gt_orig), + dp_orig <- matrix(NA_real_, + nrow = nrow(gt_orig), ncol = ncol(gt_orig), dimnames = dimnames(gt_orig)) } - # Per‑marker + + # 1. Observed heterozygosity (per‑marker & per‑sample) + + # Helper: TRUE if a genotype is heterozygous (any two different + # alleles, excluding missing "./.") + is_het <- function(g) { + if (is.na(g) || g == "./.") return(FALSE) + alleles <- strsplit(g, split = "[/|]")[[1]] + return(length(unique(alleles)) > 1) + } + #matrix of heterozygous calls + het_mat <- apply(gt_orig, c(1, 2), is_het) + + #Observed heterozygosity per marker and per sample + obs_het_marker <- rowMeans(het_mat, na.rm = TRUE) + obs_het_sample <- colMeans(het_mat, na.rm = TRUE) + + + #Per‑marker stats + mean_depth_marker <- rowMeans(dp_orig, na.rm = TRUE) genotype_present <- !is.na(gt_orig) genotyping_rate_marker <- rowMeans(genotype_present) + markers_df <- data.frame( marker = vcf_orig@fix[, "ID"], - mean_depth = round(mean_depth_marker,2), - genotyping_rate = round(genotyping_rate_marker,2), + mean_depth = round(mean_depth_marker, 2), + genotyping_rate = round(genotyping_rate_marker, 2), + obs_het = round(obs_het_marker, 2), stringsAsFactors = FALSE ) - # Per‑sample + + #Per‑sample stats + mean_depth_sample <- colMeans(dp_orig, na.rm = TRUE) genotyping_rate_sample <- colMeans(genotype_present) + samples_df <- data.frame( sample = colnames(gt_orig), - mean_depth = round(mean_depth_sample,2), - genotyping_rate = round(genotyping_rate_sample,2), + mean_depth = round(mean_depth_sample, 2), + genotyping_rate = round(genotyping_rate_sample, 2), + obs_het = round(obs_het_sample, 2), stringsAsFactors = FALSE ) + + #Write CSV + base_name <- if (!is.null(output.file)) output.file else "pre_filter" write.csv(markers_df, paste0(base_name, "_marker_stats.csv"), row.names = FALSE, quote = FALSE) From 757b01c69e989096c00c81144bf74704db5de413 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 13 Mar 2026 10:11:38 -0400 Subject: [PATCH 12/80] up version --- DESCRIPTION | 6 +++--- NEWS.md | 7 +++++++ R/madc2vcf_all.R | 20 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..9a18709 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.2 +Version: 0.6.3 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', @@ -23,7 +23,7 @@ Authors@R: c(person(given='Alexander M.', person(given='Dongyan', family='Zhao', role='ctb'), - person('Cornell', 'University', + person('University', "of Florida", role=c('cph'), comment = "Breeding Insight")) Maintainer: Alexander M. Sandercock @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: diff --git a/NEWS.md b/NEWS.md index b089e67..1a0d64d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# BIGr 0.6.3 + +- New function to check MADC file: check_madc_sanity. By now, it checks presence of required columns, if fixed allele IDs were assigned, presence of IUPAC codes, lower case sequences bases, indels, and chromosome and position information. +- Add new arguments markers_info for users to add CSV file with marker information such as CHROM, POS, Marker Type and position of indels. For BI species, these information is at https://github.com/Breeding-Insight/BIGapp-PanelHub +- Check inputs for madc2vcf_all +- Update affiliation on DESCRIPTION + # BIGr 0.6.2 - Fixed the doi and name list in the CITATION file diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 125d540..1413c8e 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -62,6 +62,26 @@ madc2vcf_all <- function(madc = NULL, out_vcf = NULL, verbose = TRUE){ + # Input checks + if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.") + if(!is.null(hap_seq_file) & !file.exists(hap_seq_file)) stop("Haplotype sequence file not found. Please provide a valid path.") + + ## n.cores as integer + if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") + + ## alignment_score_thr, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr as numeric + if(!is.numeric(alignment_score_thr)) stop("alignment_score_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_dp_thr)) stop("multiallelic_SNP_dp_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_sample_thr)) stop("multiallelic_SNP_sample_thr should be numeric.") + + ## out_vcf as string + if(!is.null(out_vcf) & !is.character(out_vcf)) stop("out_vcf should be a string specifying the output file name.") + + ## rm_multiallelic_SNP and verbose as logical + if(!is.logical(rm_multiallelic_SNP)) stop("rm_multiallelic_SNP should be logical.") + if(!is.logical(verbose)) stop("verbose should be logical.") + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=%") +importFrom(dplyr,across) +importFrom(dplyr,group_by) +importFrom(dplyr,mutate) +importFrom(dplyr,select) +importFrom(dplyr,summarise) +importFrom(dplyr,where) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) diff --git a/NEWS.md b/NEWS.md index 1a0d64d..928265a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# BIGr 0.6.4 + +- Add function `vmsg` to organize messages printed on the console +- Add metadata to VCF header from madc2vcf_targets +- Add argument `madc_object` to `get_countsMADC` to avoid reading the MADC file twice and to get directly the MADC fixed padding output from `check_botloci` +- Organize messages from `madc2vcf_targets` checks +- Add argument `collapse_matches_counts` and `verbose` to `madc2vcf_targets` function + # BIGr 0.6.3 - New function to check MADC file: check_madc_sanity. By now, it checks presence of required columns, if fixed allele IDs were assigned, presence of IUPAC codes, lower case sequences bases, indels, and chromosome and position information. diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 19d3ff6..1455606 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -88,18 +88,18 @@ check_madc_sanity <- function(report) { } else indels <- NULL - messages[["Columns"]] <- c("Required columns are present\n", - "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n") - messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n", - "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n") - messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n", - "No IUPAC (non-ATCG) codes found in AlleleSequence\n") - messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n", - "No lowercase bases found in AlleleSequence\n") + messages[["Columns"]] <- c("Required columns are present", + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") + messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", + "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", + "No IUPAC (non-ATCG) codes found in AlleleSequence") + messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", + "No lowercase bases found in AlleleSequence") messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), - "No indels found (ref/alt lengths match) for all CloneIDs\n") - messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good\n", - "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information\n") + "No indels found (ref/alt lengths match) for all CloneIDs") + messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") list(checks = checks, messages = messages, indel_clone_ids = indels) } diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 8be1cda..2123be1 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -1,13 +1,38 @@ #' Obtain Read Counts from MADC File #' -#' This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -#' and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -#' only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +#' Reads a DArTag MADC report and returns reference and total read count matrices +#' per marker and sample. Only `Ref` and `Alt` target loci are retained; +#' `|AltMatch` / `|RefMatch` rows are either discarded or collapsed depending on +#' `collapse_matches_counts`. #' +#' @details +#' Either `madc_file` or `madc_object` must be provided (not both `NULL`). +#' When `madc_object` is supplied it is passed directly to `get_counts()`, +#' skipping file I/O. The function constructs: +#' - `ref_matrix` — per-sample reference allele counts. +#' - `size_matrix` — per-sample total counts (ref + alt). +#' +#' Markers whose `CloneID` appears only in the `Ref` or only in the `Alt` rows +#' are removed with a warning. A summary of the proportion of zero-count +#' data points (missing data) is reported via `vmsg()`. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., as returned by `check_botloci()`). When supplied, file reading is +#' skipped. At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), `|AltMatch` and `|RefMatch` rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A named list with two numeric matrices, both with markers as rows +#' and samples as columns: +#' \describe{ +#' \item{`ref_matrix`}{Reference allele read counts.} +#' \item{`size_matrix`}{Total read counts (reference + alternative).} +#' } #' -#' @param madc_file Path to MADC file -#' @import dplyr -#' @return A list of read count matrices for reference, alternate, and total read count values #' @examples #' # Get the path to the MADC file #' madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") @@ -15,17 +40,35 @@ #' # Extract the read count matrices #' counts_matrices <- get_countsMADC(madc_path) #' -#' # Access the reference, alternate, and size matrices -#' -#' # ref_matrix <- counts_matrices$ref_matrix -#' # alt_matrix <- counts_matrices$alt_matrix +#' # Access the reference and size matrices +#' # ref_matrix <- counts_matrices$ref_matrix #' # size_matrix <- counts_matrices$size_matrix #' #' rm(counts_matrices) +#' +#' @seealso [get_counts()], [check_madc_sanity()] +#' +#' @import dplyr #' @export -get_countsMADC <- function(madc_file) { +get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + + vmsg(paste0("Extracting read counts from ", ifelse(!is.null(madc_file), paste0("MADC file: ", madc_file), "madc_object")), verbose = verbose, level = 0, type = ">>") + vmsg(ifelse(collapse_matches_counts, + "|AltMatch and |RefMatch counts will be collapsed into their respective |Ref and |Alt alleles.", + "|AltMatch and |RefMatch rows will be discarded (collapse_matches_counts = FALSE)."), + verbose = verbose, level = 1, type = ">>") + # This function takes the MADC file as input and generates a Ref and Alt counts dataframe as output - update_df <- get_counts(madc_file) + if (is.null(madc_object)) { + update_df <- get_counts(madc_file = madc_file, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } else { + update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } # Filter rows where 'AlleleID' ends with 'Ref' ref_df <- subset(update_df, grepl("Ref$", AlleleID)) @@ -43,9 +86,16 @@ get_countsMADC <- function(madc_file) { #Retain only the rows in common if they are not identical and provide warning if (same == FALSE) { - warning("Mismatch between Ref and Alt Markers. Markers without a Ref or Alt match removed.") # Find the common CloneIDs between the two dataframes + all_mks <- unique(c(rownames(ref_df), rownames(alt_df))) common_ids <- intersect(rownames(ref_df), rownames(alt_df)) + n_singles <- length(all_mks) - length(common_ids) + + vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>") + vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>") + + warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion")) + # Subset both dataframes to retain only the common rows ref_df <- ref_df[common_ids, ] alt_df <- alt_df[common_ids, ] @@ -77,7 +127,7 @@ get_countsMADC <- function(madc_file) { # Print the result ratio_missing_data <- count_zeros / length(size_matrix) - message("Ratio of missing data =", ratio_missing_data, "\n") + vmsg(paste0("Percentage of missing data (datapoints with 0 total count): ", round(ratio_missing_data * 100, 2), "%"), verbose = verbose, level = 2, type = ">>") # Return the ref and alt matrices as a list matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix) @@ -86,41 +136,109 @@ get_countsMADC <- function(madc_file) { } -get_counts <- function(madc_file) { +#' Read and Pre-process a MADC File +#' +#' Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the +#' file format, and returns a filtered data frame containing only `Ref` and `Alt` +#' haplotype rows ready for count-matrix construction. +#' +#' @details +#' **Input**: either `madc_file` (path to CSV) or `madc_object` (pre-read data +#' frame) must be supplied; at least one is required. +#' +#' **Format detection** (applied to file or object alike): the first seven rows +#' of the first column are inspected: +#' - **Standard format**: all entries are blank or `"*"` — the first 7 rows are +#' treated as DArT placeholder rows and skipped. +#' - **Fixed-allele-ID format**: no filler rows — data are used as-is. +#' +#' **`|AltMatch` / `|RefMatch` handling** (controlled by `collapse_matches_counts`): +#' - `FALSE` (default): these rows are simply discarded. +#' - `TRUE`: their counts are summed into the corresponding `|Ref` or `|Alt` +#' row for the same `CloneID`. +#' +#' In all cases, trailing suffixes on `AlleleID` (e.g., `|Ref_001`, `|Alt_002`) +#' are stripped to the canonical `|Ref` / `|Alt` form. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., from `check_botloci()`). When supplied, file reading is skipped. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), those rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A data frame with one row per `Ref` or `Alt` allele entry, retaining +#' all original columns (`AlleleID`, `CloneID`, `AlleleSequence`, sample +#' count columns, etc.). +#' +#' @importFrom dplyr mutate group_by summarise across where select +#' @importFrom dplyr %>% +#' +#' @keywords internal +get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + # Read the MADC file + + if(!is.null(madc_file)){ #Read only the first column for the first seven rows first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) #Check if all entries in the first column are either blank or "*" check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + } else { + check_entries <- all(madc_object[1:min(7L, nrow(madc_object)), 1] %in% c("", "*")) + } + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline if (check_entries) { #Note: This assumes that the first 7 rows are placeholder info from DArT processing - #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) - - #Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] - - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - + vmsg("Detected raw MADC format with 7-row header. Reading file while skipping the first 7 rows.", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + } else { + madc_df <- madc_object[-(1:7), ] + } } else { #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) - - # Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] + vmsg("Detected fixed allele IDs MADC format", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + } else { + madc_df <- madc_object + } + } - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + if(collapse_matches_counts){ + filtered_df <- madc_df[order(madc_df$AlleleID),] %>% + mutate(Type = ifelse(grepl("Alt", AlleleID), "Alt", "Ref")) %>% + group_by(CloneID, Type) %>% + summarise( + AlleleID = paste0(unique(CloneID), "|", unique(Type)), + AlleleSequence = first(AlleleSequence), + across(where(is.numeric), sum), + .groups = "drop" + ) %>% + select(AlleleID, CloneID, AlleleSequence, everything(), -Type) + } else { + #Retain only the Ref and Alt haplotypes + filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] } + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + return(filtered_df) } diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 1413c8e..13227d3 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -60,8 +60,11 @@ madc2vcf_all <- function(madc = NULL, multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, verbose = TRUE){ + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + # Input checks if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.") if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.") @@ -85,8 +88,8 @@ madc2vcf_all <- function(madc = NULL, bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=') - if(!is.null(madc)) report <- read.csv(madc, check.names = FALSE) else stop("Please provide a MADC file") + report <- read.csv(madc, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ + idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ + idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) + if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + } + if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index fc022f9..92bb8e6 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -44,13 +44,15 @@ #' designed on the **bottom** strand (one ID per line). Required when #' `get_REF_ALT = TRUE` and `markers_info` is not provided. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Required columns: `BI_markerID, Chr, Pos, Ref, Alt`. If indels are -#' present, also require `Type, Indel_pos`. When supplied, these values populate -#' `#CHROM, POS, REF, ALT` in the VCF directly. +#' metadata. Required columns: `CloneID, Chr, Pos, Ref, Alt`. This file is required in +#' case your MADC CloneID column doesn't have the format CHR_POS. If indels are +#' present, columns `Type, Indel_pos` are also required. #' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT #' bases from the Ref/Alt probe sequences in the MADC file (with strand correction #' using `botloci_file`). Targets with more than one difference between Ref/Alt #' sequences are removed. +#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for targets with identical `CHROM_POS` are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target. +#' @param verbose logical (default `FALSE`). If `TRUE`, prints detailed messages about #' #' @return (Invisibly) returns the path to `output.file`. The side effect is a #' **VCF v4.3** written to disk containing one row per target and columns for all @@ -96,7 +98,31 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, markers_info = NULL, - get_REF_ALT = FALSE) { + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = FALSE) { + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + # Input checks + if(!file.exists(madc_file)) stop("The MADC file does not exist.") + if(!is.character(output.file)) stop("output.file must be a character string.") + if(get_REF_ALT && is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") + if(get_REF_ALT && !file.exists(botloci_file)) stop("The botloci file does not exist.") + if(!is.null(markers_info) && !file.exists(markers_info)) stop("The markers_info file does not exist.") + if(!is.null(markers_info) && !is.character(markers_info)) stop("markers_info must be a character string or NULL.") + if(!is.logical(get_REF_ALT)) stop("get_REF_ALT must be a logical value (TRUE or FALSE).") + if(!is.logical(verbose)) stop("verbose must be a logical value (TRUE or FALSE).") + + # Create a VCF header line with metadata about the command and its parameters + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_targets=') # MADC checks report <- read.csv(madc_file) @@ -106,6 +132,9 @@ madc2vcf_targets <- function(madc_file, if (check) message[1] else message[2] }, checks$checks, checks$messages) + for(i in 1:length(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) stop(paste("The MADC file does not pass the sanity checks:\n", @@ -117,7 +146,19 @@ madc2vcf_targets <- function(madc_file, if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) } - matrices <- get_countsMADC(madc_file) + # Check marker names compatibility between MADC and botloci + if(!is.null(botloci_file)){ + botloci <- read.csv(botloci_file, header = F) + checked_botloci <- check_botloci(botloci, report) + botloci <- checked_botloci[[1]] + report <- checked_botloci[[2]] + } + + vmsg("Input checks done", verbose = verbose, level = 1, type = ">>") + + vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") + + matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts) ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -148,8 +189,7 @@ madc2vcf_targets <- function(madc_file, # Get REF and ALT if(get_REF_ALT){ - if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - csv <- get_counts(madc_file) + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts) # Keep only the ones that have alt and ref csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] @@ -261,7 +301,8 @@ madc2vcf_targets <- function(madc_file, '##INFO=', '##FORMAT=', '##FORMAT=', - '##FORMAT=' + '##FORMAT=', + bigr_meta ) #Make the header#Make the VCF df diff --git a/R/utils.R b/R/utils.R index 2399560..92fac46 100644 --- a/R/utils.R +++ b/R/utils.R @@ -43,7 +43,7 @@ convert_to_dosage <- function(gt) { #' @noRd check_botloci <- function(botloci, report, verbose=TRUE){ if(!any(botloci$V1 %in% report$CloneID)) { - if(verbose) message("None of the botloci markers could be found in the MADC file. Checking padding match...\n") + vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) @@ -51,7 +51,7 @@ check_botloci <- function(botloci, report, verbose=TRUE){ if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") if(pad_madc != pad_botloci) { - if(verbose) message("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.\n") + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") if (pad_madc < pad_botloci) { report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) @@ -69,3 +69,46 @@ check_botloci <- function(botloci, report, verbose=TRUE){ } return(list(botloci, report)) } + +##' Verbose Message Utility +##' +##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +##' +##' @param text Character string, the message to print (supports sprintf formatting). +##' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. +##' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). +##' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. +##' @param ... Additional arguments passed to sprintf for formatting. +##' +##' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +##' +##' @return No return value, called for side effects. +##' @export +vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { + if (!verbose) return(invisible()) + # Format timestamp + timestamp <- format(Sys.time(), "[%H:%M:%S]") + + # Create indentation based on level + indent <- switch(as.character(level), + "0" = "", # Section headers + "1" = " ∙ ", # Main steps (medium bullet) + "2" = " - ", # Details + "3" = " > ", # Sub-details + paste0(paste(rep(" ", level), collapse = ""), "• ") # Fallback for level > 3 + ) + + # Format type label (only show for level 0) + type_label <- if (level == 0) sprintf("%-1s ", type) else "" + + # Format message text + dots <- list(...) + if(length(dots) == 0) { + msg_text <- text + } else { + msg_text <- sprintf(text, ...) + } + # Combine everything + formatted_msg <- sprintf("%s %s%s%s", timestamp, type_label, indent, msg_text) + message(formatted_msg) +} diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 676ef7f..39d7264 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -6,6 +6,7 @@ \usage{ filterVCF( vcf.file, + quality.rates = F, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, @@ -58,17 +59,11 @@ The VCF format is v4.3 \examples{ ## Use file paths for each file on the local system -#Temp location (only for example) -output_file <- tempfile() -filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), - filter.OD = 0.5, - filter.MAF = 0.05, - ploidy = 2, - output.file = output_file) - -# Removing the output for the example -rm(output_file) +#filterVCF(vcf.file = "example_dart_Dosage_Report.csv", + # filter.OD = 0.5, + # ploidy = 2, + # output.file = "name_for_vcf") ##The function will output the filtered VCF to the current working directory diff --git a/man/get_counts.Rd b/man/get_counts.Rd new file mode 100644 index 0000000..1879e07 --- /dev/null +++ b/man/get_counts.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_countsMADC.R +\name{get_counts} +\alias{get_counts} +\title{Read and Pre-process a MADC File} +\usage{ +get_counts( + madc_file = NULL, + madc_object = NULL, + collapse_matches_counts = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame +(e.g., from \code{check_botloci()}). When supplied, file reading is skipped. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} +and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} +rows. If \code{FALSE} (default), those rows are discarded.} + +\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} +} +\value{ +A data frame with one row per \code{Ref} or \code{Alt} allele entry, retaining +all original columns (\code{AlleleID}, \code{CloneID}, \code{AlleleSequence}, sample +count columns, etc.). +} +\description{ +Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the +file format, and returns a filtered data frame containing only \code{Ref} and \code{Alt} +haplotype rows ready for count-matrix construction. +} +\details{ +\strong{Input}: either \code{madc_file} (path to CSV) or \code{madc_object} (pre-read data +frame) must be supplied; at least one is required. + +\strong{Format detection} (applied to file or object alike): the first seven rows +of the first column are inspected: +\itemize{ +\item \strong{Standard format}: all entries are blank or \code{"*"} — the first 7 rows are +treated as DArT placeholder rows and skipped. +\item \strong{Fixed-allele-ID format}: no filler rows — data are used as-is. +} + +\strong{\verb{|AltMatch} / \verb{|RefMatch} handling} (controlled by \code{collapse_matches_counts}): +\itemize{ +\item \code{FALSE} (default): these rows are simply discarded. +\item \code{TRUE}: their counts are summed into the corresponding \verb{|Ref} or \verb{|Alt} +row for the same \code{CloneID}. +} + +In all cases, trailing suffixes on \code{AlleleID} (e.g., \verb{|Ref_001}, \verb{|Alt_002}) +are stripped to the canonical \verb{|Ref} / \verb{|Alt} form. +} +\keyword{internal} diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd index 66c5708..28fca1e 100644 --- a/man/get_countsMADC.Rd +++ b/man/get_countsMADC.Rd @@ -4,18 +4,53 @@ \alias{get_countsMADC} \title{Obtain Read Counts from MADC File} \usage{ -get_countsMADC(madc_file) +get_countsMADC( + madc_file = NULL, + madc_object = NULL, + collapse_matches_counts = FALSE, + verbose = TRUE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame +(e.g., as returned by \code{check_botloci()}). When supplied, file reading is +skipped. At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} +and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} +rows. If \code{FALSE} (default), \verb{|AltMatch} and \verb{|RefMatch} rows are discarded.} + +\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} } \value{ -A list of read count matrices for reference, alternate, and total read count values +A named list with two numeric matrices, both with markers as rows +and samples as columns: +\describe{ +\item{\code{ref_matrix}}{Reference allele read counts.} +\item{\code{size_matrix}}{Total read counts (reference + alternative).} +} } \description{ -This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +Reads a DArTag MADC report and returns reference and total read count matrices +per marker and sample. Only \code{Ref} and \code{Alt} target loci are retained; +\verb{|AltMatch} / \verb{|RefMatch} rows are either discarded or collapsed depending on +\code{collapse_matches_counts}. +} +\details{ +Either \code{madc_file} or \code{madc_object} must be provided (not both \code{NULL}). +When \code{madc_object} is supplied it is passed directly to \code{get_counts()}, +skipping file I/O. The function constructs: +\itemize{ +\item \code{ref_matrix} — per-sample reference allele counts. +\item \code{size_matrix} — per-sample total counts (ref + alt). +} + +Markers whose \code{CloneID} appears only in the \code{Ref} or only in the \code{Alt} rows +are removed with a warning. A summary of the proportion of zero-count +data points (missing data) is reported via \code{vmsg()}. } \examples{ # Get the path to the MADC file @@ -24,11 +59,13 @@ madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") # Extract the read count matrices counts_matrices <- get_countsMADC(madc_path) -# Access the reference, alternate, and size matrices - -# ref_matrix <- counts_matrices$ref_matrix -# alt_matrix <- counts_matrices$alt_matrix +# Access the reference and size matrices +# ref_matrix <- counts_matrices$ref_matrix # size_matrix <- counts_matrices$size_matrix rm(counts_matrices) + +} +\seealso{ +\code{\link[=get_counts]{get_counts()}}, \code{\link[=check_madc_sanity]{check_madc_sanity()}} } diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 0c06134..dc1cc65 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -9,55 +9,68 @@ imputation_concordance( imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE + verbose = FALSE, + plot = FALSE, + print_result = TRUE ) } \arguments{ -\item{reference_genos}{A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{reference_genos}{A data frame containing reference genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{imputed_genos}{A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{imputed_genos}{A data frame containing imputed genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{missing_code}{An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation.} +\item{missing_code}{Optional value specifying missing data. If provided, +loci with this value in either dataset will be excluded from the concordance calculation.} -\item{snps_2_exclude}{An optional vector of marker IDs to exclude from the concordance calculation.} +\item{snps_2_exclude}{Optional vector of marker IDs to exclude from the concordance calculation.} -\item{verbose}{A logical value indicating whether to print a summary of the concordance results. Default is FALSE.} +\item{verbose}{Logical. If \code{TRUE}, prints summary statistics (minimum, quartiles, +median, mean, maximum) of concordance percentages.} + +\item{plot}{Logical. If \code{TRUE}, produces a bar plot of concordance percentage +by sample.} + +\item{print_result}{Logical. If \code{TRUE} (default), prints the concordance +results data frame to the console. If \code{FALSE}, results are returned invisibly.} } \value{ -A list with two elements: +A data frame with: \itemize{ -\item \code{result_df}: A data frame with sample IDs and their concordance percentages. -\item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. +\item \code{ID}: Sample identifiers shared between the datasets. +\item \code{Concordance}: Percentage of matching genotypes per sample. } +If \code{print_result = FALSE}, the data frame is returned invisibly. } \description{ -This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -will not be considered for concordance if the \code{missing_code} argument is used. If a specific subset of markers should be excluded, -it can be provided using the \code{snps_2_exclude} argument. +This function calculates the concordance between imputed and reference +genotypes. It assumes that samples are rows and markers are columns. +Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +Missing data in either dataset can be excluded from the concordance calculation +using the \code{missing_code} argument. Specific markers can be excluded using +the \code{snps_2_exclude} argument. } \details{ -The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +The function: +\enumerate{ +\item Identifies common samples and markers between the datasets. +\item Optionally excludes specified SNPs. +\item Removes loci with missing data (if \code{missing_code} is provided). +\item Computes per-sample concordance as the percentage of matching genotypes. } -\examples{ - -# Example Input variables -ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -ref_file <- system.file("imputation_reference.txt", package="BIGr") -test_file <- system.file("imputation_test.txt", package="BIGr") - -# Import files -snps = read.table(ignore_file, header = TRUE) -ref = read.table(ref_file, header = TRUE) -test = read.table(test_file, header = TRUE) - -#Calculations -result <- imputation_concordance(reference_genos = ref, - imputed_genos = test, - snps_2_exclude = snps, - missing_code = 5, - verbose = FALSE) - +When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +is generated using \pkg{ggplot2}. +} +\examples{ +result <- imputation_concordance( + reference_genos = ref, + imputed_genos = test, + snps_2_exclude = snps, + missing_code = 5, + verbose = TRUE, + plot = TRUE +) } diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index 6fe7f11..58cfcb3 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -14,6 +14,7 @@ madc2vcf_all( multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, verbose = TRUE ) } diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index fad847c..5d62c87 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -9,7 +9,9 @@ madc2vcf_targets( output.file, botloci_file, markers_info = NULL, - get_REF_ALT = FALSE + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = FALSE ) } \arguments{ @@ -22,14 +24,18 @@ designed on the \strong{bottom} strand (one ID per line). Required when \code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Required columns: \verb{BI_markerID, Chr, Pos, Ref, Alt}. If indels are -present, also require \verb{Type, Indel_pos}. When supplied, these values populate -\verb{#CHROM, POS, REF, ALT} in the VCF directly.} +metadata. Required columns: \verb{CloneID, Chr, Pos, Ref, Alt}. This file is required in +case your MADC CloneID column doesn't have the format CHR_POS. If indels are +present, columns \verb{Type, Indel_pos} are also required.} \item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT bases from the Ref/Alt probe sequences in the MADC file (with strand correction using \code{botloci_file}). Targets with more than one difference between Ref/Alt sequences are removed.} + +\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for targets with identical \code{CHROM_POS} are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target.} + +\item{verbose}{logical (default \code{FALSE}). If \code{TRUE}, prints detailed messages about} } \value{ (Invisibly) returns the path to \code{output.file}. The side effect is a diff --git a/man/vmsg.Rd b/man/vmsg.Rd new file mode 100644 index 0000000..abcc768 --- /dev/null +++ b/man/vmsg.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{vmsg} +\alias{vmsg} +\title{Verbose Message Utility} +\usage{ +vmsg(text, verbose = FALSE, level = 1, type = ">>", ...) +} +\arguments{ +\item{text}{Character string, the message to print (supports sprintf formatting).} + +\item{verbose}{Logical. If TRUE, prints the message; if FALSE, suppresses output.} + +\item{level}{Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).} + +\item{type}{Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.} + +\item{...}{Additional arguments passed to sprintf for formatting.} +} +\value{ +No return value, called for side effects. +} +\description{ +Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +} +\details{ +Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +} From 9afb26545efa2bdabcef702e0673f7b7e50a7961 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 13 Mar 2026 21:31:26 -0400 Subject: [PATCH 15/80] messages ok --- R/madc2vcf_targets.R | 64 +++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 92bb8e6..d595ae5 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -100,7 +100,7 @@ madc2vcf_targets <- function(madc_file, markers_info = NULL, get_REF_ALT = FALSE, collapse_matches_counts = FALSE, - verbose = FALSE) { + verbose = TRUE) { vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") @@ -132,7 +132,7 @@ madc2vcf_targets <- function(madc_file, if (check) message[1] else message[2] }, checks$checks, checks$messages) - for(i in 1:length(messages_results)) + for(i in seq_along(messages_results)) vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ @@ -143,7 +143,9 @@ madc2vcf_targets <- function(madc_file, if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) - if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + if(is.null(markers_info)) stop("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + else vmsg("MADC file has some issues (IUPAC codes, lowercase bases, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") } # Check marker names compatibility between MADC and botloci @@ -158,7 +160,7 @@ madc2vcf_targets <- function(madc_file, vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") - matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts) + matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = verbose) ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -172,8 +174,11 @@ madc2vcf_targets <- function(madc_file, ) row.names(ad_df) <- row.names(ref_df) + vmsg("Depth information extracted", verbose = verbose, level = 1, type = ">>") + #Obtaining Chr and Pos information from the row_names if(is.null(markers_info)){ + vmsg("No markers_info file provided. Attempting to recover CHROM and POS from CloneID...", verbose = verbose, level = 0, type = ">>") new_df <- size_df %>% rownames_to_column(var = "row_name") %>% separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% @@ -187,9 +192,11 @@ madc2vcf_targets <- function(madc_file, new_df$TotalAlt <- rowSums(alt_df) new_df$TotalSize <- rowSums(size_df) + vmsg("CHROM and POS recovered from CloneID", verbose = verbose, level = 1, type = ">>") # Get REF and ALT if(get_REF_ALT){ - csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts) + vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", verbose = verbose, level = 0, type = ">>") + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) # Keep only the ones that have alt and ref csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] @@ -225,8 +232,9 @@ madc2vcf_targets <- function(madc_file, orig_alt_seq <- orig_alt_seq[order(alt_ord)] ordered_clone_ids <- sort(ref_ord) + more_poly <- no_diff <- 0 ref_base <- alt_base <- vector() - for(i in 1:length(orig_ref_seq)){ + for(i in seq_along(orig_ref_seq)){ # FIXED: Use original sequences for SNP calling temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") idx_diff <- which(temp_list[[1]] != temp_list[[2]]) @@ -234,6 +242,7 @@ madc2vcf_targets <- function(madc_file, if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences ref_base[i] <- NA alt_base[i] <- NA + more_poly <- more_poly + 1 } else if(length(idx_diff) == 1) { orig_ref_base <- temp_list[[1]][idx_diff] orig_alt_base <- temp_list[[2]][idx_diff] @@ -250,32 +259,50 @@ madc2vcf_targets <- function(madc_file, # No differences found ref_base[i] <- NA alt_base[i] <- NA + no_diff <- no_diff + 1 } } + if(more_poly > 0) vmsg(paste(more_poly, "markers removed because more than one polymorphism was found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") + if(no_diff > 0) vmsg(paste(no_diff, "markers removed because no differences were found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") + } else { - warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") ref_base <- "." alt_base <- "." + vmsg(paste("REF and ALT bases could not be recovered because of missing reference or alternative sequences"), verbose = verbose, level = 1, type = ">>") } - } else { ref_base <- "." alt_base <- "." + vmsg(paste("REF and ALT bases not recovered because get_REF_ALT = FALSE"), verbose = verbose, level = 1, type = ">>") } } else { + vmsg("markers_info file provided. Using CHROM, POS, REF and ALT from the file.", verbose = verbose, level = 0, type = ">>") # Verify markers_info file df <- read.csv(markers_info) + + # Accept either CloneID or BI_markerID as the marker ID column + if ("BI_markerID" %in% colnames(df)) { + id_col <- "BI_markerID" + } else if ("CloneID" %in% colnames(df)) { + id_col <- "CloneID" + } else { + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + } + if(checks$checks["Indels"]){ - if(!all(c("BI_markerID","Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) - stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT, Type, Indel_pos") + vmsg("Indels detected in MADC file. Checking for required columns in markers_info...", verbose = verbose, level = 1, type = ">>") + if(!all(c(id_col,"Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt, Type, Indel_pos")) } - if(!all(c("BI_markerID","Chr","Pos","Ref","Alt") %in% colnames(df))) - stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT") + if(!all(c(id_col,"Chr","Pos","Ref","Alt") %in% colnames(df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt")) - if(!all(rownames(ad_df)%in% df$BI_markerID)) + if(!all(rownames(ad_df) %in% df[[id_col]])){ + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% df[[id_col]]] + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") - - matched <- df[match(rownames(ad_df), df$BI_markerID),] + } + matched <- df[match(rownames(ad_df), df[[id_col]]),] new_df <- data.frame( CHROM = matched$Chr, @@ -291,6 +318,9 @@ madc2vcf_targets <- function(madc_file, alt_base <- matched$Alt } + vmsg("CHROM, POS, REF and ALT columns prepared", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF dataframe", verbose = verbose, level = 0, type = ">>") #Make a header separate from the dataframe vcf_header <- c( "##fileformat=VCFv4.3", @@ -351,6 +381,7 @@ madc2vcf_targets <- function(madc_file, rownames(combined_wide) <- combined_wide$Row combined_wide$Row <- NULL colnames(combined_wide) <- colnames(matrices[[1]]) + vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") return(as.matrix(combined_wide)) } @@ -360,6 +391,7 @@ madc2vcf_targets <- function(madc_file, #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) + vmsg("VCF dataframe prepared", verbose = verbose, level = 1, type = ">>") # Add # to the CHROM column name colnames(vcf_df)[1] <- "#CHROM" @@ -368,6 +400,7 @@ madc2vcf_targets <- function(madc_file, vcf_df <- vcf_df[order(vcf_df[,1],as.numeric(as.character(vcf_df[,2]))),] if(sum(is.na(vcf_df$REF)) >1) { + vmsg(paste(sum(is.na(vcf_df$REF)), "markers removed because of presence of more than one polymorphism between ref and alt sequences."), verbose = verbose, level = 1, type = ">>") warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] } @@ -379,4 +412,5 @@ madc2vcf_targets <- function(madc_file, suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) + vmsg(paste("VCF file written to", output.file), verbose = verbose, level = 0, type = ">>") } From c31118d8a7336f31b580cadabecd31c80985578b Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 25 Mar 2026 15:29:49 -0400 Subject: [PATCH 16/80] targets okay --- R/check_madc_sanity.R | 205 +++++++-- R/get_countsMADC.R | 2 + R/madc2vcf_targets.R | 362 ++++++++------- R/utils.R | 52 +-- man/madc2vcf_targets.Rd | 22 +- tests/testthat/test-check_madc_sanity.R | 67 ++- tests/testthat/test-madc2vcf_targets.R | 588 +++++++++++++++++++++++- 7 files changed, 1018 insertions(+), 280 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 1455606..301cd74 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -1,41 +1,80 @@ #' Run basic sanity checks on a MADC-style allele report #' #' @description -#' Performs five quick validations on an allele report: -#' 1) **Columns** – required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); -#' 2) **FixAlleleIDs** – first column’s first up-to-6 rows are not all blank or "*" +#' Performs eight quick validations on an allele report: +#' 1) **Columns** - required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); +#' 2) **FixAlleleIDs** - first column's first up-to-6 rows are not all blank or `"*"` #' *and* both `_0001` and `_0002` appear in `AlleleID`; -#' 3) **IUPACcodes** – presence of non-ATCG characters in `AlleleSequence`; -#' 4) **LowerCase** – presence of lowercase a/t/c/g in `AlleleSequence`; -#' 5) **Indels** – reference/alternate allele lengths differ for the same `CloneID`. +#' 3) **IUPACcodes** - presence of non-ATCG characters in `AlleleSequence`; +#' 4) **LowerCase** - presence of lowercase a/t/c/g in `AlleleSequence`; +#' 5) **Indels** - reference/alternate allele lengths differ for the same `CloneID`, +#' or a `"-"` character is present in `AlleleSequence`; +#' 6) **ChromPos** - all `CloneID` values follow the `Chr_Position` format +#' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); +#' 7) **allNAcol** - at least one column contains only `NA` values; +#' 8) **allNArow** - at least one row contains only `NA` values. #' #' @param report A `data.frame` with at least the columns #' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also -#' used in the “FixAlleleIDs” check to inspect its first up to six entries. +#' used in the `FixAlleleIDs` check to inspect its first up to six entries. +#' If `FixAlleleIDs` is `FALSE` (raw DArT format), the first 7 rows are +#' treated as header filler and skipped before further checks are run. #' #' @details -#' - **IUPAC check:** Flags any character outside `ATCG` (case-insensitive), -#' which will include ambiguity codes (`N`, `R`, `Y`, etc.) and symbols like `-`. -#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs `"Alt_0002"`, -#' merged by `CloneID`, and the lengths of `AlleleSequence` are compared. -#' - If required columns are missing, only **Columns** is evaluated (`FALSE`) and +#' - **FixAlleleIDs:** When the check fails (raw DArT format), row 7 is used as +#' the column header and the first 7 rows are dropped before subsequent checks. +#' - **IUPAC check:** Flags any character outside `A`, `T`, `C`, `G` and `"-"` +#' (case-insensitive), which includes ambiguity codes (`N`, `R`, `Y`, etc.). +#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs +#' `"Alt_0002"`, merged by `CloneID`, and the lengths of `AlleleSequence` +#' are compared. A `"-"` anywhere in `AlleleSequence` is also treated as +#' evidence of an indel. +#' - **ChromPos:** Each `CloneID` is split on `"_"` into exactly two parts; the +#' first part must match `"Chr"` (case-insensitive) and the second must be a +#' positive integer. Returns `FALSE` when any `CloneID` is `NA`. +#' - **allNAcol / allNArow:** Detected via `apply()` over columns/rows +#' respectively; useful for flagging empty or corrupt entries. +#' - If required columns are missing (`Columns = FALSE`), only `Columns` and +#' `FixAlleleIDs` are evaluated; all other checks remain `NA` and #' `indel_clone_ids` is returned as `NULL`. #' -#' @return A list with: +#' @return A named list with three elements: #' \describe{ -#' \item{checks}{Named logical vector with entries -#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`.} -#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt lengths differ. -#' Returns `character(0)` if none, or `NULL` when required columns are missing.} +#' \item{checks}{Named logical vector with eight entries: +#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`, +#' `ChromPos`, `allNAcol`, `allNArow`. +#' `TRUE` means the condition was detected (or passed for `Columns` and +#' `FixAlleleIDs`); `NA` means the check was skipped.} +#' \item{messages}{Named list of length-2 character vectors, one per check. +#' Element `[[1]]` is the message when the check is `TRUE`, element `[[2]]` +#' when it is `FALSE`. Indexed by the same names as `checks`.} +#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt +#' lengths differ. Returns `character(0)` if none are found, or `NULL` +#' when required columns are missing.} #' } #' -#' #' @export check_madc_sanity <- function(report) { # Initialize - checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) - messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) + + # ---- FixAlleleIDs ---- + # Check if first up-to-6 entries in the *first column* are all "" or "*" + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 + + if(!checks["FixAlleleIDs"]){ + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } # Validate required columns required <- c("CloneID", "AlleleID", "AlleleSequence") @@ -43,19 +82,8 @@ check_madc_sanity <- function(report) { checks["Columns"] <- length(missing_cols) == 0 if(checks[["Columns"]]){ - # ---- FixAlleleIDs ---- - # Check if first up-to-6 entries in the *first column* are all "" or "*" - n <- nrow(report) - idx <- seq_len(min(6L, n)) - first_col_vals <- report[[1]][idx] - all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) - # Also require that both _0001 and _0002 appear in AlleleID - has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) - has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) - checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 - # ---- IUPACcodes ---- - iu <- grepl("[^ATCG]", report$AlleleSequence, ignore.case = TRUE) + iu <- grepl("[^ATCG-]", report$AlleleSequence, ignore.case = TRUE) checks["IUPACcodes"] <- any(iu, na.rm = TRUE) # ---- LowerCase ---- @@ -75,31 +103,126 @@ check_madc_sanity <- function(report) { alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) cmp_ok <- !is.na(ref_len) & !is.na(alt_len) indel_mask <- cmp_ok & (ref_len != alt_len) - checks["Indels"] <- any(indel_mask) + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence)) indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) } else { checks["Indels"] <- FALSE indels <- character(0) } + # --- All NA ---- + checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x)))) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)))) + # ---- Chrom Pos ---- - pos <- strsplit(report[,2], "_") - checks["ChromPos"] <- all(sapply(pos, length) == 2) + if(!any(is.na(report$CloneID))) { + pos <- strsplit(report$CloneID, "_") + format <- all(sapply(pos, length) == 2) + first <- all(grepl("Chr", sapply(pos, "[", 1), ignore.case = TRUE)) + second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0)) + checks["ChromPos"] <- all(format, first, second) + } else checks["ChromPos"] <- FALSE } else indels <- NULL messages[["Columns"]] <- c("Required columns are present", - "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", - "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets") + "MADC not processed by HapApp. Please, run the MADC through HapApp to fix Allele IDs before using it in BIGr/BIGapp.") messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", - "No IUPAC (non-ATCG) codes found in AlleleSequence") + "No IUPAC (non-ATCG) codes found in AlleleSequence") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", - "No lowercase bases found in AlleleSequence") + "No lowercase bases found in AlleleSequence") messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), - "No indels found (ref/alt lengths match) for all CloneIDs") + "No indels found (ref/alt lengths match) for all CloneIDs") messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", - "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") + messages[["allNArow"]] <- c("One or more rows contain all NA values.", + "No rows with all NA values") + messages[["allNAcol"]] <- c("One or more columns contain all NA values.", + "No columns with all NA values") list(checks = checks, messages = messages, indel_clone_ids = indels) } + +#' Check and Adjust Botloci and MADC Marker Compatibility +#' +#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. +#' +#' @param botloci A data frame containing the botloci markers. +#' @param report A data frame containing the MADC markers. +#' @param ChromPos logical value indicating whether the CloneID in the MADC file contains chromosome and position information in the format "Chr_Pos". Default is TRUE +#' @param mi_df A data frame containing marker information with columns CloneID, Chr, and Pos. Required if `ChromPos` is FALSE. +#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. Required if `ChromPos` is FALSE. +#' +#' @return A list containing the adjusted botloci and MADC data frames. +#' +#' @details +#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. +#' +#' @keywords internal +#' @noRd +check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=TRUE){ + + # Check inputs + if(!ChromPos) { + if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with CloneID, Chr and Pos columns must be provided.") + # if exists, it must contain CloneID or BI_markerID that matches the report$CloneID, and Chr and Pos columns + if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + } else { + use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + } + if(is.null(mi_df$Chr) | is.null(mi_df$Pos)) stop("When MADC CloneID don't follow the format Chr_Pos, Chr and Pos columns must be provided in the markers_info file.") + } + + if(!any(botloci$V1 %in% report$CloneID)) { # First check if any botloci markers are found in MADC file. If not, check for padding mismatch. + vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") + + pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) + + if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") + + if(pad_madc != pad_botloci) { + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") + if (pad_madc < pad_botloci) { + report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) + ) + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) + } else { + botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", + sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) + ) + if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") + } + } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 1, type = ">>") + if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + } else { + use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + } + mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) + + if(!any(botloci$V1 %in% mk_info_CloneID)){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 1, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } else { + vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 1, type = ">>") + report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + } + } else { + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") + vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 1, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } + } + return(list(botloci, report)) +} diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 2123be1..8cce70b 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -69,6 +69,8 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche } else { update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose) } + # Ensure plain data.frame so row.names<- does not trigger tibble deprecation warning + update_df <- as.data.frame(update_df) # Filter rows where 'AlleleID' ends with 'Ref' ref_df <- subset(update_df, grepl("Ref$", AlleleID)) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index d595ae5..c1c4c71 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -16,8 +16,7 @@ #' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing #' the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. #' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT` -#' (and `Type`, `Indel_pos` when indels are present), bypassing sequence-based -#' inference. +#' bypassing sequence-based inference. #' #' **Output VCF layout** #' - `INFO` fields: @@ -40,19 +39,20 @@ #' #' @param madc_file character. Path to the input MADC CSV file. #' @param output.file character. Path to the output VCF file to write. -#' @param botloci_file character. Path to a plain-text file listing target IDs -#' designed on the **bottom** strand (one ID per line). Required when -#' `get_REF_ALT = TRUE` and `markers_info` is not provided. +#' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text +#' file listing target IDs designed on the **bottom** strand (one ID per line). +#' Required only when `get_REF_ALT = TRUE` and `markers_info` does not supply +#' `Ref` and `Alt` columns. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Required columns: `CloneID, Chr, Pos, Ref, Alt`. This file is required in -#' case your MADC CloneID column doesn't have the format CHR_POS. If indels are -#' present, columns `Type, Indel_pos` are also required. +#' metadata. Minimum required columns: `CloneID` (or `BI_markerID`), `Chr`, `Pos`. +#' When `get_REF_ALT = TRUE`, also requires `Ref` and `Alt` (replaces probe-sequence +#' inference). `Type` and `Indel_pos` are never required by this function. #' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT #' bases from the Ref/Alt probe sequences in the MADC file (with strand correction #' using `botloci_file`). Targets with more than one difference between Ref/Alt #' sequences are removed. #' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for targets with identical `CHROM_POS` are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target. -#' @param verbose logical (default `FALSE`). If `TRUE`, prints detailed messages about +#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress messages about each processing step. #' #' @return (Invisibly) returns the path to `output.file`. The side effect is a #' **VCF v4.3** written to disk containing one row per target and columns for all @@ -96,7 +96,7 @@ #' @export madc2vcf_targets <- function(madc_file, output.file, - botloci_file, + botloci_file = NULL, markers_info = NULL, get_REF_ALT = FALSE, collapse_matches_counts = FALSE, @@ -105,19 +105,17 @@ madc2vcf_targets <- function(madc_file, vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks - if(!file.exists(madc_file)) stop("The MADC file does not exist.") + if(!(file.exists(madc_file) | url_exists(madc_file))) stop("The MADC file does not exist.") if(!is.character(output.file)) stop("output.file must be a character string.") - if(get_REF_ALT && is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - if(get_REF_ALT && !file.exists(botloci_file)) stop("The botloci file does not exist.") - if(!is.null(markers_info) && !file.exists(markers_info)) stop("The markers_info file does not exist.") if(!is.null(markers_info) && !is.character(markers_info)) stop("markers_info must be a character string or NULL.") + if(!is.null(markers_info) && !file.exists(markers_info) && !url_exists(markers_info)) stop("The markers_info file does not exist.") if(!is.logical(get_REF_ALT)) stop("get_REF_ALT must be a logical value (TRUE or FALSE).") if(!is.logical(verbose)) stop("verbose must be a logical value (TRUE or FALSE).") # Create a VCF header line with metadata about the command and its parameters bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_targets= 0) stop(paste("The MADC file does not pass the sanity checks:\n", paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) } - if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ - idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) - if(is.null(markers_info)) stop("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:\n", - paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) - else vmsg("MADC file has some issues (IUPAC codes, lowercase bases, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") + if(any(checks$checks[c("IUPACcodes", "Indels")]) && get_REF_ALT){ + idx <- which((checks$checks[c("IUPACcodes", "Indels")])) + if(is.null(markers_info)) stop(paste("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:", + paste(messages_results[c("IUPACcodes", "Indels")[idx]], collapse = "\n"))) + else vmsg("MADC file has some issues (IUPAC codes, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") } - # Check marker names compatibility between MADC and botloci - if(!is.null(botloci_file)){ - botloci <- read.csv(botloci_file, header = F) - checked_botloci <- check_botloci(botloci, report) - botloci <- checked_botloci[[1]] - report <- checked_botloci[[2]] + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) } - vmsg("Input checks done", verbose = verbose, level = 1, type = ">>") + # ---- Validate botloci and pre-process CloneIDs based on get_REF_ALT logic ---- + mi_df <- NULL # markers_info data frame (loaded once, reused below) + mi_has_ref_alt <- FALSE # TRUE when markers_info provides Ref and Alt columns + botloci <- NULL # botloci data frame (set when needed) + + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + mi_has_ref_alt <- all(c("Ref", "Alt") %in% colnames(mi_df)) + } + + if(!isTRUE(checks$checks["ChromPos"])) { + if(is.null(markers_info)){ + stop("CloneID column does not follow the 'Chr_Pos'. ", + "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + } else { + + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + + } + } + + if(get_REF_ALT) { + + if(mi_has_ref_alt) { + # markers_info supplies REF and ALT — no botloci required + vmsg("markers_info contains Ref and Alt columns. REF and ALT will be taken from markers_info.", + verbose = verbose, level = 1, type = ">>") + + } else { + if(checks$checks["Indels"]) + stop("Indels detected in MADC file. Since get_REF_ALT = TRUE, a markers_info file with REF/ALT information is required.") + + # REF/ALT must be extracted from probe sequences — botloci is required + if(is.null(botloci_file) || (!file.exists(botloci_file) && !url_exists(botloci_file))) + stop("get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE.") + + # Validate that CloneIDs match botloci marker names (after any remapping) + botloci <- read.table(botloci_file, header = FALSE) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci <- checked_botloci[[1]] + report <- checked_botloci[[2]] + + } + } + + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") @@ -176,79 +221,133 @@ madc2vcf_targets <- function(madc_file, vmsg("Depth information extracted", verbose = verbose, level = 1, type = ">>") - #Obtaining Chr and Pos information from the row_names - if(is.null(markers_info)){ - vmsg("No markers_info file provided. Attempting to recover CHROM and POS from CloneID...", verbose = verbose, level = 0, type = ">>") - new_df <- size_df %>% - rownames_to_column(var = "row_name") %>% - separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% - select(CHROM, POS) + if(get_REF_ALT && mi_has_ref_alt) { + vmsg("Using markers_info for CHROM, POS, REF and ALT.", verbose = verbose, level = 0, type = ">>") - # Remove leading zeros from the POS column - new_df$POS <- sub("^0+", "", new_df$POS) + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else + if ("CloneID" %in% colnames(mi_df)) "CloneID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. Since Ref and Alt are provided in markers_info, Type and Indel_pos are not required.", + verbose = verbose, level = 1, type = ">>") + + if(!all(c(id_col, "Chr", "Pos", "Ref", "Alt") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", + id_col, ", Chr, Pos, Ref, Alt")) + + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] + + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) new_df$TotalSize <- rowSums(size_df) - vmsg("CHROM and POS recovered from CloneID", verbose = verbose, level = 1, type = ">>") - # Get REF and ALT - if(get_REF_ALT){ - vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", verbose = verbose, level = 0, type = ">>") - csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) - # Keep only the ones that have alt and ref - csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] + ref_base <- matched$Ref + alt_base <- matched$Alt - # Get reverse complement the tag is present in botloci - botloci <- read.table(botloci_file, header = FALSE) + } else if(!is.null(markers_info) && !get_REF_ALT) { + vmsg("markers_info file provided. Using CHROM and POS from the file.", verbose = verbose, level = 0, type = ">>") - # Check if the botloci file marker IDs match with the MADC file - checked_botloci <- check_botloci(botloci, csv) - botloci <- checked_botloci[[1]] - csv <- checked_botloci[[2]] + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else + if ("CloneID" %in% colnames(mi_df)) "CloneID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") - # FIXED: Store original sequences before any transformation - csv$OriginalAlleleSequence <- csv$AlleleSequence + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", + verbose = verbose, level = 1, type = ">>") - # Apply reverse complement to sequences for bottom strand markers - idx <- which(csv$CloneID %in% botloci[,1]) - csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) + if(!all(c(id_col, "Chr", "Pos") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos")) - ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] - alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] - # FIXED: Get original sequences for SNP calling - orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) - if(all(sort(ref_ord) == sort(alt_ord))){ - # Order sequences consistently - ref_seq <- ref_seq[order(ref_ord)] - alt_seq <- alt_seq[order(alt_ord)] - orig_ref_seq <- orig_ref_seq[order(ref_ord)] - orig_alt_seq <- orig_alt_seq[order(alt_ord)] - ordered_clone_ids <- sort(ref_ord) + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") + + } else { + vmsg(ifelse(get_REF_ALT, + "Recovering CHROM and POS from CloneID for probe-sequence REF/ALT extraction...", + "No markers_info file provided. Recovering CHROM and POS from CloneID..."), + verbose = verbose, level = 0, type = ">>") + + # Split on the last underscore to handle chromosome names containing underscores + # (e.g. Chr_01_000123456). When ChromPos was FALSE, check_botloci already + # remapped CloneIDs to Chr_PaddedPos, so this split is always valid. + new_df <- size_df %>% + rownames_to_column(var = "row_name") %>% + separate(row_name, into = c("CHROM", "POS"), sep = "_(?=[^_]*$)") %>% + select(CHROM, POS) + new_df$POS <- sub("^0+", "", new_df$POS) + vmsg("CHROM and POS recovered from CloneID.", verbose = verbose, level = 1, type = ">>") + + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + if(get_REF_ALT) { + vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", + verbose = verbose, level = 0, type = ">>") + + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) + csv <- csv[which(csv$CloneID %in% rownames(ad_df)), ] + + ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] + alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + orig_ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + + if(all(sort(ref_ord) == sort(alt_ord))) { + # Key sequences by CloneID, then reorder to MADC row order so that + # loop index i always corresponds to rownames(size_df)[i]. + ref_seq_by_id <- setNames(orig_ref_seq, ref_ord) + alt_seq_by_id <- setNames(orig_alt_seq, alt_ord) + madc_ids <- rownames(size_df) + orig_ref_seq <- ref_seq_by_id[madc_ids] + orig_alt_seq <- alt_seq_by_id[madc_ids] more_poly <- no_diff <- 0 - ref_base <- alt_base <- vector() - for(i in seq_along(orig_ref_seq)){ - # FIXED: Use original sequences for SNP calling + ref_base <- alt_base <- rep(NA_character_, length(madc_ids)) + names(ref_base) <- names(alt_base) <- madc_ids + for(i in seq_along(madc_ids)) { + if(is.na(orig_ref_seq[i]) || is.na(orig_alt_seq[i])) next temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") - idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - - if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences + if(length(temp_list[[1]]) != length(temp_list[[2]])) + stop(paste0("Marker '", madc_ids[i], "' has Ref and Alt probe sequences of different lengths ", + "(", length(temp_list[[1]]), " vs ", length(temp_list[[2]]), "). ", + "This should not happen for SNP markers. ", + "If this is an indel, please provide a markers_info file with Ref and Alt columns.")) + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) + + if(length(idx_diff) > 1) { ref_base[i] <- NA alt_base[i] <- NA more_poly <- more_poly + 1 } else if(length(idx_diff) == 1) { orig_ref_base <- temp_list[[1]][idx_diff] orig_alt_base <- temp_list[[2]][idx_diff] - - # FIXED: Apply reverse complement to bases only if marker is in botloci - if(ordered_clone_ids[i] %in% botloci[,1]) { + if(madc_ids[i] %in% botloci[, 1]) { ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) } else { @@ -256,66 +355,27 @@ madc2vcf_targets <- function(madc_file, alt_base[i] <- orig_alt_base } } else { - # No differences found ref_base[i] <- NA alt_base[i] <- NA no_diff <- no_diff + 1 } } - if(more_poly > 0) vmsg(paste(more_poly, "markers removed because more than one polymorphism was found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") - if(no_diff > 0) vmsg(paste(no_diff, "markers removed because no differences were found between Ref and Alt sequences"), verbose = verbose, level = 2, type = ">>") + if(more_poly > 0) vmsg(paste(more_poly, "markers removed: more than one polymorphism between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") + if(no_diff > 0) vmsg(paste(no_diff, "markers removed: no differences found between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") } else { ref_base <- "." alt_base <- "." - vmsg(paste("REF and ALT bases could not be recovered because of missing reference or alternative sequences"), verbose = verbose, level = 1, type = ">>") + vmsg("REF and ALT bases could not be recovered: missing reference or alternative sequences.", + verbose = verbose, level = 1, type = ">>") } + } else { + # ── get_REF_ALT = FALSE, no markers_info ───────────────────────── ref_base <- "." alt_base <- "." - vmsg(paste("REF and ALT bases not recovered because get_REF_ALT = FALSE"), verbose = verbose, level = 1, type = ">>") - } - } else { - vmsg("markers_info file provided. Using CHROM, POS, REF and ALT from the file.", verbose = verbose, level = 0, type = ">>") - # Verify markers_info file - df <- read.csv(markers_info) - - # Accept either CloneID or BI_markerID as the marker ID column - if ("BI_markerID" %in% colnames(df)) { - id_col <- "BI_markerID" - } else if ("CloneID" %in% colnames(df)) { - id_col <- "CloneID" - } else { - stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") - } - - if(checks$checks["Indels"]){ - vmsg("Indels detected in MADC file. Checking for required columns in markers_info...", verbose = verbose, level = 1, type = ">>") - if(!all(c(id_col,"Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) - stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt, Type, Indel_pos")) + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") } - if(!all(c(id_col,"Chr","Pos","Ref","Alt") %in% colnames(df))) - stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos, Ref, Alt")) - - if(!all(rownames(ad_df) %in% df[[id_col]])){ - miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% df[[id_col]]] - vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") - warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") - } - matched <- df[match(rownames(ad_df), df[[id_col]]),] - - new_df <- data.frame( - CHROM = matched$Chr, - POS = matched$Pos - ) - - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) - new_df$TotalSize <- rowSums(size_df) - - ref_base <- matched$Ref - alt_base <- matched$Alt } vmsg("CHROM, POS, REF and ALT columns prepared", verbose = verbose, level = 1, type = ">>") @@ -356,38 +416,20 @@ madc2vcf_targets <- function(madc_file, vcf_df$FORMAT <- paste("DP","RA","AD",sep=":") #Combine info from the matrices to form the VCF information for each sample - # Combine the matrices into a single matrix with elements separated by ":" - make_vcf_format <- function(..., separator = ":") { - matrices <- list(...) - n <- length(matrices) - - # Convert matrices to long form - long_forms <- lapply(matrices, function(mat) { - suppressMessages(reshape2::melt(mat, varnames = c("Row", "Col"), value.name = "Value")) - }) - - # Concatenate the elements - combined_long <- long_forms[[1]] - combined_long$Combined <- combined_long$Value - - for (i in 2:n) { - combined_long$Combined <- paste(combined_long$Combined, long_forms[[i]]$Value, sep = separator) - } + m_size <- melt(as.matrix(size_df), varnames = c("Row", "Col"), value.name = "Value") + m_ref <- melt(as.matrix(ref_df), varnames = c("Row", "Col"), value.name = "Value") + m_ad <- melt(as.matrix(ad_df), varnames = c("Row", "Col"), value.name = "Value") - # Convert back to wide form - combined_wide <- suppressMessages(reshape2::dcast(combined_long, Row ~ Col, value.var = "Combined")) + combined_long <- m_size + combined_long$Combined <- paste(m_size$Value, m_ref$Value, m_ad$Value, sep = ":") - # Restore row and column names - rownames(combined_wide) <- combined_wide$Row - combined_wide$Row <- NULL - colnames(combined_wide) <- colnames(matrices[[1]]) - vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") + combined_wide <- suppressMessages(dcast(combined_long, Row ~ Col, value.var = "Combined")) + rownames(combined_wide) <- combined_wide$Row + combined_wide$Row <- NULL + colnames(combined_wide) <- colnames(size_df) - return(as.matrix(combined_wide)) - } - - # Combine the matrices - geno_df <- make_vcf_format(as.matrix(size_df), as.matrix(ref_df), as.matrix(ad_df)) + geno_df <- as.matrix(combined_wide) + vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) @@ -399,7 +441,15 @@ madc2vcf_targets <- function(madc_file, # Sort vcf_df <- vcf_df[order(vcf_df[,1],as.numeric(as.character(vcf_df[,2]))),] - if(sum(is.na(vcf_df$REF)) >1) { + # Remove markers with NA CHROM/POS (unmatched in markers_info, Case 3) + na_coord <- is.na(vcf_df[, 1]) | is.na(vcf_df$POS) + if(any(na_coord)) { + vmsg(paste(sum(na_coord), "markers removed: no matching entry found in markers_info."), verbose = verbose, level = 1, type = ">>") + warning(paste(sum(na_coord), "markers removed: no matching entry found in markers_info.")) + vcf_df <- vcf_df[!na_coord, ] + } + + if(sum(is.na(vcf_df$REF)) > 0) { vmsg(paste(sum(is.na(vcf_df$REF)), "markers removed because of presence of more than one polymorphism between ref and alt sequences."), verbose = verbose, level = 1, type = ">>") warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] diff --git a/R/utils.R b/R/utils.R index 92fac46..a270570 100644 --- a/R/utils.R +++ b/R/utils.R @@ -26,50 +26,6 @@ convert_to_dosage <- function(gt) { }) } -#' Check and Adjust Botloci and MADC Marker Compatibility -#' -#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. -#' -#' @param botloci A data frame containing the botloci markers. -#' @param report A data frame containing the MADC markers. -#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. -#' -#' @return A list containing the adjusted botloci and MADC data frames. -#' -#' @details -#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. -#' -#' @keywords internal -#' @noRd -check_botloci <- function(botloci, report, verbose=TRUE){ - if(!any(botloci$V1 %in% report$CloneID)) { - vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") - - pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) - pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) - - if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") - - if(pad_madc != pad_botloci) { - vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") - if (pad_madc < pad_botloci) { - report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", - sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) - ) - report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) - } else { - botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", - sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) - ) - if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") - } - } else { - stop("Check marker IDs in both MADC and botloci files. They should be the same.") - } - } - return(list(botloci, report)) -} - ##' Verbose Message Utility ##' ##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. @@ -112,3 +68,11 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { formatted_msg <- sprintf("%s %s%s%s", timestamp, type_label, indent, msg_text) message(formatted_msg) } + +url_exists <- function(u) { + tryCatch({ + con <- url(u, open = "rb") + close(con) + TRUE + }, error = function(e) FALSE, warning = function(w) FALSE) +} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 5d62c87..1da55d8 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -7,11 +7,11 @@ madc2vcf_targets( madc_file, output.file, - botloci_file, + botloci_file = NULL, markers_info = NULL, get_REF_ALT = FALSE, collapse_matches_counts = FALSE, - verbose = FALSE + verbose = TRUE ) } \arguments{ @@ -19,14 +19,15 @@ madc2vcf_targets( \item{output.file}{character. Path to the output VCF file to write.} -\item{botloci_file}{character. Path to a plain-text file listing target IDs -designed on the \strong{bottom} strand (one ID per line). Required when -\code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.} +\item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text +file listing target IDs designed on the \strong{bottom} strand (one ID per line). +Required only when \code{get_REF_ALT = TRUE} and \code{markers_info} does not supply +\code{Ref} and \code{Alt} columns.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Required columns: \verb{CloneID, Chr, Pos, Ref, Alt}. This file is required in -case your MADC CloneID column doesn't have the format CHR_POS. If indels are -present, columns \verb{Type, Indel_pos} are also required.} +metadata. Minimum required columns: \code{CloneID} (or \code{BI_markerID}), \code{Chr}, \code{Pos}. +When \code{get_REF_ALT = TRUE}, also requires \code{Ref} and \code{Alt} (replaces probe-sequence +inference). \code{Type} and \code{Indel_pos} are never required by this function.} \item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT bases from the Ref/Alt probe sequences in the MADC file (with strand correction @@ -35,7 +36,7 @@ sequences are removed.} \item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for targets with identical \code{CHROM_POS} are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target.} -\item{verbose}{logical (default \code{FALSE}). If \code{TRUE}, prints detailed messages about} +\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress messages about each processing step.} } \value{ (Invisibly) returns the path to \code{output.file}. The side effect is a @@ -59,8 +60,7 @@ IUPAC/ambiguous bases, lowercase bases, indels). \item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. \item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT} -(and \code{Type}, \code{Indel_pos} when indels are present), bypassing sequence-based -inference. +bypassing sequence-based inference. } \strong{Output VCF layout} diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index 5053e55..91e5493 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,10 +1,71 @@ test_that("check madc",{ - madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") - report <- read.csv(madc_file, check.names = FALSE) - check_madc_sanity(report) + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow") + # raw madc + report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) + res <- check_madc_sanity(report) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test lower case + report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test IUPAC + report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # clean alfalfa madc (fixed allele IDs, Chr_Pos CloneIDs, no issues) + report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel madc (ChromPos FALSE because IDs are not Chr_Pos) + report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel IUPAC (IUPAC codes + lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel lowercase (lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato more indels madc ChromPosFALSE + report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) }) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 524b687..5e21c7c 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -69,33 +69,10 @@ test_that("bottom strand markers have correct REF/ALT", { # Get results from both functions suppressWarnings( madc2vcf_targets(madc_file = madc_file, output.file = temp_targets, - get_REF_ALT = TRUE, botloci_file = bot_file) - ) - - suppressWarnings( - madc2vcf_all(madc = madc_file, botloci_file = bot_file, - hap_seq_file = NULL, out_vcf = temp_all, verbose = FALSE) + get_REF_ALT = TRUE, botloci_file = bot_file) ) vcf_targets <- read.vcfR(temp_targets, verbose = FALSE) - vcf_all <- read.vcfR(temp_all, verbose = FALSE) - - # Find common markers between both outputs - common_markers <- intersect(vcf_targets@fix[,"ID"], vcf_all@fix[,"ID"]) - - if(length(common_markers) > 0) { - # Compare REF/ALT for common markers - targets_subset <- vcf_targets@fix[vcf_targets@fix[,"ID"] %in% common_markers,] - all_subset <- vcf_all@fix[vcf_all@fix[,"ID"] %in% common_markers,] - - # Sort both by ID for comparison - targets_subset <- targets_subset[order(targets_subset[,"ID"]),] - all_subset <- all_subset[order(all_subset[,"ID"]),] - - # Check that REF/ALT match between the two functions - expect_equal(targets_subset[,"REF"], all_subset[,"REF"]) - expect_equal(targets_subset[,"ALT"], all_subset[,"ALT"]) - } # Validate that all REF/ALT are valid nucleotides expect_true(all(vcf_targets@fix[,"REF"] %in% c("A", "T", "G", "C", "."))) @@ -107,5 +84,566 @@ test_that("bottom strand markers have correct REF/ALT", { expect_true(all(vcf_targets@fix[valid_snps,"REF"] != vcf_targets@fix[valid_snps,"ALT"])) } - rm(vcf_targets, vcf_all, temp_targets, temp_all) + rm(vcf_targets, temp_targets) +}) + + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4139) + expect_equal(sum(DP[,5]), 42869) + unlink(out) + + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + # Test error when get_REF_ALT = TRUE but no markers_info or botloci provided to extract REF/ALT + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + verbose = FALSE), + regexp = "get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE." + ) + + # Test that it works when get_REF_ALT = TRUE and botloci is provided (REF/ALT recovered from probe sequences) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + # Test that it also works when markers_info is provided together with botloci (should give same result as above but just to confirm no interference between the two) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC (even after trying to fix potential padding mismatch with ChromPos info in markers_info) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when no matching CloneID between markers_info and MADC to fix the botloci mismatch issue (even if botloci file is not used, the function should still check that the provided markers_info can match with MADC CloneIDs to be able to use the ChromPos info to fix potential padding mismatch) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE), + "None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match." + ) + + # Test that it works when the function can find a matching ID in markers_info to fix the botloci mismatch issue + # (even if botloci file is not used, the function should still be able to use the ChromPos info in markers_info to + # fix potential padding mismatch and find matching IDs between MADC and botloci) + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info_ChromPos, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa lower case fixed MADC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 56547) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5163) + expect_equal(sum(dp[,5]), 58927) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info_ChromPos, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + botloci_file = potato_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # get_REF_ALT = FALSE: same counts as alfalfa_madc + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + ) + }) }) From 5d54f0de55a89258da46b220ae740d2e346b5e44 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 27 Mar 2026 02:26:48 -0400 Subject: [PATCH 17/80] targets ok --- R/check_madc_sanity.R | 122 +++++++++++++++++------ R/madc2vcf_all.R | 37 ++++++- R/madc2vcf_targets.R | 125 +++++++++++++++++++----- man/check_madc_sanity.Rd | 58 +++++++---- tests/testthat/test-check_madc_sanity.R | 18 ++-- tests/testthat/test-madc2vcf_targets.R | 75 +++++++++++++- 6 files changed, 348 insertions(+), 87 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 301cd74..759fce3 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -1,7 +1,7 @@ #' Run basic sanity checks on a MADC-style allele report #' #' @description -#' Performs eight quick validations on an allele report: +#' Performs nine quick validations on an allele report: #' 1) **Columns** - required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); #' 2) **FixAlleleIDs** - first column's first up-to-6 rows are not all blank or `"*"` #' *and* both `_0001` and `_0002` appear in `AlleleID`; @@ -11,8 +11,9 @@ #' or a `"-"` character is present in `AlleleSequence`; #' 6) **ChromPos** - all `CloneID` values follow the `Chr_Position` format #' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); -#' 7) **allNAcol** - at least one column contains only `NA` values; -#' 8) **allNArow** - at least one row contains only `NA` values. +#' 7) **allNAcol** - at least one column contains only `NA` or empty values; +#' 8) **allNArow** - at least one row contains only `NA` or empty values; +#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row. #' #' @param report A `data.frame` with at least the columns #' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also @@ -21,44 +22,64 @@ #' treated as header filler and skipped before further checks are run. #' #' @details -#' - **FixAlleleIDs:** When the check fails (raw DArT format), row 7 is used as -#' the column header and the first 7 rows are dropped before subsequent checks. +#' - **FixAlleleIDs:** When the first six rows of the first column are all blank +#' or `"*"` (raw DArT format), row 7 is promoted to column headers and the +#' first 7 rows are dropped before subsequent checks are run. The check is +#' `TRUE` when the file has already been processed by HapApp (fixed IDs with +#' `_0001`/`_0002` suffixes). #' - **IUPAC check:** Flags any character outside `A`, `T`, `C`, `G` and `"-"` #' (case-insensitive), which includes ambiguity codes (`N`, `R`, `Y`, etc.). #' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs -#' `"Alt_0002"`, merged by `CloneID`, and the lengths of `AlleleSequence` -#' are compared. A `"-"` anywhere in `AlleleSequence` is also treated as -#' evidence of an indel. +#' `"Alt_0002"`, merged by `CloneID`, and flagged as indels if either (a) the +#' lengths of `AlleleSequence` differ, (b) the sequences have the same length +#' but more than one character differs between them (complex indel / local +#' rearrangement), or (c) a `"-"` character is present anywhere in +#' `AlleleSequence`. #' - **ChromPos:** Each `CloneID` is split on `"_"` into exactly two parts; the #' first part must match `"Chr"` (case-insensitive) and the second must be a #' positive integer. Returns `FALSE` when any `CloneID` is `NA`. #' - **allNAcol / allNArow:** Detected via `apply()` over columns/rows -#' respectively; useful for flagging empty or corrupt entries. +#' respectively; a cell is treated as missing when it is `NA` or an empty +#' string (`""`). Useful for flagging empty or corrupt entries. +#' - **RefAltSeqs:** For each unique `CloneID`, checks whether at least one row +#' with a `Ref` (`|Ref_` when `FixAlleleIDs = TRUE`, `|Ref$` otherwise) and +#' one row with an `Alt` (`|Alt_` / `|Alt$`) allele exist. `CloneID`s that +#' lack a `Ref` row are stored in `missRef`; those lacking an `Alt` row in +#' `missAlt`. The check is `TRUE` when both sets are empty. #' - If required columns are missing (`Columns = FALSE`), only `Columns` and #' `FixAlleleIDs` are evaluated; all other checks remain `NA` and -#' `indel_clone_ids` is returned as `NULL`. +#' `indel_clone_ids`, `missRef`, and `missAlt` are returned as `NULL`. #' -#' @return A named list with three elements: +#' @return A named list with five elements: #' \describe{ -#' \item{checks}{Named logical vector with eight entries: +#' \item{checks}{Named logical vector with nine entries: #' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`, -#' `ChromPos`, `allNAcol`, `allNArow`. -#' `TRUE` means the condition was detected (or passed for `Columns` and -#' `FixAlleleIDs`); `NA` means the check was skipped.} +#' `ChromPos`, `allNAcol`, `allNArow`, `RefAltSeqs`. +#' `TRUE` means the condition was detected (or passed for `Columns`, +#' `FixAlleleIDs`, `ChromPos`, and `RefAltSeqs`); `NA` means the check +#' was skipped.} #' \item{messages}{Named list of length-2 character vectors, one per check. #' Element `[[1]]` is the message when the check is `TRUE`, element `[[2]]` #' when it is `FALSE`. Indexed by the same names as `checks`.} #' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt #' lengths differ. Returns `character(0)` if none are found, or `NULL` #' when required columns are missing.} +#' \item{missRef}{Character vector of `CloneID`s that have no `Ref` allele +#' row. Returns `character(0)` if all `CloneID`s have a `Ref` row, or +#' `NULL` when required columns are missing.} +#' \item{missAlt}{Character vector of `CloneID`s that have no `Alt` allele +#' row. Returns `character(0)` if all `CloneID`s have an `Alt` row, or +#' `NULL` when required columns are missing.} #' } #' #' @export check_madc_sanity <- function(report) { # Initialize - checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) - messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA, allNAcol = NA, allNArow = NA) + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) # ---- FixAlleleIDs ---- # Check if first up-to-6 entries in the *first column* are all "" or "*" @@ -67,14 +88,13 @@ check_madc_sanity <- function(report) { first_col_vals <- report[[1]][idx] all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) # Also require that both _0001 and _0002 appear in AlleleID - has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) - has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) - checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 - - if(!checks["FixAlleleIDs"]){ + if(all_blank_or_star) { colnames(report) <- report[7,] report <- report[-c(1:7),] } + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 # Validate required columns required <- c("CloneID", "AlleleID", "AlleleSequence") @@ -102,47 +122,89 @@ check_madc_sanity <- function(report) { ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE) alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) cmp_ok <- !is.na(ref_len) & !is.na(alt_len) + + # Classic indel: different lengths indel_mask <- cmp_ok & (ref_len != alt_len) + + # Complex indel: same length but >1 character difference between sequences + same_len <- cmp_ok & (ref_len == alt_len) + if (any(same_len)) { + n_diffs <- mapply(function(r, a) { + r_chars <- strsplit(r, "")[[1]] + a_chars <- strsplit(a, "")[[1]] + sum(r_chars != a_chars) + }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) + indel_mask[same_len] <- n_diffs > 1 + } + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence)) indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) + } else { checks["Indels"] <- FALSE indels <- character(0) } # --- All NA ---- - checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x)))) - checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)))) + checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x) | x == ""))) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)) | x == "")) # ---- Chrom Pos ---- if(!any(is.na(report$CloneID))) { pos <- strsplit(report$CloneID, "_") format <- all(sapply(pos, length) == 2) - first <- all(grepl("Chr", sapply(pos, "[", 1), ignore.case = TRUE)) + first <- all(grepl("^[A-Za-z]", sapply(pos, "[", 1))) second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0)) checks["ChromPos"] <- all(format, first, second) } else checks["ChromPos"] <- FALSE - } else indels <- NULL + # ---- RefAltSeqs ---- + all_clones <- unique(report$CloneID) + if (isTRUE(checks["FixAlleleIDs"])) { + has_ref <- unique(report$CloneID[grepl("\\|Ref_", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt_", report$AlleleID)]) + } else { + has_ref <- unique(report$CloneID[grepl("\\|Ref$", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt$", report$AlleleID)]) + } + missRef <- setdiff(all_clones, has_ref) + missAlt <- setdiff(all_clones, has_alt) + checks["RefAltSeqs"] <- length(missRef) == 0 & length(missAlt) == 0 + + # ---- OtherAlleles ---- + checks["OtherAlleles"] <- any(grepl("[|]Other", report$AlleleID)) + + } else { + indels <- NULL + missRef <- NULL + missAlt <- NULL + } messages[["Columns"]] <- c("Required columns are present", "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", - "MADC not processed by HapApp. Please, run the MADC through HapApp to fix Allele IDs before using it in BIGr/BIGapp.") + "MADC not processed by HapApp.") messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", "No IUPAC (non-ATCG) codes found in AlleleSequence") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", "No lowercase bases found in AlleleSequence") - messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), - "No indels found (ref/alt lengths match) for all CloneIDs") + messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ or >1 mismatch between same-length sequences) for the CloneIDs:",paste(indels, collapse = " ")), + "No indels found (ref/alt lengths match and at most 1 mismatch) for all CloneIDs") messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") messages[["allNArow"]] <- c("One or more rows contain all NA values.", "No rows with all NA values") messages[["allNAcol"]] <- c("One or more columns contain all NA values.", "No columns with all NA values") + messages[["RefAltSeqs"]] <- c("All CloneIDs have both Ref and Alt alleles", + paste0("Some CloneIDs are missing Ref and/or Alt alleles. ", + "Missing Ref: ", paste(missRef, collapse = " "), ". ", + "Missing Alt: ", paste(missAlt, collapse = " "), ".")) + messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID.", + "No alleles other than Ref and Alt found in AlleleID.") - list(checks = checks, messages = messages, indel_clone_ids = indels) + list(checks = checks, messages = messages, indel_clone_ids = indels, + missRef = missRef, missAlt = missAlt) } #' Check and Adjust Botloci and MADC Marker Compatibility diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 13227d3..22edcc6 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -98,7 +98,7 @@ madc2vcf_all <- function(madc = NULL, "out_vcf= ", out_vcf, ', ', "verbose= ", verbose,')">') - report <- read.csv(madc, check.names = FALSE) + report <- read.csv(madc, check.names = FALSE) checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { @@ -111,9 +111,38 @@ madc2vcf_all <- function(madc = NULL, paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) } - if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ - idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) - if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + if(any(checks$checks[c("IUPACcodes")])){ + idx <- which((checks$checks[c("IUPACcodes")])) + stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n")) + } + + if(any(!checks$checks[c("ChromPos")])){ + if(is.null(markers_info)) { + stop(paste(messages_results[c("ChromPos")], collapse = "\n")) + } else { + mi_df <- read.csv(markers_info) + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ", + "The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.") + } + } + + if(any(checks$checks[c("Indels")])){ + idx <- which((checks$checks[c("Indels")])) + if(is.null(markers_info)) { + stop(paste(messages_results[c("Indels")[idx]], collapse = "\n")) + } else { + mi_df <- read.csv(markers_info) + if(checks$checks["Indels"] && + !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) + stop("Indels detected in MADC file. ", + "The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.") + } + } + + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) } if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index c1c4c71..1416044 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -9,14 +9,17 @@ #' #' @details #' **What this function does** -#' - Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, -#' IUPAC/ambiguous bases, lowercase bases, indels). +#' - Runs basic sanity checks on the MADC file via `check_madc_sanity()` (column +#' presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +#' chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). #' - Extracts reference and total read counts per sample and target. #' - Derives `AD` (ref,alt) by subtraction (alt = total − ref). -#' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing -#' the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. -#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT` -#' bypassing sequence-based inference. +#' - If `get_REF_ALT = TRUE`, recovers REF/ALT bases either from `markers_info` +#' (when `Ref`/`Alt` columns are present) or by comparing the Ref/Alt probe +#' sequences in the MADC file (with strand correction via `botloci_file`). +#' Targets with >1 polymorphism between sequences are discarded. +#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, +#' `ALT`, bypassing sequence-based inference. #' #' **Output VCF layout** #' - `INFO` fields: @@ -32,27 +35,58 @@ #' complemented prior to base comparison so that REF/ALT are reported in the #' top-strand genomic orientation. #' -#' **Sanity check behavior** -#' - If required columns or fixed IDs are missing, the function `stop()`s. -#' - If IUPAC/lowercase/indels are detected and `markers_info` is **not** -#' provided, the function `stop()`s with a diagnostic message explaining what to fix. +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if required columns (`CloneID`, `AlleleID`, +#' `AlleleSequence`) are missing. +#' +#' For the remaining checks the required inputs depend on the combination of +#' check result and `get_REF_ALT`: +#' +#' | Check | Status | `get_REF_ALT` | Required | +#' |---|---|---|---| +#' | **IUPAC codes** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | — | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | — | +#' | **Indels** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | — | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | — | +#' | **ChromPos** | valid | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | valid | `FALSE` | — | +#' | | invalid | `TRUE` | `markers_info` with `Chr`/`Pos`/`Ref`/`Alt` **or** `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | | invalid | `FALSE` | `markers_info` with `Chr`/`Pos` | +#' | **FixAlleleIDs** | fixed | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | fixed | `FALSE` | — | +#' | | not fixed | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | not fixed | `FALSE` | — | #' #' @param madc_file character. Path to the input MADC CSV file. #' @param output.file character. Path to the output VCF file to write. #' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text #' file listing target IDs designed on the **bottom** strand (one ID per line). -#' Required only when `get_REF_ALT = TRUE` and `markers_info` does not supply -#' `Ref` and `Alt` columns. +#' Used for strand-correcting probe sequences when `get_REF_ALT = TRUE` and +#' `markers_info` does not supply `Ref` and `Alt` columns. Also required when +#' `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Minimum required columns: `CloneID` (or `BI_markerID`), `Chr`, `Pos`. -#' When `get_REF_ALT = TRUE`, also requires `Ref` and `Alt` (replaces probe-sequence -#' inference). `Type` and `Indel_pos` are never required by this function. -#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT -#' bases from the Ref/Alt probe sequences in the MADC file (with strand correction -#' using `botloci_file`). Targets with more than one difference between Ref/Alt -#' sequences are removed. -#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for targets with identical `CHROM_POS` are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target. -#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress messages about each processing step. +#' metadata. Accepted columns: +#' - `CloneID` or `BI_markerID` (required as marker identifier); +#' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; +#' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence +#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to recover +#' REF/ALT bases. The source is chosen automatically: `markers_info` `Ref`/`Alt` +#' columns take priority; otherwise probe sequences from the MADC are compared +#' (with `botloci_file` for strand correction). Targets with more than one +#' difference between Ref/Alt sequences are removed. When `FALSE`, REF and ALT +#' are set to `"."` in the output VCF. +#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for +#' `|AltMatch` and `|RefMatch` rows are summed into their corresponding `|Ref` +#' and `|Alt` rows before building the matrices. Useful when the MADC contains +#' multiple allele rows per target that should be aggregated. +#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress +#' messages about each processing step. #' #' @return (Invisibly) returns the path to `output.file`. The side effect is a #' **VCF v4.3** written to disk containing one row per target and columns for all @@ -133,18 +167,18 @@ madc2vcf_targets <- function(madc_file, for(i in seq_along(messages_results)) vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") - if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ - idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + if(any(!(checks$checks[c("Columns")]))){ + idx <- which(!(checks$checks[c("Columns")])) if(length(idx) > 0) stop(paste("The MADC file does not pass the sanity checks:\n", - paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + paste(messages_results[c("Columns")[idx]], collapse = "\n"))) } if(any(checks$checks[c("IUPACcodes", "Indels")]) && get_REF_ALT){ idx <- which((checks$checks[c("IUPACcodes", "Indels")])) if(is.null(markers_info)) stop(paste("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:", paste(messages_results[c("IUPACcodes", "Indels")[idx]], collapse = "\n"))) - else vmsg("MADC file has some issues (IUPAC codes, indels), but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") + else vmsg("MADC file has IUPAC codes and/or indels, but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") } if(checks$checks["LowerCase"]){ @@ -163,6 +197,35 @@ madc2vcf_targets <- function(madc_file, mi_has_ref_alt <- all(c("Ref", "Alt") %in% colnames(mi_df)) } + if(!checks$checks["FixAlleleIDs"]){ + vmsg("MADC file has not been processed by HapApp.", verbose = verbose, level = 1) + if(get_REF_ALT){ + if(!mi_has_ref_alt) stop("MADC file has not been processed by HapApp. BIGr only provide results if get_REF_ALT is set to FALSE or if is TRUE but a marker_info with REF and ALT information is provided.") + } + # The check points to FALSE if the 6 initial rows exist or if there are no fixed allele ID (aka _0001, _0002, etc) + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + if(all_blank_or_star) { + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } + } + + if(checks$checks["allNArow"]){ + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + report <- report[!idx, ] + vmsg("MADC contains rows with all NA values. Rows %s will be removed.", verbose = verbose, level = 1, type = ">>", paste(which(idx), collapse = ", ")) + } + + if(checks$checks["allNAcol"]){ + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + report <- report[, !idx] + vmsg("MADC contains columns with all NA values. Columns %s will be removed.", verbose = verbose, level = 1, type = ">>", paste0(which(idx), collapse = ",")) + } + if(!isTRUE(checks$checks["ChromPos"])) { if(is.null(markers_info)){ stop("CloneID column does not follow the 'Chr_Pos'. ", @@ -201,11 +264,21 @@ madc2vcf_targets <- function(madc_file, } } + # Throw message if OtherAlleles are present + if(checks$checks["OtherAlleles"]) { + vmsg("AlleleID contains alleles other than Ref and Alt. These will be ignored in the VCF output. Use function madc2vcf_all to include them.", verbose = verbose, level = 1, type = ">>") + } + + # Make sure counts are numeric + count_cols <- setdiff(colnames(report), c("CloneID", "AlleleID", "AlleleSequence")) + report[count_cols] <- lapply(report[count_cols], function(x) as.numeric(as.character(x))) + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -230,7 +303,7 @@ madc2vcf_targets <- function(madc_file, stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") if(checks$checks["Indels"]) - vmsg("Indels detected in MADC file. Since Ref and Alt are provided in markers_info, Type and Indel_pos are not required.", + vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", verbose = verbose, level = 1, type = ">>") if(!all(c(id_col, "Chr", "Pos", "Ref", "Alt") %in% colnames(mi_df))) diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd index 494145e..5328a4f 100644 --- a/man/check_madc_sanity.Rd +++ b/man/check_madc_sanity.Rd @@ -9,35 +9,59 @@ check_madc_sanity(report) \arguments{ \item{report}{A \code{data.frame} with at least the columns \code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also -used in the “FixAlleleIDs” check to inspect its first up to six entries.} +used in the \code{FixAlleleIDs} check to inspect its first up to six entries. +If \code{FixAlleleIDs} is \code{FALSE} (raw DArT format), the first 7 rows are +treated as header filler and skipped before further checks are run.} } \value{ -A list with: +A named list with three elements: \describe{ -\item{checks}{Named logical vector with entries -\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}.} -\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ. -Returns \code{character(0)} if none, or \code{NULL} when required columns are missing.} +\item{checks}{Named logical vector with eight entries: +\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}, +\code{ChromPos}, \code{allNAcol}, \code{allNArow}. +\code{TRUE} means the condition was detected (or passed for \code{Columns} and +\code{FixAlleleIDs}); \code{NA} means the check was skipped.} +\item{messages}{Named list of length-2 character vectors, one per check. +Element \verb{[[1]]} is the message when the check is \code{TRUE}, element \verb{[[2]]} +when it is \code{FALSE}. Indexed by the same names as \code{checks}.} +\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt +lengths differ. Returns \code{character(0)} if none are found, or \code{NULL} +when required columns are missing.} } } \description{ -Performs five quick validations on an allele report: +Performs eight quick validations on an allele report: \enumerate{ -\item \strong{Columns} – required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); -\item \strong{FixAlleleIDs} – first column’s first up-to-6 rows are not all blank or "*" +\item \strong{Columns} - required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); +\item \strong{FixAlleleIDs} - first column's first up-to-6 rows are not all blank or \code{"*"} \emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID}; -\item \strong{IUPACcodes} – presence of non-ATCG characters in \code{AlleleSequence}; -\item \strong{LowerCase} – presence of lowercase a/t/c/g in \code{AlleleSequence}; -\item \strong{Indels} – reference/alternate allele lengths differ for the same \code{CloneID}. +\item \strong{IUPACcodes} - presence of non-ATCG characters in \code{AlleleSequence}; +\item \strong{LowerCase} - presence of lowercase a/t/c/g in \code{AlleleSequence}; +\item \strong{Indels} - reference/alternate allele lengths differ for the same \code{CloneID}, +or a \code{"-"} character is present in \code{AlleleSequence}; +\item \strong{ChromPos} - all \code{CloneID} values follow the \code{Chr_Position} format +(prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); +\item \strong{allNAcol} - at least one column contains only \code{NA} values; +\item \strong{allNArow} - at least one row contains only \code{NA} values. } } \details{ \itemize{ -\item \strong{IUPAC check:} Flags any character outside \code{ATCG} (case-insensitive), -which will include ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.) and symbols like \code{-}. -\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs \code{"Alt_0002"}, -merged by \code{CloneID}, and the lengths of \code{AlleleSequence} are compared. -\item If required columns are missing, only \strong{Columns} is evaluated (\code{FALSE}) and +\item \strong{FixAlleleIDs:} When the check fails (raw DArT format), row 7 is used as +the column header and the first 7 rows are dropped before subsequent checks. +\item \strong{IUPAC check:} Flags any character outside \code{A}, \code{T}, \code{C}, \code{G} and \code{"-"} +(case-insensitive), which includes ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.). +\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs +\code{"Alt_0002"}, merged by \code{CloneID}, and the lengths of \code{AlleleSequence} +are compared. A \code{"-"} anywhere in \code{AlleleSequence} is also treated as +evidence of an indel. +\item \strong{ChromPos:} Each \code{CloneID} is split on \code{"_"} into exactly two parts; the +first part must match \code{"Chr"} (case-insensitive) and the second must be a +positive integer. Returns \code{FALSE} when any \code{CloneID} is \code{NA}. +\item \strong{allNAcol / allNArow:} Detected via \code{apply()} over columns/rows +respectively; useful for flagging empty or corrupt entries. +\item If required columns are missing (\code{Columns = FALSE}), only \code{Columns} and +\code{FixAlleleIDs} are evaluated; all other checks remain \code{NA} and \code{indel_clone_ids} is returned as \code{NULL}. } } diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index 91e5493..0fbe774 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,13 +1,13 @@ test_that("check madc",{ github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" - names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow") + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs") # raw madc report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -15,7 +15,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -23,7 +23,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -31,7 +31,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -39,7 +39,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -47,7 +47,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -55,7 +55,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) @@ -63,7 +63,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) names(exp) <- names expect_equal(res$checks, exp) }) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 5e21c7c..ce2ab8e 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -642,8 +642,81 @@ test_that("simu alfalfa",{ expect_error( madc2vcf_targets(madc_file = alfalfa_madc_raw, output.file = out, - get_REF_ALT = FALSE, + get_REF_ALT = TRUE, + verbose = FALSE) + ) + + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, verbose = FALSE) ) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) }) }) From ee509819f866fc241d8002177e2bc2999cc4e6d9 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:02:21 -0400 Subject: [PATCH 18/80] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- NEWS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1a0d64d..84f1c79 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,9 @@ # BIGr 0.6.3 -- New function to check MADC file: check_madc_sanity. By now, it checks presence of required columns, if fixed allele IDs were assigned, presence of IUPAC codes, lower case sequences bases, indels, and chromosome and position information. -- Add new arguments markers_info for users to add CSV file with marker information such as CHROM, POS, Marker Type and position of indels. For BI species, these information is at https://github.com/Breeding-Insight/BIGapp-PanelHub -- Check inputs for madc2vcf_all -- Update affiliation on DESCRIPTION +- New function to check MADC files: `check_madc_sanity`. Currently, it checks for the presence of required columns, whether fixed allele IDs were assigned, the presence of IUPAC codes, lowercase sequence bases, indels, and chromosome and position information. +- Added new argument `markers_info`, which allows users to provide a CSV file with marker information such as CHROM, POS, marker type, and position of indels. For BI species, this information is available from [PanelHub](https://github.com/Breeding-Insight/BIGapp-PanelHub). +- Checked inputs for `madc2vcf_all`. +- Updated affiliation in `DESCRIPTION`. # BIGr 0.6.2 From d3a40610f6a862d187086f220d70581dc6a1b0f1 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:03:19 -0400 Subject: [PATCH 19/80] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- R/check_madc_sanity.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 19d3ff6..692f81b 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -91,8 +91,8 @@ check_madc_sanity <- function(report) { messages[["Columns"]] <- c("Required columns are present\n", "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n", - "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n") - messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n", + "MADC was not processed by BI. Please contact us to assign allele IDs to your MADC using the species haplotype database. This guarantees reproducibility across different datasets.\n") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. These codes are not currently supported\n", "No IUPAC (non-ATCG) codes found in AlleleSequence\n") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n", "No lowercase bases found in AlleleSequence\n") From f765c7cc8bbb79d6a78e48683aa9b1dcdc08802d Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:05:24 -0400 Subject: [PATCH 20/80] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- R/filterVCF.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/filterVCF.R b/R/filterVCF.R index 550468c..14f94c0 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -36,7 +36,7 @@ #' #' @export filterVCF <- function(vcf.file, - quality.rates = F, + quality.rates = FALSE, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, From 87bb1fc75464759629057450e77fa925eae1f21a Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 27 Mar 2026 09:08:14 -0400 Subject: [PATCH 21/80] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- R/check_ped.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_ped.R b/R/check_ped.R index a2277bc..0ebbd5a 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -93,7 +93,7 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { repeated_ids_report <- conflicting_ids #### check 3: missing parents #### - for (i in 1:nrow(data)) { + for (i in seq_len(nrow(data))) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] From 6059c108f31dfb2e041aba9ac221625c372b86d8 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:33:10 -0400 Subject: [PATCH 22/80] Update R/madc2vcf_targets.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/madc2vcf_targets.R | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 1416044..c7fd269 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -523,8 +523,21 @@ madc2vcf_targets <- function(madc_file, } if(sum(is.na(vcf_df$REF)) > 0) { - vmsg(paste(sum(is.na(vcf_df$REF)), "markers removed because of presence of more than one polymorphism between ref and alt sequences."), verbose = verbose, level = 1, type = ">>") - warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) + vmsg( + paste( + sum(is.na(vcf_df$REF)), + "markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences)." + ), + verbose = verbose, + level = 1, + type = ">>" + ) + warning( + paste( + "Markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences):", + sum(is.na(vcf_df$REF)) + ) + ) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] } From b09b0c1faba04150d51acd4716e53fa1afeff094 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:38:45 -0400 Subject: [PATCH 23/80] Update R/check_madc_sanity.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/check_madc_sanity.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 759fce3..472c9ab 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -137,7 +137,7 @@ check_madc_sanity <- function(report) { indel_mask[same_len] <- n_diffs > 1 } - checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence)) + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence), na.rm = TRUE) indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) } else { From 409dbd33ff80e49b3e9dee39a2a16fe92cfb7728 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:40:18 -0400 Subject: [PATCH 24/80] Update R/get_countsMADC.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/get_countsMADC.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 8cce70b..3854559 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -223,7 +223,14 @@ get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_co if(collapse_matches_counts){ filtered_df <- madc_df[order(madc_df$AlleleID),] %>% - mutate(Type = ifelse(grepl("Alt", AlleleID), "Alt", "Ref")) %>% + # Keep only Ref/Alt alleles and their Match variants; drop other allele types + dplyr::filter(grepl("\\|(Ref|Alt)(Match)?\\b", AlleleID)) %>% + mutate( + Type = dplyr::case_when( + grepl("\\|Alt(Match)?\\b", AlleleID) ~ "Alt", + grepl("\\|Ref(Match)?\\b", AlleleID) ~ "Ref" + ) + ) %>% group_by(CloneID, Type) %>% summarise( AlleleID = paste0(unique(CloneID), "|", unique(Type)), From e6fce1953e1a1f3dc94a9c920efed49c3f62fdb7 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:43:21 -0400 Subject: [PATCH 25/80] Update R/get_countsMADC.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/get_countsMADC.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 3854559..58a2496 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -235,7 +235,7 @@ get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_co summarise( AlleleID = paste0(unique(CloneID), "|", unique(Type)), AlleleSequence = first(AlleleSequence), - across(where(is.numeric), sum), + across(where(is.numeric), ~ sum(.x, na.rm = TRUE)), .groups = "drop" ) %>% select(AlleleID, CloneID, AlleleSequence, everything(), -Type) From 669ac4eb77ace06ded03a69fbfab29cc1013c680 Mon Sep 17 00:00:00 2001 From: Cris Taniguti Date: Fri, 27 Mar 2026 10:49:14 -0400 Subject: [PATCH 26/80] Update R/check_madc_sanity.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/check_madc_sanity.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 472c9ab..87df09b 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -147,7 +147,7 @@ check_madc_sanity <- function(report) { # --- All NA ---- checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x) | x == ""))) - checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x)) | x == "")) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x) | x == ""))) # ---- Chrom Pos ---- if(!any(is.na(report$CloneID))) { From bbfbee226439300609186c21b0fb047c54c8a33d Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 27 Mar 2026 10:50:24 -0400 Subject: [PATCH 27/80] fix tests --- tests/testthat/test-check_madc_sanity.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index 0fbe774..a185997 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,13 +1,13 @@ test_that("check madc",{ github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" - names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs") + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs", "OtherAlleles") # raw madc report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -15,7 +15,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -23,7 +23,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -31,7 +31,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -39,7 +39,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -47,7 +47,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -55,7 +55,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) @@ -63,7 +63,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) names(exp) <- names expect_equal(res$checks, exp) }) From 55ee61a91afafbb429011346f93c77a906334ef3 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 27 Mar 2026 17:02:56 -0400 Subject: [PATCH 28/80] madc2vcf_all indels support okay --- NAMESPACE | 1 + R/madc2vcf_all.R | 152 +++++-- R/utils.R | 14 + man/check_madc_sanity.Rd | 49 ++- man/filterVCF.Rd | 2 +- man/madc2vcf_all.Rd | 8 +- man/madc2vcf_targets.Rd | 77 +++- tests/testthat/test-madc2vcf_all.R | 634 +++++++++++++++++++++++++++++ 8 files changed, 859 insertions(+), 78 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 43c0559..6ecaef4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(url_exists) export(vmsg) import(dplyr) import(ggplot2) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 3cab307..08300f6 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -2,8 +2,8 @@ #' #' This function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It includes options for filtering multiallelic SNPs and parallel processing to improve performance. #' -#' @param madc A string specifying the path to the MADC file. -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. +#' @param madc Required. A string specifying the path or URL to the MADC file. +#' @param botloci_file Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand. #' @param hap_seq_file A string specifying the path to the haplotype database fasta file. #' @param rm_multiallelic_SNP A logical value. If TRUE, SNPs with more than one alternative base are removed. If FALSE, the thresholds specified by `multiallelic_SNP_dp_thr` and `multiallelic_SNP_sample_thr` are used to filter low-frequency SNP alleles. Default is FALSE. #' @param multiallelic_SNP_dp_thr A numeric value specifying the minimum depth by tag threshold for filtering low-frequency SNP alleles when `rm_multiallelic_SNP` is FALSE. Default is 0. @@ -51,8 +51,8 @@ #' @import vcfR #' #' @export -madc2vcf_all <- function(madc = NULL, - botloci_file = NULL, +madc2vcf_all <- function(madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -66,9 +66,9 @@ madc2vcf_all <- function(madc = NULL, vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks - if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.") - if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.") - if(!is.null(hap_seq_file) & !file.exists(hap_seq_file)) stop("Haplotype sequence file not found. Please provide a valid path.") + if(!(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") + if(!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if(!is.null(hap_seq_file) & !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") ## n.cores as integer if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") @@ -105,6 +105,9 @@ madc2vcf_all <- function(madc = NULL, if (check) message[1] else message[2] }, checks$checks, checks$messages) + for(i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) stop(paste("The MADC file does not pass the sanity checks:\n", @@ -116,11 +119,31 @@ madc2vcf_all <- function(madc = NULL, stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n")) } + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + # Standardize marker ID column to CloneID + if(!"CloneID" %in% colnames(mi_df) && "BI_markerID" %in% colnames(mi_df)) { + colnames(mi_df)[colnames(mi_df) == "BI_markerID"] <- "CloneID" + vmsg("markers_info: 'BI_markerID' column renamed to 'CloneID' for internal use.", verbose = verbose, level = 1) + } else if(!"CloneID" %in% colnames(mi_df) && !"BI_markerID" %in% colnames(mi_df)) { + stop("markers_info must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + } + # Validate CloneID values + if(any(is.na(mi_df$CloneID) | mi_df$CloneID == "")) + stop("markers_info CloneID column contains empty or NA values. Please check your markers_info file.") + if(!any(mi_df$CloneID %in% report$CloneID)) + stop("None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs.") + n_match <- sum(mi_df$CloneID %in% report$CloneID) + n_total <- length(unique(report$CloneID)) + if(n_match < n_total) + vmsg("%s of %s MADC CloneIDs found in markers_info. Unmatched markers will be removed.", verbose = verbose, level = 1, n_match, n_total) + } else mi_df <- NULL + if(any(!checks$checks[c("ChromPos")])){ if(is.null(markers_info)) { stop(paste(messages_results[c("ChromPos")], collapse = "\n")) } else { - mi_df <- read.csv(markers_info) if(!all(c("Chr", "Pos") %in% colnames(mi_df))) stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ", "The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.") @@ -130,13 +153,20 @@ madc2vcf_all <- function(madc = NULL, if(any(checks$checks[c("Indels")])){ idx <- which((checks$checks[c("Indels")])) if(is.null(markers_info)) { - stop(paste(messages_results[c("Indels")[idx]], collapse = "\n")) + vmsg("The MADC file contains indels and markers_info file is not provided. Tags with indels as targets will be flagged with warnings and removed from the analysis. Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels.",verbose = verbose, level = 1, type = ">>>") } else { - mi_df <- read.csv(markers_info) if(checks$checks["Indels"] && !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) stop("Indels detected in MADC file. ", "The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.") + if(!"Type" %in% colnames(mi_df)) { + mi_df$Type <- ifelse(nchar(as.character(mi_df$Ref)) > 1 | nchar(as.character(mi_df$Alt)) > 1, + "Indel", "SNP") + vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels).", + verbose = verbose, level = 1, + sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) + } + vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>>") } } @@ -145,21 +175,34 @@ madc2vcf_all <- function(madc = NULL, report$AlleleSequence <- toupper(report$AlleleSequence) } - if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") + if(!checks$checks["RefAltSeqs"] && is.null(hap_seq_file)){ + vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded.", verbose = verbose, level = 1) + } + + botloci <- read.csv(botloci_file, header = F) if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL # Check marker names compatibility between MADC and botloci - checked_botloci <- check_botloci(botloci, report) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Starting conversion...", verbose = verbose, level = 0, type = ">>") + my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, n.cores=n.cores, alignment_score_thr = alignment_score_thr, + mi_df = mi_df, verbose = verbose) + vmsg("All information gathered!", verbose = verbose, level = 1, type = ">>") + + vmsg("Adding information to a VCF body...", verbose = verbose, level = 0, type = ">>") + vcf_body <- create_VCF_body(csv = my_results_csv, n.cores = n.cores, rm_multiallelic_SNP = rm_multiallelic_SNP, @@ -201,9 +244,9 @@ madc2vcf_all <- function(madc = NULL, #' @import parallel #' #' @noRd -loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){ +loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, checks = NULL, mi_df = NULL, verbose = TRUE){ - if(!is.null(hap_seq)){ + if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) } @@ -217,7 +260,7 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align clust <- makeCluster(n.cores) #clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) - add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = verbose)) + add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = FALSE)) stopCluster(clust) ref_index <- sapply(add_ref_alt_results, "[[",2) @@ -229,19 +272,25 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align updated_by_cloneID <- lapply(add_ref_alt_results, "[[",1) - if(verbose){ - cat("The Ref_0001 sequence had to be added for:", sum(ref_index==1),"tags\n") - cat("The Alt_0002 sequence had to be added for:", sum(alt_index==1),"tags\n") - cat("Tags discarded due to lack of Ref_0001 sequence:", sum(ref_index==-1),"tags\n") - cat("Tags discarded due to lack of Alt_0002 sequence:", sum(alt_index==-1),"tags\n") + if(!is.null(hap_seq)){ + vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences.", verbose = verbose, level = 1) + } else { + vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis.", verbose = verbose, level = 1) } + vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==1)) + vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==1)) + vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==-1)) + vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==-1)) + vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 1) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci", "alignment_score_thr")) - compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) + #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df")) + compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, verbose = FALSE)) stopCluster(clust) + vmsg("Pairwise alignments concluded.", verbose = verbose, level = 1) + my_results_csv <- lapply(compare_results, "[[", 1) my_results_csv <- do.call(rbind, my_results_csv) @@ -252,11 +301,9 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_indels <- sapply(compare_results, "[[", 4) rm_indels <- unlist(rm_indels) - if(verbose){ - cat("Number of tags removed because of low alignment score:", length(rm_score),"tags\n") - cat("Number of tags removed because of N in the alternative sequence:", length(rm_N),"tags\n") - cat("Number of tags removed because of indels as targets (not yet supported):", length(rm_indels),"tags\n") - } + vmsg("Number of tags removed because of low alignment score: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_score)) + vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_N)) + vmsg("Number of tags removed because of indels as targets (not yet supported): %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_indels)) rownames(my_results_csv) <- NULL return(my_results_csv) @@ -351,10 +398,44 @@ add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { #' @importFrom pwalign pairwiseAlignment nucleotideSubstitutionMatrix #' #' @noRd -compare <- function(one_tag, botloci, alignment_score_thr = 40){ +compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, verbose = FALSE){ + # for(i in 1507:length(updated_by_cloneID)){ + # one_tag <- updated_by_cloneID[[i]] + cloneID <- one_tag$CloneID[1] + isBotLoci <- cloneID %in% botloci[,1] - # If marker is present in the botloc list, get the reverse complement sequence + if(!is.null(mi_df)) { + one_mi_df <- mi_df[which(mi_df$CloneID %in% cloneID), ] + # Handle duplicate CloneIDs in markers_info + if(nrow(one_mi_df) > 1) { + key_cols <- intersect(c("CloneID", "Chr", "Pos", "Ref", "Alt", "Type"), colnames(one_mi_df)) + if(nrow(unique(one_mi_df[, key_cols])) == 1) { + warning("Duplicate CloneID '", cloneID, "' found in markers_info with identical key columns. Keeping the first entry.") + one_mi_df <- one_mi_df[1, ] + } else { + stop("Duplicate CloneID '", cloneID, "' found in markers_info with differing values in key columns (CloneID, Chr, Pos, Ref, Alt, Type). Please resolve the conflict in your markers_info file.") + } + } + isIndel <- tolower(one_mi_df$Type) == "indel" + } else { + isIndel <- FALSE + } + + if(isIndel){ + update_tag <- one_tag[grep("Ref_0001$",one_tag$AlleleID),] + update_tag[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag_temp <- one_tag[grep("Alt_0002$",one_tag$AlleleID),] + update_tag_temp[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag <- rbind(update_tag, update_tag_temp) + + return(list(update_tag = update_tag, + rm_score = NULL, + rm_N = NULL, + rm_indels = NULL)) + } + + # If marker is present in the botloci list, get the reverse complement sequence if(isBotLoci) one_tag$AlleleSequence <- sapply(one_tag$AlleleSequence, function(x) as.character(reverseComplement(DNAString(x)))) chr <- sapply(strsplit(cloneID, "_"), function(x) x[-length(x)]) @@ -368,7 +449,10 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ sigma <- nucleotideSubstitutionMatrix(match = 1, mismatch = -0.5, baseOnly = FALSE) # baseOnly = FALSE avoid breaking when N is present (they will be filtered after)) align <- pairwiseAlignment(ref_seq, alt_seq, - substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + substitutionMatrix = sigma, + gapOpening=-1.4, + gapExtension=-0.1, + type = "global") # The score is a bit different from the python script despite same weights if(align@score > alignment_score_thr){ # if score for the target sequence is smaller than the threshold, the tag will be discarted @@ -529,9 +613,7 @@ create_VCF_body <- function(csv, vcf_tag_list1 <- lapply(vcf_tag_list, "[[", 1) rm_mks <- sapply(vcf_tag_list, "[[" ,2) - if(verbose){ - print(paste("SNP removed because presented more than one allele:", sum(rm_mks))) - } + vmsg("SNP removed because presented more than one allele: %s", verbose = verbose, level = 2, type = ">>>",sum(rm_mks)) for(i in seq_along(vcf_tag_list1)) { if(is.vector(vcf_tag_list1[[i]])) { @@ -555,9 +637,7 @@ create_VCF_body <- function(csv, if(length(which(duplicated(vcf_body[,3]))) > 0){ repeated <- vcf_body[which(duplicated(vcf_body[,3])), 4] - if(verbose){ - print(paste("Different primers pair capture same SNP positions in", length(repeated), "locations. The repeated were discarded.")) - } + vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded.", verbose = verbose, level = 2, length(repeated)) repeated_tab <- vcf_body[which(vcf_body[,4] %in% repeated),] vcf_body_new <- vcf_body[-which(vcf_body[,4] %in% repeated),] diff --git a/R/utils.R b/R/utils.R index a270570..cf3cc6a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -69,6 +69,20 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { message(formatted_msg) } + +#' Check Whether a URL Is Accessible +#' +#' Attempts to open a connection to the given URL and returns `TRUE` if +#' successful, `FALSE` otherwise. Errors and warnings are both treated as +#' inaccessible. +#' +#' @param u character. The URL to test. +#' +#' @return A single logical: `TRUE` if the URL can be opened, `FALSE` if not. +#' +#' @keywords internal +#' @noRd +#' @export url_exists <- function(u) { tryCatch({ con <- url(u, open = "rb") diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd index 5328a4f..0398625 100644 --- a/man/check_madc_sanity.Rd +++ b/man/check_madc_sanity.Rd @@ -14,23 +14,30 @@ If \code{FixAlleleIDs} is \code{FALSE} (raw DArT format), the first 7 rows are treated as header filler and skipped before further checks are run.} } \value{ -A named list with three elements: +A named list with five elements: \describe{ -\item{checks}{Named logical vector with eight entries: +\item{checks}{Named logical vector with nine entries: \code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}, -\code{ChromPos}, \code{allNAcol}, \code{allNArow}. -\code{TRUE} means the condition was detected (or passed for \code{Columns} and -\code{FixAlleleIDs}); \code{NA} means the check was skipped.} +\code{ChromPos}, \code{allNAcol}, \code{allNArow}, \code{RefAltSeqs}. +\code{TRUE} means the condition was detected (or passed for \code{Columns}, +\code{FixAlleleIDs}, \code{ChromPos}, and \code{RefAltSeqs}); \code{NA} means the check +was skipped.} \item{messages}{Named list of length-2 character vectors, one per check. Element \verb{[[1]]} is the message when the check is \code{TRUE}, element \verb{[[2]]} when it is \code{FALSE}. Indexed by the same names as \code{checks}.} \item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ. Returns \code{character(0)} if none are found, or \code{NULL} when required columns are missing.} +\item{missRef}{Character vector of \code{CloneID}s that have no \code{Ref} allele +row. Returns \code{character(0)} if all \code{CloneID}s have a \code{Ref} row, or +\code{NULL} when required columns are missing.} +\item{missAlt}{Character vector of \code{CloneID}s that have no \code{Alt} allele +row. Returns \code{character(0)} if all \code{CloneID}s have an \code{Alt} row, or +\code{NULL} when required columns are missing.} } } \description{ -Performs eight quick validations on an allele report: +Performs nine quick validations on an allele report: \enumerate{ \item \strong{Columns} - required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); \item \strong{FixAlleleIDs} - first column's first up-to-6 rows are not all blank or \code{"*"} @@ -41,27 +48,39 @@ Performs eight quick validations on an allele report: or a \code{"-"} character is present in \code{AlleleSequence}; \item \strong{ChromPos} - all \code{CloneID} values follow the \code{Chr_Position} format (prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); -\item \strong{allNAcol} - at least one column contains only \code{NA} values; -\item \strong{allNArow} - at least one row contains only \code{NA} values. +\item \strong{allNAcol} - at least one column contains only \code{NA} or empty values; +\item \strong{allNArow} - at least one row contains only \code{NA} or empty values; +\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row. } } \details{ \itemize{ -\item \strong{FixAlleleIDs:} When the check fails (raw DArT format), row 7 is used as -the column header and the first 7 rows are dropped before subsequent checks. +\item \strong{FixAlleleIDs:} When the first six rows of the first column are all blank +or \code{"*"} (raw DArT format), row 7 is promoted to column headers and the +first 7 rows are dropped before subsequent checks are run. The check is +\code{TRUE} when the file has already been processed by HapApp (fixed IDs with +\verb{_0001}/\verb{_0002} suffixes). \item \strong{IUPAC check:} Flags any character outside \code{A}, \code{T}, \code{C}, \code{G} and \code{"-"} (case-insensitive), which includes ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.). \item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs -\code{"Alt_0002"}, merged by \code{CloneID}, and the lengths of \code{AlleleSequence} -are compared. A \code{"-"} anywhere in \code{AlleleSequence} is also treated as -evidence of an indel. +\code{"Alt_0002"}, merged by \code{CloneID}, and flagged as indels if either (a) the +lengths of \code{AlleleSequence} differ, (b) the sequences have the same length +but more than one character differs between them (complex indel / local +rearrangement), or (c) a \code{"-"} character is present anywhere in +\code{AlleleSequence}. \item \strong{ChromPos:} Each \code{CloneID} is split on \code{"_"} into exactly two parts; the first part must match \code{"Chr"} (case-insensitive) and the second must be a positive integer. Returns \code{FALSE} when any \code{CloneID} is \code{NA}. \item \strong{allNAcol / allNArow:} Detected via \code{apply()} over columns/rows -respectively; useful for flagging empty or corrupt entries. +respectively; a cell is treated as missing when it is \code{NA} or an empty +string (\code{""}). Useful for flagging empty or corrupt entries. +\item \strong{RefAltSeqs:} For each unique \code{CloneID}, checks whether at least one row +with a \code{Ref} (\verb{|Ref_} when \code{FixAlleleIDs = TRUE}, \verb{|Ref$} otherwise) and +one row with an \code{Alt} (\verb{|Alt_} / \verb{|Alt$}) allele exist. \code{CloneID}s that +lack a \code{Ref} row are stored in \code{missRef}; those lacking an \code{Alt} row in +\code{missAlt}. The check is \code{TRUE} when both sets are empty. \item If required columns are missing (\code{Columns = FALSE}), only \code{Columns} and \code{FixAlleleIDs} are evaluated; all other checks remain \code{NA} and -\code{indel_clone_ids} is returned as \code{NULL}. +\code{indel_clone_ids}, \code{missRef}, and \code{missAlt} are returned as \code{NULL}. } } diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 39d7264..2a3ab62 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -6,7 +6,7 @@ \usage{ filterVCF( vcf.file, - quality.rates = F, + quality.rates = FALSE, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index 58cfcb3..ac2de3f 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -5,8 +5,8 @@ \title{Converts MADC file to VCF recovering target and off-target SNPs} \usage{ madc2vcf_all( - madc = NULL, - botloci_file = NULL, + madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -19,9 +19,9 @@ madc2vcf_all( ) } \arguments{ -\item{madc}{A string specifying the path to the MADC file.} +\item{madc}{Required. A string specifying the path or URL to the MADC file.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand.} \item{hap_seq_file}{A string specifying the path to the haplotype database fasta file.} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 1da55d8..30363a6 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -21,22 +21,33 @@ madc2vcf_targets( \item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text file listing target IDs designed on the \strong{bottom} strand (one ID per line). -Required only when \code{get_REF_ALT = TRUE} and \code{markers_info} does not supply -\code{Ref} and \code{Alt} columns.} +Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and +\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Also required when +\code{ChromPos} is invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Minimum required columns: \code{CloneID} (or \code{BI_markerID}), \code{Chr}, \code{Pos}. -When \code{get_REF_ALT = TRUE}, also requires \code{Ref} and \code{Alt} (replaces probe-sequence -inference). \code{Type} and \code{Indel_pos} are never required by this function.} +metadata. Accepted columns: +\itemize{ +\item \code{CloneID} or \code{BI_markerID} (required as marker identifier); +\item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; +\item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence +inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +}} -\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT -bases from the Ref/Alt probe sequences in the MADC file (with strand correction -using \code{botloci_file}). Targets with more than one difference between Ref/Alt -sequences are removed.} +\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to recover +REF/ALT bases. The source is chosen automatically: \code{markers_info} \code{Ref}/\code{Alt} +columns take priority; otherwise probe sequences from the MADC are compared +(with \code{botloci_file} for strand correction). Targets with more than one +difference between Ref/Alt sequences are removed. When \code{FALSE}, REF and ALT +are set to \code{"."} in the output VCF.} -\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for targets with identical \code{CHROM_POS} are summed together. This is useful when the MADC file contains multiple rows per target (e.g., due to multiple alleles or technical replicates) and you want to aggregate them into a single entry per unique target.} +\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for +\verb{|AltMatch} and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} +and \verb{|Alt} rows before building the matrices. Useful when the MADC contains +multiple allele rows per target that should be aggregated.} -\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress messages about each processing step.} +\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress +messages about each processing step.} } \value{ (Invisibly) returns the path to \code{output.file}. The side effect is a @@ -53,14 +64,17 @@ Convert DArTag MADC target read counts to a VCF \strong{What this function does} \itemize{ -\item Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, -IUPAC/ambiguous bases, lowercase bases, indels). +\item Runs basic sanity checks on the MADC file via \code{check_madc_sanity()} (column +presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). \item Extracts reference and total read counts per sample and target. \item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref). -\item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing -the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. -\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT} -bypassing sequence-based inference. +\item If \code{get_REF_ALT = TRUE}, recovers REF/ALT bases either from \code{markers_info} +(when \code{Ref}/\code{Alt} columns are present) or by comparing the Ref/Alt probe +sequences in the MADC file (with strand correction via \code{botloci_file}). +Targets with >1 polymorphism between sequences are discarded. +\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, +\code{ALT}, bypassing sequence-based inference. } \strong{Output VCF layout} @@ -83,11 +97,30 @@ If a target ID appears in \code{botloci_file}, its probe sequences are reverse- complemented prior to base comparison so that REF/ALT are reported in the top-strand genomic orientation. -\strong{Sanity check behavior} -\itemize{ -\item If required columns or fixed IDs are missing, the function \code{stop()}s. -\item If IUPAC/lowercase/indels are detected and \code{markers_info} is \strong{not} -provided, the function \code{stop()}s with a diagnostic message explaining what to fix. +\strong{Sanity check behaviour and requirements} + +The function always stops if required columns (\code{CloneID}, \code{AlleleID}, +\code{AlleleSequence}) are missing. + +For the remaining checks the required inputs depend on the combination of +check result and \code{get_REF_ALT}:\tabular{llll}{ + Check \tab Status \tab \code{get_REF_ALT} \tab Required \cr + \strong{IUPAC codes} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab — \cr + \strong{Indels} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab — \cr + \strong{ChromPos} \tab valid \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab valid \tab \code{FALSE} \tab — \cr + \tab invalid \tab \code{TRUE} \tab \code{markers_info} with \code{Chr}/\code{Pos}/\code{Ref}/\code{Alt} \strong{or} \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \tab invalid \tab \code{FALSE} \tab \code{markers_info} with \code{Chr}/\code{Pos} \cr + \strong{FixAlleleIDs} \tab fixed \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab fixed \tab \code{FALSE} \tab — \cr + \tab not fixed \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not fixed \tab \code{FALSE} \tab — \cr } } \section{Dependencies}{ diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index f2095c5..7904035 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -65,3 +65,637 @@ test_that("test madc offtargets",{ rm(vcf) }) + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 2, + rm_multiallelic_SNP = TRUE, + multiallelic_SNP_sample_thr = 0, + multiallelic_SNP_dp_thr = 0, + alignment_score_thr = 40, + out_vcf = out, + verbose = TRUE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 233482) + unlink(out) + + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + # Test error when botloci_file is NULL + expect_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = NULL, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + # Test that it works when hap_seq_file is provided (REF/ALT recovered from probe sequences) + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + # Test that it also works when markers_info is provided together with botloci + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when markers_info does not match MADC CloneIDs + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + ) + + # Test error when markers_info_ChromPos is provided but IDs still don't match botloci + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE) + ) + }) + + test_that("alfalfa lower case fixed MADC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + # IUPAC codes cause a stop in madc2vcf_all + expect_error( + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 56547) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5163) + expect_equal(sum(dp[,5]), 58927) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # markers_info without Ref/Alt/Indel_pos while indels present -> error + expect_error( + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE) + ) + + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # Raw format fails FixAlleleIDs check -> madc2vcf_all stops + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + ) + + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + ) + }) +}) + From bf5ff4c48889f0ff1153d0b1c0ea0f8f71f6e502 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 31 Mar 2026 09:56:21 -0400 Subject: [PATCH 29/80] madc2vcf_all support indel --- DESCRIPTION | 2 +- NEWS.md | 41 +++ R/check_madc_sanity.R | 44 +-- R/madc2vcf_all.R | 121 +++++--- R/madc2vcf_targets.R | 1 + tests/testthat/test-madc2vcf_all.R | 412 ++++++++----------------- tests/testthat/test-madc2vcf_targets.R | 35 ++- 7 files changed, 292 insertions(+), 364 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9a18709..bf782ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.3 +Version: 0.6.5 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index fce76f4..8037725 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,44 @@ +# BIGr 0.6.5 + +# Updates on madc2vcf functions +Details: + +- both functions targets and all (targets + off-targets) markers now have `check_madc_sanity` function implemented. It tests: + - [Columns] If MADC has the expected columns + - [allNArow | allNAcol] Presence of columns and rows with all NA (happens often when people open the MADC in excel before loading in R) + - [IUPACcodes] Presence of IUPAC codes on AlleleSequence + - [LowerCase] Presence of lower case bases on AlleleSequence + - [Indels] Presence of Indels + - [ChromPos] If CloneID follows the format Chr_Pos + - [RefAltSeqs] If all Ref Allele has corresponding Alt and vice-versa + - [OtherAlleles] If "Other" exists in the MADC AlleleID + +- Better messages if `verbose = TRUE` in `madc2vcf_all` +- `madc2vcf_all` support for Indels - markers_info with Indels position is required; only the target indel is extracted, off-targets are ignored for the tag +- `madc2vcf_targets` doesn’t run if: + - MADC Column names are not correct + - Ignore Other alleles - but inform the user if they exist or not and direct them to `madc2vcf_all` in case they want to extract them as well +- See the table for madc2vcf_targets requirements accordingly to MADC content: + +  | check status | get_REF_ALT | Requires +-- | -- | -- | -- +IUPAC | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +Indels | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +ChromPos | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info CHR/POS/REF/ALT or markers_info CHR/POS/ + botloci +  | FALSE | FALSE | markers_info CHR/POS +FixAlleleIDs | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info REF/ALT +  | FALSE | FALSE | - + # BIGr 0.6.4 - Add function `vmsg` to organize messages printed on the console diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 87df09b..c36a92e 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -132,7 +132,7 @@ check_madc_sanity <- function(report) { n_diffs <- mapply(function(r, a) { r_chars <- strsplit(r, "")[[1]] a_chars <- strsplit(a, "")[[1]] - sum(r_chars != a_chars) + sum(toupper(r_chars) != toupper(a_chars)) }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) indel_mask[same_len] <- n_diffs > 1 } @@ -183,8 +183,8 @@ check_madc_sanity <- function(report) { messages[["Columns"]] <- c("Required columns are present", "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", - "MADC not processed by HapApp.") - messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported", + "MADC not processed by HapApp") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them", "No IUPAC (non-ATCG) codes found in AlleleSequence") messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", "No lowercase bases found in AlleleSequence") @@ -192,16 +192,16 @@ check_madc_sanity <- function(report) { "No indels found (ref/alt lengths match and at most 1 mismatch) for all CloneIDs") messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") - messages[["allNArow"]] <- c("One or more rows contain all NA values.", + messages[["allNArow"]] <- c("One or more rows contain all NA values", "No rows with all NA values") - messages[["allNAcol"]] <- c("One or more columns contain all NA values.", + messages[["allNAcol"]] <- c("One or more columns contain all NA values", "No columns with all NA values") messages[["RefAltSeqs"]] <- c("All CloneIDs have both Ref and Alt alleles", - paste0("Some CloneIDs are missing Ref and/or Alt alleles. ", + paste0("Some CloneIDs are missing Ref and/or Alt alleles ", "Missing Ref: ", paste(missRef, collapse = " "), ". ", "Missing Alt: ", paste(missAlt, collapse = " "), ".")) - messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID.", - "No alleles other than Ref and Alt found in AlleleID.") + messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID", + "No alleles other than Ref and Alt found in AlleleID") list(checks = checks, messages = messages, indel_clone_ids = indels, missRef = missRef, missAlt = missAlt) @@ -248,12 +248,17 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") if(pad_madc != pad_botloci) { - vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 1, type = ">>") + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 2, type = ">>") if (pad_madc < pad_botloci) { report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) ) report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) + if(!is.null(mi_df)) { + mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df$CloneID), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df$CloneID))) + ) + } } else { botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) @@ -261,30 +266,31 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") } } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ - vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 1, type = ">>") + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 2, type = ">>") if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") } else { use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" - vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") } mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) if(!any(botloci$V1 %in% mk_info_CloneID)){ - vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 1, type = ">>") + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 2, type = ">>") stop("Check marker IDs in both MADC and botloci files. They should be the same.") } else { - vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 1, type = ">>") + vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 2, type = ">>") report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + mi_df$CloneID <- mk_info_CloneID } } else { - vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 1, type = ">>") - vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 1, type = ">>") + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 2, type = ">>") stop("Check marker IDs in both MADC and botloci files. They should be the same.") } } - return(list(botloci, report)) + return(list(botloci, report, mi_df)) } diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 08300f6..3ec95d4 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -66,9 +66,9 @@ madc2vcf_all <- function(madc, vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks - if(!(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") - if(!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") - if(!is.null(hap_seq_file) & !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") + if(is.null(madc) || !(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") + if(is.null(botloci_file) || !(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if(!is.null(hap_seq_file) && !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") ## n.cores as integer if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") @@ -153,7 +153,7 @@ madc2vcf_all <- function(madc, if(any(checks$checks[c("Indels")])){ idx <- which((checks$checks[c("Indels")])) if(is.null(markers_info)) { - vmsg("The MADC file contains indels and markers_info file is not provided. Tags with indels as targets will be flagged with warnings and removed from the analysis. Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels.",verbose = verbose, level = 1, type = ">>>") + vmsg("The MADC file contains indels and markers_info file is not provided. Tags with indels as targets will be flagged with warnings and removed from the analysis. Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels.",verbose = verbose, level = 1, type = ">>") } else { if(checks$checks["Indels"] && !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) @@ -166,7 +166,7 @@ madc2vcf_all <- function(madc, verbose = verbose, level = 1, sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) } - vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>>") + vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>") } } @@ -186,28 +186,35 @@ madc2vcf_all <- function(madc, checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + + # Derive position padding width from CloneIDs in the original report + pad_width <- unique(nchar(sub(".*_", "", unique(report$CloneID)))) + if(length(pad_width) != 1) warning("CloneIDs in the MADC report have inconsistent position padding widths. IDs in the VCF may be inconsistent.") + pad_width <- pad_width[1] vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") - vmsg("Starting conversion...", verbose = verbose, level = 0, type = ">>") + vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, n.cores=n.cores, alignment_score_thr = alignment_score_thr, + checks = checks, mi_df = mi_df, + pad_width = pad_width, verbose = verbose) - vmsg("All information gathered!", verbose = verbose, level = 1, type = ">>") - - vmsg("Adding information to a VCF body...", verbose = verbose, level = 0, type = ">>") + vmsg("All information gathered!", verbose = verbose, level = 0, type = ">>") vcf_body <- create_VCF_body(csv = my_results_csv, n.cores = n.cores, rm_multiallelic_SNP = rm_multiallelic_SNP, multiallelic_SNP_dp_thr = multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr = multiallelic_SNP_sample_thr, + pad_width = pad_width, verbose = verbose) #Make a header separate from the dataframe @@ -226,6 +233,8 @@ madc2vcf_all <- function(madc, vcf_term <- sapply(strsplit(out_vcf, "[.]"), function(x) x[length(x)]) if(length(vcf_term) != 0) if(vcf_term != "vcf") out_vcf <- paste0(out_vcf,".vcf") + vmsg("VCF ready! Output written to: %s", verbose = verbose, level = 0, type = ">>", out_vcf) + writeLines(vcf_header, con = out_vcf) suppressWarnings( write.table(vcf_body, file = out_vcf, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) @@ -244,7 +253,8 @@ madc2vcf_all <- function(madc, #' @import parallel #' #' @noRd -loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, checks = NULL, mi_df = NULL, verbose = TRUE){ +loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, + checks = NULL, mi_df = NULL, pad_width = NULL,verbose = TRUE){ if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) @@ -274,23 +284,21 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align if(!is.null(hap_seq)){ vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences.", verbose = verbose, level = 1) + vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==1)) + vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==1)) } else { vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis.", verbose = verbose, level = 1) } - vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==1)) - vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==1)) - vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(ref_index==-1)) - vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>>", sum(alt_index==-1)) + vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) + vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) - vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 1) + vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df")) compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, verbose = FALSE)) stopCluster(clust) - vmsg("Pairwise alignments concluded.", verbose = verbose, level = 1) - my_results_csv <- lapply(compare_results, "[[", 1) my_results_csv <- do.call(rbind, my_results_csv) @@ -301,9 +309,19 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_indels <- sapply(compare_results, "[[", 4) rm_indels <- unlist(rm_indels) - vmsg("Number of tags removed because of low alignment score: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_score)) - vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_N)) - vmsg("Number of tags removed because of indels as targets (not yet supported): %s tags", verbose = verbose, level = 2, type = ">>>", length(rm_indels)) + vmsg("Number of tags removed because of low alignment score (threshold = %s): %s tags", verbose = verbose, level = 2, type = ">>", alignment_score_thr, length(rm_score)) + vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>", length(rm_N)) + if(length(rm_indels) > 0) { + if(!is.null(mi_df) && all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) { + vmsg("Number of tags with indels as targets: %s tags (markers_info provided with required columns; targets exported, off-targets skipped)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } else { + vmsg("Number of tags removed because of indels as targets: %s tags (no markers_info with Ref/Alt/Indel_pos provided; tags discarded)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } + } else { + vmsg("Number of tags removed because of indels as targets: 0 tags", verbose = verbose, level = 2, type = ">>") + } + + vmsg("Pairwise alignments concluded!", verbose = verbose, level = 1) rownames(my_results_csv) <- NULL return(my_results_csv) @@ -417,7 +435,7 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver stop("Duplicate CloneID '", cloneID, "' found in markers_info with differing values in key columns (CloneID, Chr, Pos, Ref, Alt, Type). Please resolve the conflict in your markers_info file.") } } - isIndel <- tolower(one_mi_df$Type) == "indel" + isIndel <- !is.null(one_mi_df$Type) && !is.na(one_mi_df$Type) && tolower(one_mi_df$Type) == "indel" } else { isIndel <- FALSE } @@ -597,6 +615,7 @@ create_VCF_body <- function(csv, multiallelic_SNP_dp_thr = 2, multiallelic_SNP_sample_thr = 10, n.cores = 1, + pad_width = NULL, verbose = TRUE){ # Make sure counts are numeric @@ -607,13 +626,31 @@ create_VCF_body <- function(csv, clust <- makeCluster(n.cores) #clusterExport(clust, c("merge_counts","rm_multiallelic_SNP", "multiallelic_SNP_dp_thr", "multiallelic_SNP_sample_thr")) - vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr)) + vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, pad_width)) stopCluster(clust) vcf_tag_list1 <- lapply(vcf_tag_list, "[[", 1) - rm_mks <- sapply(vcf_tag_list, "[[" ,2) - - vmsg("SNP removed because presented more than one allele: %s", verbose = verbose, level = 2, type = ">>>",sum(rm_mks)) + rm_mks <- sapply(vcf_tag_list, "[[", 2) # total removed + total_mks <- sapply(vcf_tag_list, "[[", 3) # total multiallelic found + rm_setting <- sapply(vcf_tag_list, "[[", 4) # removed by rm_multiallelic_SNP=TRUE + rm_filter <- sapply(vcf_tag_list, "[[", 5) # removed because empty after filtering + kept_multi <- sapply(vcf_tag_list, "[[", 6) # kept as multiallelic + simplified <- sapply(vcf_tag_list, "[[", 7) # simplified to biallelic + + vmsg("Performing final filterings", verbose = verbose, level = 0, type = ">>") + + vmsg("Multiallelic off-target SNPs found: %s", verbose = verbose, level = 2, type = ">>", sum(total_mks)) + if(rm_multiallelic_SNP) { + vmsg("Removed (rm_multiallelic_SNP = TRUE): %s", verbose = verbose, level = 3, type = ">>", sum(rm_setting)) + } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0) { + vmsg("Removed (empty after filtering; depth thr = %s, sample thr = %s): %s", + verbose = verbose, level = 3, type = ">>", + multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, sum(rm_filter)) + vmsg("Kept as multiallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) + vmsg("Simplified to biallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(simplified)) + } else { + vmsg("All kept (rm_multiallelic_SNP = FALSE, no thresholds set): %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) + } for(i in seq_along(vcf_tag_list1)) { if(is.vector(vcf_tag_list1[[i]])) { @@ -626,12 +663,6 @@ create_VCF_body <- function(csv, vcf_body$V3 <- as.numeric(vcf_body$V3) rownames(vcf_body) <- NULL - # Remove padding - sp <- strsplit(vcf_body$target, "_") - pos <- sapply(sp, function(x) x[length(x)]) - chr <- sapply(sp, function(x) paste0(x[-length(x)], collapse = "_")) - vcf_body$target <- paste0(chr, "_",as.numeric(pos)) - # Dealing with repeated positions # discard the ones that are not the target and keep only the first if all are off-targets if(length(which(duplicated(vcf_body[,3]))) > 0){ @@ -655,6 +686,10 @@ create_VCF_body <- function(csv, vcf_body_new <- rbind(vcf_body_new, repeated_tab_stay) } else vcf_body_new <- vcf_body + vmsg("Filters finished", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF...", verbose = verbose, level = 0, type = ">>") + vcf_body_new <- vcf_body_new[,-1] colnames(vcf_body_new) <- c("#CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER", "INFO","FORMAT", colnames(csv)[-c(1:7)]) @@ -676,18 +711,24 @@ create_VCF_body <- function(csv, #' aspect of the marker, the marker is discarded. This is likely to happen to paralogous sites. #' #' @noRd -merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0){ +merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, pad_width = NULL){ #Get counts for target SNP - rm <- 0 + rm_by_setting <- 0 # removed because rm_multiallelic_SNP = TRUE + rm_by_filter <- 0 # removed because empty after threshold filtering + kept_multiallelic <- 0 # kept as-is (still multiallelic after filtering or no filter) + simplified <- 0 # simplified from multiallelic to biallelic by filtering + total_multiallelic <- 0 RefTag <- apply(cloneID_unit[which(grepl("Ref", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) AltTag <- apply(cloneID_unit[which(grepl("Alt", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) + cloneID <- cloneID_unit$CloneID[1] + if(is.null(pad_width)) pad_width <- nchar(sub(".*_", "", cloneID)) info <- cloneID_unit[grep("Ref_", cloneID_unit$AlleleID),] info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), + cloneID, info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(RefTag, AltTag)),";", "ADS=",sum(RefTag),",",sum(AltTag)), "DP:RA:AD") @@ -704,19 +745,21 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic alleles <- unique(by_pos[[i]]$AlleleID) if(length(unique(by_pos[[i]]$Alt)) > 1){ # If SNP is multiallelic + total_multiallelic <- total_multiallelic + 1 if(rm_multiallelic_SNP){ # option to remove multiallelics - rm <- rm + 1 + rm_by_setting <- rm_by_setting + 1 next() } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles rm.idx <- which(apply(by_pos[[i]][,-c(1:7)], 1, function(x) sum(x > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr)) if(length(rm.idx)) up_by_pos <- by_pos[[i]][-rm.idx,] else up_by_pos <- by_pos[[i]] if(length(unique(up_by_pos$Alt)) == 0) { # If after applied filter all tags are gone - rm <- rm + 1 + rm_by_filter <- rm_by_filter + 1 next() } else if (length(unique(up_by_pos$Alt)) > 1 ){ # If after applied filter the SNP remains multiallelic + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(up_by_pos, up_by_pos$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -731,12 +774,14 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- unique.data.frame(info) } else { # If after applied filter, only one alternative remains + simplified <- simplified + 1 alt <- apply(up_by_pos[,-c(1:7)], 2, sum) total_alt <- alt info <- unique.data.frame(up_by_pos[,2:5]) } } else { # If rm_multiallelic_SNP set to FALSE and threshold is 0, keep all multiallelics + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(by_pos[[i]], by_pos[[i]]$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -763,7 +808,7 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), + paste0(info$Chromosome, "_", formatC(as.integer(as.numeric(info$SNP_position_in_Genome)), width = pad_width, flag = "0", format = "d")), info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(ref, total_alt)),";", "ADS=",sum(ref),",",sum(total_alt)), "DP:RA:AD") @@ -773,5 +818,5 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic } } - return(list(vcf_tag, rm)) + return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified)) } diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index c7fd269..80af70f 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -260,6 +260,7 @@ madc2vcf_targets <- function(madc_file, checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] } } diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index 7904035..f88adc9 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -99,12 +99,14 @@ test_that("simu alfalfa",{ test_that("ALFALFA — clean fixed allele ID MADC", { out <- tempfile(fileext = ".vcf") + #out <- "test.vcf" + # Default parameters expect_no_error( madc2vcf_all(madc = alfalfa_madc, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 2, - rm_multiallelic_SNP = TRUE, + rm_multiallelic_SNP = FALSE, multiallelic_SNP_sample_thr = 0, multiallelic_SNP_dp_thr = 0, alignment_score_thr = 40, @@ -117,7 +119,10 @@ test_that("simu alfalfa",{ expect_true(all(!is.na(vcf@fix[, "ALT"]))) DP <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(DP[1,]), 4534) - expect_equal(sum(DP[,5]), 233482) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) unlink(out) expect_no_error( @@ -126,15 +131,18 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = TRUE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") - expect_true(all(is.na(vcf@fix[, "REF"]))) - expect_true(all(is.na(vcf@fix[, "ALT"]))) + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) DP <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(DP[1,]), 4534) - expect_equal(sum(DP[,5]), 56547) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) # Test error when botloci_file is NULL expect_error( @@ -146,45 +154,48 @@ test_that("simu alfalfa",{ verbose = FALSE) ) - # Test that it works when hap_seq_file is provided (REF/ALT recovered from probe sequences) + # Test that it also works when markers_info is provided together with botloci madc2vcf_all(madc = alfalfa_madc, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, + multiallelic_SNP_dp_thr = 80, + multiallelic_SNP_sample_thr = 2, n.cores = 1, + markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 234777) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 3) - # Test that it also works when markers_info is provided together with botloci + # Remove multiallelics madc2vcf_all(madc = alfalfa_madc, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, + rm_multiallelic_SNP = TRUE, n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) - + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 233482) + multi <- grepl(",", vcf@fix[,5]) + expect_false(any(multi)) # It has multiallelics + expect_equal(sum(multi), 0) }) test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { @@ -208,145 +219,113 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs." ) # Test error when markers_info_ChromPos is provided but IDs still don't match botloci - expect_error( - madc2vcf_all(madc = alfalfa_madc_wrongID, - botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, - n.cores = 1, - markers_info = alfalfa_markers_info_ChromPos, - out_vcf = out, - verbose = FALSE) - ) - }) - - test_that("alfalfa lower case fixed MADC", { - out <- tempfile(fileext = ".vcf") - madc2vcf_all(madc = alfalfa_lowercase, + madc2vcf_all(madc = alfalfa_madc_wrongID, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 1, + markers_info = alfalfa_markers_info_ChromPos, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + }) + + test_that("alfalfa lower case missing 3 ref and 1 alt fixed MADC", { + out <- tempfile(fileext = ".vcf") madc2vcf_all(madc = alfalfa_lowercase, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 1, - markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 233719) madc2vcf_all(madc = alfalfa_lowercase, botloci_file = alfalfa_botloci, hap_seq_file = NULL, n.cores = 1, - markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) - }) - - test_that("alfalfa IUPAC code", { - out <- tempfile(fileext = ".vcf") - # IUPAC codes cause a stop in madc2vcf_all - expect_error( - madc2vcf_all(madc = alfalfa_iupac, - botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, - n.cores = 1, - out_vcf = out, - verbose = FALSE) - ) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 230415) - madc2vcf_all(madc = alfalfa_iupac, + madc2vcf_all(madc = alfalfa_lowercase, botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, + hap_seq_file = NULL, n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + check <- check[-which(is.na(check$Pos)),] expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = alfalfa_iupac, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = alfalfa_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(dp[1,]), 4534) - expect_equal(sum(dp[,5]), 56547) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = alfalfa_iupac, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, - n.cores = 1, - out_vcf = out, - verbose = FALSE) + expect_equal(sum(dp[,5]), 230415) - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(alfalfa_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + }) + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + # IUPAC codes cause a stop in madc2vcf_all + expect_error( + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence\\. This codes are not currently supported by BIGr/BIGapp\\. Run HapApp to replace them" + ) }) test_that("potato indel madc chrompos=FALSE", { @@ -358,7 +337,9 @@ test_that("simu alfalfa",{ hap_seq_file = potato_microhapDB, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) madc2vcf_all(madc = potato_indel_madc, @@ -367,16 +348,18 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41656) + expect_equal(sum(dp[,10]), 43017) + expect_equal(sum(dp[3,]), 5073) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -393,10 +376,11 @@ test_that("simu alfalfa",{ vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 5163) - expect_equal(sum(dp[,5]), 58927) + expect_equal(sum(dp[1,]), 3937) + expect_equal(sum(dp[,5]), 248571) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -407,25 +391,10 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" ) - madc2vcf_all(madc = potato_indel_madc, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41656) }) test_that("potato indel chromposFALSE", { @@ -437,7 +406,8 @@ test_that("simu alfalfa",{ hap_seq_file = potato_microhapDB, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" ) madc2vcf_all(madc = potato_more_indels_chrompos_false, @@ -453,9 +423,11 @@ test_that("simu alfalfa",{ vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -472,10 +444,11 @@ test_that("simu alfalfa",{ vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 6301) - expect_equal(sum(dp[,5]), 53613) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -486,25 +459,9 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" ) - - madc2vcf_all(madc = potato_more_indels_chrompos_false, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) }) test_that("potato lowercase", { @@ -515,13 +472,15 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) vcf_infos <- vcf@fix[,c(1:5)] lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) expect_equal(sum(dp[,10]), 41755) @@ -536,165 +495,36 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info_ChromPos, out_vcf = out, - verbose = FALSE) + verbose = FALSE), + regexp = "Indels detected in MADC file. The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns." ) - - madc2vcf_all(madc = potato_indel_lowercase, - botloci_file = potato_botloci, - hap_seq_file = potato_microhapDB, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_lowercase, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 6301) - expect_equal(sum(dp[,5]), 53613) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_lowercase, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) }) - test_that("potato IUPAC", { out <- tempfile(fileext = ".vcf") - madc2vcf_all(madc = potato_indel_iupac, - botloci_file = potato_botloci, - hap_seq_file = potato_microhapDB, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_iupac, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[1,]), 6301) - expect_equal(sum(dp[,5]), 53613) - expect_equal(check$REF, check$Ref) - expect_equal(check$ALT, check$Alt) - - madc2vcf_all(madc = potato_indel_iupac, - botloci_file = potato_botloci, - hap_seq_file = NULL, - n.cores = 1, - markers_info = potato_markers_info, - out_vcf = out, - verbose = FALSE) - - vcf <- read.vcfR(out, verbose = FALSE) - lut <- read.csv(potato_markers_info) - vcf_infos <- vcf@fix[,c(1:5)] - lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] - check <- cbind(vcf_infos,lut_infos) - expect_equal(as.numeric(check$POS), check$Pos) - dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) - }) - - test_that("alfalfa raw MADC format (7-row header)", { - out <- tempfile(fileext = ".vcf") - # Raw format fails FixAlleleIDs check -> madc2vcf_all stops - expect_error( - madc2vcf_all(madc = alfalfa_madc_raw, - botloci_file = alfalfa_botloci, - hap_seq_file = alfalfa_microhapDB, - n.cores = 1, - out_vcf = out, - verbose = FALSE) - ) expect_error( - madc2vcf_all(madc = alfalfa_madc_raw, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, - n.cores = 1, - out_vcf = out, - verbose = FALSE) - ) - - expect_error( - madc2vcf_all(madc = alfalfa_madc_raw, - botloci_file = alfalfa_botloci, - hap_seq_file = NULL, + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, n.cores = 1, - markers_info = alfalfa_markers_info, + markers_info = potato_markers_info, out_vcf = out, - verbose = FALSE) + verbose = TRUE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them" ) + }) + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # Raw format fails FixAlleleIDs check -> madc2vcf_all stops expect_error( madc2vcf_all(madc = alfalfa_madc_raw, botloci_file = alfalfa_botloci, hap_seq_file = alfalfa_microhapDB, n.cores = 1, - markers_info = alfalfa_markers_info, out_vcf = out, - verbose = FALSE) + verbose = FALSE), regexp = "MADC not processed by HapApp" ) }) }) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index ce2ab8e..9899e17 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -245,11 +245,13 @@ test_that("simu alfalfa",{ test_that("alfalfa lower case fixed MADC", { out <- tempfile(fileext = ".vcf") - madc2vcf_targets(madc_file = alfalfa_lowercase, - output.file = out, - get_REF_ALT = TRUE, - botloci_file = alfalfa_botloci, - verbose = FALSE) + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = TRUE) + ) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -260,14 +262,16 @@ test_that("simu alfalfa",{ expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[,10]), 43017) - madc2vcf_targets(madc_file = alfalfa_lowercase, - output.file = out, - get_REF_ALT = TRUE, - botloci_file = alfalfa_botloci, - markers_info = alfalfa_markers_info, - verbose = FALSE) + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + ) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -278,15 +282,16 @@ test_that("simu alfalfa",{ expect_equal(check$ALT, check$Alt) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[,10]), 43017) + expect_warning( madc2vcf_targets(madc_file = alfalfa_lowercase, output.file = out, get_REF_ALT = FALSE, botloci_file = alfalfa_botloci, markers_info = alfalfa_markers_info, verbose = FALSE) - + ) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) vcf_infos <- vcf@fix[,c(1:5)] @@ -294,7 +299,7 @@ test_that("simu alfalfa",{ check <- cbind(vcf_infos,lut_infos) expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43691) + expect_equal(sum(dp[,10]), 43017) }) test_that("alfalfa IUPAC code", { From 291ae8e66a9d5cebb65fc2dac15bfae908adb652 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 10:38:45 -0400 Subject: [PATCH 30/80] add support for Others --- R/madc2vcf_all.R | 265 ++++++++++++++++++++++++++++++++++++-------- man/madc2vcf_all.Rd | 11 ++ 2 files changed, 230 insertions(+), 46 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 3ec95d4..ac20835 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -11,6 +11,10 @@ #' @param alignment_score_thr A numeric value specifying the minimum alignment score threshold. Default is 40. #' @param n.cores An integer specifying the number of cores to use for parallel processing. Default is 1. #' @param out_vcf A string specifying the name of the output VCF file. If the file extension is not `.vcf`, it will be appended automatically. +#' @param markers_info A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed). +#' @param add_others A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE. +#' @param others_max_snps An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit). +#' @param others_rm_with_indels A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE. #' @param verbose A logical value indicating whether to print metrics and progress to the console. Default is TRUE. #' #' @return This function does not return an R object. It writes the processed VCF file v4.3 to the specified `out_vcf` path. @@ -61,6 +65,9 @@ madc2vcf_all <- function(madc, alignment_score_thr = 40, out_vcf = NULL, markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE){ vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") @@ -81,8 +88,11 @@ madc2vcf_all <- function(madc, ## out_vcf as string if(!is.null(out_vcf) & !is.character(out_vcf)) stop("out_vcf should be a string specifying the output file name.") - ## rm_multiallelic_SNP and verbose as logical + ## rm_multiallelic_SNP, add_others and verbose as logical if(!is.logical(rm_multiallelic_SNP)) stop("rm_multiallelic_SNP should be logical.") + if(!is.logical(add_others)) stop("add_others should be logical.") + if(!is.null(others_max_snps) && (!is.numeric(others_max_snps) || others_max_snps < 1)) stop("others_max_snps should be a positive integer or NULL.") + if(!is.logical(others_rm_with_indels)) stop("others_rm_with_indels should be logical.") if(!is.logical(verbose)) stop("verbose should be logical.") bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all= 1 | nchar(as.character(mi_df$Alt)) > 1, "Indel", "SNP") - vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels).", + vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels)", verbose = verbose, level = 1, sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) } - vmsg("The MADC file contains indels and markers_info file was provided with all required columns. Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors.",verbose = verbose, level = 1, type = ">>") + vmsg("The MADC file contains indels and markers_info file was provided with all required columns",verbose = verbose, level = 1, type = ">>") + vmsg("Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors",verbose = verbose, level = 2, type = ">>") } } + vmsg("Inputs checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") if(checks$checks["LowerCase"]){ - vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case", verbose = verbose, level = 1) report$AlleleSequence <- toupper(report$AlleleSequence) } if(!checks$checks["RefAltSeqs"] && is.null(hap_seq_file)){ - vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded.", verbose = verbose, level = 1) + vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded", verbose = verbose, level = 1) } botloci <- read.csv(botloci_file, header = F) @@ -193,10 +210,6 @@ madc2vcf_all <- function(madc, if(length(pad_width) != 1) warning("CloneIDs in the MADC report have inconsistent position padding widths. IDs in the VCF may be inconsistent.") pad_width <- pad_width[1] - vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") - - vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") - my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, @@ -205,6 +218,9 @@ madc2vcf_all <- function(madc, checks = checks, mi_df = mi_df, pad_width = pad_width, + add_others = add_others, + others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = verbose) vmsg("All information gathered!", verbose = verbose, level = 0, type = ">>") @@ -254,7 +270,9 @@ madc2vcf_all <- function(madc, #' #' @noRd loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, - checks = NULL, mi_df = NULL, pad_width = NULL,verbose = TRUE){ + checks = NULL, mi_df = NULL, pad_width = NULL, + add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, + verbose = TRUE){ if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) @@ -283,11 +301,11 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align updated_by_cloneID <- lapply(add_ref_alt_results, "[[",1) if(!is.null(hap_seq)){ - vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences.", verbose = verbose, level = 1) + vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences", verbose = verbose, level = 1) vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==1)) vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==1)) } else { - vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis.", verbose = verbose, level = 1) + vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis", verbose = verbose, level = 1) } vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) @@ -295,8 +313,10 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df")) - compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, verbose = FALSE)) + #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df", "add_others", "others_max_snps", "others_rm_with_indels")) + compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, + add_others = add_others, others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = FALSE)) stopCluster(clust) my_results_csv <- lapply(compare_results, "[[", 1) @@ -308,6 +328,8 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_N <- unlist(rm_N) rm_indels <- sapply(compare_results, "[[", 4) rm_indels <- unlist(rm_indels) + n_rm_others_indels <- sum(sapply(compare_results, "[[", 5)) + n_rm_others_maxsnps <- sum(sapply(compare_results, "[[", 6)) vmsg("Number of tags removed because of low alignment score (threshold = %s): %s tags", verbose = verbose, level = 2, type = ">>", alignment_score_thr, length(rm_score)) vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>", length(rm_N)) @@ -320,6 +342,22 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align } else { vmsg("Number of tags removed because of indels as targets: 0 tags", verbose = verbose, level = 2, type = ">>") } + n_others_total <- sum(sapply(compare_results, "[[", 7)) + n_others_kept <- n_others_total - n_rm_others_indels - n_rm_others_maxsnps + others_added_info <- unlist(lapply(compare_results, "[[", 8)) + if(add_others) { + vmsg("Number of Other alleles found: %s (%s kept after filters, %s discarded)", verbose = verbose, level = 2, type = ">>", n_others_total, n_others_kept, n_others_total - n_others_kept) + if(others_rm_with_indels) + vmsg("Number of Other alleles discarded due to indels vs Ref: %s", verbose = verbose, level = 2, type = ">>", n_rm_others_indels) + if(!is.null(others_max_snps)) + vmsg("Number of Other alleles discarded due to exceeding max SNPs (%s): %s", verbose = verbose, level = 2, type = ">>", others_max_snps, n_rm_others_maxsnps) + # if(length(others_added_info) > 0) { + # vmsg("Others tags added:", verbose = verbose, level = 3, type = ">>") + # for(msg in others_added_info) vmsg(" %s", verbose = verbose, level = 3, type = ">>", msg) + # } + } else { + vmsg("Number of Other alleles found: %s (not processed, add_others = FALSE)", verbose = verbose, level = 2, type = ">>", n_others_total) + } vmsg("Pairwise alignments concluded!", verbose = verbose, level = 1) @@ -416,10 +454,10 @@ add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { #' @importFrom pwalign pairwiseAlignment nucleotideSubstitutionMatrix #' #' @noRd -compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, verbose = FALSE){ - # for(i in 1507:length(updated_by_cloneID)){ - # one_tag <- updated_by_cloneID[[i]] +compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df = NULL, add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, verbose = FALSE){ + #idx <- which(names(updated_by_cloneID) == "Ra01_020534029") + #one_tag <- updated_by_cloneID[[idx]] cloneID <- one_tag$CloneID[1] isBotLoci <- cloneID %in% botloci[,1] @@ -450,7 +488,11 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver return(list(update_tag = update_tag, rm_score = NULL, rm_N = NULL, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } # If marker is present in the botloci list, get the reverse complement sequence @@ -480,7 +522,11 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver return(list(update_tag = NULL, rm_score = NULL, rm_N = NULL, - rm_indels= cloneID)) + rm_indels = cloneID, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } ref_base <- substring(ref_seq, align@pattern@mismatch@unlistData, align@pattern@mismatch@unlistData) alt_base <- substring(alt_seq, align@subject@mismatch@unlistData, align@subject@mismatch@unlistData) @@ -506,50 +552,126 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df= NULL, ver if(length(rm_target) >0) pos_ref_idx <- pos_ref_idx[-rm_target] # Cases found where the AltMatch is another alternative for the target SNP - they are discarted if(length(pos_ref_idx) >0){ - ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + ref_base_match <- substring(ref_seq, pos_ref_idx, pos_ref_idx) pos_alt_idx <- align@subject@mismatch@unlistData # If there are indels, the position in the alternative is not the same as the reference if(length(rm_target) >0) pos_alt_idx <- pos_alt_idx[-rm_target] # remove target position when is AltMatch - but the order in the sequence is the same - alt_base <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + alt_base_match <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + + # If Match sequences have N, do not consider as polymorphism + if(any(!alt_base_match %in% c("A", "T", "C", "G"))) { + ref_base_match <- ref_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + alt_base_match <- alt_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + } + + if(length(alt_base_match) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + # The reported position is always on reference + pos <- pos_target - (pos_target_idx - pos_ref_idx) + + # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table + update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base_match)), ] + + update_tag_temp$Chromosome <- chr + update_tag_temp$SNP_position_in_Genome <- pos + update_tag_temp$Ref <- ref_base_match + update_tag_temp$Alt <- alt_base_match + + update_tag <- rbind(update_tag, update_tag_temp) + } + } + } + } + others_seq <- one_tag[grep("Other",one_tag$AlleleID),] + n_others_total <- nrow(others_seq) + n_rm_others_indels <- 0L + n_rm_others_maxsnps <- 0L + others_added_info <- character(0) + + if(add_others && nrow(others_seq) > 0){ + for(j in seq_len(nrow(others_seq))){ + align <- pairwiseAlignment(ref_seq, # Align with the reference + others_seq[j,]$AlleleSequence, + substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + # Filter: discard Others with indels relative to Ref + if(others_rm_with_indels && + (length(align@pattern@indel@unlistData) > 0 | length(align@subject@indel@unlistData) > 0)) { + n_rm_others_indels <- n_rm_others_indels + 1L + next + } + pos_ref_idx <- align@pattern@mismatch@unlistData + pos_alt_idx <- align@subject@mismatch@unlistData + # Filter: discard Others with too many SNPs vs Ref (count before removing target position) + if(!is.null(others_max_snps) && length(pos_ref_idx) > others_max_snps) { + n_rm_others_maxsnps <- n_rm_others_maxsnps + 1L + next + } + rm_target_other <- which(pos_ref_idx == pos_target_idx) # remove target position if base is the same as Ref or Alt + if(length(rm_target_other) > 0) { + other_tag_base <- substring(others_seq[j,]$AlleleSequence, pos_target_idx, pos_target_idx) + if(other_tag_base == ref_base | other_tag_base == alt_base){ # If Other has same base as REF and ALT, it won't be considered in their counts + pos_ref_idx <- pos_ref_idx[-rm_target_other] + pos_alt_idx <- pos_alt_idx[-rm_target_other] + } + } + other_ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + other_alt_base <- substring(others_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + # Cases found where the AltMatch is another alternative for the target SNP - they are discarted + if(length(pos_ref_idx) >0){ # If Match sequences have N, do not consider as polymorphism - if(any(!alt_base %in% c("A", "T", "C", "G"))) { - ref_base <- ref_base[-which(!alt_base %in% c("A", "T", "C", "G"))] - pos_ref_idx <- pos_ref_idx[-which(!alt_base %in% c("A", "T", "C", "G"))] - alt_base <- alt_base[-which(!alt_base %in% c("A", "T", "C", "G"))] + if(any(!other_alt_base %in% c("A", "T", "C", "G"))) { + other_ref_base <- other_ref_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + other_alt_base <- other_alt_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] } - if(length(alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + if(length(other_alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted # The reported position is always on reference pos <- pos_target - (pos_target_idx - pos_ref_idx) # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table - update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base)), ] + update_tag_temp <- one_tag[grep("Other",one_tag$AlleleID)[j],][rep(1, length(other_alt_base)), ] update_tag_temp$Chromosome <- chr update_tag_temp$SNP_position_in_Genome <- pos - update_tag_temp$Ref <- ref_base - update_tag_temp$Alt <- alt_base + update_tag_temp$Ref <- other_ref_base + update_tag_temp$Alt <- other_alt_base update_tag <- rbind(update_tag, update_tag_temp) + others_added_info <- c(others_added_info, + paste0(others_seq[j,]$AlleleID, " -> position(s): ", paste(pos, collapse = ", "))) } } } } + return(list(update_tag = update_tag, # updated data.frame, NULL if discarted rm_score = NULL, # cloneID if removed because of low alignment score, NULL if kept rm_N = NULL, - rm_indels = NULL)) # cloneID if removed because of N in the target alternative, NULL if kept + rm_indels = NULL, + n_rm_others_indels = n_rm_others_indels, + n_rm_others_maxsnps = n_rm_others_maxsnps, + n_others_total = n_others_total, + others_added_info = others_added_info)) } else { return(list(update_tag = NULL, rm_score = NULL, rm_N = cloneID, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } } else{ return(list(update_tag = NULL, rm_score = cloneID, rm_N = NULL, - rm_indels = NULL)) + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } } @@ -636,10 +758,12 @@ create_VCF_body <- function(csv, rm_filter <- sapply(vcf_tag_list, "[[", 5) # removed because empty after filtering kept_multi <- sapply(vcf_tag_list, "[[", 6) # kept as multiallelic simplified <- sapply(vcf_tag_list, "[[", 7) # simplified to biallelic + multi_others_target <- sapply(vcf_tag_list, "[[", 8) # multiallelic target from Others vmsg("Performing final filterings", verbose = verbose, level = 0, type = ">>") vmsg("Multiallelic off-target SNPs found: %s", verbose = verbose, level = 2, type = ">>", sum(total_mks)) + vmsg("Multiallelic target SNPs with a 3rd allele from Others: %s", verbose = verbose, level = 2, type = ">>", sum(multi_others_target)) if(rm_multiallelic_SNP) { vmsg("Removed (rm_multiallelic_SNP = TRUE): %s", verbose = verbose, level = 3, type = ">>", sum(rm_setting)) } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0) { @@ -668,7 +792,7 @@ create_VCF_body <- function(csv, if(length(which(duplicated(vcf_body[,3]))) > 0){ repeated <- vcf_body[which(duplicated(vcf_body[,3])), 4] - vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded.", verbose = verbose, level = 2, length(repeated)) + vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded", verbose = verbose, level = 2, type = ">>", length(repeated)) repeated_tab <- vcf_body[which(vcf_body[,4] %in% repeated),] vcf_body_new <- vcf_body[-which(vcf_body[,4] %in% repeated),] @@ -711,31 +835,80 @@ create_VCF_body <- function(csv, #' aspect of the marker, the marker is discarded. This is likely to happen to paralogous sites. #' #' @noRd -merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, pad_width = NULL){ - +merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, + multiallelic_SNP_sample_thr = 0, pad_width = NULL){ + #cloneID_unit <- cloneID[[250]] #Get counts for target SNP rm_by_setting <- 0 # removed because rm_multiallelic_SNP = TRUE rm_by_filter <- 0 # removed because empty after threshold filtering kept_multiallelic <- 0 # kept as-is (still multiallelic after filtering or no filter) simplified <- 0 # simplified from multiallelic to biallelic by filtering total_multiallelic <- 0 - RefTag <- apply(cloneID_unit[which(grepl("Ref", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - AltTag <- apply(cloneID_unit[which(grepl("Alt", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) + multiallelic_others_target <- 0 # target SNPs with a 3rd allele from Others + + # Target marker + RefTag <- apply(cloneID_unit[which((grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("RefMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) + AltTag <- apply(cloneID_unit[which((grepl("Alt_0002$", cloneID_unit$AlleleID) | grepl("AltMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) cloneID <- cloneID_unit$CloneID[1] if(is.null(pad_width)) pad_width <- nchar(sub(".*_", "", cloneID)) - info <- cloneID_unit[grep("Ref_", cloneID_unit$AlleleID),] + info <- cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),] + + # In case there are Others that add multiallelics to targets + others_target <- cloneID_unit[,3] %in% cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),3] + if(sum(others_target) > 2 & !rm_multiallelic_SNP){ # If target is multiallelic + multiallelic_others_target <- 1 + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + other_alts <- unique(cloneID_unit[idx_other,5]) + other_alts_info <- cloneID_unit[idx_other,] + OtherTag_list <- list() + total <- rep(0, length(RefTag)) + ads <- vector() + tab_counts <- paste0(RefTag + AltTag + total, ":", RefTag, ":", RefTag, ",", AltTag) + for(j in 1:length(other_alts)){ + temp_other <- which(other_alts_info[,5] == other_alts[j]) + OtherTag_list[[j]] <- apply(other_alts_info[temp_other, -c(1:7)], 2, sum) + total_temp <- OtherTag_list[[j]] + + if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles + if(sum(total_temp > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr) next() + } + total <- total + total_temp + tab_counts <- paste0(tab_counts, ",",OtherTag_list[[j]]) + ads_temp <- sum(OtherTag_list[[j]]) + ads <- paste0(ads, ",", ads_temp) + } + alts <- paste0(info$Alt, ",", paste0(other_alts, collapse = ",")) + info_mk <- paste0("DP=", sum(c(RefTag, AltTag,total)),";", + "ADS=",sum(RefTag),",",sum(AltTag), ads) + } else { + tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, AltTag) + alts <- info$Alt + info_mk <- paste0("DP=", sum(c(RefTag, AltTag)),";", + "ADS=",sum(RefTag),",",sum(AltTag)) + } + info <- c(info$Chromosome, info$SNP_position_in_Genome, cloneID, - info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(RefTag, AltTag)),";", - "ADS=",sum(RefTag),",",sum(AltTag)), "DP:RA:AD") + info$Ref, + alts, + ".", + ".", + info_mk, + "DP:RA:AD") vcf_tag <- c(info, tab_counts) # Check if there are more than one alternative allele by loci - off_tag <- cloneID_unit[-which(grepl("Ref_", cloneID_unit$AlleleID) | grepl("Alt_", cloneID_unit$AlleleID)),] + rm_tags <- which(grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("Alt_0002$", cloneID_unit$AlleleID)) + if(sum(others_target) > 2){ + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + off_tag <- cloneID_unit[-c(rm_tags,idx_other),] + } else { + off_tag <- cloneID_unit[-rm_tags,] + } + if(nrow(off_tag)){ # If there are off target SNP by_pos <- split.data.frame(off_tag, off_tag$SNP_position_in_Genome) @@ -818,5 +991,5 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic } } - return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified)) + return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified, multiallelic_others_target)) } diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index ac2de3f..a1e8c92 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -15,6 +15,9 @@ madc2vcf_all( alignment_score_thr = 40, out_vcf = NULL, markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE ) } @@ -37,6 +40,14 @@ madc2vcf_all( \item{out_vcf}{A string specifying the name of the output VCF file. If the file extension is not \code{.vcf}, it will be appended automatically.} +\item{markers_info}{A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed).} + +\item{add_others}{A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE.} + +\item{others_max_snps}{An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit).} + +\item{others_rm_with_indels}{A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE.} + \item{verbose}{A logical value indicating whether to print metrics and progress to the console. Default is TRUE.} } \value{ From 84852dabc10214a50ffc8840a6999db1e4c1dab1 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 10:44:36 -0400 Subject: [PATCH 31/80] up version --- DESCRIPTION | 2 +- NEWS.md | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf782ad..f395fbb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.5 +Version: 0.6.6 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index 8037725..6167003 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# BIGr 0.6.6 + +# Updates on `madc2vcf_all` + +- New arguments for controlling processing of `Other` alleles: + - `add_others`: if `TRUE` (default), alleles labeled "Other" in the MADC are included in off-target SNP extraction + - `others_max_snps`: discards Other alleles with more than this many SNP differences relative to the Ref sequence (default: 5) + - `others_rm_with_indels`: discards Other alleles containing insertions or deletions relative to the Ref sequence (default: `TRUE`) +- Others alleles that carry a different base at the target SNP position are now reported as a 3rd allele in the VCF instead of being silently dropped +- Target position is now correctly removed from Others alignments, preventing duplicate VCF positions and marker IDs +- Fixed a bug where Others alleles with "Ref_" or "Alt_" in their AlleleID would corrupt the target SNP REF/ALT fields and read depth counts in `merge_counts` +- Improved verbose messages throughout: counts of Other alleles found, kept, and discarded (by indel filter and by max SNP filter) are now reported; multiallelic target SNPs with a 3rd allele from Others are counted and reported +- Debug-level message (level 3) listing each Other allele added and its genomic position + # BIGr 0.6.5 # Updates on madc2vcf functions From 96a4ed14a62efa8f0cfe3f925d8c0bda0a216ccc Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 14:31:23 -0400 Subject: [PATCH 32/80] add madc2vcf_multi --- DESCRIPTION | 3 +- NAMESPACE | 7 + NEWS.md | 18 +++ R/check_madc_sanity.R | 5 +- R/get_countsMADC.R | 10 +- R/madc2vcf_multi.R | 173 ++++++++++++++++++++++++ man/madc2vcf_multi.Rd | 59 ++++++++ tests/testthat/test-check_madc_sanity.R | 2 +- tests/testthat/test-check_ped.R | 2 +- tests/testthat/test-madc2vcf_all.R | 34 ++--- tests/testthat/test-madc2vcf_multi.R | 144 ++++++++++++++++++++ tests/testthat/test-madc2vcf_targets.R | 6 +- 12 files changed, 433 insertions(+), 30 deletions(-) create mode 100644 R/madc2vcf_multi.R create mode 100644 man/madc2vcf_multi.Rd create mode 100644 tests/testthat/test-madc2vcf_multi.R diff --git a/DESCRIPTION b/DESCRIPTION index f395fbb..255ec4f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.6 +Version: 0.6.7 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', @@ -69,5 +69,6 @@ Suggests: rmdformats, knitr (>= 1.10), rmarkdown, + polyRAD, testthat (>= 3.0.0) RdMacros: Rdpack diff --git a/NAMESPACE b/NAMESPACE index 6ecaef4..f3a2e76 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(get_countsMADC) export(imputation_concordance) export(madc2gmat) export(madc2vcf_all) +export(madc2vcf_multi) export(madc2vcf_targets) export(merge_MADCs) export(solve_composition_poly) @@ -39,11 +40,17 @@ importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) importFrom(dplyr,"%>%") importFrom(dplyr,across) +importFrom(dplyr,case_when) +importFrom(dplyr,filter) importFrom(dplyr,group_by) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,where) +importFrom(polyRAD,IterateHWE) +importFrom(polyRAD,RADdata2VCF) +importFrom(polyRAD,TestOverdispersion) +importFrom(polyRAD,readDArTag) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) diff --git a/NEWS.md b/NEWS.md index 6167003..0eff4c3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,21 @@ +# BIGr 0.6.7 + +# New function `madc2vcf_multi` + +- New function `madc2vcf_multi` to convert a DArTag MADC file to a VCF using the polyRAD pipeline for multiallelic genotyping +- Runs `check_madc_sanity` before loading the data and stops with informative errors if: + - Required columns are missing + - IUPAC (non-ATCG) codes are present in AlleleSequence + - Ref/Alt sequences are unpaired (`RefAltSeqs = FALSE`) + - Allele IDs have not been fixed by HapApp (`FixAlleleIDs = FALSE`) + - CloneIDs do not follow `Chr_Pos` format and no `markers_info` is provided +- New argument `markers_info`: optional path or URL to a CSV with `CloneID`/`BI_markerID`, `Chr`, and `Pos` columns; required when CloneIDs do not follow the `Chr_Pos` format +- Runs `check_botloci` to validate and reconcile CloneIDs between the MADC and botloci file, automatically fixing padding mismatches +- A corrected temp file is written and passed to `readDArTag` only when needed (all-NA rows/columns detected, CloneIDs remapped by `check_botloci`, or botloci IDs remapped) +- Accepts paths or URLs for `madc_file`, `botloci_file`, and `markers_info` +- Estimates overdispersion with `polyRAD::TestOverdispersion`, iterates priors with `polyRAD::IterateHWE`, and exports the result with `polyRAD::RADdata2VCF` +- `polyRAD` is a soft dependency (listed under `Suggests`); an informative error is raised if it is not installed + # BIGr 0.6.6 # Updates on `madc2vcf_all` diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index c36a92e..2248779 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -129,10 +129,11 @@ check_madc_sanity <- function(report) { # Complex indel: same length but >1 character difference between sequences same_len <- cmp_ok & (ref_len == alt_len) if (any(same_len)) { - n_diffs <- mapply(function(r, a) { + n_diffs <- mapply(function(r, a) { r_chars <- strsplit(r, "")[[1]] a_chars <- strsplit(a, "")[[1]] - sum(toupper(r_chars) != toupper(a_chars)) + standard <- toupper(r_chars) %in% c("A","T","C","G") & toupper(a_chars) %in% c("A","T","C","G") + sum(toupper(r_chars[standard]) != toupper(a_chars[standard])) }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) indel_mask[same_len] <- n_diffs > 1 } diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 58a2496..396004a 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -177,7 +177,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche #' count columns, etc.). #' #' @importFrom dplyr mutate group_by summarise across where select -#' @importFrom dplyr %>% +#' @importFrom dplyr %>% filter case_when #' #' @keywords internal get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { @@ -224,11 +224,11 @@ get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_co if(collapse_matches_counts){ filtered_df <- madc_df[order(madc_df$AlleleID),] %>% # Keep only Ref/Alt alleles and their Match variants; drop other allele types - dplyr::filter(grepl("\\|(Ref|Alt)(Match)?\\b", AlleleID)) %>% + filter(grepl("\\|(Ref|Alt)(Match)?(_|$)", AlleleID)) %>% mutate( - Type = dplyr::case_when( - grepl("\\|Alt(Match)?\\b", AlleleID) ~ "Alt", - grepl("\\|Ref(Match)?\\b", AlleleID) ~ "Ref" + Type = case_when( + grepl("\\|Alt(Match)?(_|$)", AlleleID) ~ "Alt", + grepl("\\|Ref(Match)?(_|$)", AlleleID) ~ "Ref" ) ) %>% group_by(CloneID, Type) %>% diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R new file mode 100644 index 0000000..6e86b45 --- /dev/null +++ b/R/madc2vcf_multi.R @@ -0,0 +1,173 @@ +#' Convert MADC file to VCF using polyRAD for multiallelic genotyping +#' +#' This function converts a DArTag MADC file to a VCF using the polyRAD package's +#' `readDArTag` and `RADdata2VCF` pipeline. It runs `check_madc_sanity` before +#' loading the data, applies corrections for lowercase sequences and all-NA +#' rows/columns, and sets `n.header.rows` automatically based on whether the +#' MADC file follows the raw DArT format (6 header rows) or the fixed allele ID +#' format (no header rows). +#' +#' @param madc_file character. Path or URL to the input MADC CSV file. +#' @param botloci_file character. Path or URL to the botloci file listing target +#' IDs designed on the bottom strand. +#' @param outfile character. Path for the output VCF file. +#' @param markers_info character or NULL. Optional path or URL to a CSV file +#' with marker metadata. Required when CloneIDs do not follow the +#' \code{Chr_Pos} format; must contain \code{CloneID} (or +#' \code{BI_markerID}), \code{Chr}, and \code{Pos} columns. +#' @param ploidy integer. Ploidy level of the samples passed to \code{taxaPloidy}. +#' Default is 2. +#' @param verbose logical. Whether to print progress messages. Default is TRUE. +#' +#' @return Invisible NULL. Writes a VCF file to \code{outfile}. +#' +#' @details +#' The function performs the following steps: +#' \enumerate{ +#' \item Reads the MADC file and runs \code{check_madc_sanity}. +#' \item Validates the botloci file against MADC CloneIDs using +#' \code{check_botloci}, fixing any padding mismatches automatically. +#' \item Converts lowercase bases in AlleleSequence to uppercase if detected. +#' \item Removes all-NA rows and columns if detected. +#' \item Writes the corrected data to a temporary file and passes it to +#' \code{polyRAD::readDArTag}. +#' \item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +#' calls \code{polyRAD::IterateHWE}, then exports the result with +#' \code{polyRAD::RADdata2VCF}. +#' } +#' +#' @importFrom utils read.csv write.csv read.table +#' +#' @export +madc2vcf_multi <- function(madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE) { + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + if (!(file.exists(madc_file) | url_exists(madc_file))) stop("MADC file not found. Please provide a valid path or URL.") + if (!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if (!is.null(markers_info) && !(file.exists(markers_info) | url_exists(markers_info))) stop("markers_info file not found. Please provide a valid path or URL.") + if (!is.numeric(ploidy) || ploidy < 1) stop("ploidy must be a positive integer.") + + # ---- Load markers_info if provided ---- + mi_df <- if (!is.null(markers_info)) read.csv(markers_info) else NULL + + # ---- Read and sanity-check ---- + report <- read.csv(madc_file, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + for (i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if (!checks$checks["Columns"]) + stop("The MADC file is missing required columns (CloneID, AlleleID, AlleleSequence)") + + if (checks$checks["IUPACcodes"]) + stop("MADC Allele Sequences contain IUPAC (non-ATCG) codes. Please run HapApp to clean MADC file before using this function.") + + if (!isTRUE(checks$checks["RefAltSeqs"])) + stop("Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function.") + + if (!isTRUE(checks$checks["FixAlleleIDs"])) + stop("The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function.") + + if (!isTRUE(checks$checks["ChromPos"])) { + if (is.null(markers_info)) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + if (!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + } + + # ---- Check botloci vs MADC CloneIDs ---- + vmsg("Checking botloci file", verbose = verbose, level = 0, type = ">>") + cloneids_before <- report$CloneID + botloci_df <- read.table(botloci_file, header = FALSE) + botloci_before <- botloci_df$V1 + checked_botloci <- check_botloci(botloci_df, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci_df <- checked_botloci[[1]] + report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + cloneid_changed <- !identical(report$CloneID, cloneids_before) + botloci_changed <- !identical(botloci_df$V1, botloci_before) + + # ---- Botloci temp file (if IDs were remapped) ---- + if (botloci_changed) { + tmp_botloci <- tempfile() + on.exit(unlink(tmp_botloci), add = TRUE) + write.table(botloci_df, tmp_botloci, row.names = FALSE, col.names = FALSE, quote = FALSE) + botloci_input <- tmp_botloci + } else { + botloci_input <- botloci_file + } + + # ---- Corrections: only create a temp file if needed ---- + need_temp <- isTRUE(checks$checks["allNArow"]) || isTRUE(checks$checks["allNAcol"]) || cloneid_changed + + if (need_temp) { + if (checks$checks["LowerCase"]) { + vmsg("MADC Allele Sequences contain lowercase characters. Converting to uppercase", + verbose = verbose, level = 1, type = ">>") + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + if (checks$checks["allNArow"]) { + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA row(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[!idx, ] + } + + if (checks$checks["allNAcol"]) { + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA column(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[, !idx] + } + + tmp_madc <- tempfile(fileext = ".csv") + on.exit(unlink(tmp_madc), add = TRUE) + write.csv(report, tmp_madc, row.names = FALSE, quote = TRUE) + input_file <- tmp_madc + } else { + if (checks$checks["LowerCase"]) + vmsg("MADC Allele Sequences contain lowercase characters. polyRAD will handle them", + verbose = verbose, level = 1, type = ">>") + input_file <- madc_file + } + + vmsg("Loading MADC into polyRAD", verbose = verbose, level = 0, type = ">>") + + raddat <- polyRAD::readDArTag( + file = input_file, + botloci = botloci_input, + n.header.rows = 0L, + sample.name.row = 1, + trim.sample.names = "", + taxaPloidy = as.integer(ploidy) + ) + + overdispersionP <- polyRAD::TestOverdispersion(raddat) + my_ovdisp <- overdispersionP$optimal + + vmsg("Running HWE iteration (overdispersion = %s)", verbose = verbose, level = 0, type = ">>", my_ovdisp) + + raddat_hwe <- polyRAD::IterateHWE(raddat, overdispersion = my_ovdisp) + + vmsg("Writing VCF to %s", verbose = verbose, level = 0, type = ">>", outfile) + + polyRAD::RADdata2VCF(raddat_hwe, file = outfile, asSNPs = FALSE, hindhe = FALSE) + + vmsg("Done!", verbose = verbose, level = 0, type = ">>") + + invisible(NULL) +} + diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd new file mode 100644 index 0000000..5c3a777 --- /dev/null +++ b/man/madc2vcf_multi.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/madc2vcf_multi.R +\name{madc2vcf_multi} +\alias{madc2vcf_multi} +\title{Convert MADC file to VCF using polyRAD for multiallelic genotyping} +\usage{ +madc2vcf_multi( + madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE +) +} +\arguments{ +\item{madc_file}{character. Path or URL to the input MADC CSV file.} + +\item{botloci_file}{character. Path or URL to the botloci file listing target +IDs designed on the bottom strand.} + +\item{outfile}{character. Path for the output VCF file.} + +\item{markers_info}{character or NULL. Optional path or URL to a CSV file +with marker metadata. Required when CloneIDs do not follow the +\code{Chr_Pos} format; must contain \code{CloneID} (or +\code{BI_markerID}), \code{Chr}, and \code{Pos} columns.} + +\item{ploidy}{integer. Ploidy level of the samples passed to \code{taxaPloidy}. +Default is 2.} + +\item{verbose}{logical. Whether to print progress messages. Default is TRUE.} +} +\value{ +Invisible NULL. Writes a VCF file to \code{outfile}. +} +\description{ +This function converts a DArTag MADC file to a VCF using the polyRAD package's +\code{readDArTag} and \code{RADdata2VCF} pipeline. It runs \code{check_madc_sanity} before +loading the data, applies corrections for lowercase sequences and all-NA +rows/columns, and sets \code{n.header.rows} automatically based on whether the +MADC file follows the raw DArT format (6 header rows) or the fixed allele ID +format (no header rows). +} +\details{ +The function performs the following steps: +\enumerate{ +\item Reads the MADC file and runs \code{check_madc_sanity}. +\item Validates the botloci file against MADC CloneIDs using +\code{check_botloci}, fixing any padding mismatches automatically. +\item Converts lowercase bases in AlleleSequence to uppercase if detected. +\item Removes all-NA rows and columns if detected. +\item Writes the corrected data to a temporary file and passes it to +\code{polyRAD::readDArTag}. +\item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +calls \code{polyRAD::IterateHWE}, then exports the result with +\code{polyRAD::RADdata2VCF}. +} +} diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index a185997..d0c45cf 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -15,7 +15,7 @@ test_that("check madc",{ report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) res <- check_madc_sanity(report) - exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE) names(exp) <- names expect_equal(res$checks, exp) diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index fdf69e0..706143f 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -15,7 +15,7 @@ test_that("test imputation",{ messy_parents <- output.list$messy_parents missing_parents <- output.list$missing_parents - expect_true(df_length == 4) + expect_true(df_length == 5) # Before was 4 expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) expect_true(nrow(missing_parents) == 13) diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index f88adc9..c8c860f 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -22,7 +22,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) set.seed(456) madc2vcf_all(madc = madc_file, @@ -34,7 +34,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp_multi, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) vcf_multi <- read.vcfR(temp_multi) @@ -56,7 +56,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) @@ -111,7 +111,7 @@ test_that("simu alfalfa",{ multiallelic_SNP_dp_thr = 0, alignment_score_thr = 40, out_vcf = out, - verbose = TRUE) + verbose = FALSE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -131,7 +131,7 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = TRUE) + verbose = FALSE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -163,7 +163,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -184,7 +184,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -230,7 +230,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info_ChromPos, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -256,7 +256,7 @@ test_that("simu alfalfa",{ hap_seq_file = alfalfa_microhapDB, n.cores = 1, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -276,7 +276,7 @@ test_that("simu alfalfa",{ hap_seq_file = NULL, n.cores = 1, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -297,7 +297,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = alfalfa_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(alfalfa_markers_info) @@ -348,7 +348,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) @@ -358,8 +358,8 @@ test_that("simu alfalfa",{ check <- check[-which(is.na(check$Ref)),] expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 43017) - expect_equal(sum(dp[3,]), 5073) + expect_equal(sum(dp[,10]), 226838) + expect_equal(sum(dp[3,]), 3996) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -472,7 +472,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(out, verbose = FALSE) lut <- read.csv(potato_markers_info) @@ -483,7 +483,7 @@ test_that("simu alfalfa",{ expect_equal(as.numeric(check$POS), check$Pos) dp <- extract.gt(vcf, "DP", as.numeric = TRUE) - expect_equal(sum(dp[,10]), 41755) + expect_equal(sum(dp[,10]), 219742) expect_equal(check$REF, check$Ref) expect_equal(check$ALT, check$Alt) @@ -510,7 +510,7 @@ test_that("simu alfalfa",{ n.cores = 1, markers_info = potato_markers_info, out_vcf = out, - verbose = TRUE), + verbose = FALSE), regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them" ) }) diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R new file mode 100644 index 0000000..414d160 --- /dev/null +++ b/tests/testthat/test-madc2vcf_multi.R @@ -0,0 +1,144 @@ +context("MADC to VCF via polyRAD") + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { + + skip_if_not_installed("polyRAD") + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_if_offline("raw.githubusercontent.com") + + out <- tempfile(fileext = ".vcf") + on.exit(unlink(out), add = TRUE) + + # Fixed allele ID format + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + # Don't allow raw MADC + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = FALSE + ), regexp = "The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function." + ) + + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + # Wrong IDs + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + markers_info = alfalfa_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + ### Avoid IUPAC codes + expect_error( + madc2vcf_multi( + madc_file = alfalfa_iupac, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "MADC Allele Sequences contain IUPAC \\(non-ATCG\\) codes. Please run HapApp to clean MADC file before using this function." + ) + + expect_error( + madc2vcf_multi( + madc_file = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function." + ) + + expect_no_error( + madc2vcf_multi( + madc_file = potato_indel_madc, + botloci_file = potato_botloci, + outfile = out, + markers_info = potato_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 277) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/1/1/6") + +}) + diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 9899e17..a64da34 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -140,7 +140,7 @@ test_that("simu alfalfa",{ output.file = out, get_REF_ALT = FALSE, collapse_matches_counts = TRUE, - verbose = FALSE) + verbose = TRUE) ) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") @@ -250,7 +250,7 @@ test_that("simu alfalfa",{ output.file = out, get_REF_ALT = TRUE, botloci_file = alfalfa_botloci, - verbose = TRUE) + verbose = FALSE) ) vcf <- read.vcfR(out, verbose = FALSE) @@ -677,7 +677,7 @@ test_that("simu alfalfa",{ output.file = out, get_REF_ALT = FALSE, collapse_matches_counts = TRUE, - verbose = FALSE) + verbose = TRUE) vcf <- read.vcfR(out, verbose = FALSE) expect_s4_class(vcf, "vcfR") From cec168d8d12799dee04094b17b0253a0f21859ce Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 15:06:54 -0400 Subject: [PATCH 33/80] fix checks --- DESCRIPTION | 3 ++- NAMESPACE | 6 +----- NEWS.md | 6 +++--- R/filterVCF.R | 5 +++-- R/imputation_concordance.R | 3 +++ R/utils.R | 11 ++++++----- man/filterVCF.Rd | 2 ++ man/imputation_concordance.Rd | 2 ++ tests/testthat/test-imputation_concordance.R | 1 - 9 files changed, 22 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 255ec4f..fa60f51 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,7 +62,8 @@ Imports: janitor, quadprog, tibble, - stringr + stringr, + ggplot2 Suggests: covr, spelling, diff --git a/NAMESPACE b/NAMESPACE index f3a2e76..ae09080 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,7 +22,6 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) -export(url_exists) export(vmsg) import(dplyr) import(ggplot2) @@ -47,16 +46,13 @@ importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,where) -importFrom(polyRAD,IterateHWE) -importFrom(polyRAD,RADdata2VCF) -importFrom(polyRAD,TestOverdispersion) -importFrom(polyRAD,readDArTag) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) importFrom(stats,cor) +importFrom(stats,reorder) importFrom(stats,setNames) importFrom(utils,packageVersion) importFrom(utils,read.csv) diff --git a/NEWS.md b/NEWS.md index 0eff4c3..dd2f630 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # BIGr 0.6.7 -# New function `madc2vcf_multi` +## New function `madc2vcf_multi` - New function `madc2vcf_multi` to convert a DArTag MADC file to a VCF using the polyRAD pipeline for multiallelic genotyping - Runs `check_madc_sanity` before loading the data and stops with informative errors if: @@ -18,7 +18,7 @@ # BIGr 0.6.6 -# Updates on `madc2vcf_all` +## Updates on `madc2vcf_all` - New arguments for controlling processing of `Other` alleles: - `add_others`: if `TRUE` (default), alleles labeled "Other" in the MADC are included in off-target SNP extraction @@ -32,7 +32,7 @@ # BIGr 0.6.5 -# Updates on madc2vcf functions +## Updates on madc2vcf functions Details: - both functions targets and all (targets + off-targets) markers now have `check_madc_sanity` function implemented. It tests: diff --git a/R/filterVCF.R b/R/filterVCF.R index 14f94c0..b9bca78 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -17,6 +17,7 @@ #' @param filter.SAMPLE.miss Sample missing data filter #' @param filter.SNP.miss SNP missing data filter #' @param ploidy The ploidy of the species being analyzed +#' @param quality.rates Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity). #' @param output.file output file name (optional). If no output.file name provided, then a vcfR object will be returned. #' @return A gzipped vcf file #' @importFrom vcfR read.vcfR @@ -55,7 +56,7 @@ filterVCF <- function(vcf.file, # Read VCF (can be .vcf or .vcf.gz) - if (class(vcf.file) != "vcfR") { + if (!inherits(vcf.file, "vcfR")) { vcf <- read.vcfR(vcf.file) } else { vcf <- vcf.file @@ -381,7 +382,7 @@ filterVCF <- function(vcf.file, } ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz) cat("Exporting VCF\n") - if (!class(vcf.file) == "vcfR"){ + if (!inherits(vcf.file, "vcfR")){ if (!is.null(output.file)){ output_name <- paste0(output.file,".vcf.gz") vcfR::write.vcf(vcf, file = output_name) diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index ac501bb..1eb441a 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -50,6 +50,7 @@ #' @import ggplot2 #' #' @examples +#' \dontrun{ #' result <- imputation_concordance( #' reference_genos = ref, #' imputed_genos = test, @@ -58,7 +59,9 @@ #' verbose = TRUE, #' plot = TRUE #' ) +#' } #' +#' @importFrom stats reorder #' @export imputation_concordance <- function(reference_genos, imputed_genos, diff --git a/R/utils.R b/R/utils.R index cf3cc6a..59e5563 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,8 +1,9 @@ #Internal Functions utils::globalVariables(c( - "ALT", "AlleleID", "CHROM", "Data", "ID", "MarkerName", "POS", - "QPseparate", "QPsolve_par", "REF", "Var1", "Variant", "geno", + "ALT", "AlleleID", "AlleleSequence", "CHROM", "Concordance", "Data", "ID", + "MarkerName", "POS", + "QPseparate", "QPsolve_par", "REF", "Type", "Var1", "Variant", "geno", "ind", "ref", "row_name", "size", "snp", "CloneID", "Count", "qualifying_sites_count", "MarkerID", "SampleID", "Dosage", @@ -48,10 +49,10 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { # Create indentation based on level indent <- switch(as.character(level), "0" = "", # Section headers - "1" = " ∙ ", # Main steps (medium bullet) + "1" = " \u2219 ", # Main steps (medium bullet) "2" = " - ", # Details "3" = " > ", # Sub-details - paste0(paste(rep(" ", level), collapse = ""), "• ") # Fallback for level > 3 + paste0(paste(rep(" ", level), collapse = ""), "\u2022 ") # Fallback for level > 3 ) # Format type label (only show for level 0) @@ -82,7 +83,7 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { #' #' @keywords internal #' @noRd -#' @export +#' url_exists <- function(u) { tryCatch({ con <- url(u, open = "rb") diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 2a3ab62..0342fe1 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -23,6 +23,8 @@ filterVCF( \arguments{ \item{vcf.file}{vcfR object or path to VCF file. Can be unzipped (.vcf) or gzipped (.vcf.gz).} +\item{quality.rates}{Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity).} + \item{filter.OD}{Updog filter} \item{filter.BIAS.min}{Updog filter (requires a value for both BIAS.min and BIAS.max)} diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index dc1cc65..31f54a8 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -64,6 +64,7 @@ When \code{plot = TRUE}, a bar plot showing concordance percentage per sample is generated using \pkg{ggplot2}. } \examples{ +\dontrun{ result <- imputation_concordance( reference_genos = ref, imputed_genos = test, @@ -72,5 +73,6 @@ result <- imputation_concordance( verbose = TRUE, plot = TRUE ) +} } diff --git a/tests/testthat/test-imputation_concordance.R b/tests/testthat/test-imputation_concordance.R index f1fb421..459998c 100644 --- a/tests/testthat/test-imputation_concordance.R +++ b/tests/testthat/test-imputation_concordance.R @@ -1,6 +1,5 @@ context("Imputation Concordance") - test_that("test imputation",{ #Input variables ignore_file <- system.file("imputation_ignore.txt", package="BIGr") From 0be2e0fb06e3accde737195790a868b450ee0ff3 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 15:30:37 -0400 Subject: [PATCH 34/80] fix checks 2 --- tests/testthat/test-madc2vcf_multi.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R index 414d160..40ececd 100644 --- a/tests/testthat/test-madc2vcf_multi.R +++ b/tests/testthat/test-madc2vcf_multi.R @@ -34,8 +34,6 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { skip_if_offline("raw.githubusercontent.com") out <- tempfile(fileext = ".vcf") - on.exit(unlink(out), add = TRUE) - # Fixed allele ID format expect_no_error( madc2vcf_multi( @@ -54,6 +52,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { expect_equal(GT[3,5],"0/0/0/3") # Don't allow raw MADC + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_madc_raw, @@ -64,6 +63,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function." ) + out <- tempfile(fileext = ".vcf") expect_no_error( madc2vcf_multi( madc_file = alfalfa_madc, @@ -75,6 +75,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ) # Wrong IDs + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_madc_wrongID, @@ -85,6 +86,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "Check marker IDs in both MADC and botloci files. They should be the same." ) + out <- tempfile(fileext = ".vcf") expect_no_error( madc2vcf_multi( madc_file = alfalfa_madc_wrongID, @@ -103,6 +105,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { expect_equal(GT[3,5],"0/0/0/3") ### Avoid IUPAC codes + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_iupac, @@ -113,6 +116,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "MADC Allele Sequences contain IUPAC \\(non-ATCG\\) codes. Please run HapApp to clean MADC file before using this function." ) + out <- tempfile(fileext = ".vcf") expect_error( madc2vcf_multi( madc_file = alfalfa_lowercase, @@ -123,6 +127,7 @@ test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { ), regexp = "Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function." ) + out <- tempfile(fileext = ".vcf") expect_no_error( madc2vcf_multi( madc_file = potato_indel_madc, From 33fc87c10aaedbb788b518f1f5b391446fea1cdd Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 15:49:45 -0400 Subject: [PATCH 35/80] add VariantAnnotation to test env --- .github/workflows/R-CMD-check.yaml | 2 ++ tests/testthat/test-madc2vcf_multi.R | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 969e02d..f7c1b98 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,6 +43,8 @@ jobs: extra-packages: | any::rcmdcheck any::covr + any::polyRAD + bioc::VariantAnnotation needs: check - uses: r-lib/actions/check-r-package@v2 diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R index 40ececd..adca090 100644 --- a/tests/testthat/test-madc2vcf_multi.R +++ b/tests/testthat/test-madc2vcf_multi.R @@ -7,6 +7,7 @@ context("MADC to VCF via polyRAD") test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { skip_if_not_installed("polyRAD") + skip_if_not_installed("VariantAnnotation") github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" From 77107ba5c638874ec56e23744a0f8dfbf5fa9017 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 1 Apr 2026 16:32:52 -0400 Subject: [PATCH 36/80] ignore madc2vcf_multi tests in actions --- .github/workflows/R-CMD-check.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index f7c1b98..969e02d 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,8 +43,6 @@ jobs: extra-packages: | any::rcmdcheck any::covr - any::polyRAD - bioc::VariantAnnotation needs: check - uses: r-lib/actions/check-r-package@v2 From ccf9e776a7051cd4f893064c77ccdc8cb193ca99 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 09:00:47 -0400 Subject: [PATCH 37/80] more messages and tests --- .github/workflows/R-CMD-check.yaml | 5 +++++ R/madc2vcf_all.R | 14 ++++++++++++++ R/madc2vcf_multi.R | 7 +++++++ R/madc2vcf_targets.R | 8 ++++++++ 4 files changed, 34 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 969e02d..587188c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,7 +43,12 @@ jobs: extra-packages: | any::rcmdcheck any::covr + any::polyRAD needs: check + + - name: Install VariantAnnotation (no Suggests) + run: pak::pkg_install("bioc::VariantAnnotation", dependencies = c("Depends", "Imports", "LinkingTo")) + shell: Rscript {0} - uses: r-lib/actions/check-r-package@v2 - name: Generate test coverage report diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index ac20835..17490f3 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -70,6 +70,20 @@ madc2vcf_all <- function(madc, others_rm_with_indels = TRUE, verbose = TRUE){ + vmsg("Running BIGr madc2vcf_all", verbose = verbose, level = 0, type = ">>") + vmsg("madc : %s", verbose = verbose, level = 1, madc) + vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) + vmsg("hap_seq_file : %s", verbose = verbose, level = 1, if (is.null(hap_seq_file)) "NULL" else hap_seq_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) + vmsg("n.cores : %s", verbose = verbose, level = 1, n.cores) + vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, alignment_score_thr) + vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, rm_multiallelic_SNP) + vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, multiallelic_SNP_dp_thr) + vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, multiallelic_SNP_sample_thr) + vmsg("add_others : %s", verbose = verbose, level = 1, add_others) + vmsg("others_max_snps : %s", verbose = verbose, level = 1, if (is.null(others_max_snps)) "NULL" else others_max_snps) + vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, others_rm_with_indels) + vmsg("out_vcf : %s", verbose = verbose, level = 1, if (is.null(out_vcf)) "NULL" else out_vcf) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 6e86b45..0b67f3f 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -46,6 +46,13 @@ madc2vcf_multi <- function(madc_file, ploidy = 2L, verbose = TRUE) { + vmsg("Running BIGr madc2vcf_multi", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) + vmsg("outfile : %s", verbose = verbose, level = 1, outfile) + vmsg("ploidy : %s", verbose = verbose, level = 1, ploidy) + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") if (!(file.exists(madc_file) | url_exists(madc_file))) stop("MADC file not found. Please provide a valid path or URL.") diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 80af70f..19c486e 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -136,6 +136,14 @@ madc2vcf_targets <- function(madc_file, collapse_matches_counts = FALSE, verbose = TRUE) { + vmsg("Running BIGr madc2vcf_targets", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) + vmsg("output.file : %s", verbose = verbose, level = 1, output.file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) + vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, get_REF_ALT) + vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, collapse_matches_counts) + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks From 8a00c9e1b0f69e462e185867e46082eb7c85ee50 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 09:03:42 -0400 Subject: [PATCH 38/80] bugfix --- R/madc2vcf_all.R | 26 +++++++++++++------------- R/madc2vcf_multi.R | 10 +++++----- R/madc2vcf_targets.R | 12 ++++++------ 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 17490f3..b25e6da 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -71,19 +71,19 @@ madc2vcf_all <- function(madc, verbose = TRUE){ vmsg("Running BIGr madc2vcf_all", verbose = verbose, level = 0, type = ">>") - vmsg("madc : %s", verbose = verbose, level = 1, madc) - vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) - vmsg("hap_seq_file : %s", verbose = verbose, level = 1, if (is.null(hap_seq_file)) "NULL" else hap_seq_file) - vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) - vmsg("n.cores : %s", verbose = verbose, level = 1, n.cores) - vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, alignment_score_thr) - vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, rm_multiallelic_SNP) - vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, multiallelic_SNP_dp_thr) - vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, multiallelic_SNP_sample_thr) - vmsg("add_others : %s", verbose = verbose, level = 1, add_others) - vmsg("others_max_snps : %s", verbose = verbose, level = 1, if (is.null(others_max_snps)) "NULL" else others_max_snps) - vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, others_rm_with_indels) - vmsg("out_vcf : %s", verbose = verbose, level = 1, if (is.null(out_vcf)) "NULL" else out_vcf) + vmsg("madc : %s", verbose = verbose, level = 1, type = ">>", madc) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("hap_seq_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(hap_seq_file)) "NULL" else hap_seq_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("n.cores : %s", verbose = verbose, level = 1, type = ">>", n.cores) + vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, type = ">>", alignment_score_thr) + vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, type = ">>", rm_multiallelic_SNP) + vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_dp_thr) + vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_sample_thr) + vmsg("add_others : %s", verbose = verbose, level = 1, type = ">>", add_others) + vmsg("others_max_snps : %s", verbose = verbose, level = 1, type = ">>", if (is.null(others_max_snps)) "NULL" else others_max_snps) + vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, type = ">>", others_rm_with_indels) + vmsg("out_vcf : %s", verbose = verbose, level = 1, type = ">>", if (is.null(out_vcf)) "NULL" else out_vcf) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") # Input checks diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 0b67f3f..46c4fb3 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -47,11 +47,11 @@ madc2vcf_multi <- function(madc_file, verbose = TRUE) { vmsg("Running BIGr madc2vcf_multi", verbose = verbose, level = 0, type = ">>") - vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) - vmsg("botloci_file : %s", verbose = verbose, level = 1, botloci_file) - vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) - vmsg("outfile : %s", verbose = verbose, level = 1, outfile) - vmsg("ploidy : %s", verbose = verbose, level = 1, ploidy) + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("outfile : %s", verbose = verbose, level = 1, type = ">>", outfile) + vmsg("ploidy : %s", verbose = verbose, level = 1, type = ">>", ploidy) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 19c486e..888c445 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -137,12 +137,12 @@ madc2vcf_targets <- function(madc_file, verbose = TRUE) { vmsg("Running BIGr madc2vcf_targets", verbose = verbose, level = 0, type = ">>") - vmsg("madc_file : %s", verbose = verbose, level = 1, madc_file) - vmsg("output.file : %s", verbose = verbose, level = 1, output.file) - vmsg("botloci_file : %s", verbose = verbose, level = 1, if (is.null(botloci_file)) "NULL" else botloci_file) - vmsg("markers_info : %s", verbose = verbose, level = 1, if (is.null(markers_info)) "NULL" else markers_info) - vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, get_REF_ALT) - vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, collapse_matches_counts) + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("output.file : %s", verbose = verbose, level = 1, type = ">>", output.file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, type = ">>", get_REF_ALT) + vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, type = ">>", collapse_matches_counts) vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") From f2013e3d9d37be0fc98c61da0831ceb92b8d5593 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 11:18:11 -0400 Subject: [PATCH 39/80] update man --- R/get_countsMADC.R | 3 ++- R/madc2vcf_all.R | 14 ++++++++++++++ R/madc2vcf_multi.R | 13 +++++++++++++ man/madc2vcf_all.Rd | 10 ++++++++++ man/madc2vcf_multi.Rd | 12 ++++++++++++ 5 files changed, 51 insertions(+), 1 deletion(-) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 396004a..3a9bc2b 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -95,8 +95,9 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>") vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>") + vmsg("Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences", verbose = verbose, level = 1, type = ">>") - warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion")) + warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion. Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences.")) # Subset both dataframes to retain only the common rows ref_df <- ref_df[common_ids, ] diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index b25e6da..63c031d 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -22,6 +22,18 @@ #' @details #' The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the `alignment_score_thr` parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the `alignment_score_thr` is not met, the corresponding SNPs are discarded. #' +#' **Sanity check behaviour and requirements** +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `markers_info` with `Ref`/`Alt`/`Indel_pos`/`Indel_length` + `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | **RefAltSeqs** | all paired | `botloci_file` | +#' | | unpaired | `botloci_file` + `hap_seq_file` (microhaplotype DB) | +#' +#' #' @examples #' # Example usage: #' @@ -323,6 +335,8 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align } vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) + if(sum(ref_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Ref_0001 sequence. Consider providing the haplotype database file to recover these tags", sum(ref_index==-1))) + if(sum(alt_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Alt_0002 sequence. Consider providing the haplotype database file to recover these tags", sum(alt_index==-1))) vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 46c4fb3..bcbae02 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -36,6 +36,19 @@ #' \code{polyRAD::RADdata2VCF}. #' } #' +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +#' unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +#' remaining checks the required inputs are: +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' #' @importFrom utils read.csv write.csv read.table #' #' @export diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index a1e8c92..c15e69b 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -58,6 +58,16 @@ This function processes a MADC file to generate a VCF file containing both targe } \details{ The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the \code{alignment_score_thr} parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the \code{alignment_score_thr} is not met, the corresponding SNPs are discarded. + +\strong{Sanity check behaviour and requirements}\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{markers_info} with \code{Ref}/\code{Alt}/\code{Indel_pos}/\code{Indel_length} + \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \strong{RefAltSeqs} \tab all paired \tab \code{botloci_file} \cr + \tab unpaired \tab \code{botloci_file} + \code{hap_seq_file} (microhaplotype DB) \cr +} } \examples{ # Example usage: diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd index 5c3a777..70bc59d 100644 --- a/man/madc2vcf_multi.Rd +++ b/man/madc2vcf_multi.Rd @@ -56,4 +56,16 @@ The function performs the following steps: calls \code{polyRAD::IterateHWE}, then exports the result with \code{polyRAD::RADdata2VCF}. } + +\strong{Sanity check behaviour and requirements} + +The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +remaining checks the required inputs are:\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr +} } From b01c12ba1aaf5b62bafab31f410a241363bf8018 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 2 Apr 2026 14:17:33 -0400 Subject: [PATCH 40/80] minor version up --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fa60f51..48ede7c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.7 +Version: 0.7.0 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index dd2f630..1b9559b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# BIGr 0.6.7 +# BIGr 0.7.0 ## New function `madc2vcf_multi` From d0e02e2b4f61778fdf71f73f03462b4b01f8e8eb Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Thu, 2 Apr 2026 15:03:54 -0400 Subject: [PATCH 41/80] added v1 of parentage function --- R/assign_parentage.R | 245 +++++++++++++++++++++++++ tests/testthat/test-assign_parentage.R | 184 +++++++++++++++++++ 2 files changed, 429 insertions(+) create mode 100644 R/assign_parentage.R create mode 100644 tests/testthat/test-assign_parentage.R diff --git a/R/assign_parentage.R b/R/assign_parentage.R new file mode 100644 index 0000000..bf9716d --- /dev/null +++ b/R/assign_parentage.R @@ -0,0 +1,245 @@ +#' Perform Parentage Assignment from Genotypic Data +#' +#' @description +#' Assigns parents to progeny based on genetic marker data coded as 0, 1, or 2 +#' representing allele counts. The function supports several methods, including +#' finding the best single parent (sire, dam, or best match) based on +#' homozygous loci mismatches, or identifying the best parent pair by +#' minimizing Mendelian inheritance errors. +#' +#' @details +#' The function operates in one of two main modes depending on the `method` argument: +#' \itemize{ +#' \item \strong{Homozygous Mismatch Methods} (`"best.sire"`, `"best.dam"`, `"best.match"`): +#' These methods work by considering only homozygous loci (coded as 0 or 2). +#' They calculate the percentage of mismatching homozygous loci between each +#' progeny and the potential parents. Heterozygous loci (coded as 1) are +#' ignored in this comparison. +#' \item \strong{Best Pair Method} (`"best.pair"`): This method evaluates all +#' possible sire-dam pairs for each progeny. It counts the number of loci +#' that show a Mendelian error given the parental pair's genotypes (e.g., +#' two parents with genotype 0 cannot produce a progeny with genotype 1 or 2). +#' The pair(s) with the minimum error percentage is/are reported as the best match. +#' } +#' +#' @param genotypes_file A character string. Path to the tab-separated file +#' containing genotypic data for all individuals. Must have an 'ID' column +#' followed by marker columns. +#' @param parents_file A character string. Path to the tab-separated file listing +#' potential parent individuals. Must have an 'ID' column and optionally a +#' 'Sex' column ('M' for male, 'F' for female). +#' @param progeny_file A character string. Path to the tab-separated file listing +#' the progeny individuals to be analyzed. Must have an 'ID' column. +#' @param method A character string specifying the assignment method. Must be one +#' of `"best.pair"` (default), `"best.sire"`, `"best.dam"`, or `"best.match"`. +#' @param show.ties A logical value. If `TRUE` (default), all tied best matches +#' for a progeny are reported in wide format. If `FALSE`, only the first +#' tied match is reported and a warning is issued. +#' @param allow.selfing A logical value. If `TRUE` (default), an individual can +#' be assigned as both sire and dam. Only applicable when `method = "best.pair"`. +#' @param verbose A logical value. If `TRUE` (default), progress messages and the +#' final results table are printed to the console. +#' @param write.txt A logical value. If `TRUE` (default), the results table is +#' written to a file named "parentage_results.txt" in the current directory. +#' +#' @return A `tibble` (data frame) containing the parentage assignment results. +#' If `verbose = TRUE`, the function prints the results to the console and +#' returns the `tibble` invisibly. +#' +#' @importFrom dplyr filter mutate across bind_rows +#' @importFrom tibble column_to_rownames tibble as_tibble +#' +#' @export +#' +find_parentage <- function(genotypes_file, parents_file, progeny_file, + method = "best.pair", + show.ties = TRUE, + allow.selfing = TRUE, + verbose = TRUE, + write.txt = TRUE) { + #### Input Validation and Data Loading #### + allowed_methods <- c("best.sire", "best.dam", "best.match", "best.pair") + if (!method %in% allowed_methods) { + stop("Method must be one of: ", paste(allowed_methods, collapse = ", ")) + } + + tryCatch({ + genos <- read.table(genotypes_file, header = TRUE, stringsAsFactors = FALSE) + all_parents <- read.table(parents_file, header = TRUE, stringsAsFactors = FALSE) + progeny_candidates <- read.table(progeny_file, header = TRUE, stringsAsFactors = FALSE) + }, error = function(e) { + stop("Error reading input files. Please ensure paths are correct and files are properly formatted.") + }) + + valid_ids <- genos$ID + removed_parents <- setdiff(all_parents$ID, valid_ids) + if (length(removed_parents) > 0) { + warning("The following parent IDs were not in the genotype file and will not be analyzed: ", + paste(removed_parents, collapse = ", "), call. = FALSE) + all_parents <- all_parents %>% filter(ID %in% valid_ids) + } + + removed_progeny <- setdiff(progeny_candidates$ID, valid_ids) + if (length(removed_progeny) > 0) { + warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", + paste(removed_progeny, collapse = ", "), call. = FALSE) + progeny_candidates <- progeny_candidates %>% filter(ID %in% valid_ids) + } + + if (!"Sex" %in% colnames(all_parents)) { + warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") + all_parents$Sex <- "A" + } + + all_parents <- all_parents %>% mutate(Sex = toupper(Sex)) + sire_candidates <- all_parents %>% filter(Sex %in% c("M", "A")) + dam_candidates <- all_parents %>% filter(Sex %in% c("F", "A")) + + if (nrow(sire_candidates) == 0 && method %in% c("best.sire", "best.pair")) { + warning("No valid sire candidates remain after filtering.", call. = FALSE) + } + if (nrow(dam_candidates) == 0 && method %in% c("best.dam", "best.pair")) { + warning("No valid dam candidates remain after filtering.", call. = FALSE) + } + if (nrow(progeny_candidates) == 0) { + stop("No valid progeny candidates remain after filtering.") + } + + #### Logic for Homozygous Matching Methods #### + if (method %in% c("best.sire", "best.dam", "best.match")) { + genos_hom <- genos %>% mutate(across(-ID, ~ ifelse(.x == 1, NA, .x))) + parent_ids <- switch(method, + "best.sire" = sire_candidates$ID, + "best.dam" = dam_candidates$ID, + "best.match" = union(sire_candidates$ID, dam_candidates$ID) + ) + + parent_genos <- genos_hom %>% filter(ID %in% parent_ids) %>% column_to_rownames("ID") + progeny_genos <- genos_hom %>% filter(ID %in% progeny_candidates$ID) %>% column_to_rownames("ID") + results_list <- list() + + for (i in seq_len(nrow(progeny_genos))) { + progeny_id <- rownames(progeny_genos)[i] + progeny_vec <- as.numeric(progeny_genos[i, ]) + + mismatches <- rowSums(parent_genos != progeny_vec, na.rm = TRUE) + comparisons <- rowSums(!is.na(parent_genos) & !is.na(progeny_vec)) + percent_mismatch <- (mismatches / comparisons) * 100 + percent_mismatch[is.nan(percent_mismatch)] <- NA + + best_idx <- which.min(percent_mismatch) + if (length(best_idx) == 0) { + best_parent_id <- NA + min_percent <- NA + markers_tested <- NA + } else { + best_parent_id <- rownames(parent_genos)[best_idx] + min_percent <- percent_mismatch[best_idx] + markers_tested <- comparisons[best_idx] + } + + results_list[[progeny_id]] <- tibble( + Progeny = progeny_id, + Best_Match = best_parent_id, + Mendelian_Error_Pct = round(min_percent,2), + Markers_Tested = markers_tested + ) + } + final_df <- bind_rows(results_list) + } + + #### Logic for Best Pair Method #### + if (method == "best.pair") { + genos_mat <- genos %>% column_to_rownames("ID") %>% as.matrix() + parent_pairs <- expand.grid(Sire = sire_candidates$ID, Dam = dam_candidates$ID, stringsAsFactors = FALSE) + + if (!allow.selfing) { + parent_pairs <- parent_pairs %>% filter(Sire != Dam) + if(verbose) cat("Selfing is disallowed. Pairs with identical parents are removed.\n") + } + if (nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") + + sire_genos_mat <- genos_mat[parent_pairs$Sire, , drop = FALSE] + dam_genos_mat <- genos_mat[parent_pairs$Dam, , drop = FALSE] + results_list <- list() + + for (prog_id in progeny_candidates$ID) { + progeny_vec <- genos_mat[prog_id, ] + + mismatches <- rowSums( + (sire_genos_mat == 0 & dam_genos_mat == 0 & progeny_vec > 0) | + (sire_genos_mat == 2 & dam_genos_mat == 2 & progeny_vec < 2) | + ((sire_genos_mat == 0 & dam_genos_mat == 1) | (sire_genos_mat == 1 & dam_genos_mat == 0)) & (progeny_vec == 2) | + ((sire_genos_mat == 2 & dam_genos_mat == 1) | (sire_genos_mat == 1 & dam_genos_mat == 2)) & (progeny_vec == 0) | + ((sire_genos_mat == 0 & dam_genos_mat == 2) | (sire_genos_mat == 2 & dam_genos_mat == 0)) & (progeny_vec != 1), + na.rm = TRUE + ) + + comparisons <- rowSums(!is.na(sire_genos_mat) & !is.na(dam_genos_mat) & !is.na(progeny_vec)) + percent_mismatch <- (mismatches / comparisons) * 100 + percent_mismatch[is.nan(percent_mismatch)] <- NA + + min_mismatch_val <- min(percent_mismatch, na.rm = TRUE) + + if (is.infinite(min_mismatch_val)) { + results_list[[prog_id]] <- tibble(Progeny = prog_id, Markers_Tested = 0) + next + } + + best_indices <- which(percent_mismatch == min_mismatch_val) + best_pairs <- parent_pairs[best_indices, ] + + if (!show.ties && nrow(best_pairs) > 1) { + warning("Progeny '", prog_id, "' has ", nrow(best_pairs), " tied best pairs. Only one is reported as show.ties=FALSE.", call. = FALSE) + } + + num_to_report <- if (show.ties) nrow(best_pairs) else 1 + num_to_report <- min(nrow(best_pairs), num_to_report) + + result_row <- list(Progeny = prog_id) + + if (num_to_report == 1) { + result_row[['Sire']] <- best_pairs$Sire[1] + result_row[['Dam']] <- best_pairs$Dam[1] + result_row[['Mendelian_Error_Pct']] <- sprintf("%.2f", min_mismatch_val) + result_row[['Markers_Tested']] <- comparisons[best_indices[1]] + } else if (num_to_report > 1) { + for (k in 1:num_to_report) { + result_row[[paste0("Sire_", k)]] <- best_pairs$Sire[k] + result_row[[paste0("Dam_", k)]] <- best_pairs$Dam[k] + result_row[[paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val + result_row[[paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] + } + } + results_list[[prog_id]] <- as_tibble(result_row) + } + final_df <- bind_rows(results_list) + } + + #### Output #### + # write .txt output + if (write.txt) { + output_filename <- "parentage_results.txt" + tryCatch({ + write.table(final_df, + file = output_filename, + sep = "\t", + quote = FALSE, + row.names = FALSE) + if (verbose) { + cat("\nResults successfully written to:", output_filename, "\n") + } + }, error = function(e) { + warning("Could not write results to file. Error: ", e$message, call. = FALSE) + }) + } + + if (verbose) { + cat("\n--- Parentage Assignment Results ---\n") + print(final_df) + return(invisible(final_df)) + } else { + return(final_df) + } +} + diff --git a/tests/testthat/test-assign_parentage.R b/tests/testthat/test-assign_parentage.R new file mode 100644 index 0000000..b855351 --- /dev/null +++ b/tests/testthat/test-assign_parentage.R @@ -0,0 +1,184 @@ +# tests/testthat/test_find_parentage.R +# Test suite for find_parentage() (assign_parentage.R) [1] +# ------------------------------------------------------------------ + +# 1. Load the function ----------------------------------------------- +# Assuming the function is in the current package: +# You may need to adjust the path if it is in a different location. +source("R/assign_parentage.R") # [1] + +# 2. Helper: create a very small, deterministic dataset ----------- +small_test_data <- function() { + # 3 parents (one male, one female, one unknown) + parents_df <- tibble::tribble( + ~ID, ~Sex, + "P1", "M", + "P2", "F", + "P3", NA_character_ + ) + # Progeny + progeny_df <- tibble::tibble(ID = "C1") + + # Genotypes (0 = AA, 1 = Aa, 2 = aa) + # P1 (M) : AA -> 0 + # P2 (F) : aa -> 2 + # P3 : unknown + # C1 : should be heterozygous (Aa) -> 1 + genotypes_df <- tibble::tribble( + ~ID, ~L1, + "P1", 0, + "P2", 2, + "P3", NA, + "C1", 1 + ) + + list(parents = parents_df, + progeny = progeny_df, + genotypes = genotypes_df) +} + +# 3. Unit tests ------------------------------------------------------- + +testthat::test_that("find_parentage returns a tibble with expected columns", { + data <- small_test_data() + + # Write temp files + tmp_dir <- tempdir() + write_tsv(data$parents, file.path(tmp_dir, "parents.tsv")) + write_tsv(data$progeny, file.path(tmp_dir, "progeny.tsv")) + write_tsv(data$genotypes, file.path(tmp_dir, "genotypes.tsv")) + + # Run the function + res <- find_parentage( + genotypes_file = file.path(tmp_dir, "genotypes.tsv"), + parents_file = file.path(tmp_dir, "parents.tsv"), + progeny_file = file.path(tmp_dir, "progeny.tsv"), + method = "best.pair", + write.txt = FALSE, # we don't need the text file in tests + verbose = FALSE + ) + + # Basic sanity checks + testthat::expect_s3_class(res, "tbl_df") + testthat::expect_equal(nrow(res), 1) # one progeny + testthat::expect_true(all(c("Progeny", "Sire", "Dam", "Mendelian_Error_Pct", + "Markers_Tested") %in% colnames(res))) + + # The best parent pair should be (P1, P2) with no Mendelian errors + testthat::expect_equal(res$Progeny, "C1") + testthat::expect_equal(res$Sire, "P1") + testthat::expect_equal(res$Dam, "P2") + testthat::expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) + testthat::expect_equal(res$Markers_Tested, 1) # L1 was compared +}) + +testthat::test_that("best.sire and best.dam methods work on the same data", { + data <- small_test_data() + + write_tsv(data$parents, file.path(tmpdir(), "parents.tsv")) + write_tsv(data$progeny, file.path(tmpdir(), "progeny.tsv")) + write_tsv(data$genotypes, file.path(tmpdir(), "genotypes.tsv")) + + # Best sire + res_sire <- find_parentage( + genotypes_file = file.path(tmpdir(), "genotypes.tsv"), + parents_file = file.path(tmpdir(), "parents.tsv"), + progeny_file = file.path(tmpdir(), "progeny.tsv"), + method = "best.sire", + write.txt = FALSE, + verbose = FALSE + ) + testthat::expect_equal(res_sire$Best_Match, "P1") + + # Best dam + res_dam <- find_parentage( + genotypes_file = file.path(tmpdir(), "genotypes.tsv"), + parents_file = file.path(tmpdir(), "parents.tsv"), + progeny_file = file.path(tmpdir(), "progeny.tsv"), + method = "best.dam", + write.txt = FALSE, + verbose = FALSE + ) + testthat::expect_equal(res_dam$Best_Match, "P2") +}) + +testthat::test_that("function can handle a larger random dataset", { + # Generate a larger random dataset using the provided script logic [2] + # (The code below mirrors the logic in test_data.R but runs locally.) + set.seed(42) + n_progeny <- 50 + n_loci <- 100 + n_males <- 5 + n_females <- 5 + n_unknown <- 2 + + # Build parent list + parents_df <- tibble::tribble( + ~ID, ~Sex, + paste0("Male_", 1:n_males), "M", + paste0("Female_", 1:n_females), "F", + paste0("Unknown_", 1:n_unknown), NA_character_ + ) %>% tibble::add_row() + + # Progeny list + progeny_ids <- paste0("Progeny_", 1:n_progeny) + progeny_df <- tibble::tibble(ID = progeny_ids) + + # Random genotypes for parents + all_parent_ids <- parents_df$ID + parent_gens_mat <- matrix( + sample(0:2, length(all_parent_ids) * n_loci, replace = TRUE), + nrow = length(all_parent_ids), + dimnames = list(all_parent_ids, + paste0("Locus_", 1:n_loci)) + ) + + # Build progeny genotypes from random parents (simple simulation) + progeny_genos_mat <- matrix(NA_integer_, nrow = n_progeny, + ncol = n_loci, + dimnames = list(progeny_ids, + paste0("Locus_", 1:n_loci))) + + for (i in seq_along(progeny_ids)) { + sire <- sample(all_parent_ids, 1) + dam <- sample(all_parent_ids, 1) + for (l in 1:n_loci) { + parent_alleles <- c(parent_gens_mat[sire, l], + parent_gens_mat[dam, l]) + # Simple rule: sum to get genotype (0,1,2) – works because alleles are 0 or 1 + progeny_genos_mat[i, l] <- sum(parent_alleles) + } + } + + # Combine and write to files + all_gens_df <- tibble::as_tibble( + Rcpp::cppFunction('library(tibble);') # just for tibble import + ) + all_gens_df <- tibble::as_tibble( + rbind(parents_df, progeny_df) + ) %>% + tibble::add_row() + + # Write files + tmp_dir <- tempdir() + write_tsv(parents_df, file.path(tmp_dir, "parents.tsv")) + write_tsv(progeny_df, file.path(tmp_dir, "progeny.tsv")) + write_tsv(all_parent_ids, file.path(tmp_dir, "genotypes.tsv")) # placeholder + + # Call the function + res <- find_parentage( + genotypes_file = file.path(tmp_dir, "genotypes.tsv"), + parents_file = file.path(tmp_dir, "parents.tsv"), + progeny_file = file.path(tmp_dir, "progeny.tsv"), + method = "best.pair", + write.txt = FALSE, + verbose = FALSE + ) + + # Basic checks on the larger set + testthat::expect_equal(nrow(res), n_progeny) + testthat::expect_true(all(c("Progeny", "Sire", "Dam", + "Mendelian_Error_Pct", "Markers_Tested") %in% colnames(res))) + testthat::expect_true(all(res$Mendelian_Error_Pct >= 0)) + testthat::expect_true(all(res$Mendelian_Error_Pct <= 100)) +}) From 5baae7964c81e8a26384b0db8281c34aef188232 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Thu, 2 Apr 2026 15:37:59 -0400 Subject: [PATCH 42/80] modified code to use data.table for increased efficiency --- R/assign_parentage.R | 114 +++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 63 deletions(-) diff --git a/R/assign_parentage.R b/R/assign_parentage.R index bf9716d..db6c58f 100644 --- a/R/assign_parentage.R +++ b/R/assign_parentage.R @@ -46,10 +46,8 @@ #' If `verbose = TRUE`, the function prints the results to the console and #' returns the `tibble` invisibly. #' -#' @importFrom dplyr filter mutate across bind_rows -#' @importFrom tibble column_to_rownames tibble as_tibble -#' -#' @export +#' @importFrom data.table := as.data.table CJ copy data.table fread fwrite rbindlist +#' #' @export #' find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best.pair", @@ -57,6 +55,13 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, allow.selfing = TRUE, verbose = TRUE, write.txt = TRUE) { + + # Ensure data.table is loaded + if (!requireNamespace("data.table", quietly = TRUE)) { + stop("The 'data.table' package is required. Please install it.", call. = FALSE) + } + library(data.table) + #### Input Validation and Data Loading #### allowed_methods <- c("best.sire", "best.dam", "best.match", "best.pair") if (!method %in% allowed_methods) { @@ -64,11 +69,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } tryCatch({ - genos <- read.table(genotypes_file, header = TRUE, stringsAsFactors = FALSE) - all_parents <- read.table(parents_file, header = TRUE, stringsAsFactors = FALSE) - progeny_candidates <- read.table(progeny_file, header = TRUE, stringsAsFactors = FALSE) + genos <- fread(genotypes_file) + all_parents <- fread(parents_file) + progeny_candidates <- fread(progeny_file) }, error = function(e) { - stop("Error reading input files. Please ensure paths are correct and files are properly formatted.") + stop("Error reading input files. Ensure paths are correct and files are TSV/CSV.") }) valid_ids <- genos$ID @@ -76,24 +81,24 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (length(removed_parents) > 0) { warning("The following parent IDs were not in the genotype file and will not be analyzed: ", paste(removed_parents, collapse = ", "), call. = FALSE) - all_parents <- all_parents %>% filter(ID %in% valid_ids) + all_parents <- all_parents[ID %in% valid_ids] } removed_progeny <- setdiff(progeny_candidates$ID, valid_ids) if (length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) - progeny_candidates <- progeny_candidates %>% filter(ID %in% valid_ids) + progeny_candidates <- progeny_candidates[ID %in% valid_ids] } if (!"Sex" %in% colnames(all_parents)) { warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") - all_parents$Sex <- "A" + all_parents[, Sex := "A"] } - all_parents <- all_parents %>% mutate(Sex = toupper(Sex)) - sire_candidates <- all_parents %>% filter(Sex %in% c("M", "A")) - dam_candidates <- all_parents %>% filter(Sex %in% c("F", "A")) + all_parents[, Sex := toupper(Sex)] + sire_candidates <- all_parents[Sex %in% c("M", "A")] + dam_candidates <- all_parents[Sex %in% c("F", "A")] if (nrow(sire_candidates) == 0 && method %in% c("best.sire", "best.pair")) { warning("No valid sire candidates remain after filtering.", call. = FALSE) @@ -107,21 +112,22 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, #### Logic for Homozygous Matching Methods #### if (method %in% c("best.sire", "best.dam", "best.match")) { - genos_hom <- genos %>% mutate(across(-ID, ~ ifelse(.x == 1, NA, .x))) + genos_hom <- copy(genos) + marker_cols <- setdiff(names(genos_hom), "ID") + for (col in marker_cols) { + genos_hom[get(col) == 1, (col) := NA_integer_] + } + parent_ids <- switch(method, "best.sire" = sire_candidates$ID, "best.dam" = dam_candidates$ID, - "best.match" = union(sire_candidates$ID, dam_candidates$ID) - ) - - parent_genos <- genos_hom %>% filter(ID %in% parent_ids) %>% column_to_rownames("ID") - progeny_genos <- genos_hom %>% filter(ID %in% progeny_candidates$ID) %>% column_to_rownames("ID") - results_list <- list() + "best.match" = union(sire_candidates$ID, dam_candidates$ID)) - for (i in seq_len(nrow(progeny_genos))) { - progeny_id <- rownames(progeny_genos)[i] - progeny_vec <- as.numeric(progeny_genos[i, ]) + parent_genos <- as.matrix(genos_hom[ID %in% parent_ids], rownames = "ID") + progeny_genos <- as.matrix(genos_hom[ID %in% progeny_candidates$ID], rownames = "ID") + results_list <- lapply(rownames(progeny_genos), function(progeny_id) { + progeny_vec <- progeny_genos[progeny_id, ] mismatches <- rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- rowSums(!is.na(parent_genos) & !is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 @@ -129,41 +135,33 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, best_idx <- which.min(percent_mismatch) if (length(best_idx) == 0) { - best_parent_id <- NA - min_percent <- NA - markers_tested <- NA + data.table(Progeny = progeny_id, Best_Match = NA, Mendelian_Error_Pct = NA, Markers_Tested = NA) } else { - best_parent_id <- rownames(parent_genos)[best_idx] - min_percent <- percent_mismatch[best_idx] - markers_tested <- comparisons[best_idx] + data.table(Progeny = progeny_id, + Best_Match = rownames(parent_genos)[best_idx], + Mendelian_Error_Pct = round(percent_mismatch[best_idx], 2), + Markers_Tested = comparisons[best_idx]) } - - results_list[[progeny_id]] <- tibble( - Progeny = progeny_id, - Best_Match = best_parent_id, - Mendelian_Error_Pct = round(min_percent,2), - Markers_Tested = markers_tested - ) - } - final_df <- bind_rows(results_list) + }) + final_df <- rbindlist(results_list) } #### Logic for Best Pair Method #### if (method == "best.pair") { - genos_mat <- genos %>% column_to_rownames("ID") %>% as.matrix() - parent_pairs <- expand.grid(Sire = sire_candidates$ID, Dam = dam_candidates$ID, stringsAsFactors = FALSE) + genos_mat <- as.matrix(genos, rownames = "ID") + + parent_pairs <- CJ(Sire = sire_candidates$ID, Dam = dam_candidates$ID) if (!allow.selfing) { - parent_pairs <- parent_pairs %>% filter(Sire != Dam) - if(verbose) cat("Selfing is disallowed. Pairs with identical parents are removed.\n") + parent_pairs <- parent_pairs[Sire != Dam] + if (verbose) cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } if (nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") sire_genos_mat <- genos_mat[parent_pairs$Sire, , drop = FALSE] dam_genos_mat <- genos_mat[parent_pairs$Dam, , drop = FALSE] - results_list <- list() - for (prog_id in progeny_candidates$ID) { + results_list <- lapply(progeny_candidates$ID, function(prog_id) { progeny_vec <- genos_mat[prog_id, ] mismatches <- rowSums( @@ -182,12 +180,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, min_mismatch_val <- min(percent_mismatch, na.rm = TRUE) if (is.infinite(min_mismatch_val)) { - results_list[[prog_id]] <- tibble(Progeny = prog_id, Markers_Tested = 0) - next + return(data.table(Progeny = prog_id, Markers_Tested = 0)) } best_indices <- which(percent_mismatch == min_mismatch_val) - best_pairs <- parent_pairs[best_indices, ] + best_pairs <- parent_pairs[best_indices] if (!show.ties && nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", nrow(best_pairs), " tied best pairs. Only one is reported as show.ties=FALSE.", call. = FALSE) @@ -197,7 +194,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, num_to_report <- min(nrow(best_pairs), num_to_report) result_row <- list(Progeny = prog_id) - if (num_to_report == 1) { result_row[['Sire']] <- best_pairs$Sire[1] result_row[['Dam']] <- best_pairs$Dam[1] @@ -211,24 +207,17 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, result_row[[paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] } } - results_list[[prog_id]] <- as_tibble(result_row) - } - final_df <- bind_rows(results_list) + as.data.table(result_row) + }) + final_df <- rbindlist(results_list, fill = TRUE) } #### Output #### - # write .txt output if (write.txt) { - output_filename <- "parentage_results.txt" + output_filename <- "parentage_results_dt.txt" tryCatch({ - write.table(final_df, - file = output_filename, - sep = "\t", - quote = FALSE, - row.names = FALSE) - if (verbose) { - cat("\nResults successfully written to:", output_filename, "\n") - } + fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) + if (verbose) cat("\nResults successfully written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) @@ -242,4 +231,3 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, return(final_df) } } - From a252f2217b0b5fd075f0b055fe5d6198fc3f674e Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Fri, 3 Apr 2026 16:40:16 -0400 Subject: [PATCH 43/80] improved assign_parentage and validate_parentage --- NAMESPACE | 8 + R/assign_parentage.R | 149 ++++++---- R/validate_pedigree.R | 363 +++++++++++++++++++++++++ man/find_parentage.Rd | 110 ++++++++ man/validate_pedigree.Rd | 100 +++++++ tests/testthat/test-assign_parentage.R | 184 ------------- 6 files changed, 670 insertions(+), 244 deletions(-) create mode 100644 R/validate_pedigree.R create mode 100644 man/find_parentage.Rd create mode 100644 man/validate_pedigree.Rd delete mode 100644 tests/testthat/test-assign_parentage.R diff --git a/NAMESPACE b/NAMESPACE index ae09080..e1fb9ae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(dosage2vcf) export(dosage_ratios) export(filterMADC) export(filterVCF) +export(find_parentage) export(flip_dosage) export(get_countsMADC) export(imputation_concordance) @@ -37,6 +38,13 @@ importFrom(Biostrings,DNAString) importFrom(Biostrings,reverseComplement) importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) +importFrom(data.table,CJ) +importFrom(data.table,as.data.table) +importFrom(data.table,copy) +importFrom(data.table,data.table) +importFrom(data.table,fread) +importFrom(data.table,fwrite) +importFrom(data.table,rbindlist) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,case_when) diff --git a/R/assign_parentage.R b/R/assign_parentage.R index db6c58f..1b93b49 100644 --- a/R/assign_parentage.R +++ b/R/assign_parentage.R @@ -1,66 +1,95 @@ -#' Perform Parentage Assignment from Genotypic Data +#' Find Parentage Assignments for Progeny #' -#' @description -#' Assigns parents to progeny based on genetic marker data coded as 0, 1, or 2 -#' representing allele counts. The function supports several methods, including -#' finding the best single parent (sire, dam, or best match) based on -#' homozygous loci mismatches, or identifying the best parent pair by -#' minimizing Mendelian inheritance errors. +#' Assigns the most likely parent(s) to each progeny individual based on +#' genotypic data using Mendelian error rates or homozygous mismatch rates. +#' +#' @param genotypes_file Path to a TSV/CSV file containing genotype data. +#' Must include an 'ID' column followed by marker columns coded as 0, 1, 2 +#' (allele dosage). +#' @param parents_file Path to a TSV/CSV file listing candidate parent IDs. +#' Must include an 'ID' column. An optional 'Sex' column with values +#' 'M' (sire), 'F' (dam), or 'A' (ambiguous) determines which parents are +#' tested for each role. If absent, all parents are treated as ambiguous. +#' @param progeny_file Path to a TSV/CSV file listing progeny IDs to assign. +#' Must include an 'ID' column. +#' @param method Character. Parentage assignment method. One of: +#' \itemize{ +#' \item \code{"best.sire"} — finds the best sire for each progeny using +#' homozygous mismatch rate. +#' \item \code{"best.dam"} — finds the best dam for each progeny using +#' homozygous mismatch rate. +#' \item \code{"best.match"} — finds the single best parent (either sex) +#' using homozygous mismatch rate. +#' \item \code{"best.pair"} — finds the best sire-dam pair for each +#' progeny using full Mendelian error rate (default). +#' } +#' @param show_ties Logical. If \code{TRUE}, all tied best pairs are reported +#' as additional columns (\code{Sire_1}, \code{Sire_2}, etc.) when +#' \code{method = "best.pair"}. If \code{FALSE}, only one tied pair is +#' reported with a warning. Default is \code{TRUE}. +#' @param allow_selfing Logical. If \code{FALSE}, sire-dam pairs where both +#' IDs are identical are excluded when \code{method = "best.pair"}. +#' Default is \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints progress messages, summary +#' statistics, and the results table to the console. Default is \code{TRUE}. +#' @param write_txt Logical. If \code{TRUE}, writes results to +#' \code{parentage_results_dt.txt} in the working directory. Default is +#' \code{TRUE}. +#' +#' @return A \code{data.table} with one row per progeny (or more if ties are +#' reported). Columns depend on the method used: +#' \itemize{ +#' \item \code{best.sire} / \code{best.dam} / \code{best.match}: \code{Progeny}, +#' \code{Best_Match}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. +#' \item \code{best.pair} (no ties): \code{Progeny}, \code{Sire}, \code{Dam}, +#' \code{Mendelian_Error_Pct}, \code{Markers_Tested}. +#' \item \code{best.pair} (with ties): columns are suffixed \code{_1}, \code{_2}, +#' etc. for each tied pair. +#' } +#' Returned invisibly when \code{verbose = TRUE}. #' #' @details -#' The function operates in one of two main modes depending on the `method` argument: -#' \itemize{ -#' \item \strong{Homozygous Mismatch Methods} (`"best.sire"`, `"best.dam"`, `"best.match"`): -#' These methods work by considering only homozygous loci (coded as 0 or 2). -#' They calculate the percentage of mismatching homozygous loci between each -#' progeny and the potential parents. Heterozygous loci (coded as 1) are -#' ignored in this comparison. -#' \item \strong{Best Pair Method} (`"best.pair"`): This method evaluates all -#' possible sire-dam pairs for each progeny. It counts the number of loci -#' that show a Mendelian error given the parental pair's genotypes (e.g., -#' two parents with genotype 0 cannot produce a progeny with genotype 1 or 2). -#' The pair(s) with the minimum error percentage is/are reported as the best match. -#' } +#' For \code{"best.sire"}, \code{"best.dam"}, and \code{"best.match"}, only +#' homozygous markers (coded 0 or 2) are used for comparison; heterozygous +#' markers (coded 1) are set to \code{NA}. This reduces false mismatches caused +#' by phase ambiguity. #' -#' @param genotypes_file A character string. Path to the tab-separated file -#' containing genotypic data for all individuals. Must have an 'ID' column -#' followed by marker columns. -#' @param parents_file A character string. Path to the tab-separated file listing -#' potential parent individuals. Must have an 'ID' column and optionally a -#' 'Sex' column ('M' for male, 'F' for female). -#' @param progeny_file A character string. Path to the tab-separated file listing -#' the progeny individuals to be analyzed. Must have an 'ID' column. -#' @param method A character string specifying the assignment method. Must be one -#' of `"best.pair"` (default), `"best.sire"`, `"best.dam"`, or `"best.match"`. -#' @param show.ties A logical value. If `TRUE` (default), all tied best matches -#' for a progeny are reported in wide format. If `FALSE`, only the first -#' tied match is reported and a warning is issued. -#' @param allow.selfing A logical value. If `TRUE` (default), an individual can -#' be assigned as both sire and dam. Only applicable when `method = "best.pair"`. -#' @param verbose A logical value. If `TRUE` (default), progress messages and the -#' final results table are printed to the console. -#' @param write.txt A logical value. If `TRUE` (default), the results table is -#' written to a file named "parentage_results.txt" in the current directory. +#' For \code{"best.pair"}, all markers are used and full Mendelian inheritance +#' rules are applied across all possible sire-dam combinations via +#' \code{data.table::CJ()}. #' -#' @return A `tibble` (data frame) containing the parentage assignment results. -#' If `verbose = TRUE`, the function prints the results to the console and -#' returns the `tibble` invisibly. +#' Individuals in \code{parents_file} or \code{progeny_file} that are absent +#' from \code{genotypes_file} are removed with a warning. #' -#' @importFrom data.table := as.data.table CJ copy data.table fread fwrite rbindlist -#' #' @export +#' @examples +#' \dontrun{ +#' # Assign best sire-dam pair to each progeny +#' results <- find_parentage( +#' genotypes_file = "genotypes.txt", +#' parents_file = "parents.txt", +#' progeny_file = "progeny.txt", +#' method = "best.pair", +#' show_ties = TRUE, +#' allow_selfing = FALSE +#' ) #' +#' # Find best individual parent match (ignoring sex) +#' results <- find_parentage( +#' genotypes_file = "genotypes.txt", +#' parents_file = "parents.txt", +#' progeny_file = "progeny.txt", +#' method = "best.match" +#' ) +#' } +#' +#' @importFrom data.table fread fwrite copy CJ rbindlist +#' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best.pair", - show.ties = TRUE, - allow.selfing = TRUE, + show_ties = TRUE, + allow_selfing = TRUE, verbose = TRUE, - write.txt = TRUE) { - - # Ensure data.table is loaded - if (!requireNamespace("data.table", quietly = TRUE)) { - stop("The 'data.table' package is required. Please install it.", call. = FALSE) - } - library(data.table) + write_txt = TRUE) { #### Input Validation and Data Loading #### allowed_methods <- c("best.sire", "best.dam", "best.match", "best.pair") @@ -97,8 +126,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } all_parents[, Sex := toupper(Sex)] - sire_candidates <- all_parents[Sex %in% c("M", "A")] - dam_candidates <- all_parents[Sex %in% c("F", "A")] + sire_candidates <- all_parents[Sex %in% c("M", "A", "NA")] + dam_candidates <- all_parents[Sex %in% c("F", "A", "NA")] if (nrow(sire_candidates) == 0 && method %in% c("best.sire", "best.pair")) { warning("No valid sire candidates remain after filtering.", call. = FALSE) @@ -152,7 +181,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, parent_pairs <- CJ(Sire = sire_candidates$ID, Dam = dam_candidates$ID) - if (!allow.selfing) { + if (!allow_selfing) { parent_pairs <- parent_pairs[Sire != Dam] if (verbose) cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } @@ -186,11 +215,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, best_indices <- which(percent_mismatch == min_mismatch_val) best_pairs <- parent_pairs[best_indices] - if (!show.ties && nrow(best_pairs) > 1) { - warning("Progeny '", prog_id, "' has ", nrow(best_pairs), " tied best pairs. Only one is reported as show.ties=FALSE.", call. = FALSE) + if (!show_ties && nrow(best_pairs) > 1) { + warning("Progeny '", prog_id, "' has ", nrow(best_pairs), " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) } - num_to_report <- if (show.ties) nrow(best_pairs) else 1 + num_to_report <- if (show_ties) nrow(best_pairs) else 1 num_to_report <- min(nrow(best_pairs), num_to_report) result_row <- list(Progeny = prog_id) @@ -213,7 +242,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } #### Output #### - if (write.txt) { + if (write_txt) { output_filename <- "parentage_results_dt.txt" tryCatch({ fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R new file mode 100644 index 0000000..b416370 --- /dev/null +++ b/R/validate_pedigree.R @@ -0,0 +1,363 @@ +#' Validate Pedigree Using Mendelian Error Analysis +#' +#' Validates parent-offspring trios by calculating Mendelian error rates from +#' SNP genotype data. Trios exceeding the error threshold are flagged and +#' analysed further using homozygous-marker comparisons to identify which +#' parent(s) are likely incorrect. Optionally writes corrected pedigree files +#' with erroneous parents replaced by zeros or by the best-matching candidate IDs. +#' +#' @param pedigree_file Character. Path to the pedigree file (TSV or CSV). +#' Must contain columns \code{Progeny}, \code{Sire}, and \code{Dam}. +#' @param genotypes_file Character. Path to the genotypes file (TSV or CSV). +#' Must contain an \code{ID} column followed by one column per marker, +#' with genotypes coded as 0, 1, or 2 (allele dosage). +#' @param error_threshold Numeric. Maximum acceptable Mendelian error percentage +#' (0-100) for a trio to be considered a \code{PASS}. Default is \code{5.0}. +#' @param min_markers Integer. Minimum number of non-missing markers required +#' across all three individuals for a trio to be fully evaluated. Trios below +#' this threshold receive status \code{LOW_MARKERS}. Default is \code{10}. +#' @param homozygous_threshold Numeric. Maximum acceptable homozygous-marker +#' mismatch percentage (0-100) for a parent to be considered acceptable in a +#' failed trio. Default is \code{2.0}. +#' @param fill_pedigree Logical. If \code{TRUE}, writes an additional corrected +#' pedigree file (\code{id_corrected_pedigree.txt}) in which removed parents +#' are replaced by the best-matching candidate ID rather than zero. +#' Default is \code{FALSE}. +#' @param verbose Logical. If \code{TRUE}, prints a summary of results and +#' correction decisions to the console. Default is \code{TRUE}. +#' @param write_txt Logical. If \code{TRUE}, writes the full validation results +#' table to \code{pedigree_validation_results.txt}. Default is \code{TRUE}. +#' +#' @return A \code{data.table} with one row per validated trio containing the +#' following columns: \code{Progeny} (progeny ID); \code{Sire} (declared sire +#' ID); \code{Dam} (declared dam ID); \code{Mendelian_Error_Pct} (overall +#' Mendelian error rate \% across all three individuals); +#' \code{Markers_Tested} (number of markers with non-missing genotypes in all +#' three individuals); \code{Status} (validation outcome: \code{PASS}, +#' \code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}); +#' \code{Correction_Decision} (action taken for failed trios: \code{NONE}, +#' \code{KEEP_BOTH}, \code{REMOVE_SIRE}, \code{REMOVE_DAM}, or +#' \code{REMOVE_BOTH}); \code{Sire_Hom_Error_Pct} (homozygous-marker mismatch +#' \% between sire and progeny, \code{NA} unless status is \code{FAIL}); +#' \code{Dam_Hom_Error_Pct} (homozygous-marker mismatch \% between dam and +#' progeny, \code{NA} unless status is \code{FAIL}); \code{Best_Sire} (ID of +#' the best-matching sire candidate, populated only when +#' \code{Correction_Decision} is \code{REMOVE_SIRE} or \code{REMOVE_BOTH}); +#' \code{Best_Sire_Error_Pct} (homozygous mismatch \% for \code{Best_Sire}); +#' \code{Best_Dam} (ID of the best-matching dam candidate, populated only when +#' \code{Correction_Decision} is \code{REMOVE_DAM} or \code{REMOVE_BOTH}); +#' \code{Best_Dam_Error_Pct} (homozygous mismatch \% for \code{Best_Dam}). +#' The function also writes \code{zero_corrected_pedigree.txt} (always) and, +#' if \code{fill_pedigree = TRUE}, \code{id_corrected_pedigree.txt}. If +#' \code{write_txt = TRUE}, results are written to +#' \code{pedigree_validation_results.txt}. The return value is invisible when +#' \code{verbose = TRUE}. +#' +#' @details +#' Mendelian errors are identified using standard allele-dosage rules, e.g. a +#' progeny cannot carry an allele absent in both parents. Only homozygous +#' parental markers (coded 0 or 2) are used in the per-parent mismatch +#' analysis, as heterozygous markers are uninformative for tracing allele +#' origin. Trios in the pedigree that lack genotype data for any of the three +#' individuals are removed prior to analysis. +#' +#' @importFrom data.table fread fwrite copy as.data.table rbindlist data.table +#' +#' @examples +#' results <- validate_pedigree( +#' pedigree_file = "pedigree.txt", +#' genotypes_file = "genotypes.txt", +#' error_threshold = 5.0, +#' min_markers = 10, +#' homozygous_threshold = 2.0, +#' fill_pedigree = TRUE, +#' verbose = TRUE, +#' write_txt = TRUE +#' ) +validate_pedigree <- function(pedigree_file, genotypes_file, + error_threshold = 5.0, + min_markers = 10, + homozygous_threshold = 2.0, + fill_pedigree = FALSE, + verbose = TRUE, + write_txt = TRUE) { + + #### Input Validation and Data Loading #### + if (error_threshold < 0 || error_threshold > 100) { + stop("error_threshold must be between 0 and 100") + } + if (homozygous_threshold < 0 || homozygous_threshold > 100) { + stop("homozygous_threshold must be between 0 and 100") + } + + tryCatch({ + pedigree <- fread(pedigree_file) + genos <- fread(genotypes_file) + }, error = function(e) { + stop("Error reading input files. Ensure paths are correct and files are TSV/CSV.") + }) + + # Keep original pedigree for correction if needed + original_pedigree <- copy(pedigree) + + # Check required columns + required_ped_cols <- c("Progeny", "Sire", "Dam") + missing_cols <- setdiff(required_ped_cols, names(pedigree)) + if (length(missing_cols) > 0) { + stop("Pedigree file missing required columns: ", paste(missing_cols, collapse = ", ")) + } + + if (!"ID" %in% names(genos)) { + stop("Genotypes file must have an 'ID' column") + } + + # Filter pedigree to only include individuals with genotype data + valid_ids <- genos$ID + initial_trios <- nrow(pedigree) + + pedigree <- pedigree[Progeny %in% valid_ids & Sire %in% valid_ids & Dam %in% valid_ids] + + removed_trios <- initial_trios - nrow(pedigree) + if (removed_trios > 0 && verbose) { + cat("Removed", removed_trios, "trios due to missing genotype data.\n") + } + + if (nrow(pedigree) == 0) { + stop("No valid trios remain after filtering for genotype availability.") + } + + #### Mendelian Error Calculation #### + genos_mat <- as.matrix(genos, rownames = "ID") + + # Create homozygous-only matrix for parent analysis + genos_hom <- copy(genos) + marker_cols <- setdiff(names(genos_hom), "ID") + for (col in marker_cols) { + genos_hom[get(col) == 1, (col) := NA_integer_] + } + genos_hom_mat <- as.matrix(genos_hom, rownames = "ID") + + #### Helper: Find Best Matching Parent #### + # Returns list(id, error_pct) for the candidate with lowest homozygous mismatch vs progeny + find_best_parent <- function(prog_id, exclude_ids = character(0)) { + candidates <- setdiff(rownames(genos_hom_mat), c(prog_id, exclude_ids)) + + if (length(candidates) == 0) return(list(id = NA_character_, error_pct = NA_real_)) + + prog_hom <- genos_hom_mat[prog_id, ] + + errors <- sapply(candidates, function(cand_id) { + cand_hom <- genos_hom_mat[cand_id, ] + comparisons <- sum(!is.na(cand_hom) & !is.na(prog_hom)) + if (comparisons == 0) return(NA_real_) + (sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 + }) + + best_idx <- which.min(errors) + list(id = candidates[best_idx], error_pct = round(errors[best_idx], 2)) + } + + results_list <- lapply(seq_len(nrow(pedigree)), function(i) { + prog_id <- pedigree$Progeny[i] + sire_id <- pedigree$Sire[i] + dam_id <- pedigree$Dam[i] + + # Extract genotype vectors + progeny_vec <- genos_mat[prog_id, ] + sire_vec <- genos_mat[sire_id, ] + dam_vec <- genos_mat[dam_id, ] + + # Calculate Mendelian errors using same logic as original function + mismatches <- sum( + (sire_vec == 0 & dam_vec == 0 & progeny_vec > 0) | + (sire_vec == 2 & dam_vec == 2 & progeny_vec < 2) | + ((sire_vec == 0 & dam_vec == 1) | (sire_vec == 1 & dam_vec == 0)) & (progeny_vec == 2) | + ((sire_vec == 2 & dam_vec == 1) | (sire_vec == 1 & dam_vec == 2)) & (progeny_vec == 0) | + ((sire_vec == 0 & dam_vec == 2) | (sire_vec == 2 & dam_vec == 0)) & (progeny_vec != 1), + na.rm = TRUE + ) + + # Count comparable markers (non-NA in all three individuals) + markers_tested <- sum(!is.na(sire_vec) & !is.na(dam_vec) & !is.na(progeny_vec)) + + # Initialise per-parent and best-parent fields (populated only for FAILs) + sire_error_pct <- NA_real_ + dam_error_pct <- NA_real_ + best_sire <- NA_character_ + best_sire_pct <- NA_real_ + best_dam <- NA_character_ + best_dam_pct <- NA_real_ + + # Calculate error percentage and determine status + if (markers_tested == 0) { + error_pct <- NA_real_ + status <- "NO_DATA" + correction_decision <- "NONE" + } else if (markers_tested < min_markers) { + error_pct <- (mismatches / markers_tested) * 100 + status <- "LOW_MARKERS" + correction_decision <- "NONE" + } else { + error_pct <- (mismatches / markers_tested) * 100 + + if (error_pct <= error_threshold) { + status <- "PASS" + correction_decision <- "NONE" + } else { + status <- "FAIL" + + # Per-parent homozygous analysis for failed trios + progeny_hom <- genos_hom_mat[prog_id, ] + sire_hom <- genos_hom_mat[sire_id, ] + dam_hom <- genos_hom_mat[dam_id, ] + + # Sire homozygous error + sire_comparisons <- sum(!is.na(sire_hom) & !is.na(progeny_hom)) + sire_error_pct <- if (sire_comparisons == 0) NA_real_ else + round((sum(sire_hom != progeny_hom, na.rm = TRUE) / sire_comparisons) * 100, 2) + + # Dam homozygous error + dam_comparisons <- sum(!is.na(dam_hom) & !is.na(progeny_hom)) + dam_error_pct <- if (dam_comparisons == 0) NA_real_ else + round((sum(dam_hom != progeny_hom, na.rm = TRUE) / dam_comparisons) * 100, 2) + + sire_acceptable <- !is.na(sire_error_pct) && sire_error_pct <= homozygous_threshold + dam_acceptable <- !is.na(dam_error_pct) && dam_error_pct <= homozygous_threshold + + if (sire_acceptable && dam_acceptable) { + correction_decision <- "KEEP_BOTH" + } else if (sire_acceptable && !dam_acceptable) { + correction_decision <- "REMOVE_DAM" + best <- find_best_parent(prog_id, exclude_ids = c(sire_id)) + best_dam <- best$id + best_dam_pct <- best$error_pct + } else if (!sire_acceptable && dam_acceptable) { + correction_decision <- "REMOVE_SIRE" + best <- find_best_parent(prog_id, exclude_ids = c(dam_id)) + best_sire <- best$id + best_sire_pct <- best$error_pct + } else { + correction_decision <- "REMOVE_BOTH" + best_s <- find_best_parent(prog_id, exclude_ids = character(0)) + best_sire <- best_s$id + best_sire_pct <- best_s$error_pct + best_d <- find_best_parent(prog_id, exclude_ids = c(best_s$id)) + best_dam <- best_d$id + best_dam_pct <- best_d$error_pct + } + } + } + + data.table( + Progeny = prog_id, + Sire = sire_id, + Dam = dam_id, + Mendelian_Error_Pct = round(error_pct, 2), + Markers_Tested = markers_tested, + Status = status, + Correction_Decision = correction_decision, + Sire_Hom_Error_Pct = sire_error_pct, # NA unless FAIL + Dam_Hom_Error_Pct = dam_error_pct, # NA unless FAIL + Best_Sire = best_sire, + Best_Sire_Error_Pct = best_sire_pct, + Best_Dam = best_dam, + Best_Dam_Error_Pct = best_dam_pct + ) + }) + + final_df <- rbindlist(results_list) + + #### Always Write Corrected Pedigree (zeros for failed parents) #### + zero_corrected_pedigree <- copy(original_pedigree) + + for (i in seq_len(nrow(final_df))) { + prog_id <- final_df$Progeny[i] + decision <- final_df$Correction_Decision[i] + + if (decision == "REMOVE_SIRE") { + zero_corrected_pedigree[Progeny == prog_id, Sire := 0] + } else if (decision == "REMOVE_DAM") { + zero_corrected_pedigree[Progeny == prog_id, Dam := 0] + } else if (decision == "REMOVE_BOTH") { + zero_corrected_pedigree[Progeny == prog_id, `:=`(Sire = 0, Dam = 0)] + } + # KEEP_BOTH and NONE require no changes + } + + tryCatch({ + fwrite(zero_corrected_pedigree, file = "zero_corrected_pedigree.txt", sep = "\t", quote = FALSE) + if (verbose) cat("Corrected pedigree (zeros) written to: zero_corrected_pedigree.txt\n") + }, error = function(e) { + warning("Could not write corrected pedigree to file. Error: ", e$message, call. = FALSE) + }) + + #### Optionally Write Filled Pedigree (best matching IDs for failed parents) #### + if (fill_pedigree) { + id_corrected_pedigree <- copy(original_pedigree) + + for (i in seq_len(nrow(final_df))) { + prog_id <- final_df$Progeny[i] + decision <- final_df$Correction_Decision[i] + + if (decision == "REMOVE_SIRE") { + id_corrected_pedigree[Progeny == prog_id, Sire := final_df$Best_Sire[i]] + } else if (decision == "REMOVE_DAM") { + id_corrected_pedigree[Progeny == prog_id, Dam := final_df$Best_Dam[i]] + } else if (decision == "REMOVE_BOTH") { + id_corrected_pedigree[Progeny == prog_id, `:=`(Sire = final_df$Best_Sire[i], + Dam = final_df$Best_Dam[i])] + } + # KEEP_BOTH and NONE require no changes + } + + tryCatch({ + fwrite(id_corrected_pedigree, file = "id_corrected_pedigree.txt", sep = "\t", quote = FALSE) + if (verbose) cat("Filled pedigree (best IDs) written to: id_corrected_pedigree.txt\n") + }, error = function(e) { + warning("Could not write filled pedigree to file. Error: ", e$message, call. = FALSE) + }) + } + + #### Summary Statistics #### + if (verbose) { + total_trios <- nrow(final_df) + status_counts <- table(final_df$Status) + + cat("\n--- Trio Validation Summary ---\n") + cat("Total trios tested:", total_trios, "\n") + for (status in names(status_counts)) { + cat(sprintf("%-12s: %d (%.1f%%)\n", status, status_counts[status], + (status_counts[status] / total_trios) * 100)) + } + cat("Error threshold:", error_threshold, "%\n") + cat("Homozygous threshold:", homozygous_threshold, "%\n") + cat("Minimum markers required:", min_markers, "\n\n") + + corrections <- table(final_df$Correction_Decision) + cat("Correction summary:\n") + for (decision in names(corrections)) { + if (decision != "NONE") { + cat(" ", decision, ":", corrections[decision], "\n") + } + } + cat("\n") + } + + #### Output #### + if (write_txt) { + output_filename <- "pedigree_validation_results.txt" + tryCatch({ + fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) + if (verbose) cat("Results written to:", output_filename, "\n") + }, error = function(e) { + warning("Could not write results to file. Error: ", e$message, call. = FALSE) + }) + } + + if (verbose) { + print(final_df) + return(invisible(final_df)) + } else { + return(final_df) + } +} diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd new file mode 100644 index 0000000..297d156 --- /dev/null +++ b/man/find_parentage.Rd @@ -0,0 +1,110 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assign_parentage.R +\name{find_parentage} +\alias{find_parentage} +\title{Find Parentage Assignments for Progeny} +\usage{ +find_parentage( + genotypes_file, + parents_file, + progeny_file, + method = "best.pair", + show_ties = TRUE, + allow_selfing = TRUE, + verbose = TRUE, + write_txt = TRUE +) +} +\arguments{ +\item{genotypes_file}{Path to a TSV/CSV file containing genotype data. +Must include an 'ID' column followed by marker columns coded as 0, 1, 2 +(allele dosage).} + +\item{parents_file}{Path to a TSV/CSV file listing candidate parent IDs. +Must include an 'ID' column. An optional 'Sex' column with values +'M' (sire), 'F' (dam), or 'A' (ambiguous) determines which parents are +tested for each role. If absent, all parents are treated as ambiguous.} + +\item{progeny_file}{Path to a TSV/CSV file listing progeny IDs to assign. +Must include an 'ID' column.} + +\item{method}{Character. Parentage assignment method. One of: +\itemize{ +\item \code{"best.sire"} — finds the best sire for each progeny using +homozygous mismatch rate. +\item \code{"best.dam"} — finds the best dam for each progeny using +homozygous mismatch rate. +\item \code{"best.match"} — finds the single best parent (either sex) +using homozygous mismatch rate. +\item \code{"best.pair"} — finds the best sire-dam pair for each +progeny using full Mendelian error rate (default). +}} + +\item{show_ties}{Logical. If \code{TRUE}, all tied best pairs are reported +as additional columns (\code{Sire_1}, \code{Sire_2}, etc.) when +\code{method = "best.pair"}. If \code{FALSE}, only one tied pair is +reported with a warning. Default is \code{TRUE}.} + +\item{allow_selfing}{Logical. If \code{FALSE}, sire-dam pairs where both +IDs are identical are excluded when \code{method = "best.pair"}. +Default is \code{TRUE}.} + +\item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary +statistics, and the results table to the console. Default is \code{TRUE}.} + +\item{write_txt}{Logical. If \code{TRUE}, writes results to +\code{parentage_results_dt.txt} in the working directory. Default is +\code{TRUE}.} +} +\value{ +A \code{data.table} with one row per progeny (or more if ties are +reported). Columns depend on the method used: +\itemize{ +\item \code{best.sire} / \code{best.dam} / \code{best.match}: \code{Progeny}, +\code{Best_Match}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. +\item \code{best.pair} (no ties): \code{Progeny}, \code{Sire}, \code{Dam}, +\code{Mendelian_Error_Pct}, \code{Markers_Tested}. +\item \code{best.pair} (with ties): columns are suffixed \code{_1}, \code{_2}, +etc. for each tied pair. +} +Returned invisibly when \code{verbose = TRUE}. +} +\description{ +Assigns the most likely parent(s) to each progeny individual based on +genotypic data using Mendelian error rates or homozygous mismatch rates. +} +\details{ +For \code{"best.sire"}, \code{"best.dam"}, and \code{"best.match"}, only +homozygous markers (coded 0 or 2) are used for comparison; heterozygous +markers (coded 1) are set to \code{NA}. This reduces false mismatches caused +by phase ambiguity. + +For \code{"best.pair"}, all markers are used and full Mendelian inheritance +rules are applied across all possible sire-dam combinations via +\code{data.table::CJ()}. + +Individuals in \code{parents_file} or \code{progeny_file} that are absent +from \code{genotypes_file} are removed with a warning. +} +\examples{ +\dontrun{ +# Assign best sire-dam pair to each progeny +results <- find_parentage( + genotypes_file = "genotypes.txt", + parents_file = "parents.txt", + progeny_file = "progeny.txt", + method = "best.pair", + show_ties = TRUE, + allow_selfing = FALSE +) + +# Find best individual parent match (ignoring sex) +results <- find_parentage( + genotypes_file = "genotypes.txt", + parents_file = "parents.txt", + progeny_file = "progeny.txt", + method = "best.match" +) +} + +} diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd new file mode 100644 index 0000000..4fd3d50 --- /dev/null +++ b/man/validate_pedigree.Rd @@ -0,0 +1,100 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_pedigree.R +\name{validate_pedigree} +\alias{validate_pedigree} +\title{Validate Pedigree Using Mendelian Error Analysis} +\usage{ +validate_pedigree( + pedigree_file, + genotypes_file, + error_threshold = 5, + min_markers = 10, + homozygous_threshold = 2, + fill_pedigree = FALSE, + verbose = TRUE, + write_txt = TRUE +) +} +\arguments{ +\item{pedigree_file}{Character. Path to the pedigree file (TSV or CSV). +Must contain columns \code{Progeny}, \code{Sire}, and \code{Dam}.} + +\item{genotypes_file}{Character. Path to the genotypes file (TSV or CSV). +Must contain an \code{ID} column followed by one column per marker, +with genotypes coded as 0, 1, or 2 (allele dosage).} + +\item{error_threshold}{Numeric. Maximum acceptable Mendelian error percentage +(0-100) for a trio to be considered a \code{PASS}. Default is \code{5.0}.} + +\item{min_markers}{Integer. Minimum number of non-missing markers required +across all three individuals for a trio to be fully evaluated. Trios below +this threshold receive status \code{LOW_MARKERS}. Default is \code{10}.} + +\item{homozygous_threshold}{Numeric. Maximum acceptable homozygous-marker +mismatch percentage (0-100) for a parent to be considered acceptable in a +failed trio. Default is \code{2.0}.} + +\item{fill_pedigree}{Logical. If \code{TRUE}, writes an additional corrected +pedigree file (\code{id_corrected_pedigree.txt}) in which removed parents +are replaced by the best-matching candidate ID rather than zero. +Default is \code{FALSE}.} + +\item{verbose}{Logical. If \code{TRUE}, prints a summary of results and +correction decisions to the console. Default is \code{TRUE}.} + +\item{write_txt}{Logical. If \code{TRUE}, writes the full validation results +table to \code{pedigree_validation_results.txt}. Default is \code{TRUE}.} +} +\value{ +A \code{data.table} with one row per validated trio containing the +following columns: \code{Progeny} (progeny ID); \code{Sire} (declared sire +ID); \code{Dam} (declared dam ID); \code{Mendelian_Error_Pct} (overall +Mendelian error rate \\% across all three individuals); +\code{Markers_Tested} (number of markers with non-missing genotypes in all +three individuals); \code{Status} (validation outcome: \code{PASS}, +\code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}); +\code{Correction_Decision} (action taken for failed trios: \code{NONE}, +\code{KEEP_BOTH}, \code{REMOVE_SIRE}, \code{REMOVE_DAM}, or +\code{REMOVE_BOTH}); \code{Sire_Hom_Error_Pct} (homozygous-marker mismatch +\\% between sire and progeny, \code{NA} unless status is \code{FAIL}); +\code{Dam_Hom_Error_Pct} (homozygous-marker mismatch \\% between dam and +progeny, \code{NA} unless status is \code{FAIL}); \code{Best_Sire} (ID of +the best-matching sire candidate, populated only when +\code{Correction_Decision} is \code{REMOVE_SIRE} or \code{REMOVE_BOTH}); +\code{Best_Sire_Error_Pct} (homozygous mismatch \\% for \code{Best_Sire}); +\code{Best_Dam} (ID of the best-matching dam candidate, populated only when +\code{Correction_Decision} is \code{REMOVE_DAM} or \code{REMOVE_BOTH}); +\code{Best_Dam_Error_Pct} (homozygous mismatch \\% for \code{Best_Dam}). +The function also writes \code{zero_corrected_pedigree.txt} (always) and, +if \code{fill_pedigree = TRUE}, \code{id_corrected_pedigree.txt}. If +\code{write_txt = TRUE}, results are written to +\code{pedigree_validation_results.txt}. The return value is invisible when +\code{verbose = TRUE}. +} +\description{ +Validates parent-offspring trios by calculating Mendelian error rates from +SNP genotype data. Trios exceeding the error threshold are flagged and +analysed further using homozygous-marker comparisons to identify which +parent(s) are likely incorrect. Optionally writes corrected pedigree files +with erroneous parents replaced by zeros or by the best-matching candidate IDs. +} +\details{ +Mendelian errors are identified using standard allele-dosage rules, e.g. a +progeny cannot carry an allele absent in both parents. Only homozygous +parental markers (coded 0 or 2) are used in the per-parent mismatch +analysis, as heterozygous markers are uninformative for tracing allele +origin. Trios in the pedigree that lack genotype data for any of the three +individuals are removed prior to analysis. +} +\examples{ +results <- validate_pedigree( + pedigree_file = "pedigree.txt", + genotypes_file = "genotypes.txt", + error_threshold = 5.0, + min_markers = 10, + homozygous_threshold = 2.0, + fill_pedigree = TRUE, + verbose = TRUE, + write_txt = TRUE +) +} diff --git a/tests/testthat/test-assign_parentage.R b/tests/testthat/test-assign_parentage.R deleted file mode 100644 index b855351..0000000 --- a/tests/testthat/test-assign_parentage.R +++ /dev/null @@ -1,184 +0,0 @@ -# tests/testthat/test_find_parentage.R -# Test suite for find_parentage() (assign_parentage.R) [1] -# ------------------------------------------------------------------ - -# 1. Load the function ----------------------------------------------- -# Assuming the function is in the current package: -# You may need to adjust the path if it is in a different location. -source("R/assign_parentage.R") # [1] - -# 2. Helper: create a very small, deterministic dataset ----------- -small_test_data <- function() { - # 3 parents (one male, one female, one unknown) - parents_df <- tibble::tribble( - ~ID, ~Sex, - "P1", "M", - "P2", "F", - "P3", NA_character_ - ) - # Progeny - progeny_df <- tibble::tibble(ID = "C1") - - # Genotypes (0 = AA, 1 = Aa, 2 = aa) - # P1 (M) : AA -> 0 - # P2 (F) : aa -> 2 - # P3 : unknown - # C1 : should be heterozygous (Aa) -> 1 - genotypes_df <- tibble::tribble( - ~ID, ~L1, - "P1", 0, - "P2", 2, - "P3", NA, - "C1", 1 - ) - - list(parents = parents_df, - progeny = progeny_df, - genotypes = genotypes_df) -} - -# 3. Unit tests ------------------------------------------------------- - -testthat::test_that("find_parentage returns a tibble with expected columns", { - data <- small_test_data() - - # Write temp files - tmp_dir <- tempdir() - write_tsv(data$parents, file.path(tmp_dir, "parents.tsv")) - write_tsv(data$progeny, file.path(tmp_dir, "progeny.tsv")) - write_tsv(data$genotypes, file.path(tmp_dir, "genotypes.tsv")) - - # Run the function - res <- find_parentage( - genotypes_file = file.path(tmp_dir, "genotypes.tsv"), - parents_file = file.path(tmp_dir, "parents.tsv"), - progeny_file = file.path(tmp_dir, "progeny.tsv"), - method = "best.pair", - write.txt = FALSE, # we don't need the text file in tests - verbose = FALSE - ) - - # Basic sanity checks - testthat::expect_s3_class(res, "tbl_df") - testthat::expect_equal(nrow(res), 1) # one progeny - testthat::expect_true(all(c("Progeny", "Sire", "Dam", "Mendelian_Error_Pct", - "Markers_Tested") %in% colnames(res))) - - # The best parent pair should be (P1, P2) with no Mendelian errors - testthat::expect_equal(res$Progeny, "C1") - testthat::expect_equal(res$Sire, "P1") - testthat::expect_equal(res$Dam, "P2") - testthat::expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) - testthat::expect_equal(res$Markers_Tested, 1) # L1 was compared -}) - -testthat::test_that("best.sire and best.dam methods work on the same data", { - data <- small_test_data() - - write_tsv(data$parents, file.path(tmpdir(), "parents.tsv")) - write_tsv(data$progeny, file.path(tmpdir(), "progeny.tsv")) - write_tsv(data$genotypes, file.path(tmpdir(), "genotypes.tsv")) - - # Best sire - res_sire <- find_parentage( - genotypes_file = file.path(tmpdir(), "genotypes.tsv"), - parents_file = file.path(tmpdir(), "parents.tsv"), - progeny_file = file.path(tmpdir(), "progeny.tsv"), - method = "best.sire", - write.txt = FALSE, - verbose = FALSE - ) - testthat::expect_equal(res_sire$Best_Match, "P1") - - # Best dam - res_dam <- find_parentage( - genotypes_file = file.path(tmpdir(), "genotypes.tsv"), - parents_file = file.path(tmpdir(), "parents.tsv"), - progeny_file = file.path(tmpdir(), "progeny.tsv"), - method = "best.dam", - write.txt = FALSE, - verbose = FALSE - ) - testthat::expect_equal(res_dam$Best_Match, "P2") -}) - -testthat::test_that("function can handle a larger random dataset", { - # Generate a larger random dataset using the provided script logic [2] - # (The code below mirrors the logic in test_data.R but runs locally.) - set.seed(42) - n_progeny <- 50 - n_loci <- 100 - n_males <- 5 - n_females <- 5 - n_unknown <- 2 - - # Build parent list - parents_df <- tibble::tribble( - ~ID, ~Sex, - paste0("Male_", 1:n_males), "M", - paste0("Female_", 1:n_females), "F", - paste0("Unknown_", 1:n_unknown), NA_character_ - ) %>% tibble::add_row() - - # Progeny list - progeny_ids <- paste0("Progeny_", 1:n_progeny) - progeny_df <- tibble::tibble(ID = progeny_ids) - - # Random genotypes for parents - all_parent_ids <- parents_df$ID - parent_gens_mat <- matrix( - sample(0:2, length(all_parent_ids) * n_loci, replace = TRUE), - nrow = length(all_parent_ids), - dimnames = list(all_parent_ids, - paste0("Locus_", 1:n_loci)) - ) - - # Build progeny genotypes from random parents (simple simulation) - progeny_genos_mat <- matrix(NA_integer_, nrow = n_progeny, - ncol = n_loci, - dimnames = list(progeny_ids, - paste0("Locus_", 1:n_loci))) - - for (i in seq_along(progeny_ids)) { - sire <- sample(all_parent_ids, 1) - dam <- sample(all_parent_ids, 1) - for (l in 1:n_loci) { - parent_alleles <- c(parent_gens_mat[sire, l], - parent_gens_mat[dam, l]) - # Simple rule: sum to get genotype (0,1,2) – works because alleles are 0 or 1 - progeny_genos_mat[i, l] <- sum(parent_alleles) - } - } - - # Combine and write to files - all_gens_df <- tibble::as_tibble( - Rcpp::cppFunction('library(tibble);') # just for tibble import - ) - all_gens_df <- tibble::as_tibble( - rbind(parents_df, progeny_df) - ) %>% - tibble::add_row() - - # Write files - tmp_dir <- tempdir() - write_tsv(parents_df, file.path(tmp_dir, "parents.tsv")) - write_tsv(progeny_df, file.path(tmp_dir, "progeny.tsv")) - write_tsv(all_parent_ids, file.path(tmp_dir, "genotypes.tsv")) # placeholder - - # Call the function - res <- find_parentage( - genotypes_file = file.path(tmp_dir, "genotypes.tsv"), - parents_file = file.path(tmp_dir, "parents.tsv"), - progeny_file = file.path(tmp_dir, "progeny.tsv"), - method = "best.pair", - write.txt = FALSE, - verbose = FALSE - ) - - # Basic checks on the larger set - testthat::expect_equal(nrow(res), n_progeny) - testthat::expect_true(all(c("Progeny", "Sire", "Dam", - "Mendelian_Error_Pct", "Markers_Tested") %in% colnames(res))) - testthat::expect_true(all(res$Mendelian_Error_Pct >= 0)) - testthat::expect_true(all(res$Mendelian_Error_Pct <= 100)) -}) From bf0a4682d89ffc8f8f1928990b5e07dcf09da39b Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 6 Apr 2026 10:07:43 -0400 Subject: [PATCH 44/80] finalized parentage functions for diploids and test files --- R/validate_pedigree.R | 295 +++++++-------- tests/testthat/test-assign_parentage.R | 462 ++++++++++++++++++++++++ tests/testthat/test-validate_pedigree.R | 262 ++++++++++++++ 3 files changed, 872 insertions(+), 147 deletions(-) create mode 100644 tests/testthat/test-assign_parentage.R create mode 100644 tests/testthat/test-validate_pedigree.R diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index b416370..998d71d 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -1,134 +1,138 @@ -#' Validate Pedigree Using Mendelian Error Analysis +#' Validate Pedigree Trios Using Mendelian Error Analysis #' #' Validates parent-offspring trios by calculating Mendelian error rates from -#' SNP genotype data. Trios exceeding the error threshold are flagged and -#' analysed further using homozygous-marker comparisons to identify which -#' parent(s) are likely incorrect. Optionally writes corrected pedigree files -#' with erroneous parents replaced by zeros or by the best-matching candidate IDs. +#' SNP genotype data. Identifies incorrect parentage assignments and optionally +#' suggests or fills in best-matching replacements. #' -#' @param pedigree_file Character. Path to the pedigree file (TSV or CSV). -#' Must contain columns \code{Progeny}, \code{Sire}, and \code{Dam}. -#' @param genotypes_file Character. Path to the genotypes file (TSV or CSV). -#' Must contain an \code{ID} column followed by one column per marker, -#' with genotypes coded as 0, 1, or 2 (allele dosage). -#' @param error_threshold Numeric. Maximum acceptable Mendelian error percentage -#' (0-100) for a trio to be considered a \code{PASS}. Default is \code{5.0}. -#' @param min_markers Integer. Minimum number of non-missing markers required -#' across all three individuals for a trio to be fully evaluated. Trios below -#' this threshold receive status \code{LOW_MARKERS}. Default is \code{10}. -#' @param homozygous_threshold Numeric. Maximum acceptable homozygous-marker -#' mismatch percentage (0-100) for a parent to be considered acceptable in a -#' failed trio. Default is \code{2.0}. -#' @param fill_pedigree Logical. If \code{TRUE}, writes an additional corrected -#' pedigree file (\code{id_corrected_pedigree.txt}) in which removed parents -#' are replaced by the best-matching candidate ID rather than zero. -#' Default is \code{FALSE}. -#' @param verbose Logical. If \code{TRUE}, prints a summary of results and -#' correction decisions to the console. Default is \code{TRUE}. -#' @param write_txt Logical. If \code{TRUE}, writes the full validation results -#' table to \code{pedigree_validation_results.txt}. Default is \code{TRUE}. +#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV) with +#' columns: \code{Progeny}, \code{Sire}, \code{Dam}. +#' @param genotypes_file Character. Path to the genotypes file (TSV/CSV) with +#' an \code{ID} column followed by marker columns coded +#' as 0, 1, 2 (additive allele dosage). +#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage to +#' classify a trio as \code{PASS} (default: \code{5.0}). +#' @param min_markers Integer. Minimum number of non-missing markers +#' required to evaluate a trio; below this the trio +#' is flagged \code{LOW_MARKERS} (default: \code{10}). +#' @param single_parent_error_threshold Numeric. Maximum homozygous-marker mismatch +#' percentage for a parent to be considered acceptable +#' in a failed trio (default: \code{2.0}). +#' @param fill_pedigree Logical. If \code{TRUE}, writes an additional file with +#' failed parents replaced by the best-matching candidate +#' IDs (default: \code{FALSE}). +#' @param verbose Logical. If \code{TRUE}, prints progress messages, summary +#' statistics, and the results table to the console +#' (default: \code{TRUE}). +#' @param write_txt Logical. If \code{TRUE}, writes the validation results to +#' \code{output_filename} (default: \code{TRUE}). +#' @param output_filename Character. Name of the output file for validation +#' results (default: \code{"trio_validation_results.txt"}). #' -#' @return A \code{data.table} with one row per validated trio containing the -#' following columns: \code{Progeny} (progeny ID); \code{Sire} (declared sire -#' ID); \code{Dam} (declared dam ID); \code{Mendelian_Error_Pct} (overall -#' Mendelian error rate \% across all three individuals); -#' \code{Markers_Tested} (number of markers with non-missing genotypes in all -#' three individuals); \code{Status} (validation outcome: \code{PASS}, -#' \code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}); -#' \code{Correction_Decision} (action taken for failed trios: \code{NONE}, -#' \code{KEEP_BOTH}, \code{REMOVE_SIRE}, \code{REMOVE_DAM}, or -#' \code{REMOVE_BOTH}); \code{Sire_Hom_Error_Pct} (homozygous-marker mismatch -#' \% between sire and progeny, \code{NA} unless status is \code{FAIL}); -#' \code{Dam_Hom_Error_Pct} (homozygous-marker mismatch \% between dam and -#' progeny, \code{NA} unless status is \code{FAIL}); \code{Best_Sire} (ID of -#' the best-matching sire candidate, populated only when -#' \code{Correction_Decision} is \code{REMOVE_SIRE} or \code{REMOVE_BOTH}); -#' \code{Best_Sire_Error_Pct} (homozygous mismatch \% for \code{Best_Sire}); -#' \code{Best_Dam} (ID of the best-matching dam candidate, populated only when -#' \code{Correction_Decision} is \code{REMOVE_DAM} or \code{REMOVE_BOTH}); -#' \code{Best_Dam_Error_Pct} (homozygous mismatch \% for \code{Best_Dam}). -#' The function also writes \code{zero_corrected_pedigree.txt} (always) and, -#' if \code{fill_pedigree = TRUE}, \code{id_corrected_pedigree.txt}. If -#' \code{write_txt = TRUE}, results are written to -#' \code{pedigree_validation_results.txt}. The return value is invisible when -#' \code{verbose = TRUE}. +#' @return A \code{data.table} (returned invisibly) with one row per trio and +#' the following columns: +#' \describe{ +#' \item{Progeny}{Progeny ID.} +#' \item{Sire}{Declared sire ID.} +#' \item{Dam}{Declared dam ID.} +#' \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +#' \item{Markers_Tested}{Number of markers compared across all three individuals.} +#' \item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}.} +#' \item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, \code{REMOVE_SIRE}, +#' \code{REMOVE_DAM}, or \code{REMOVE_BOTH}.} +#' \item{Sire_Hom_Error_Pct}{Sire homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} +#' \item{Dam_Hom_Error_Pct}{Dam homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} +#' \item{Best_Sire}{Best-matching sire candidate ID (\code{NA} unless sire removed).} +#' \item{Best_Sire_Error_Pct}{Homozygous mismatch percentage for \code{Best_Sire}.} +#' \item{Best_Dam}{Best-matching dam candidate ID (\code{NA} unless dam removed).} +#' \item{Best_Dam_Error_Pct}{Homozygous mismatch percentage for \code{Best_Dam}.} +#' } #' #' @details -#' Mendelian errors are identified using standard allele-dosage rules, e.g. a -#' progeny cannot carry an allele absent in both parents. Only homozygous -#' parental markers (coded 0 or 2) are used in the per-parent mismatch -#' analysis, as heterozygous markers are uninformative for tracing allele -#' origin. Trios in the pedigree that lack genotype data for any of the three -#' individuals are removed prior to analysis. -#' -#' @importFrom data.table fread fwrite copy as.data.table rbindlist data.table +#' Trios are filtered to individuals present in the genotype file before +#' analysis. Mendelian errors are counted as genotype combinations impossible +#' under Mendelian inheritance (e.g. both parents homozygous reference but +#' progeny carries the alternate allele). Failed trios are further dissected +#' using homozygous-only markers to identify which parent is likely incorrect. +#' A corrected pedigree with failed parents set to \code{0} is always written +#' to \code{corrected_pedigree.txt}. If \code{fill_pedigree = TRUE}, a second +#' file (\code{filled_pedigree.txt}) replaces those zeros with the best +#' genomic match. #' #' @examples +#' \dontrun{ +#' # Basic run with defaults +#' results <- validate_pedigree("pedigree.txt", "genotypes.txt") +#' +#' # Stricter thresholds, custom output name, no console output #' results <- validate_pedigree( -#' pedigree_file = "pedigree.txt", -#' genotypes_file = "genotypes.txt", -#' error_threshold = 5.0, -#' min_markers = 10, -#' homozygous_threshold = 2.0, -#' fill_pedigree = TRUE, -#' verbose = TRUE, -#' write_txt = TRUE +#' pedigree_file = "pedigree.txt", +#' genotypes_file = "genotypes.txt", +#' trio_error_threshold = 2.0, +#' single_parent_error_threshold = 1.0, +#' fill_pedigree = TRUE, +#' verbose = FALSE, +#' output_filename = "my_validation.txt" #' ) -validate_pedigree <- function(pedigree_file, genotypes_file, - error_threshold = 5.0, - min_markers = 10, - homozygous_threshold = 2.0, - fill_pedigree = FALSE, - verbose = TRUE, - write_txt = TRUE) { - +#' } +#' +#' @import data.table +#' @export + validate_pedigree <- function(pedigree_file, genotypes_file, +trio_error_threshold = 5.0, +min_markers = 10, +single_parent_error_threshold = 2.0, +fill_pedigree = FALSE, +verbose = TRUE, +write_txt = TRUE, +output_filename = "trio_validation_results.txt") { + #### Input Validation and Data Loading #### - if (error_threshold < 0 || error_threshold > 100) { - stop("error_threshold must be between 0 and 100") + if (trio_error_threshold < 0 || trio_error_threshold > 100) { + stop("trio_error_threshold must be between 0 and 100") } - if (homozygous_threshold < 0 || homozygous_threshold > 100) { - stop("homozygous_threshold must be between 0 and 100") + if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) { + stop("single_parent_error_threshold must be between 0 and 100") } - + tryCatch({ pedigree <- fread(pedigree_file) genos <- fread(genotypes_file) }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TSV/CSV.") }) - + # Keep original pedigree for correction if needed original_pedigree <- copy(pedigree) - + # Check required columns required_ped_cols <- c("Progeny", "Sire", "Dam") missing_cols <- setdiff(required_ped_cols, names(pedigree)) if (length(missing_cols) > 0) { stop("Pedigree file missing required columns: ", paste(missing_cols, collapse = ", ")) } - + if (!"ID" %in% names(genos)) { stop("Genotypes file must have an 'ID' column") } - + # Filter pedigree to only include individuals with genotype data valid_ids <- genos$ID initial_trios <- nrow(pedigree) - + pedigree <- pedigree[Progeny %in% valid_ids & Sire %in% valid_ids & Dam %in% valid_ids] - + removed_trios <- initial_trios - nrow(pedigree) if (removed_trios > 0 && verbose) { cat("Removed", removed_trios, "trios due to missing genotype data.\n") } - + if (nrow(pedigree) == 0) { stop("No valid trios remain after filtering for genotype availability.") } - + #### Mendelian Error Calculation #### genos_mat <- as.matrix(genos, rownames = "ID") - + # Create homozygous-only matrix for parent analysis genos_hom <- copy(genos) marker_cols <- setdiff(names(genos_hom), "ID") @@ -136,37 +140,37 @@ validate_pedigree <- function(pedigree_file, genotypes_file, genos_hom[get(col) == 1, (col) := NA_integer_] } genos_hom_mat <- as.matrix(genos_hom, rownames = "ID") - + #### Helper: Find Best Matching Parent #### # Returns list(id, error_pct) for the candidate with lowest homozygous mismatch vs progeny find_best_parent <- function(prog_id, exclude_ids = character(0)) { candidates <- setdiff(rownames(genos_hom_mat), c(prog_id, exclude_ids)) - + if (length(candidates) == 0) return(list(id = NA_character_, error_pct = NA_real_)) - + prog_hom <- genos_hom_mat[prog_id, ] - + errors <- sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] comparisons <- sum(!is.na(cand_hom) & !is.na(prog_hom)) if (comparisons == 0) return(NA_real_) (sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) - + best_idx <- which.min(errors) list(id = candidates[best_idx], error_pct = round(errors[best_idx], 2)) } - + results_list <- lapply(seq_len(nrow(pedigree)), function(i) { prog_id <- pedigree$Progeny[i] sire_id <- pedigree$Sire[i] dam_id <- pedigree$Dam[i] - + # Extract genotype vectors progeny_vec <- genos_mat[prog_id, ] sire_vec <- genos_mat[sire_id, ] dam_vec <- genos_mat[dam_id, ] - + # Calculate Mendelian errors using same logic as original function mismatches <- sum( (sire_vec == 0 & dam_vec == 0 & progeny_vec > 0) | @@ -176,10 +180,10 @@ validate_pedigree <- function(pedigree_file, genotypes_file, ((sire_vec == 0 & dam_vec == 2) | (sire_vec == 2 & dam_vec == 0)) & (progeny_vec != 1), na.rm = TRUE ) - + # Count comparable markers (non-NA in all three individuals) markers_tested <- sum(!is.na(sire_vec) & !is.na(dam_vec) & !is.na(progeny_vec)) - + # Initialise per-parent and best-parent fields (populated only for FAILs) sire_error_pct <- NA_real_ dam_error_pct <- NA_real_ @@ -187,7 +191,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_sire_pct <- NA_real_ best_dam <- NA_character_ best_dam_pct <- NA_real_ - + # Calculate error percentage and determine status if (markers_tested == 0) { error_pct <- NA_real_ @@ -199,31 +203,31 @@ validate_pedigree <- function(pedigree_file, genotypes_file, correction_decision <- "NONE" } else { error_pct <- (mismatches / markers_tested) * 100 - - if (error_pct <= error_threshold) { + + if (error_pct <= trio_error_threshold) { status <- "PASS" correction_decision <- "NONE" } else { status <- "FAIL" - + # Per-parent homozygous analysis for failed trios progeny_hom <- genos_hom_mat[prog_id, ] sire_hom <- genos_hom_mat[sire_id, ] dam_hom <- genos_hom_mat[dam_id, ] - + # Sire homozygous error sire_comparisons <- sum(!is.na(sire_hom) & !is.na(progeny_hom)) sire_error_pct <- if (sire_comparisons == 0) NA_real_ else round((sum(sire_hom != progeny_hom, na.rm = TRUE) / sire_comparisons) * 100, 2) - + # Dam homozygous error dam_comparisons <- sum(!is.na(dam_hom) & !is.na(progeny_hom)) dam_error_pct <- if (dam_comparisons == 0) NA_real_ else round((sum(dam_hom != progeny_hom, na.rm = TRUE) / dam_comparisons) * 100, 2) - - sire_acceptable <- !is.na(sire_error_pct) && sire_error_pct <= homozygous_threshold - dam_acceptable <- !is.na(dam_error_pct) && dam_error_pct <= homozygous_threshold - + + sire_acceptable <- !is.na(sire_error_pct) && sire_error_pct <= single_parent_error_threshold + dam_acceptable <- !is.na(dam_error_pct) && dam_error_pct <= single_parent_error_threshold + if (sire_acceptable && dam_acceptable) { correction_decision <- "KEEP_BOTH" } else if (sire_acceptable && !dam_acceptable) { @@ -247,7 +251,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } } } - + data.table( Progeny = prog_id, Sire = sire_id, @@ -264,75 +268,75 @@ validate_pedigree <- function(pedigree_file, genotypes_file, Best_Dam_Error_Pct = best_dam_pct ) }) - + final_df <- rbindlist(results_list) - + #### Always Write Corrected Pedigree (zeros for failed parents) #### - zero_corrected_pedigree <- copy(original_pedigree) - + corrected_pedigree <- copy(original_pedigree) + for (i in seq_len(nrow(final_df))) { prog_id <- final_df$Progeny[i] decision <- final_df$Correction_Decision[i] - + if (decision == "REMOVE_SIRE") { - zero_corrected_pedigree[Progeny == prog_id, Sire := 0] + corrected_pedigree[Progeny == prog_id, Sire := 0] } else if (decision == "REMOVE_DAM") { - zero_corrected_pedigree[Progeny == prog_id, Dam := 0] + corrected_pedigree[Progeny == prog_id, Dam := 0] } else if (decision == "REMOVE_BOTH") { - zero_corrected_pedigree[Progeny == prog_id, `:=`(Sire = 0, Dam = 0)] + corrected_pedigree[Progeny == prog_id, `:=`(Sire = 0, Dam = 0)] } # KEEP_BOTH and NONE require no changes } - + tryCatch({ - fwrite(zero_corrected_pedigree, file = "zero_corrected_pedigree.txt", sep = "\t", quote = FALSE) - if (verbose) cat("Corrected pedigree (zeros) written to: zero_corrected_pedigree.txt\n") + fwrite(corrected_pedigree, file = "corrected_pedigree.txt", sep = "\t", quote = FALSE) + if (verbose) cat("Corrected pedigree (zeros) written to: corrected_pedigree.txt\n") }, error = function(e) { warning("Could not write corrected pedigree to file. Error: ", e$message, call. = FALSE) }) - + #### Optionally Write Filled Pedigree (best matching IDs for failed parents) #### if (fill_pedigree) { - id_corrected_pedigree <- copy(original_pedigree) - + filled_pedigree <- copy(original_pedigree) + for (i in seq_len(nrow(final_df))) { prog_id <- final_df$Progeny[i] decision <- final_df$Correction_Decision[i] - + if (decision == "REMOVE_SIRE") { - id_corrected_pedigree[Progeny == prog_id, Sire := final_df$Best_Sire[i]] + filled_pedigree[Progeny == prog_id, Sire := final_df$Best_Sire[i]] } else if (decision == "REMOVE_DAM") { - id_corrected_pedigree[Progeny == prog_id, Dam := final_df$Best_Dam[i]] + filled_pedigree[Progeny == prog_id, Dam := final_df$Best_Dam[i]] } else if (decision == "REMOVE_BOTH") { - id_corrected_pedigree[Progeny == prog_id, `:=`(Sire = final_df$Best_Sire[i], - Dam = final_df$Best_Dam[i])] + filled_pedigree[Progeny == prog_id, `:=`(Sire = final_df$Best_Sire[i], + Dam = final_df$Best_Dam[i])] } # KEEP_BOTH and NONE require no changes } - + tryCatch({ - fwrite(id_corrected_pedigree, file = "id_corrected_pedigree.txt", sep = "\t", quote = FALSE) - if (verbose) cat("Filled pedigree (best IDs) written to: id_corrected_pedigree.txt\n") + fwrite(filled_pedigree, file = "filled_pedigree.txt", sep = "\t", quote = FALSE) + if (verbose) cat("Filled pedigree (best IDs) written to: filled_pedigree.txt\n") }, error = function(e) { warning("Could not write filled pedigree to file. Error: ", e$message, call. = FALSE) }) } - + #### Summary Statistics #### if (verbose) { total_trios <- nrow(final_df) status_counts <- table(final_df$Status) - + cat("\n--- Trio Validation Summary ---\n") cat("Total trios tested:", total_trios, "\n") for (status in names(status_counts)) { cat(sprintf("%-12s: %d (%.1f%%)\n", status, status_counts[status], (status_counts[status] / total_trios) * 100)) } - cat("Error threshold:", error_threshold, "%\n") - cat("Homozygous threshold:", homozygous_threshold, "%\n") + cat("Error threshold:", trio_error_threshold, "%\n") + cat("Homozygous threshold:", single_parent_error_threshold, "%\n") cat("Minimum markers required:", min_markers, "\n\n") - + corrections <- table(final_df$Correction_Decision) cat("Correction summary:\n") for (decision in names(corrections)) { @@ -342,22 +346,19 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } cat("\n") } - + #### Output #### if (write_txt) { - output_filename <- "pedigree_validation_results.txt" tryCatch({ - fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) + fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) # <-- uses new arg if (verbose) cat("Results written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } - - if (verbose) { - print(final_df) - return(invisible(final_df)) - } else { - return(final_df) - } -} + + if (verbose) print(final_df) + + return(invisible(final_df)) + } + \ No newline at end of file diff --git a/tests/testthat/test-assign_parentage.R b/tests/testthat/test-assign_parentage.R new file mode 100644 index 0000000..1beccd8 --- /dev/null +++ b/tests/testthat/test-assign_parentage.R @@ -0,0 +1,462 @@ +# tests/testthat/test-find_parentage.R + +library(testthat) +library(data.table) + +# ───────────────────────────────────────────── +# Helper: write temp TSV files and return paths +# ───────────────────────────────────────────── +make_files <- function(genos, parents, progeny, dir = tempdir()) { + geno_file <- file.path(dir, "genos.txt") + parent_file <- file.path(dir, "parents.txt") + progeny_file <- file.path(dir, "progeny.txt") + fwrite(genos, geno_file, sep = "\t") + fwrite(parents, parent_file, sep = "\t") + fwrite(progeny, progeny_file, sep = "\t") + list(g = geno_file, p = parent_file, pr = progeny_file) +} + +# ───────────────────────────────────────────────────────────────────────────── +# Shared toy genotype data +# +# We rely ONLY on the two simplest, unambiguous Mendelian rules that have no +# operator-precedence risk in the source code: +# +# Rule A: sire=0 & dam=0 → progeny MUST be 0 (error if prog > 0) +# Rule B: sire=2 & dam=2 → progeny MUST be 2 (error if prog < 2) +# +# Design: +# S1: 0 0 0 0 0 2 2 2 2 2 +# D1: 0 0 0 0 0 2 2 2 2 2 +# child1 (perfect child of S1xD1): +# 0 0 0 0 0 2 2 2 2 2 → 0 errors with S1xD1 +# +# S2: 2 2 2 2 2 0 0 0 0 0 (opposite homozygotes) +# D2: 2 2 2 2 2 0 0 0 0 0 +# +# S2xD2 for child1: +# M1–M5: s=2,d=2 → prog must be 2, child1=0 → ERROR (×5) +# M6–M10: s=0,d=0 → prog must be 0, child1=2 → ERROR (×5) +# → 10/10 = 100% error ✓ +# +# S1xD2 for child1: +# M1–M5: s=0,d=2 → heterozygous rule (involves compound condition, +# not tested here — but S2xD1 and S2xD2 already have 100% error) +# We only need S1xD1 to be strictly better than all others. +# S1xD2: M1–M5: s=0,d=2 → no Rule A/B fires → 0 errors on M1-M5 +# M6–M10: s=2,d=0 → no Rule A/B fires → 0 errors on M6-M10 +# → 0% error ← tie with S1xD1! +# +# To break S1xD2 tie, add markers where S1=0,D1=0,child1=0 but D2≠0: +# Add M11, M12: S1=0, S2=2, D1=0, D2=2, child1=0 +# S1xD1: s=0,d=0 → must be 0, child1=0 → OK ✓ +# S1xD2: s=0,d=2 → Rule A/B don't fire → OK (no error counted) +# S2xD1: s=2,d=0 → Rule A/B don't fire → OK +# S2xD2: s=2,d=2 → must be 2, child1=0 → ERROR ✓ +# +# Hmm — S1xD2 still 0 errors. The only way to get errors for S1xD2 using +# only Rule A/B is if we have a marker where S1=0, D2=0, child1≠0 +# OR S1=2, D2=2, child1≠2. +# +# Final clean design using ONLY Rule A (s=0,d=0→prog=0): +# +# Group 1 (5 markers): S1=0, D1=0, S2=2, D2=2, child1=0 +# S1xD1: Rule A → prog=0 ✓ (0 errors) +# S2xD2: Rule B → prog must be 2, child1=0 → ERROR ✓ +# S1xD2: s=0,d=2 → no Rule A/B → 0 errors (still tied) +# S2xD1: s=2,d=0 → no Rule A/B → 0 errors (still tied) +# +# Group 2 (5 markers): S1=0, D1=0, S2=0, D2=2, child1=0 +# S1xD1: Rule A → 0 errors ✓ +# S1xD2: s=0,d=2 → no error from Rule A/B +# S2xD1: Rule A → 0 errors +# S2xD2: s=0,d=2 → no error from Rule A/B +# +# It is impossible to distinguish S1xD1 from S1xD2 using ONLY Rule A/B +# when child1 is always 0 (since Rule A needs d=0 too, and D2≠0 means +# Rule A doesn't fire for S1xD2, giving no error). +# +# CONCLUSION: We must allow heterozygous markers but avoid the +# operator-precedence bug. Looking at the source code condition: +# +# ((sire==0 & dam==1) | (sire==1 & dam==0)) & (prog==2) +# +# Due to R's precedence (& binds tighter than |), this parses as: +# (sire==0 & dam==1) | ((sire==1 & dam==0) & prog==2) +# +# So the condition misfires for sire=0,dam=1,prog=anything (always TRUE +# for the left side regardless of prog). This means any marker where +# sire=0,dam=1 will ALWAYS be counted as an error, regardless of progeny. +# +# Similarly: ((sire==2 & dam==1) | (sire==1 & dam==2)) & (prog==0) +# parses as: (sire==2 & dam==1) | ((sire==1 & dam==2) & prog==0) +# → sire=2,dam=1 always flagged as error. +# +# And: ((sire==0 & dam==2) | (sire==2 & dam==0)) & (prog!=1) +# parses as: (sire==0 & dam==2) | ((sire==2 & dam==0) & prog!=1) +# → sire=0,dam=2 always flagged as error regardless of prog. +# +# SAFE rules (no precedence issue): +# Rule A: s=0,d=0 → prog must be 0 ✓ safe +# Rule B: s=2,d=2 → prog must be 2 ✓ safe +# +# UNSAFE parent combos (always produce errors due to bug): +# s=0,d=1 → always error +# s=2,d=1 → always error +# s=0,d=2 → always error +# +# SAFE combos with no error fired (Rule A/B don't apply): +# s=1,d=0, s=1,d=2, s=1,d=1, s=2,d=0 (only right side of | checked) +# s=2,d=0: parses as (FALSE) | (TRUE & prog!=1) → error only if prog!=1 +# so s=2,d=0,prog=1 → NO error ✓ +# +# New design using ONLY Rule A, Rule B, and the safe s=2,d=0,prog=1 case: +# +# Group 1 (5 markers): S1=0,D1=0,child1=0 → Rule A, 0 errors for S1xD1 +# S2=2,D2=2 → Rule B fires: child1=0 < 2 → ERROR for S2xD2 +# S1xD2: s=0,d=2 → UNSAFE → always error for S1xD2 ✓ +# S2xD1: s=2,d=0,prog=0 → (FALSE)|(TRUE & 0!=1)=TRUE → ERROR ✓ +# +# Let's verify S1xD2 Group1: s=0,d=2 → always error (due to bug) → 5 errors +# Let's verify S2xD1 Group1: s=2,d=0,prog=0 → right side: TRUE & (0!=1)=TRUE → ERROR → 5 errors +# +# So with Group 1 alone (5 markers, all s=0,d=0,prog=0 for S1xD1): +# S1xD1: 0 errors / 5 = 0% ← BEST ✓ +# S2xD2: 5 errors / 5 = 100% +# S1xD2: 5 errors / 5 = 100% +# S2xD1: 5 errors / 5 = 100% +# +# This works! Simple and clean. +# child2 can be anything distinct. +# ───────────────────────────────────────────────────────────────────────────── + +base_genos <- data.table( + ID = c("S1", "S2", "D1", "D2", "child1", "child2"), + M1 = c(0L, 2L, 0L, 2L, 0L, 2L), + M2 = c(0L, 2L, 0L, 2L, 0L, 2L), + M3 = c(0L, 2L, 0L, 2L, 0L, 2L), + M4 = c(0L, 2L, 0L, 2L, 0L, 2L), + M5 = c(0L, 2L, 0L, 2L, 0L, 2L) +) + +# child2 is a perfect child of S2xD2 (all 2s, Rule B: s=2,d=2→prog=2 ✓) +# S1xD1 for child2: s=0,d=0→must be 0, child2=2 → ERROR on all 5 markers + +base_parents <- data.table(ID = c("S1","S2","D1","D2"), + Sex = c("M", "M", "F", "F")) +base_progeny <- data.table(ID = c("child1", "child2")) +child1_progeny <- data.table(ID = "child1") +child2_progeny <- data.table(ID = "child2") + +# ══════════════════════════════════════════════ +# 1. Input validation +# ══════════════════════════════════════════════ +test_that("invalid method throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, method = "bad.method", + verbose = FALSE, write_txt = FALSE), + regexp = "Method must be one of" + ) +}) + +test_that("missing genotype file throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage("nonexistent.txt", f$p, f$pr, + verbose = FALSE, write_txt = FALSE) + ) +}) + +test_that("parent IDs absent from genotype file raise a warning and are dropped", { + extra_parents <- rbind(base_parents, data.table(ID = "GHOST", Sex = "M")) + f <- make_files(base_genos, extra_parents, child1_progeny) + expect_warning( + find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = FALSE, write_txt = FALSE), + regexp = "GHOST" + ) +}) + +test_that("progeny IDs absent from genotype file raise a warning and are dropped", { + extra_progeny <- rbind(child1_progeny, data.table(ID = "GHOST_KID")) + f <- make_files(base_genos, base_parents, extra_progeny) + expect_warning( + find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = FALSE, write_txt = FALSE), + regexp = "GHOST_KID" + ) +}) + +test_that("no valid progeny candidates after filtering stops with an error", { + ghost_progeny <- data.table(ID = "NOBODY") + f <- make_files(base_genos, base_parents, ghost_progeny) + expect_warning( + expect_error( + find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = FALSE, write_txt = FALSE), + regexp = "No valid progeny" + ) + ) +}) + +test_that("missing Sex column raises a warning and defaults to ambiguous", { + parents_no_sex <- data.table(ID = c("S1", "D1")) + f <- make_files(base_genos, parents_no_sex, child1_progeny) + expect_warning( + find_parentage(f$g, f$p, f$pr, method = "best.match", + verbose = FALSE, write_txt = FALSE), + regexp = "Sex" + ) +}) + +# ══════════════════════════════════════════════ +# 2. Return structure +# ══════════════════════════════════════════════ +test_that("best.pair returns a data.table with expected columns (no ties)", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny", "Sire", "Dam", + "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("best.sire returns a data.table with expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.sire", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny", "Best_Match", + "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("best.dam returns a data.table with expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.dam", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny", "Best_Match", + "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("best.match returns a data.table with expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.match", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny", "Best_Match", + "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("one row is returned per progeny for single-parent methods", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best.sire", "best.dam", "best.match")) { + res <- find_parentage(f$g, f$p, f$pr, method = m, + verbose = FALSE, write_txt = FALSE) + expect_equal(nrow(res), 1L, label = paste("row count for method", m)) + } +}) + +# ══════════════════════════════════════════════ +# 3. Biological correctness +# ══════════════════════════════════════════════ +test_that("best.pair correctly identifies S1 x D1 as best pair with 0% error for child1", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Sire, "S1") + expect_equal(res$Dam, "D1") + expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) +}) + +test_that("best.pair correctly identifies S2 x D2 as best pair with 0% error for child2", { + f <- make_files(base_genos, base_parents, child2_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Sire, "S2") + expect_equal(res$Dam, "D2") + expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) +}) + +test_that("best.sire identifies S1 as best sire for child1", { + # For homozygous method: child1=0 (hom), S1=0 (hom) → match; S2=2 → mismatch + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.sire", + verbose = FALSE, write_txt = FALSE) + expect_equal(res$Best_Match, "S1") +}) + +test_that("best.dam identifies D1 as best dam for child1", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.dam", + verbose = FALSE, write_txt = FALSE) + expect_equal(res$Best_Match, "D1") +}) + +test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) +}) + +test_that("Mendelian_Error_Pct is between 0 and 100", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + pct <- as.numeric(res$Mendelian_Error_Pct) + expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) +}) + +test_that("Markers_Tested equals the number of non-NA markers", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) # minus ID column +}) + +# ══════════════════════════════════════════════ +# 4. allow_selfing +# ══════════════════════════════════════════════ +test_that("allow_selfing = FALSE removes self-pairs from candidates", { + ambig_parents <- data.table(ID = c("S1", "D1"), Sex = c("A", "A")) + f <- make_files(base_genos, ambig_parents, child1_progeny) + # With only 2 ambiguous parents, S1xD1 and D1xS1 are tied → warning expected + expect_warning( + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + allow_selfing = FALSE, show_ties = FALSE, + verbose = FALSE, write_txt = FALSE), + regexp = "tied" + ) + if (!is.na(res$Sire) && !is.na(res$Dam)) { + expect_false(res$Sire == res$Dam) + } +}) + +# ══════════════════════════════════════════════ +# 5. show_ties +# ══════════════════════════════════════════════ + +# All markers 0 → every sire×dam pair scores 0% error → guaranteed ties +tied_genos <- data.table( + ID = c("S1", "S2", "D1", "D2", "child_tie"), + M1 = c(0L, 0L, 0L, 0L, 0L), + M2 = c(0L, 0L, 0L, 0L, 0L) +) +tied_parents <- data.table(ID = c("S1","S2","D1","D2"), + Sex = c("M", "M", "F", "F")) +tied_progeny <- data.table(ID = "child_tie") + +test_that("show_ties = TRUE produces _1/_2 suffixed columns when ties exist", { + f <- make_files(tied_genos, tied_parents, tied_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = TRUE, verbose = FALSE, write_txt = FALSE) + expect_true(any(grepl("^Sire_", names(res)))) + expect_true(any(grepl("^Dam_", names(res)))) +}) + +test_that("show_ties = FALSE warns about ties and returns single-result columns", { + f <- make_files(tied_genos, tied_parents, tied_progeny) + expect_warning( + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE), + regexp = "tied" + ) + expect_true("Sire" %in% names(res)) + expect_false(any(grepl("^Sire_\\d", names(res)))) +}) + +# ══════════════════════════════════════════════ +# 6. verbose / write_txt +# ══════════════════════════════════════════════ +test_that("verbose = TRUE returns the result invisibly", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = TRUE, write_txt = FALSE) + expect_s3_class(res, "data.table") +}) + +test_that("verbose = FALSE returns the result visibly", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") +}) + +test_that("write_txt = TRUE creates the output file", { + old_wd <- getwd() + tmp <- tempdir() + setwd(tmp) + on.exit(setwd(old_wd), add = TRUE) + + f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) + find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = FALSE, write_txt = TRUE) + expect_true(file.exists(file.path(tmp, "parentage_results_dt.txt"))) +}) + +test_that("write_txt = FALSE does not create the output file", { + old_wd <- getwd() + tmp <- tempdir() + setwd(tmp) + on.exit(setwd(old_wd), add = TRUE) + + out_file <- file.path(tmp, "parentage_results_dt.txt") + if (file.exists(out_file)) file.remove(out_file) + + f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) + find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = FALSE, write_txt = FALSE) + expect_false(file.exists(out_file)) +}) + +# ══════════════════════════════════════════════ +# 7. Sex-based candidate filtering +# ══════════════════════════════════════════════ +test_that("best.sire only assigns male (M) or ambiguous (A) parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.sire", + verbose = FALSE, write_txt = FALSE) + valid_sires <- base_parents[Sex %in% c("M", "A")]$ID + expect_true(res$Best_Match %in% valid_sires) +}) + +test_that("best.dam only assigns female (F) or ambiguous (A) parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.dam", + verbose = FALSE, write_txt = FALSE) + valid_dams <- base_parents[Sex %in% c("F", "A")]$ID + expect_true(res$Best_Match %in% valid_dams) +}) + +# ══════════════════════════════════════════════ +# 8. Edge cases +# ══════════════════════════════════════════════ +test_that("single progeny individual is handled correctly", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(nrow(res), 1L) +}) + +test_that("all-NA marker column does not cause an error", { + na_genos <- copy(base_genos) + na_genos[, M1 := NA_integer_] + f <- make_files(na_genos, base_parents, child1_progeny) + expect_no_error( + find_parentage(f$g, f$p, f$pr, method = "best.pair", + verbose = FALSE, write_txt = FALSE) + ) +}) + +test_that("Progeny column contains the correct progeny IDs", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_setequal(res$Progeny, child1_progeny$ID) +}) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R new file mode 100644 index 0000000..c3cb4b9 --- /dev/null +++ b/tests/testthat/test-validate_pedigree.R @@ -0,0 +1,262 @@ +#### Tests for validate_pedigree() #### +# Run with: testthat::test_file("test-validate_pedigree.R") +# Requires: data.table, testthat + +library(testthat) +library(data.table) + +#### Helpers: Minimal test data #### + +# Genotypes: 20 markers, coded 0/1/2 +# IND_A and IND_B are parents; IND_C is a valid progeny (Mendelian-consistent) +# IND_D is a progeny whose sire is wrong (high Mendelian error) + +make_genos <- function() { + set.seed(42) + n_markers <- 20 + marker_names <- paste0("M", seq_len(n_markers)) + + # Parent A: all homozygous ref (0) + pa <- rep(0L, n_markers) + # Parent B: all homozygous alt (2) + pb <- rep(2L, n_markers) + # Valid progeny: all het (1) — perfectly Mendelian from A x B + pc <- rep(1L, n_markers) + # Bad progeny: all homozygous ref (0) — impossible if sire is B (2) and dam is A (0) + # 0/0 x 2/2 -> must be 1; so all-0 is 100% error + pd <- rep(0L, n_markers) + # Parent C2: homozygous ref (0) — correct sire for pd + pe <- rep(0L, n_markers) + + dt <- data.table( + ID = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), + rbind(pa, pb, pc, pd, pe) + ) + setnames(dt, c("ID", marker_names)) + dt +} + +make_pedigree <- function() { + data.table( + Progeny = c("IND_C", "IND_D"), + Sire = c("IND_A", "IND_B"), # IND_D sire is wrong (IND_B) + Dam = c("IND_B", "IND_A") + ) +} + +# Write temp files and return paths +write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { + ped_file <- tempfile(fileext = ".txt") + genos_file <- tempfile(fileext = ".txt") + fwrite(ped, ped_file, sep = "\t") + fwrite(genos, genos_file, sep = "\t") + list(ped = ped_file, genos = genos_file) +} + +#### Test suite #### + +test_that("PASS trio is correctly identified", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + pass_row <- res[Progeny == "IND_C"] + expect_equal(nrow(pass_row), 1L) + expect_equal(pass_row$Status, "PASS") + expect_equal(pass_row$Mendelian_Error_Pct, 0) + expect_equal(pass_row$Correction_Decision, "NONE") +}) + +test_that("FAIL trio is correctly identified", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + fail_row <- res[Progeny == "IND_D"] + expect_equal(nrow(fail_row), 1L) + expect_equal(fail_row$Status, "FAIL") + expect_gt(fail_row$Mendelian_Error_Pct, 5.0) +}) + +test_that("FAIL trio has correct correction decision (REMOVE_SIRE)", { + # IND_D: sire IND_B (all-2) is wrong; dam IND_A (all-0) matches IND_D (all-0) perfectly + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + fail_row <- res[Progeny == "IND_D"] + expect_equal(fail_row$Correction_Decision, "REMOVE_SIRE") + expect_false(is.na(fail_row$Best_Sire)) + expect_true(is.na(fail_row$Best_Dam)) # dam was fine, no replacement needed +}) + +test_that("Mendelian_Error_Pct is 0 for perfect trio", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + expect_equal(res[Progeny == "IND_C"]$Mendelian_Error_Pct, 0) +}) + +test_that("Markers_Tested equals number of markers for complete data", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + expect_equal(res[Progeny == "IND_C"]$Markers_Tested, 20L) +}) + +test_that("Returns a data.table invisibly", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + expect_s3_class(res, "data.table") + expect_equal(nrow(res), 2L) +}) + +test_that("Result has all expected columns", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + expected_cols <- c("Progeny", "Sire", "Dam", "Mendelian_Error_Pct", + "Markers_Tested", "Status", "Correction_Decision", + "Sire_Hom_Error_Pct", "Dam_Hom_Error_Pct", + "Best_Sire", "Best_Sire_Error_Pct", + "Best_Dam", "Best_Dam_Error_Pct") + expect_true(all(expected_cols %in% names(res))) +}) + +test_that("write_txt writes output file with correct name", { + f <- write_temp_files() + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = TRUE, output_filename = out_file) + + expect_true(file.exists(out_file)) + written <- fread(out_file) + expect_equal(nrow(written), 2L) +}) + +test_that("write_txt = FALSE does not create default output file", { + f <- write_temp_files() + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, output_filename = out_file) + + expect_false(file.exists(out_file)) +}) + +test_that("corrected_pedigree.txt is always written with zeros for bad parents", { + f <- write_temp_files() + tmp_dir <- tempfile() # unique dir per test — no cross-test pollution + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) + # IND_D sire should be zeroed out (written as character "0") + expect_equal(corr[Progeny == "IND_D"]$Sire, "0") + # IND_D dam should be unchanged + expect_equal(corr[Progeny == "IND_D"]$Dam, "IND_A") + # IND_C should be completely unchanged + expect_equal(corr[Progeny == "IND_C"]$Sire, "IND_A") + expect_equal(corr[Progeny == "IND_C"]$Dam, "IND_B") +}) + +test_that("fill_pedigree = TRUE writes filled_pedigree.txt with replacement IDs", { + f <- write_temp_files() + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, fill_pedigree = TRUE) + + filled <- fread(file.path(tmp_dir, "filled_pedigree.txt")) + # IND_D sire should be replaced with a valid ID (not 0, not the wrong IND_B) + new_sire <- filled[Progeny == "IND_D"]$Sire + expect_false(new_sire == "IND_B") + expect_false(new_sire == "0") +}) + +test_that("fill_pedigree = FALSE does not write filled_pedigree.txt", { + f <- write_temp_files() + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, fill_pedigree = FALSE) + + expect_false(file.exists(file.path(tmp_dir, "filled_pedigree.txt"))) +}) + +test_that("Trios with missing genotype data are removed with a message", { + ped <- data.table(Progeny = "GHOST", Sire = "IND_A", Dam = "IND_B") + f <- write_temp_files(ped = ped) + + # No valid trios remain -> should stop + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + "No valid trios remain" + ) +}) + +test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { + # Set min_markers higher than the 20 in our test data + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, min_markers = 25L) + + expect_true(all(res$Status == "LOW_MARKERS")) + expect_true(all(res$Correction_Decision == "NONE")) +}) + +test_that("error_threshold out of range raises an error", { + f <- write_temp_files() + expect_error(validate_pedigree(f$ped, f$genos, error_threshold = 150, + verbose = FALSE, write_txt = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, error_threshold = -1, + verbose = FALSE, write_txt = FALSE)) +}) + +test_that("homozygous_threshold out of range raises an error", { + f <- write_temp_files() + expect_error(validate_pedigree(f$ped, f$genos, homozygous_threshold = 101, + verbose = FALSE, write_txt = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, homozygous_threshold = -5, + verbose = FALSE, write_txt = FALSE)) +}) + +test_that("missing required pedigree column raises an error", { + bad_ped <- data.table(Progeny = "IND_C", Parent1 = "IND_A", Dam = "IND_B") + f <- write_temp_files(ped = bad_ped) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + "missing required columns" + ) +}) + +test_that("missing ID column in genotypes raises an error", { + bad_genos <- copy(make_genos()) + setnames(bad_genos, "ID", "SampleID") + f <- write_temp_files(genos = bad_genos) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + "ID" + ) +}) + +test_that("NA markers do not cause errors and are handled gracefully", { + genos <- make_genos() + # Introduce NAs in a few markers for IND_C + genos[ID == "IND_C", M1 := NA_integer_] + genos[ID == "IND_C", M2 := NA_integer_] + f <- write_temp_files(genos = genos) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + expect_equal(res[Progeny == "IND_C"]$Markers_Tested, 18L) # 2 NAs excluded + expect_equal(res[Progeny == "IND_C"]$Status, "PASS") +}) From 01e943d7e20c3bd71c0ef189fab9aa958ca201e6 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 6 Apr 2026 13:13:43 -0400 Subject: [PATCH 45/80] Added parentage functions and updated associated files --- DESCRIPTION | 3 +- NAMESPACE | 4 +- R/{assign_parentage.R => find_parentage.R} | 2 +- man/find_parentage.Rd | 2 +- man/validate_pedigree.Rd | 139 +++++++++--------- ...sign_parentage.R => test-find_parentage.R} | 0 6 files changed, 79 insertions(+), 71 deletions(-) rename R/{assign_parentage.R => find_parentage.R} (99%) rename tests/testthat/{test-assign_parentage.R => test-find_parentage.R} (100%) diff --git a/DESCRIPTION b/DESCRIPTION index 48ede7c..ec4cb60 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -63,7 +63,8 @@ Imports: quadprog, tibble, stringr, - ggplot2 + ggplot2, + data.table, Suggests: covr, spelling, diff --git a/NAMESPACE b/NAMESPACE index e1fb9ae..73326c0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(validate_pedigree) export(vmsg) import(dplyr) import(ggplot2) @@ -38,13 +39,14 @@ importFrom(Biostrings,DNAString) importFrom(Biostrings,reverseComplement) importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) +importFrom(data.table,":=") importFrom(data.table,CJ) -importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,fread) importFrom(data.table,fwrite) importFrom(data.table,rbindlist) +importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,case_when) diff --git a/R/assign_parentage.R b/R/find_parentage.R similarity index 99% rename from R/assign_parentage.R rename to R/find_parentage.R index 1b93b49..0a623b9 100644 --- a/R/assign_parentage.R +++ b/R/find_parentage.R @@ -82,7 +82,7 @@ #' ) #' } #' -#' @importFrom data.table fread fwrite copy CJ rbindlist +#' @importFrom data.table fread fwrite copy CJ rbindlist set data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best.pair", diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 297d156..01aaf79 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/assign_parentage.R +% Please edit documentation in R/find_parentage.R \name{find_parentage} \alias{find_parentage} \title{Find Parentage Assignments for Progeny} diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 4fd3d50..10d7f40 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -2,99 +2,104 @@ % Please edit documentation in R/validate_pedigree.R \name{validate_pedigree} \alias{validate_pedigree} -\title{Validate Pedigree Using Mendelian Error Analysis} +\title{Validate Pedigree Trios Using Mendelian Error Analysis} \usage{ validate_pedigree( pedigree_file, genotypes_file, - error_threshold = 5, + trio_error_threshold = 5, min_markers = 10, - homozygous_threshold = 2, + single_parent_error_threshold = 2, fill_pedigree = FALSE, verbose = TRUE, - write_txt = TRUE + write_txt = TRUE, + output_filename = "trio_validation_results.txt" ) } \arguments{ -\item{pedigree_file}{Character. Path to the pedigree file (TSV or CSV). -Must contain columns \code{Progeny}, \code{Sire}, and \code{Dam}.} +\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV) with +columns: \code{Progeny}, \code{Sire}, \code{Dam}.} -\item{genotypes_file}{Character. Path to the genotypes file (TSV or CSV). -Must contain an \code{ID} column followed by one column per marker, -with genotypes coded as 0, 1, or 2 (allele dosage).} +\item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV) with +an \code{ID} column followed by marker columns coded +as 0, 1, 2 (additive allele dosage).} -\item{error_threshold}{Numeric. Maximum acceptable Mendelian error percentage -(0-100) for a trio to be considered a \code{PASS}. Default is \code{5.0}.} +\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage to +classify a trio as \code{PASS} (default: \code{5.0}).} -\item{min_markers}{Integer. Minimum number of non-missing markers required -across all three individuals for a trio to be fully evaluated. Trios below -this threshold receive status \code{LOW_MARKERS}. Default is \code{10}.} +\item{min_markers}{Integer. Minimum number of non-missing markers +required to evaluate a trio; below this the trio +is flagged \code{LOW_MARKERS} (default: \code{10}).} -\item{homozygous_threshold}{Numeric. Maximum acceptable homozygous-marker -mismatch percentage (0-100) for a parent to be considered acceptable in a -failed trio. Default is \code{2.0}.} +\item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker mismatch +percentage for a parent to be considered acceptable +in a failed trio (default: \code{2.0}).} -\item{fill_pedigree}{Logical. If \code{TRUE}, writes an additional corrected -pedigree file (\code{id_corrected_pedigree.txt}) in which removed parents -are replaced by the best-matching candidate ID rather than zero. -Default is \code{FALSE}.} +\item{fill_pedigree}{Logical. If \code{TRUE}, writes an additional file with +failed parents replaced by the best-matching candidate +IDs (default: \code{FALSE}).} -\item{verbose}{Logical. If \code{TRUE}, prints a summary of results and -correction decisions to the console. Default is \code{TRUE}.} +\item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary +statistics, and the results table to the console +(default: \code{TRUE}).} -\item{write_txt}{Logical. If \code{TRUE}, writes the full validation results -table to \code{pedigree_validation_results.txt}. Default is \code{TRUE}.} +\item{write_txt}{Logical. If \code{TRUE}, writes the validation results to +\code{output_filename} (default: \code{TRUE}).} + +\item{output_filename}{Character. Name of the output file for validation +results (default: \code{"trio_validation_results.txt"}).} } \value{ -A \code{data.table} with one row per validated trio containing the -following columns: \code{Progeny} (progeny ID); \code{Sire} (declared sire -ID); \code{Dam} (declared dam ID); \code{Mendelian_Error_Pct} (overall -Mendelian error rate \\% across all three individuals); -\code{Markers_Tested} (number of markers with non-missing genotypes in all -three individuals); \code{Status} (validation outcome: \code{PASS}, -\code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}); -\code{Correction_Decision} (action taken for failed trios: \code{NONE}, -\code{KEEP_BOTH}, \code{REMOVE_SIRE}, \code{REMOVE_DAM}, or -\code{REMOVE_BOTH}); \code{Sire_Hom_Error_Pct} (homozygous-marker mismatch -\\% between sire and progeny, \code{NA} unless status is \code{FAIL}); -\code{Dam_Hom_Error_Pct} (homozygous-marker mismatch \\% between dam and -progeny, \code{NA} unless status is \code{FAIL}); \code{Best_Sire} (ID of -the best-matching sire candidate, populated only when -\code{Correction_Decision} is \code{REMOVE_SIRE} or \code{REMOVE_BOTH}); -\code{Best_Sire_Error_Pct} (homozygous mismatch \\% for \code{Best_Sire}); -\code{Best_Dam} (ID of the best-matching dam candidate, populated only when -\code{Correction_Decision} is \code{REMOVE_DAM} or \code{REMOVE_BOTH}); -\code{Best_Dam_Error_Pct} (homozygous mismatch \\% for \code{Best_Dam}). -The function also writes \code{zero_corrected_pedigree.txt} (always) and, -if \code{fill_pedigree = TRUE}, \code{id_corrected_pedigree.txt}. If -\code{write_txt = TRUE}, results are written to -\code{pedigree_validation_results.txt}. The return value is invisible when -\code{verbose = TRUE}. +A \code{data.table} (returned invisibly) with one row per trio and +the following columns: +\describe{ +\item{Progeny}{Progeny ID.} +\item{Sire}{Declared sire ID.} +\item{Dam}{Declared dam ID.} +\item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +\item{Markers_Tested}{Number of markers compared across all three individuals.} +\item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}.} +\item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, \code{REMOVE_SIRE}, +\code{REMOVE_DAM}, or \code{REMOVE_BOTH}.} +\item{Sire_Hom_Error_Pct}{Sire homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} +\item{Dam_Hom_Error_Pct}{Dam homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} +\item{Best_Sire}{Best-matching sire candidate ID (\code{NA} unless sire removed).} +\item{Best_Sire_Error_Pct}{Homozygous mismatch percentage for \code{Best_Sire}.} +\item{Best_Dam}{Best-matching dam candidate ID (\code{NA} unless dam removed).} +\item{Best_Dam_Error_Pct}{Homozygous mismatch percentage for \code{Best_Dam}.} +} } \description{ Validates parent-offspring trios by calculating Mendelian error rates from -SNP genotype data. Trios exceeding the error threshold are flagged and -analysed further using homozygous-marker comparisons to identify which -parent(s) are likely incorrect. Optionally writes corrected pedigree files -with erroneous parents replaced by zeros or by the best-matching candidate IDs. +SNP genotype data. Identifies incorrect parentage assignments and optionally +suggests or fills in best-matching replacements. } \details{ -Mendelian errors are identified using standard allele-dosage rules, e.g. a -progeny cannot carry an allele absent in both parents. Only homozygous -parental markers (coded 0 or 2) are used in the per-parent mismatch -analysis, as heterozygous markers are uninformative for tracing allele -origin. Trios in the pedigree that lack genotype data for any of the three -individuals are removed prior to analysis. +Trios are filtered to individuals present in the genotype file before +analysis. Mendelian errors are counted as genotype combinations impossible +under Mendelian inheritance (e.g. both parents homozygous reference but +progeny carries the alternate allele). Failed trios are further dissected +using homozygous-only markers to identify which parent is likely incorrect. +A corrected pedigree with failed parents set to \code{0} is always written +to \code{corrected_pedigree.txt}. If \code{fill_pedigree = TRUE}, a second +file (\code{filled_pedigree.txt}) replaces those zeros with the best +genomic match. } \examples{ +\dontrun{ +# Basic run with defaults +results <- validate_pedigree("pedigree.txt", "genotypes.txt") + +# Stricter thresholds, custom output name, no console output results <- validate_pedigree( - pedigree_file = "pedigree.txt", - genotypes_file = "genotypes.txt", - error_threshold = 5.0, - min_markers = 10, - homozygous_threshold = 2.0, - fill_pedigree = TRUE, - verbose = TRUE, - write_txt = TRUE + pedigree_file = "pedigree.txt", + genotypes_file = "genotypes.txt", + trio_error_threshold = 2.0, + single_parent_error_threshold = 1.0, + fill_pedigree = TRUE, + verbose = FALSE, + output_filename = "my_validation.txt" ) } + +} diff --git a/tests/testthat/test-assign_parentage.R b/tests/testthat/test-find_parentage.R similarity index 100% rename from tests/testthat/test-assign_parentage.R rename to tests/testthat/test-find_parentage.R From 8ee65a3a0d8c756bdda349b6a3fc13ead103b2d6 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 6 Apr 2026 13:14:06 -0400 Subject: [PATCH 46/80] updated headers and importFrom for functions along iwth namespace --- NAMESPACE | 1 + R/find_parentage.R | 4 +- R/validate_pedigree.R | 127 +++++++++++++++++------------------------- 3 files changed, 55 insertions(+), 77 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 73326c0..278f05e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,6 +41,7 @@ importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) importFrom(data.table,":=") importFrom(data.table,CJ) +importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,fread) diff --git a/R/find_parentage.R b/R/find_parentage.R index 0a623b9..567fc59 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -82,7 +82,7 @@ #' ) #' } #' -#' @importFrom data.table fread fwrite copy CJ rbindlist set data.table +#' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best.pair", @@ -126,7 +126,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } all_parents[, Sex := toupper(Sex)] - sire_candidates <- all_parents[Sex %in% c("M", "A", "NA")] + sire_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] dam_candidates <- all_parents[Sex %in% c("F", "A", "NA")] if (nrow(sire_candidates) == 0 && method %in% c("best.sire", "best.pair")) { diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 998d71d..b98614a 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -75,17 +75,19 @@ #' ) #' } #' -#' @import data.table +#' @importFrom data.table fread fwrite rbindlist copy data.table := set as.data.table #' @export - validate_pedigree <- function(pedigree_file, genotypes_file, -trio_error_threshold = 5.0, -min_markers = 10, -single_parent_error_threshold = 2.0, -fill_pedigree = FALSE, -verbose = TRUE, -write_txt = TRUE, -output_filename = "trio_validation_results.txt") { - +#' +#' +validate_pedigree <- function(pedigree_file, genotypes_file, + trio_error_threshold = 5.0, + min_markers = 10, + single_parent_error_threshold = 2.0, + fill_pedigree = FALSE, + verbose = TRUE, + write_txt = TRUE, + output_filename = "trio_validation_results.txt") { + #### Input Validation and Data Loading #### if (trio_error_threshold < 0 || trio_error_threshold > 100) { stop("trio_error_threshold must be between 0 and 100") @@ -93,46 +95,41 @@ output_filename = "trio_validation_results.txt") { if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) { stop("single_parent_error_threshold must be between 0 and 100") } - tryCatch({ pedigree <- fread(pedigree_file) genos <- fread(genotypes_file) }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TSV/CSV.") }) - + # Keep original pedigree for correction if needed original_pedigree <- copy(pedigree) - + # Check required columns required_ped_cols <- c("Progeny", "Sire", "Dam") missing_cols <- setdiff(required_ped_cols, names(pedigree)) if (length(missing_cols) > 0) { stop("Pedigree file missing required columns: ", paste(missing_cols, collapse = ", ")) } - if (!"ID" %in% names(genos)) { stop("Genotypes file must have an 'ID' column") } - + # Filter pedigree to only include individuals with genotype data valid_ids <- genos$ID initial_trios <- nrow(pedigree) - pedigree <- pedigree[Progeny %in% valid_ids & Sire %in% valid_ids & Dam %in% valid_ids] - removed_trios <- initial_trios - nrow(pedigree) if (removed_trios > 0 && verbose) { cat("Removed", removed_trios, "trios due to missing genotype data.\n") } - if (nrow(pedigree) == 0) { stop("No valid trios remain after filtering for genotype availability.") } - + #### Mendelian Error Calculation #### genos_mat <- as.matrix(genos, rownames = "ID") - + # Create homozygous-only matrix for parent analysis genos_hom <- copy(genos) marker_cols <- setdiff(names(genos_hom), "ID") @@ -140,38 +137,33 @@ output_filename = "trio_validation_results.txt") { genos_hom[get(col) == 1, (col) := NA_integer_] } genos_hom_mat <- as.matrix(genos_hom, rownames = "ID") - + #### Helper: Find Best Matching Parent #### - # Returns list(id, error_pct) for the candidate with lowest homozygous mismatch vs progeny find_best_parent <- function(prog_id, exclude_ids = character(0)) { candidates <- setdiff(rownames(genos_hom_mat), c(prog_id, exclude_ids)) - if (length(candidates) == 0) return(list(id = NA_character_, error_pct = NA_real_)) - prog_hom <- genos_hom_mat[prog_id, ] - errors <- sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] comparisons <- sum(!is.na(cand_hom) & !is.na(prog_hom)) if (comparisons == 0) return(NA_real_) (sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) - best_idx <- which.min(errors) list(id = candidates[best_idx], error_pct = round(errors[best_idx], 2)) } - + results_list <- lapply(seq_len(nrow(pedigree)), function(i) { prog_id <- pedigree$Progeny[i] sire_id <- pedigree$Sire[i] dam_id <- pedigree$Dam[i] - + # Extract genotype vectors progeny_vec <- genos_mat[prog_id, ] sire_vec <- genos_mat[sire_id, ] dam_vec <- genos_mat[dam_id, ] - - # Calculate Mendelian errors using same logic as original function + + # Calculate Mendelian errors mismatches <- sum( (sire_vec == 0 & dam_vec == 0 & progeny_vec > 0) | (sire_vec == 2 & dam_vec == 2 & progeny_vec < 2) | @@ -180,19 +172,18 @@ output_filename = "trio_validation_results.txt") { ((sire_vec == 0 & dam_vec == 2) | (sire_vec == 2 & dam_vec == 0)) & (progeny_vec != 1), na.rm = TRUE ) - + # Count comparable markers (non-NA in all three individuals) markers_tested <- sum(!is.na(sire_vec) & !is.na(dam_vec) & !is.na(progeny_vec)) - - # Initialise per-parent and best-parent fields (populated only for FAILs) + + # Initialise per-parent and best-parent fields sire_error_pct <- NA_real_ dam_error_pct <- NA_real_ best_sire <- NA_character_ best_sire_pct <- NA_real_ best_dam <- NA_character_ best_dam_pct <- NA_real_ - - # Calculate error percentage and determine status + if (markers_tested == 0) { error_pct <- NA_real_ status <- "NO_DATA" @@ -203,31 +194,28 @@ output_filename = "trio_validation_results.txt") { correction_decision <- "NONE" } else { error_pct <- (mismatches / markers_tested) * 100 - if (error_pct <= trio_error_threshold) { status <- "PASS" correction_decision <- "NONE" } else { status <- "FAIL" - + # Per-parent homozygous analysis for failed trios progeny_hom <- genos_hom_mat[prog_id, ] sire_hom <- genos_hom_mat[sire_id, ] dam_hom <- genos_hom_mat[dam_id, ] - - # Sire homozygous error + sire_comparisons <- sum(!is.na(sire_hom) & !is.na(progeny_hom)) sire_error_pct <- if (sire_comparisons == 0) NA_real_ else round((sum(sire_hom != progeny_hom, na.rm = TRUE) / sire_comparisons) * 100, 2) - - # Dam homozygous error + dam_comparisons <- sum(!is.na(dam_hom) & !is.na(progeny_hom)) dam_error_pct <- if (dam_comparisons == 0) NA_real_ else round((sum(dam_hom != progeny_hom, na.rm = TRUE) / dam_comparisons) * 100, 2) - + sire_acceptable <- !is.na(sire_error_pct) && sire_error_pct <= single_parent_error_threshold dam_acceptable <- !is.na(dam_error_pct) && dam_error_pct <= single_parent_error_threshold - + if (sire_acceptable && dam_acceptable) { correction_decision <- "KEEP_BOTH" } else if (sire_acceptable && !dam_acceptable) { @@ -251,7 +239,7 @@ output_filename = "trio_validation_results.txt") { } } } - + data.table( Progeny = prog_id, Sire = sire_id, @@ -260,60 +248,53 @@ output_filename = "trio_validation_results.txt") { Markers_Tested = markers_tested, Status = status, Correction_Decision = correction_decision, - Sire_Hom_Error_Pct = sire_error_pct, # NA unless FAIL - Dam_Hom_Error_Pct = dam_error_pct, # NA unless FAIL + Sire_Hom_Error_Pct = sire_error_pct, + Dam_Hom_Error_Pct = dam_error_pct, Best_Sire = best_sire, Best_Sire_Error_Pct = best_sire_pct, Best_Dam = best_dam, Best_Dam_Error_Pct = best_dam_pct ) }) - + final_df <- rbindlist(results_list) - + #### Always Write Corrected Pedigree (zeros for failed parents) #### corrected_pedigree <- copy(original_pedigree) - for (i in seq_len(nrow(final_df))) { prog_id <- final_df$Progeny[i] decision <- final_df$Correction_Decision[i] - if (decision == "REMOVE_SIRE") { - corrected_pedigree[Progeny == prog_id, Sire := 0] + set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Sire", 0L) } else if (decision == "REMOVE_DAM") { - corrected_pedigree[Progeny == prog_id, Dam := 0] + set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Dam", 0L) } else if (decision == "REMOVE_BOTH") { - corrected_pedigree[Progeny == prog_id, `:=`(Sire = 0, Dam = 0)] + set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Sire", 0L) + set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Dam", 0L) } - # KEEP_BOTH and NONE require no changes } - tryCatch({ fwrite(corrected_pedigree, file = "corrected_pedigree.txt", sep = "\t", quote = FALSE) if (verbose) cat("Corrected pedigree (zeros) written to: corrected_pedigree.txt\n") }, error = function(e) { warning("Could not write corrected pedigree to file. Error: ", e$message, call. = FALSE) }) - - #### Optionally Write Filled Pedigree (best matching IDs for failed parents) #### + + #### Optionally Write Filled Pedigree #### if (fill_pedigree) { filled_pedigree <- copy(original_pedigree) - for (i in seq_len(nrow(final_df))) { prog_id <- final_df$Progeny[i] decision <- final_df$Correction_Decision[i] - if (decision == "REMOVE_SIRE") { - filled_pedigree[Progeny == prog_id, Sire := final_df$Best_Sire[i]] + set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Sire", final_df$Best_Sire[i]) } else if (decision == "REMOVE_DAM") { - filled_pedigree[Progeny == prog_id, Dam := final_df$Best_Dam[i]] + set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Dam", final_df$Best_Dam[i]) } else if (decision == "REMOVE_BOTH") { - filled_pedigree[Progeny == prog_id, `:=`(Sire = final_df$Best_Sire[i], - Dam = final_df$Best_Dam[i])] + set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Sire", final_df$Best_Sire[i]) + set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Dam", final_df$Best_Dam[i]) } - # KEEP_BOTH and NONE require no changes } - tryCatch({ fwrite(filled_pedigree, file = "filled_pedigree.txt", sep = "\t", quote = FALSE) if (verbose) cat("Filled pedigree (best IDs) written to: filled_pedigree.txt\n") @@ -321,12 +302,11 @@ output_filename = "trio_validation_results.txt") { warning("Could not write filled pedigree to file. Error: ", e$message, call. = FALSE) }) } - + #### Summary Statistics #### if (verbose) { total_trios <- nrow(final_df) status_counts <- table(final_df$Status) - cat("\n--- Trio Validation Summary ---\n") cat("Total trios tested:", total_trios, "\n") for (status in names(status_counts)) { @@ -336,7 +316,6 @@ output_filename = "trio_validation_results.txt") { cat("Error threshold:", trio_error_threshold, "%\n") cat("Homozygous threshold:", single_parent_error_threshold, "%\n") cat("Minimum markers required:", min_markers, "\n\n") - corrections <- table(final_df$Correction_Decision) cat("Correction summary:\n") for (decision in names(corrections)) { @@ -346,19 +325,17 @@ output_filename = "trio_validation_results.txt") { } cat("\n") } - + #### Output #### if (write_txt) { tryCatch({ - fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) # <-- uses new arg + fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) if (verbose) cat("Results written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } - + if (verbose) print(final_df) - - return(invisible(final_df)) - } - \ No newline at end of file + return(invisible(final_df)) +} From e5b2004dfb5d436f7f696882311af883a31a6623 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 6 Apr 2026 13:39:39 -0400 Subject: [PATCH 47/80] deleted cra check files --- CRAN-SUBMISSION | 3 --- cran-comments.md | 9 --------- 2 files changed, 12 deletions(-) delete mode 100644 CRAN-SUBMISSION delete mode 100644 cran-comments.md diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION deleted file mode 100644 index ae8365d..0000000 --- a/CRAN-SUBMISSION +++ /dev/null @@ -1,3 +0,0 @@ -Version: 0.6.2 -Date: 2025-09-18 12:16:11 UTC -SHA: 142dc9524d88b47db88ddca2aa39cd729a8d5a0d diff --git a/cran-comments.md b/cran-comments.md deleted file mode 100644 index 2515e5b..0000000 --- a/cran-comments.md +++ /dev/null @@ -1,9 +0,0 @@ -## R CMD check results - -0 errors | 0 warnings | 1 note - -* This is a new release. - -## Updates - -- The maintainer is the same as the previous release, but the email address has been updated. From 4ec471b7087f5fc5f235584b386a6769b7a239d8 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:40:59 -0400 Subject: [PATCH 48/80] Fix formatting of RefAltSeqs documentation --- R/check_madc_sanity.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 2248779..fda01d5 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -13,7 +13,8 @@ #' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); #' 7) **allNAcol** - at least one column contains only `NA` or empty values; #' 8) **allNArow** - at least one row contains only `NA` or empty values; -#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row. +#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row; +#' 10) **OtherAlleles** - presence of alleles where the target locus differs from both the Ref and Alt in `AlleleSequence`. #' #' @param report A `data.frame` with at least the columns #' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also From cb768808a97c550411a774e0cd3ca311597910a9 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:55:45 -0400 Subject: [PATCH 49/80] updated docs --- man/check_madc_sanity.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd index 0398625..1d7eebb 100644 --- a/man/check_madc_sanity.Rd +++ b/man/check_madc_sanity.Rd @@ -50,7 +50,8 @@ or a \code{"-"} character is present in \code{AlleleSequence}; (prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); \item \strong{allNAcol} - at least one column contains only \code{NA} or empty values; \item \strong{allNArow} - at least one row contains only \code{NA} or empty values; -\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row. +\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row; +\item \strong{OtherAlleles} - presence of alleles where the target locus differs from both the Ref and Alt in \code{AlleleSequence}. } } \details{ From 9824e294785d6849a179d37431ee26c80ccd95e7 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Thu, 9 Apr 2026 13:21:39 -0400 Subject: [PATCH 50/80] Updated parentage functions to include package::function Modified Sire and Dam calls to male_parent and female_parent to be species agnostic. Modified the test files to accomodate the changes in functions Modified importFrom statement for thinSNP to call specific functions and resolve warnings when installing BIGr Updated package documentation --- NAMESPACE | 6 +- R/find_parentage.R | 268 +++++++------- R/thinSNP.R | 5 +- R/validate_pedigree.R | 442 +++++++++++++----------- man/find_parentage.Rd | 66 ++-- man/validate_pedigree.Rd | 109 +++--- tests/testthat/.gitignore | 1 + tests/testthat/corrected_pedigree.txt | 3 + tests/testthat/test-find_parentage.R | 303 +++++----------- tests/testthat/test-validate_pedigree.R | 137 +++----- 10 files changed, 622 insertions(+), 718 deletions(-) create mode 100644 tests/testthat/.gitignore create mode 100644 tests/testthat/corrected_pedigree.txt diff --git a/NAMESPACE b/NAMESPACE index 278f05e..5081f3b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,7 +30,6 @@ import(ggplot2) import(janitor) import(parallel) import(quadprog) -import(rlang) import(stringr) import(tibble) import(tidyr) @@ -50,21 +49,26 @@ importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) +importFrom(dplyr,arrange) importFrom(dplyr,case_when) importFrom(dplyr,filter) importFrom(dplyr,group_by) +importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) +importFrom(dplyr,ungroup) importFrom(dplyr,where) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) +importFrom(rlang,sym) importFrom(stats,cor) importFrom(stats,reorder) importFrom(stats,setNames) +importFrom(tibble,as_tibble) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/R/find_parentage.R b/R/find_parentage.R index 567fc59..c070659 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -3,32 +3,33 @@ #' Assigns the most likely parent(s) to each progeny individual based on #' genotypic data using Mendelian error rates or homozygous mismatch rates. #' -#' @param genotypes_file Path to a TSV/CSV file containing genotype data. +#' @param genotypes_file Path to a TSV/CSV/TXT file containing genotype data. #' Must include an 'ID' column followed by marker columns coded as 0, 1, 2 #' (allele dosage). -#' @param parents_file Path to a TSV/CSV file listing candidate parent IDs. +#' @param parents_file Path to a TSV/CSV/TXT file listing candidate parent IDs. #' Must include an 'ID' column. An optional 'Sex' column with values -#' 'M' (sire), 'F' (dam), or 'A' (ambiguous) determines which parents are -#' tested for each role. If absent, all parents are treated as ambiguous. -#' @param progeny_file Path to a TSV/CSV file listing progeny IDs to assign. +#' 'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines +#' which parents are tested for each role. If absent, all parents are treated +#' as ambiguous. +#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs to assign. #' Must include an 'ID' column. #' @param method Character. Parentage assignment method. One of: #' \itemize{ -#' \item \code{"best.sire"} — finds the best sire for each progeny using -#' homozygous mismatch rate. -#' \item \code{"best.dam"} — finds the best dam for each progeny using -#' homozygous mismatch rate. -#' \item \code{"best.match"} — finds the single best parent (either sex) +#' \item \code{"best_male_parent"} — finds the best male parent for each +#' progeny using homozygous mismatch rate. +#' \item \code{"best_female_parent"} — finds the best female parent for each +#' progeny using homozygous mismatch rate. +#' \item \code{"best_match"} — finds the single best parent (either sex) #' using homozygous mismatch rate. -#' \item \code{"best.pair"} — finds the best sire-dam pair for each -#' progeny using full Mendelian error rate (default). +#' \item \code{"best_pair"} — finds the best male-female parent pair for +#' each progeny using full Mendelian error rate (default). #' } #' @param show_ties Logical. If \code{TRUE}, all tied best pairs are reported -#' as additional columns (\code{Sire_1}, \code{Sire_2}, etc.) when -#' \code{method = "best.pair"}. If \code{FALSE}, only one tied pair is +#' as additional columns (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) +#' when \code{method = "best_pair"}. If \code{FALSE}, only one tied pair is #' reported with a warning. Default is \code{TRUE}. -#' @param allow_selfing Logical. If \code{FALSE}, sire-dam pairs where both -#' IDs are identical are excluded when \code{method = "best.pair"}. +#' @param allow_selfing Logical. If \code{FALSE}, male-female parent pairs where +#' both IDs are identical are excluded when \code{method = "best_pair"}. #' Default is \code{TRUE}. #' @param verbose Logical. If \code{TRUE}, prints progress messages, summary #' statistics, and the results table to the console. Default is \code{TRUE}. @@ -39,23 +40,24 @@ #' @return A \code{data.table} with one row per progeny (or more if ties are #' reported). Columns depend on the method used: #' \itemize{ -#' \item \code{best.sire} / \code{best.dam} / \code{best.match}: \code{Progeny}, -#' \code{Best_Match}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. -#' \item \code{best.pair} (no ties): \code{Progeny}, \code{Sire}, \code{Dam}, -#' \code{Mendelian_Error_Pct}, \code{Markers_Tested}. -#' \item \code{best.pair} (with ties): columns are suffixed \code{_1}, \code{_2}, -#' etc. for each tied pair. +#' \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: +#' \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, +#' \code{Markers_Tested}. +#' \item \code{best_pair} (no ties): \code{Progeny}, \code{Male_Parent}, +#' \code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. +#' \item \code{best_pair} (with ties): columns are suffixed \code{_1}, +#' \code{_2}, etc. for each tied pair. #' } #' Returned invisibly when \code{verbose = TRUE}. #' #' @details -#' For \code{"best.sire"}, \code{"best.dam"}, and \code{"best.match"}, only -#' homozygous markers (coded 0 or 2) are used for comparison; heterozygous -#' markers (coded 1) are set to \code{NA}. This reduces false mismatches caused -#' by phase ambiguity. +#' For \code{"best_male_parent"}, \code{"best_female_parent"}, and +#' \code{"best_match"}, only homozygous markers (coded 0 or 2) are used for +#' comparison; heterozygous markers (coded 1) are set to \code{NA}. This +#' reduces false mismatches caused by phase ambiguity. #' -#' For \code{"best.pair"}, all markers are used and full Mendelian inheritance -#' rules are applied across all possible sire-dam combinations via +#' For \code{"best_pair"}, all markers are used and full Mendelian inheritance +#' rules are applied across all possible male-female parent combinations via #' \code{data.table::CJ()}. #' #' Individuals in \code{parents_file} or \code{progeny_file} that are absent @@ -63,12 +65,12 @@ #' #' @examples #' \dontrun{ -#' # Assign best sire-dam pair to each progeny +#' # Assign best male-female parent pair to each progeny #' results <- find_parentage( #' genotypes_file = "genotypes.txt", #' parents_file = "parents.txt", #' progeny_file = "progeny.txt", -#' method = "best.pair", +#' method = "best_pair", #' show_ties = TRUE, #' allow_selfing = FALSE #' ) @@ -78,184 +80,168 @@ #' genotypes_file = "genotypes.txt", #' parents_file = "parents.txt", #' progeny_file = "progeny.txt", -#' method = "best.match" +#' method = "best_match" #' ) #' } #' #' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, - method = "best.pair", + method = "best_pair", show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE) { - #### Input Validation and Data Loading #### - allowed_methods <- c("best.sire", "best.dam", "best.match", "best.pair") + allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") if (!method %in% allowed_methods) { stop("Method must be one of: ", paste(allowed_methods, collapse = ", ")) } - tryCatch({ - genos <- fread(genotypes_file) - all_parents <- fread(parents_file) - progeny_candidates <- fread(progeny_file) + genos <- data.table::fread(genotypes_file, sep = "auto") + all_parents <- data.table::fread(parents_file, sep = "auto") + progeny_candidates <- data.table::fread(progeny_file, sep = "auto") }, error = function(e) { - stop("Error reading input files. Ensure paths are correct and files are TSV/CSV.") + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - valid_ids <- genos$ID - removed_parents <- setdiff(all_parents$ID, valid_ids) + removed_parents <- base::setdiff(all_parents$ID, valid_ids) if (length(removed_parents) > 0) { warning("The following parent IDs were not in the genotype file and will not be analyzed: ", paste(removed_parents, collapse = ", "), call. = FALSE) all_parents <- all_parents[ID %in% valid_ids] } - - removed_progeny <- setdiff(progeny_candidates$ID, valid_ids) + removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) if (length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) progeny_candidates <- progeny_candidates[ID %in% valid_ids] } - - if (!"Sex" %in% colnames(all_parents)) { + if (!"Sex" %in% base::colnames(all_parents)) { warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") all_parents[, Sex := "A"] } - - all_parents[, Sex := toupper(Sex)] - sire_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] - dam_candidates <- all_parents[Sex %in% c("F", "A", "NA")] - - if (nrow(sire_candidates) == 0 && method %in% c("best.sire", "best.pair")) { - warning("No valid sire candidates remain after filtering.", call. = FALSE) + all_parents[, Sex := base::toupper(Sex)] + male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] + female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] + if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) { + warning("No valid male parent candidates remain after filtering.", call. = FALSE) } - if (nrow(dam_candidates) == 0 && method %in% c("best.dam", "best.pair")) { - warning("No valid dam candidates remain after filtering.", call. = FALSE) + if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) { + warning("No valid female parent candidates remain after filtering.", call. = FALSE) } - if (nrow(progeny_candidates) == 0) { + if (base::nrow(progeny_candidates) == 0) { stop("No valid progeny candidates remain after filtering.") } - #### Logic for Homozygous Matching Methods #### - if (method %in% c("best.sire", "best.dam", "best.match")) { - genos_hom <- copy(genos) - marker_cols <- setdiff(names(genos_hom), "ID") + if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) { - genos_hom[get(col) == 1, (col) := NA_integer_] + genos_hom[base::get(col) == 1, (col) := NA_integer_] } - - parent_ids <- switch(method, - "best.sire" = sire_candidates$ID, - "best.dam" = dam_candidates$ID, - "best.match" = union(sire_candidates$ID, dam_candidates$ID)) - - parent_genos <- as.matrix(genos_hom[ID %in% parent_ids], rownames = "ID") - progeny_genos <- as.matrix(genos_hom[ID %in% progeny_candidates$ID], rownames = "ID") - - results_list <- lapply(rownames(progeny_genos), function(progeny_id) { - progeny_vec <- progeny_genos[progeny_id, ] - mismatches <- rowSums(parent_genos != progeny_vec, na.rm = TRUE) - comparisons <- rowSums(!is.na(parent_genos) & !is.na(progeny_vec)) + parent_ids <- base::switch(method, + "best_male_parent" = male_parent_candidates$ID, + "best_female_parent" = female_parent_candidates$ID, + "best_match" = base::union(male_parent_candidates$ID, + female_parent_candidates$ID)) + parent_genos <- base::as.matrix(genos_hom[ID %in% parent_ids], rownames = "ID") + progeny_genos <- base::as.matrix(genos_hom[ID %in% progeny_candidates$ID], rownames = "ID") + results_list <- base::lapply(base::rownames(progeny_genos), function(progeny_id) { + progeny_vec <- progeny_genos[progeny_id, ] + mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) + comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 - percent_mismatch[is.nan(percent_mismatch)] <- NA - - best_idx <- which.min(percent_mismatch) - if (length(best_idx) == 0) { - data.table(Progeny = progeny_id, Best_Match = NA, Mendelian_Error_Pct = NA, Markers_Tested = NA) + percent_mismatch[base::is.nan(percent_mismatch)] <- NA + best_idx <- base::which.min(percent_mismatch) + if (base::length(best_idx) == 0) { + data.table::data.table(Progeny = progeny_id, Best_Match = NA, + Mendelian_Error_Pct = NA, Markers_Tested = NA) } else { - data.table(Progeny = progeny_id, - Best_Match = rownames(parent_genos)[best_idx], - Mendelian_Error_Pct = round(percent_mismatch[best_idx], 2), - Markers_Tested = comparisons[best_idx]) + data.table::data.table( + Progeny = progeny_id, + Best_Match = base::rownames(parent_genos)[best_idx], + Mendelian_Error_Pct = base::round(percent_mismatch[best_idx], 2), + Markers_Tested = comparisons[best_idx] + ) } }) - final_df <- rbindlist(results_list) + final_df <- data.table::rbindlist(results_list) } - #### Logic for Best Pair Method #### - if (method == "best.pair") { - genos_mat <- as.matrix(genos, rownames = "ID") - - parent_pairs <- CJ(Sire = sire_candidates$ID, Dam = dam_candidates$ID) - + if (method == "best_pair") { + genos_mat <- base::as.matrix(genos, rownames = "ID") + parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, + Female_Parent = female_parent_candidates$ID) if (!allow_selfing) { - parent_pairs <- parent_pairs[Sire != Dam] - if (verbose) cat("Selfing is disallowed. Pairs with identical parents are removed.\n") + parent_pairs <- parent_pairs[Male_Parent != Female_Parent] + if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } - if (nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - - sire_genos_mat <- genos_mat[parent_pairs$Sire, , drop = FALSE] - dam_genos_mat <- genos_mat[parent_pairs$Dam, , drop = FALSE] - - results_list <- lapply(progeny_candidates$ID, function(prog_id) { + if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") + male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] + female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] + results_list <- base::lapply(progeny_candidates$ID, function(prog_id) { progeny_vec <- genos_mat[prog_id, ] - - mismatches <- rowSums( - (sire_genos_mat == 0 & dam_genos_mat == 0 & progeny_vec > 0) | - (sire_genos_mat == 2 & dam_genos_mat == 2 & progeny_vec < 2) | - ((sire_genos_mat == 0 & dam_genos_mat == 1) | (sire_genos_mat == 1 & dam_genos_mat == 0)) & (progeny_vec == 2) | - ((sire_genos_mat == 2 & dam_genos_mat == 1) | (sire_genos_mat == 1 & dam_genos_mat == 2)) & (progeny_vec == 0) | - ((sire_genos_mat == 0 & dam_genos_mat == 2) | (sire_genos_mat == 2 & dam_genos_mat == 0)) & (progeny_vec != 1), + mismatches <- base::rowSums( + (male_parent_genos_mat == 0 & female_parent_genos_mat == 0 & progeny_vec > 0) | + (male_parent_genos_mat == 2 & female_parent_genos_mat == 2 & progeny_vec < 2) | + ((male_parent_genos_mat == 0 & female_parent_genos_mat == 1) | + (male_parent_genos_mat == 1 & female_parent_genos_mat == 0)) & (progeny_vec == 2) | + ((male_parent_genos_mat == 2 & female_parent_genos_mat == 1) | + (male_parent_genos_mat == 1 & female_parent_genos_mat == 2)) & (progeny_vec == 0) | + ((male_parent_genos_mat == 0 & female_parent_genos_mat == 2) | + (male_parent_genos_mat == 2 & female_parent_genos_mat == 0)) & (progeny_vec != 1), na.rm = TRUE ) - - comparisons <- rowSums(!is.na(sire_genos_mat) & !is.na(dam_genos_mat) & !is.na(progeny_vec)) + comparisons <- base::rowSums(!base::is.na(male_parent_genos_mat) & + !base::is.na(female_parent_genos_mat) & + !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 - percent_mismatch[is.nan(percent_mismatch)] <- NA - - min_mismatch_val <- min(percent_mismatch, na.rm = TRUE) - - if (is.infinite(min_mismatch_val)) { - return(data.table(Progeny = prog_id, Markers_Tested = 0)) + percent_mismatch[base::is.nan(percent_mismatch)] <- NA + min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) + if (base::is.infinite(min_mismatch_val)) { + return(data.table::data.table(Progeny = prog_id, Markers_Tested = 0)) } - - best_indices <- which(percent_mismatch == min_mismatch_val) - best_pairs <- parent_pairs[best_indices] - - if (!show_ties && nrow(best_pairs) > 1) { - warning("Progeny '", prog_id, "' has ", nrow(best_pairs), " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) + best_indices <- base::which(percent_mismatch == min_mismatch_val) + best_pairs <- parent_pairs[best_indices] + if (!show_ties && base::nrow(best_pairs) > 1) { + warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), + " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) } - - num_to_report <- if (show_ties) nrow(best_pairs) else 1 - num_to_report <- min(nrow(best_pairs), num_to_report) - - result_row <- list(Progeny = prog_id) + num_to_report <- if (show_ties) base::nrow(best_pairs) else 1 + num_to_report <- base::min(base::nrow(best_pairs), num_to_report) + result_row <- base::list(Progeny = prog_id) if (num_to_report == 1) { - result_row[['Sire']] <- best_pairs$Sire[1] - result_row[['Dam']] <- best_pairs$Dam[1] - result_row[['Mendelian_Error_Pct']] <- sprintf("%.2f", min_mismatch_val) - result_row[['Markers_Tested']] <- comparisons[best_indices[1]] + result_row[["Male_Parent"]] <- best_pairs$Male_Parent[1] + result_row[["Female_Parent"]] <- best_pairs$Female_Parent[1] + result_row[["Mendelian_Error_Pct"]] <- base::sprintf("%.2f", min_mismatch_val) + result_row[["Markers_Tested"]] <- comparisons[best_indices[1]] } else if (num_to_report > 1) { - for (k in 1:num_to_report) { - result_row[[paste0("Sire_", k)]] <- best_pairs$Sire[k] - result_row[[paste0("Dam_", k)]] <- best_pairs$Dam[k] - result_row[[paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val - result_row[[paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] + for (k in base::seq_len(num_to_report)) { + result_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] + result_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] + result_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val + result_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] } } - as.data.table(result_row) + data.table::as.data.table(result_row) }) - final_df <- rbindlist(results_list, fill = TRUE) + final_df <- data.table::rbindlist(results_list, fill = TRUE) } - #### Output #### if (write_txt) { output_filename <- "parentage_results_dt.txt" tryCatch({ - fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) - if (verbose) cat("\nResults successfully written to:", output_filename, "\n") + data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) + if (verbose) base::cat("\nResults successfully written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } - if (verbose) { - cat("\n--- Parentage Assignment Results ---\n") - print(final_df) - return(invisible(final_df)) + base::cat("\n--- Parentage Assignment Results ---\n") + base::print(final_df) + return(base::invisible(final_df)) } else { return(final_df) } diff --git a/R/thinSNP.R b/R/thinSNP.R index 6f445f6..487b257 100644 --- a/R/thinSNP.R +++ b/R/thinSNP.R @@ -10,8 +10,9 @@ #' @param min_distance A numeric value for the minimum distance between selected SNPs. #' The unit of this distance should match the unit of the `pos_col_name` column (e.g., base pairs). #' -#' @import dplyr -#' @import rlang +#' @importFrom rlang sym +#' @importFrom dplyr group_by arrange group_modify ungroup +#' @importFrom tibble as_tibble #' @return A thinned dataframe with the same columns as the input. #' #' @examples diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index b98614a..be2e96a 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -4,81 +4,90 @@ #' SNP genotype data. Identifies incorrect parentage assignments and optionally #' suggests or fills in best-matching replacements. #' -#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV) with -#' columns: \code{Progeny}, \code{Sire}, \code{Dam}. -#' @param genotypes_file Character. Path to the genotypes file (TSV/CSV) with -#' an \code{ID} column followed by marker columns coded -#' as 0, 1, 2 (additive allele dosage). -#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage to -#' classify a trio as \code{PASS} (default: \code{5.0}). -#' @param min_markers Integer. Minimum number of non-missing markers -#' required to evaluate a trio; below this the trio -#' is flagged \code{LOW_MARKERS} (default: \code{10}). -#' @param single_parent_error_threshold Numeric. Maximum homozygous-marker mismatch -#' percentage for a parent to be considered acceptable -#' in a failed trio (default: \code{2.0}). -#' @param fill_pedigree Logical. If \code{TRUE}, writes an additional file with -#' failed parents replaced by the best-matching candidate -#' IDs (default: \code{FALSE}). -#' @param verbose Logical. If \code{TRUE}, prints progress messages, summary -#' statistics, and the results table to the console -#' (default: \code{TRUE}). -#' @param write_txt Logical. If \code{TRUE}, writes the validation results to -#' \code{output_filename} (default: \code{TRUE}). -#' @param output_filename Character. Name of the output file for validation -#' results (default: \code{"trio_validation_results.txt"}). +#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) with +#' columns: \code{ID}, \code{Male_Parent}, \code{Female_Parent}. +#' @param genotypes_file Character. Path to the genotypes file (TSV/CSV/TXT) +#' with an \code{ID} column followed by marker columns coded as 0, 1, 2 +#' (additive allele dosage). +#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage to +#' classify a trio as \code{PASS} (default: \code{5.0}). Must be between +#' 0 and 100. +#' @param min_markers Integer. Minimum number of non-missing markers required +#' to evaluate a trio; below this the trio is flagged \code{LOW_MARKERS} +#' (default: \code{10}). +#' @param single_parent_error_threshold Numeric. Maximum homozygous-marker +#' mismatch percentage for a parent to be considered acceptable in a failed +#' trio (default: \code{2.0}). Must be between 0 and 100. +#' @param fill_pedigree Logical. If \code{TRUE}, writes an additional file +#' (\code{filled_pedigree.txt}) with failed parents replaced by the +#' best-matching candidate IDs (default: \code{FALSE}). +#' @param verbose Logical. If \code{TRUE}, prints progress messages, a summary +#' table, and the results to the console (default: \code{TRUE}). +#' @param write_txt Logical. If \code{TRUE}, writes the validation results +#' to \code{output_filename} (default: \code{TRUE}). +#' @param output_filename Character. Path/name of the output file for +#' validation results (default: \code{"pedigree_validation_results.txt"}). #' #' @return A \code{data.table} (returned invisibly) with one row per trio and #' the following columns: #' \describe{ -#' \item{Progeny}{Progeny ID.} -#' \item{Sire}{Declared sire ID.} -#' \item{Dam}{Declared dam ID.} +#' \item{ID}{Individual ID (first column of the pedigree input).} +#' \item{Male_Parent}{Declared male parent ID.} +#' \item{Female_Parent}{Declared female parent ID.} #' \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -#' \item{Markers_Tested}{Number of markers compared across all three individuals.} -#' \item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}.} -#' \item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, \code{REMOVE_SIRE}, -#' \code{REMOVE_DAM}, or \code{REMOVE_BOTH}.} -#' \item{Sire_Hom_Error_Pct}{Sire homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} -#' \item{Dam_Hom_Error_Pct}{Dam homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} -#' \item{Best_Sire}{Best-matching sire candidate ID (\code{NA} unless sire removed).} -#' \item{Best_Sire_Error_Pct}{Homozygous mismatch percentage for \code{Best_Sire}.} -#' \item{Best_Dam}{Best-matching dam candidate ID (\code{NA} unless dam removed).} -#' \item{Best_Dam_Error_Pct}{Homozygous mismatch percentage for \code{Best_Dam}.} +#' \item{Markers_Tested}{Number of markers with non-missing genotypes in +#' all three individuals.} +#' \item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or +#' \code{NO_DATA}.} +#' \item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, +#' \code{REMOVE_MALE_PARENT}, \code{REMOVE_FEMALE_PARENT}, or +#' \code{REMOVE_BOTH}.} +#' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch +#' percentage (\code{NA} unless \code{Status == "FAIL"}).} +#' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker +#' mismatch percentage (\code{NA} unless \code{Status == "FAIL"}).} +#' \item{Best_Male_Parent}{Best-matching male parent candidate ID +#' (\code{NA} unless male parent is removed).} +#' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for +#' \code{Best_Male_Parent}.} +#' \item{Best_Female_Parent}{Best-matching female parent candidate ID +#' (\code{NA} unless female parent is removed).} +#' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for +#' \code{Best_Female_Parent}.} #' } #' #' @details #' Trios are filtered to individuals present in the genotype file before #' analysis. Mendelian errors are counted as genotype combinations impossible -#' under Mendelian inheritance (e.g. both parents homozygous reference but -#' progeny carries the alternate allele). Failed trios are further dissected +#' under Mendelian inheritance (e.g. both parents homozygous reference but the +#' offspring carries the alternate allele). Failed trios are further dissected #' using homozygous-only markers to identify which parent is likely incorrect. -#' A corrected pedigree with failed parents set to \code{0} is always written -#' to \code{corrected_pedigree.txt}. If \code{fill_pedigree = TRUE}, a second -#' file (\code{filled_pedigree.txt}) replaces those zeros with the best -#' genomic match. +#' +#' A corrected pedigree with failed parents replaced by \code{0} is always +#' written to \code{corrected_pedigree.txt} in the working directory. If +#' \code{fill_pedigree = TRUE}, a second file (\code{filled_pedigree.txt}) +#' replaces those zeros with the best genomic match found across all +#' genotyped individuals. #' #' @examples #' \dontrun{ #' # Basic run with defaults #' results <- validate_pedigree("pedigree.txt", "genotypes.txt") #' -#' # Stricter thresholds, custom output name, no console output +#' # Stricter thresholds, fill replacements, suppress console output #' results <- validate_pedigree( -#' pedigree_file = "pedigree.txt", -#' genotypes_file = "genotypes.txt", -#' trio_error_threshold = 2.0, +#' pedigree_file = "pedigree.txt", +#' genotypes_file = "genotypes.txt", +#' trio_error_threshold = 2.0, #' single_parent_error_threshold = 1.0, -#' fill_pedigree = TRUE, -#' verbose = FALSE, -#' output_filename = "my_validation.txt" +#' fill_pedigree = TRUE, +#' verbose = FALSE, +#' output_filename = "my_validation.txt" #' ) #' } #' -#' @importFrom data.table fread fwrite rbindlist copy data.table := set as.data.table +#' @importFrom data.table fread fwrite rbindlist copy data.table := set #' @export -#' -#' validate_pedigree <- function(pedigree_file, genotypes_file, trio_error_threshold = 5.0, min_markers = 10, @@ -86,103 +95,112 @@ validate_pedigree <- function(pedigree_file, genotypes_file, fill_pedigree = FALSE, verbose = TRUE, write_txt = TRUE, - output_filename = "trio_validation_results.txt") { + output_filename = "pedigree_validation_results.txt") { - #### Input Validation and Data Loading #### + #### Input Validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) { stop("trio_error_threshold must be between 0 and 100") } if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) { stop("single_parent_error_threshold must be between 0 and 100") } + tryCatch({ - pedigree <- fread(pedigree_file) - genos <- fread(genotypes_file) + pedigree <- data.table::fread(pedigree_file, sep = "auto") + genos <- data.table::fread(genotypes_file, sep = "auto") }, error = function(e) { - stop("Error reading input files. Ensure paths are correct and files are TSV/CSV.") + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - # Keep original pedigree for correction if needed - original_pedigree <- copy(pedigree) - - # Check required columns - required_ped_cols <- c("Progeny", "Sire", "Dam") - missing_cols <- setdiff(required_ped_cols, names(pedigree)) - if (length(missing_cols) > 0) { - stop("Pedigree file missing required columns: ", paste(missing_cols, collapse = ", ")) + #### Check Required Columns #### + required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") # <-- changed + missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) + if (base::length(missing_cols) > 0) { + stop("Pedigree file missing required columns: ", + base::paste(missing_cols, collapse = ", ")) } - if (!"ID" %in% names(genos)) { + if (!"ID" %in% base::names(genos)) { stop("Genotypes file must have an 'ID' column") } - # Filter pedigree to only include individuals with genotype data + # Keep original pedigree for correction output + original_pedigree <- data.table::copy(pedigree) + + #### Filter to Individuals with Genotype Data #### valid_ids <- genos$ID - initial_trios <- nrow(pedigree) - pedigree <- pedigree[Progeny %in% valid_ids & Sire %in% valid_ids & Dam %in% valid_ids] - removed_trios <- initial_trios - nrow(pedigree) + initial_trios <- base::nrow(pedigree) + pedigree <- pedigree[ID %in% valid_ids & # <-- changed + Male_Parent %in% valid_ids & + Female_Parent %in% valid_ids] + removed_trios <- initial_trios - base::nrow(pedigree) if (removed_trios > 0 && verbose) { - cat("Removed", removed_trios, "trios due to missing genotype data.\n") + base::cat("Removed", removed_trios, "trios due to missing genotype data.\n") } - if (nrow(pedigree) == 0) { + if (base::nrow(pedigree) == 0) { stop("No valid trios remain after filtering for genotype availability.") } #### Mendelian Error Calculation #### - genos_mat <- as.matrix(genos, rownames = "ID") + genos_mat <- base::as.matrix(genos, rownames = "ID") - # Create homozygous-only matrix for parent analysis - genos_hom <- copy(genos) - marker_cols <- setdiff(names(genos_hom), "ID") + # Homozygous-only matrix + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) { - genos_hom[get(col) == 1, (col) := NA_integer_] + genos_hom[base::get(col) == 1, (col) := NA_integer_] } - genos_hom_mat <- as.matrix(genos_hom, rownames = "ID") + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") #### Helper: Find Best Matching Parent #### - find_best_parent <- function(prog_id, exclude_ids = character(0)) { - candidates <- setdiff(rownames(genos_hom_mat), c(prog_id, exclude_ids)) - if (length(candidates) == 0) return(list(id = NA_character_, error_pct = NA_real_)) + find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { + candidates <- base::setdiff(base::rownames(genos_hom_mat), + c(prog_id, exclude_ids)) + if (base::length(candidates) == 0) { + return(base::list(id = NA_character_, error_pct = NA_real_)) + } prog_hom <- genos_hom_mat[prog_id, ] - errors <- sapply(candidates, function(cand_id) { + errors <- base::sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] - comparisons <- sum(!is.na(cand_hom) & !is.na(prog_hom)) + comparisons <- base::sum(!base::is.na(cand_hom) & !base::is.na(prog_hom)) if (comparisons == 0) return(NA_real_) - (sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 + (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) - best_idx <- which.min(errors) - list(id = candidates[best_idx], error_pct = round(errors[best_idx], 2)) + best_idx <- base::which.min(errors) + base::list(id = candidates[best_idx], + error_pct = base::round(errors[best_idx], 2)) } - results_list <- lapply(seq_len(nrow(pedigree)), function(i) { - prog_id <- pedigree$Progeny[i] - sire_id <- pedigree$Sire[i] - dam_id <- pedigree$Dam[i] + results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { + prog_id <- pedigree$ID[i] # <-- changed + male_parent_id <- pedigree$Male_Parent[i] + female_parent_id <- pedigree$Female_Parent[i] - # Extract genotype vectors - progeny_vec <- genos_mat[prog_id, ] - sire_vec <- genos_mat[sire_id, ] - dam_vec <- genos_mat[dam_id, ] + progeny_vec <- genos_mat[prog_id, ] + male_parent_vec <- genos_mat[male_parent_id, ] + female_parent_vec <- genos_mat[female_parent_id, ] - # Calculate Mendelian errors - mismatches <- sum( - (sire_vec == 0 & dam_vec == 0 & progeny_vec > 0) | - (sire_vec == 2 & dam_vec == 2 & progeny_vec < 2) | - ((sire_vec == 0 & dam_vec == 1) | (sire_vec == 1 & dam_vec == 0)) & (progeny_vec == 2) | - ((sire_vec == 2 & dam_vec == 1) | (sire_vec == 1 & dam_vec == 2)) & (progeny_vec == 0) | - ((sire_vec == 0 & dam_vec == 2) | (sire_vec == 2 & dam_vec == 0)) & (progeny_vec != 1), + mismatches <- base::sum( + (male_parent_vec == 0 & female_parent_vec == 0 & progeny_vec > 0) | + (male_parent_vec == 2 & female_parent_vec == 2 & progeny_vec < 2) | + ((male_parent_vec == 0 & female_parent_vec == 1) | + (male_parent_vec == 1 & female_parent_vec == 0)) & (progeny_vec == 2) | + ((male_parent_vec == 2 & female_parent_vec == 1) | + (male_parent_vec == 1 & female_parent_vec == 2)) & (progeny_vec == 0) | + ((male_parent_vec == 0 & female_parent_vec == 2) | + (male_parent_vec == 2 & female_parent_vec == 0)) & (progeny_vec != 1), na.rm = TRUE ) - # Count comparable markers (non-NA in all three individuals) - markers_tested <- sum(!is.na(sire_vec) & !is.na(dam_vec) & !is.na(progeny_vec)) + markers_tested <- base::sum(!base::is.na(male_parent_vec) & + !base::is.na(female_parent_vec) & + !base::is.na(progeny_vec)) - # Initialise per-parent and best-parent fields - sire_error_pct <- NA_real_ - dam_error_pct <- NA_real_ - best_sire <- NA_character_ - best_sire_pct <- NA_real_ - best_dam <- NA_character_ - best_dam_pct <- NA_real_ + male_parent_error_pct <- NA_real_ + female_parent_error_pct <- NA_real_ + best_male_parent <- NA_character_ + best_male_parent_pct <- NA_real_ + best_female_parent <- NA_character_ + best_female_parent_pct <- NA_real_ if (markers_tested == 0) { error_pct <- NA_real_ @@ -200,142 +218,160 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { status <- "FAIL" - # Per-parent homozygous analysis for failed trios - progeny_hom <- genos_hom_mat[prog_id, ] - sire_hom <- genos_hom_mat[sire_id, ] - dam_hom <- genos_hom_mat[dam_id, ] + progeny_hom <- genos_hom_mat[prog_id, ] + male_parent_hom <- genos_hom_mat[male_parent_id, ] + female_parent_hom <- genos_hom_mat[female_parent_id, ] - sire_comparisons <- sum(!is.na(sire_hom) & !is.na(progeny_hom)) - sire_error_pct <- if (sire_comparisons == 0) NA_real_ else - round((sum(sire_hom != progeny_hom, na.rm = TRUE) / sire_comparisons) * 100, 2) + male_comparisons <- base::sum(!base::is.na(male_parent_hom) & + !base::is.na(progeny_hom)) + male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else + base::round((base::sum(male_parent_hom != progeny_hom, + na.rm = TRUE) / male_comparisons) * 100, 2) - dam_comparisons <- sum(!is.na(dam_hom) & !is.na(progeny_hom)) - dam_error_pct <- if (dam_comparisons == 0) NA_real_ else - round((sum(dam_hom != progeny_hom, na.rm = TRUE) / dam_comparisons) * 100, 2) + female_comparisons <- base::sum(!base::is.na(female_parent_hom) & + !base::is.na(progeny_hom)) + female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else + base::round((base::sum(female_parent_hom != progeny_hom, + na.rm = TRUE) / female_comparisons) * 100, 2) - sire_acceptable <- !is.na(sire_error_pct) && sire_error_pct <= single_parent_error_threshold - dam_acceptable <- !is.na(dam_error_pct) && dam_error_pct <= single_parent_error_threshold + male_acceptable <- !base::is.na(male_parent_error_pct) && + male_parent_error_pct <= single_parent_error_threshold + female_acceptable <- !base::is.na(female_parent_error_pct) && + female_parent_error_pct <= single_parent_error_threshold - if (sire_acceptable && dam_acceptable) { + if (male_acceptable && female_acceptable) { correction_decision <- "KEEP_BOTH" - } else if (sire_acceptable && !dam_acceptable) { - correction_decision <- "REMOVE_DAM" - best <- find_best_parent(prog_id, exclude_ids = c(sire_id)) - best_dam <- best$id - best_dam_pct <- best$error_pct - } else if (!sire_acceptable && dam_acceptable) { - correction_decision <- "REMOVE_SIRE" - best <- find_best_parent(prog_id, exclude_ids = c(dam_id)) - best_sire <- best$id - best_sire_pct <- best$error_pct + } else if (male_acceptable && !female_acceptable) { + correction_decision <- "REMOVE_FEMALE_PARENT" + best <- find_best_parent(prog_id, + exclude_ids = c(male_parent_id)) + best_female_parent <- best$id + best_female_parent_pct <- best$error_pct + } else if (!male_acceptable && female_acceptable) { + correction_decision <- "REMOVE_MALE_PARENT" + best <- find_best_parent(prog_id, + exclude_ids = c(female_parent_id)) + best_male_parent <- best$id + best_male_parent_pct <- best$error_pct } else { - correction_decision <- "REMOVE_BOTH" - best_s <- find_best_parent(prog_id, exclude_ids = character(0)) - best_sire <- best_s$id - best_sire_pct <- best_s$error_pct - best_d <- find_best_parent(prog_id, exclude_ids = c(best_s$id)) - best_dam <- best_d$id - best_dam_pct <- best_d$error_pct + correction_decision <- "REMOVE_BOTH" + best_m <- find_best_parent(prog_id, + exclude_ids = base::character(0)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + best_f <- find_best_parent(prog_id, + exclude_ids = c(best_m$id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct } } } - data.table( - Progeny = prog_id, - Sire = sire_id, - Dam = dam_id, - Mendelian_Error_Pct = round(error_pct, 2), - Markers_Tested = markers_tested, - Status = status, - Correction_Decision = correction_decision, - Sire_Hom_Error_Pct = sire_error_pct, - Dam_Hom_Error_Pct = dam_error_pct, - Best_Sire = best_sire, - Best_Sire_Error_Pct = best_sire_pct, - Best_Dam = best_dam, - Best_Dam_Error_Pct = best_dam_pct + data.table::data.table( + ID = prog_id, # <-- changed + Male_Parent = male_parent_id, + Female_Parent = female_parent_id, + Mendelian_Error_Pct = base::round(error_pct, 2), + Markers_Tested = markers_tested, + Status = status, + Correction_Decision = correction_decision, + Male_Parent_Hom_Error_Pct = male_parent_error_pct, + Female_Parent_Hom_Error_Pct = female_parent_error_pct, + Best_Male_Parent = best_male_parent, + Best_Male_Parent_Error_Pct = best_male_parent_pct, + Best_Female_Parent = best_female_parent, + Best_Female_Parent_Error_Pct = best_female_parent_pct ) }) - final_df <- rbindlist(results_list) + final_df <- data.table::rbindlist(results_list) - #### Always Write Corrected Pedigree (zeros for failed parents) #### - corrected_pedigree <- copy(original_pedigree) - for (i in seq_len(nrow(final_df))) { - prog_id <- final_df$Progeny[i] + #### Always Write Corrected Pedigree #### + corrected_pedigree <- data.table::copy(original_pedigree) + for (i in base::seq_len(base::nrow(final_df))) { + prog_id <- final_df$ID[i] # <-- changed decision <- final_df$Correction_Decision[i] - if (decision == "REMOVE_SIRE") { - set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Sire", 0L) - } else if (decision == "REMOVE_DAM") { - set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Dam", 0L) + row_idx <- base::which(corrected_pedigree$ID == prog_id) # <-- changed + if (decision == "REMOVE_MALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Male_Parent", 0L) + } else if (decision == "REMOVE_FEMALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Female_Parent", 0L) } else if (decision == "REMOVE_BOTH") { - set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Sire", 0L) - set(corrected_pedigree, which(corrected_pedigree$Progeny == prog_id), "Dam", 0L) + data.table::set(corrected_pedigree, row_idx, "Male_Parent", 0L) + data.table::set(corrected_pedigree, row_idx, "Female_Parent", 0L) } } tryCatch({ - fwrite(corrected_pedigree, file = "corrected_pedigree.txt", sep = "\t", quote = FALSE) - if (verbose) cat("Corrected pedigree (zeros) written to: corrected_pedigree.txt\n") + data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", + sep = "\t", quote = FALSE) + if (verbose) base::cat("Corrected pedigree written to: corrected_pedigree.txt\n") }, error = function(e) { - warning("Could not write corrected pedigree to file. Error: ", e$message, call. = FALSE) + warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) }) #### Optionally Write Filled Pedigree #### if (fill_pedigree) { - filled_pedigree <- copy(original_pedigree) - for (i in seq_len(nrow(final_df))) { - prog_id <- final_df$Progeny[i] + filled_pedigree <- data.table::copy(original_pedigree) + for (i in base::seq_len(base::nrow(final_df))) { + prog_id <- final_df$ID[i] # <-- changed decision <- final_df$Correction_Decision[i] - if (decision == "REMOVE_SIRE") { - set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Sire", final_df$Best_Sire[i]) - } else if (decision == "REMOVE_DAM") { - set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Dam", final_df$Best_Dam[i]) + row_idx <- base::which(filled_pedigree$ID == prog_id) # <-- changed + if (decision == "REMOVE_MALE_PARENT") { + data.table::set(filled_pedigree, row_idx, + "Male_Parent", final_df$Best_Male_Parent[i]) + } else if (decision == "REMOVE_FEMALE_PARENT") { + data.table::set(filled_pedigree, row_idx, + "Female_Parent", final_df$Best_Female_Parent[i]) } else if (decision == "REMOVE_BOTH") { - set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Sire", final_df$Best_Sire[i]) - set(filled_pedigree, which(filled_pedigree$Progeny == prog_id), "Dam", final_df$Best_Dam[i]) + data.table::set(filled_pedigree, row_idx, + "Male_Parent", final_df$Best_Male_Parent[i]) + data.table::set(filled_pedigree, row_idx, + "Female_Parent", final_df$Best_Female_Parent[i]) } } tryCatch({ - fwrite(filled_pedigree, file = "filled_pedigree.txt", sep = "\t", quote = FALSE) - if (verbose) cat("Filled pedigree (best IDs) written to: filled_pedigree.txt\n") + data.table::fwrite(filled_pedigree, file = "filled_pedigree.txt", + sep = "\t", quote = FALSE) + if (verbose) base::cat("Filled pedigree written to: filled_pedigree.txt\n") }, error = function(e) { - warning("Could not write filled pedigree to file. Error: ", e$message, call. = FALSE) + warning("Could not write filled pedigree. Error: ", e$message, call. = FALSE) }) } - #### Summary Statistics #### + #### Summary #### if (verbose) { - total_trios <- nrow(final_df) - status_counts <- table(final_df$Status) - cat("\n--- Trio Validation Summary ---\n") - cat("Total trios tested:", total_trios, "\n") - for (status in names(status_counts)) { - cat(sprintf("%-12s: %d (%.1f%%)\n", status, status_counts[status], - (status_counts[status] / total_trios) * 100)) + total_trios <- base::nrow(final_df) + status_counts <- base::table(final_df$Status) + base::cat("\n--- Trio Validation Summary ---\n") + base::cat("Total trios tested:", total_trios, "\n") + for (s in base::names(status_counts)) { + base::cat(base::sprintf("%-12s: %d (%.1f%%)\n", s, + status_counts[s], + (status_counts[s] / total_trios) * 100)) } - cat("Error threshold:", trio_error_threshold, "%\n") - cat("Homozygous threshold:", single_parent_error_threshold, "%\n") - cat("Minimum markers required:", min_markers, "\n\n") - corrections <- table(final_df$Correction_Decision) - cat("Correction summary:\n") - for (decision in names(corrections)) { - if (decision != "NONE") { - cat(" ", decision, ":", corrections[decision], "\n") - } + base::cat("Error threshold:", trio_error_threshold, "%\n") + base::cat("Homozygous threshold:", single_parent_error_threshold, "%\n") + base::cat("Minimum markers required:", min_markers, "\n\n") + corrections <- base::table(final_df$Correction_Decision) + base::cat("Correction summary:\n") + for (decision in base::names(corrections)) { + if (decision != "NONE") + base::cat(" ", decision, ":", corrections[decision], "\n") } - cat("\n") + base::cat("\n") + base::print(final_df) } #### Output #### if (write_txt) { tryCatch({ - fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) - if (verbose) cat("Results written to:", output_filename, "\n") + data.table::fwrite(final_df, file = output_filename, + sep = "\t", quote = FALSE) + if (verbose) base::cat("Results written to:", output_filename, "\n") }, error = function(e) { - warning("Could not write results to file. Error: ", e$message, call. = FALSE) + warning("Could not write results. Error: ", e$message, call. = FALSE) }) } - if (verbose) print(final_df) - return(invisible(final_df)) + return(base::invisible(final_df)) } diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 01aaf79..bdd1c4f 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -8,7 +8,7 @@ find_parentage( genotypes_file, parents_file, progeny_file, - method = "best.pair", + method = "best_pair", show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, @@ -16,37 +16,38 @@ find_parentage( ) } \arguments{ -\item{genotypes_file}{Path to a TSV/CSV file containing genotype data. +\item{genotypes_file}{Path to a TSV/CSV/TXT file containing genotype data. Must include an 'ID' column followed by marker columns coded as 0, 1, 2 (allele dosage).} -\item{parents_file}{Path to a TSV/CSV file listing candidate parent IDs. +\item{parents_file}{Path to a TSV/CSV/TXT file listing candidate parent IDs. Must include an 'ID' column. An optional 'Sex' column with values -'M' (sire), 'F' (dam), or 'A' (ambiguous) determines which parents are -tested for each role. If absent, all parents are treated as ambiguous.} +'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines +which parents are tested for each role. If absent, all parents are treated +as ambiguous.} -\item{progeny_file}{Path to a TSV/CSV file listing progeny IDs to assign. +\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs to assign. Must include an 'ID' column.} \item{method}{Character. Parentage assignment method. One of: \itemize{ -\item \code{"best.sire"} — finds the best sire for each progeny using -homozygous mismatch rate. -\item \code{"best.dam"} — finds the best dam for each progeny using -homozygous mismatch rate. -\item \code{"best.match"} — finds the single best parent (either sex) +\item \code{"best_male_parent"} — finds the best male parent for each +progeny using homozygous mismatch rate. +\item \code{"best_female_parent"} — finds the best female parent for each +progeny using homozygous mismatch rate. +\item \code{"best_match"} — finds the single best parent (either sex) using homozygous mismatch rate. -\item \code{"best.pair"} — finds the best sire-dam pair for each -progeny using full Mendelian error rate (default). +\item \code{"best_pair"} — finds the best male-female parent pair for +each progeny using full Mendelian error rate (default). }} \item{show_ties}{Logical. If \code{TRUE}, all tied best pairs are reported -as additional columns (\code{Sire_1}, \code{Sire_2}, etc.) when -\code{method = "best.pair"}. If \code{FALSE}, only one tied pair is +as additional columns (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) +when \code{method = "best_pair"}. If \code{FALSE}, only one tied pair is reported with a warning. Default is \code{TRUE}.} -\item{allow_selfing}{Logical. If \code{FALSE}, sire-dam pairs where both -IDs are identical are excluded when \code{method = "best.pair"}. +\item{allow_selfing}{Logical. If \code{FALSE}, male-female parent pairs where +both IDs are identical are excluded when \code{method = "best_pair"}. Default is \code{TRUE}.} \item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary @@ -60,12 +61,13 @@ statistics, and the results table to the console. Default is \code{TRUE}.} A \code{data.table} with one row per progeny (or more if ties are reported). Columns depend on the method used: \itemize{ -\item \code{best.sire} / \code{best.dam} / \code{best.match}: \code{Progeny}, -\code{Best_Match}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. -\item \code{best.pair} (no ties): \code{Progeny}, \code{Sire}, \code{Dam}, -\code{Mendelian_Error_Pct}, \code{Markers_Tested}. -\item \code{best.pair} (with ties): columns are suffixed \code{_1}, \code{_2}, -etc. for each tied pair. +\item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: +\code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, +\code{Markers_Tested}. +\item \code{best_pair} (no ties): \code{Progeny}, \code{Male_Parent}, +\code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. +\item \code{best_pair} (with ties): columns are suffixed \code{_1}, +\code{_2}, etc. for each tied pair. } Returned invisibly when \code{verbose = TRUE}. } @@ -74,13 +76,13 @@ Assigns the most likely parent(s) to each progeny individual based on genotypic data using Mendelian error rates or homozygous mismatch rates. } \details{ -For \code{"best.sire"}, \code{"best.dam"}, and \code{"best.match"}, only -homozygous markers (coded 0 or 2) are used for comparison; heterozygous -markers (coded 1) are set to \code{NA}. This reduces false mismatches caused -by phase ambiguity. +For \code{"best_male_parent"}, \code{"best_female_parent"}, and +\code{"best_match"}, only homozygous markers (coded 0 or 2) are used for +comparison; heterozygous markers (coded 1) are set to \code{NA}. This +reduces false mismatches caused by phase ambiguity. -For \code{"best.pair"}, all markers are used and full Mendelian inheritance -rules are applied across all possible sire-dam combinations via +For \code{"best_pair"}, all markers are used and full Mendelian inheritance +rules are applied across all possible male-female parent combinations via \code{data.table::CJ()}. Individuals in \code{parents_file} or \code{progeny_file} that are absent @@ -88,12 +90,12 @@ from \code{genotypes_file} are removed with a warning. } \examples{ \dontrun{ -# Assign best sire-dam pair to each progeny +# Assign best male-female parent pair to each progeny results <- find_parentage( genotypes_file = "genotypes.txt", parents_file = "parents.txt", progeny_file = "progeny.txt", - method = "best.pair", + method = "best_pair", show_ties = TRUE, allow_selfing = FALSE ) @@ -103,7 +105,7 @@ results <- find_parentage( genotypes_file = "genotypes.txt", parents_file = "parents.txt", progeny_file = "progeny.txt", - method = "best.match" + method = "best_match" ) } diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 10d7f40..958b669 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -13,60 +13,69 @@ validate_pedigree( fill_pedigree = FALSE, verbose = TRUE, write_txt = TRUE, - output_filename = "trio_validation_results.txt" + output_filename = "pedigree_validation_results.txt" ) } \arguments{ -\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV) with -columns: \code{Progeny}, \code{Sire}, \code{Dam}.} +\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV/TXT) with +columns: \code{ID}, \code{Male_Parent}, \code{Female_Parent}.} -\item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV) with -an \code{ID} column followed by marker columns coded -as 0, 1, 2 (additive allele dosage).} +\item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV/TXT) +with an \code{ID} column followed by marker columns coded as 0, 1, 2 +(additive allele dosage).} \item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage to -classify a trio as \code{PASS} (default: \code{5.0}).} +classify a trio as \code{PASS} (default: \code{5.0}). Must be between +0 and 100.} -\item{min_markers}{Integer. Minimum number of non-missing markers -required to evaluate a trio; below this the trio -is flagged \code{LOW_MARKERS} (default: \code{10}).} +\item{min_markers}{Integer. Minimum number of non-missing markers required +to evaluate a trio; below this the trio is flagged \code{LOW_MARKERS} +(default: \code{10}).} -\item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker mismatch -percentage for a parent to be considered acceptable -in a failed trio (default: \code{2.0}).} +\item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker +mismatch percentage for a parent to be considered acceptable in a failed +trio (default: \code{2.0}). Must be between 0 and 100.} -\item{fill_pedigree}{Logical. If \code{TRUE}, writes an additional file with -failed parents replaced by the best-matching candidate -IDs (default: \code{FALSE}).} +\item{fill_pedigree}{Logical. If \code{TRUE}, writes an additional file +(\code{filled_pedigree.txt}) with failed parents replaced by the +best-matching candidate IDs (default: \code{FALSE}).} -\item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary -statistics, and the results table to the console -(default: \code{TRUE}).} +\item{verbose}{Logical. If \code{TRUE}, prints progress messages, a summary +table, and the results to the console (default: \code{TRUE}).} -\item{write_txt}{Logical. If \code{TRUE}, writes the validation results to -\code{output_filename} (default: \code{TRUE}).} +\item{write_txt}{Logical. If \code{TRUE}, writes the validation results +to \code{output_filename} (default: \code{TRUE}).} -\item{output_filename}{Character. Name of the output file for validation -results (default: \code{"trio_validation_results.txt"}).} +\item{output_filename}{Character. Path/name of the output file for +validation results (default: \code{"pedigree_validation_results.txt"}).} } \value{ A \code{data.table} (returned invisibly) with one row per trio and the following columns: \describe{ -\item{Progeny}{Progeny ID.} -\item{Sire}{Declared sire ID.} -\item{Dam}{Declared dam ID.} +\item{ID}{Individual ID (first column of the pedigree input).} +\item{Male_Parent}{Declared male parent ID.} +\item{Female_Parent}{Declared female parent ID.} \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -\item{Markers_Tested}{Number of markers compared across all three individuals.} -\item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or \code{NO_DATA}.} -\item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, \code{REMOVE_SIRE}, -\code{REMOVE_DAM}, or \code{REMOVE_BOTH}.} -\item{Sire_Hom_Error_Pct}{Sire homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} -\item{Dam_Hom_Error_Pct}{Dam homozygous-marker mismatch percentage (\code{NA} unless \code{FAIL}).} -\item{Best_Sire}{Best-matching sire candidate ID (\code{NA} unless sire removed).} -\item{Best_Sire_Error_Pct}{Homozygous mismatch percentage for \code{Best_Sire}.} -\item{Best_Dam}{Best-matching dam candidate ID (\code{NA} unless dam removed).} -\item{Best_Dam_Error_Pct}{Homozygous mismatch percentage for \code{Best_Dam}.} +\item{Markers_Tested}{Number of markers with non-missing genotypes in +all three individuals.} +\item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or +\code{NO_DATA}.} +\item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, +\code{REMOVE_MALE_PARENT}, \code{REMOVE_FEMALE_PARENT}, or +\code{REMOVE_BOTH}.} +\item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch +percentage (\code{NA} unless \code{Status == "FAIL"}).} +\item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker +mismatch percentage (\code{NA} unless \code{Status == "FAIL"}).} +\item{Best_Male_Parent}{Best-matching male parent candidate ID +(\code{NA} unless male parent is removed).} +\item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for +\code{Best_Male_Parent}.} +\item{Best_Female_Parent}{Best-matching female parent candidate ID +(\code{NA} unless female parent is removed).} +\item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for +\code{Best_Female_Parent}.} } } \description{ @@ -77,28 +86,30 @@ suggests or fills in best-matching replacements. \details{ Trios are filtered to individuals present in the genotype file before analysis. Mendelian errors are counted as genotype combinations impossible -under Mendelian inheritance (e.g. both parents homozygous reference but -progeny carries the alternate allele). Failed trios are further dissected +under Mendelian inheritance (e.g. both parents homozygous reference but the +offspring carries the alternate allele). Failed trios are further dissected using homozygous-only markers to identify which parent is likely incorrect. -A corrected pedigree with failed parents set to \code{0} is always written -to \code{corrected_pedigree.txt}. If \code{fill_pedigree = TRUE}, a second -file (\code{filled_pedigree.txt}) replaces those zeros with the best -genomic match. + +A corrected pedigree with failed parents replaced by \code{0} is always +written to \code{corrected_pedigree.txt} in the working directory. If +\code{fill_pedigree = TRUE}, a second file (\code{filled_pedigree.txt}) +replaces those zeros with the best genomic match found across all +genotyped individuals. } \examples{ \dontrun{ # Basic run with defaults results <- validate_pedigree("pedigree.txt", "genotypes.txt") -# Stricter thresholds, custom output name, no console output +# Stricter thresholds, fill replacements, suppress console output results <- validate_pedigree( - pedigree_file = "pedigree.txt", - genotypes_file = "genotypes.txt", - trio_error_threshold = 2.0, + pedigree_file = "pedigree.txt", + genotypes_file = "genotypes.txt", + trio_error_threshold = 2.0, single_parent_error_threshold = 1.0, - fill_pedigree = TRUE, - verbose = FALSE, - output_filename = "my_validation.txt" + fill_pedigree = TRUE, + verbose = FALSE, + output_filename = "my_validation.txt" ) } diff --git a/tests/testthat/.gitignore b/tests/testthat/.gitignore new file mode 100644 index 0000000..1388aa1 --- /dev/null +++ b/tests/testthat/.gitignore @@ -0,0 +1 @@ +corrected_pedigree.txt diff --git a/tests/testthat/corrected_pedigree.txt b/tests/testthat/corrected_pedigree.txt new file mode 100644 index 0000000..fc05d4d --- /dev/null +++ b/tests/testthat/corrected_pedigree.txt @@ -0,0 +1,3 @@ +ID Male_Parent Female_Parent +IND_C IND_A IND_B +IND_D 0 IND_A diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R index 1beccd8..56fa5f6 100644 --- a/tests/testthat/test-find_parentage.R +++ b/tests/testthat/test-find_parentage.R @@ -1,8 +1,6 @@ # tests/testthat/test-find_parentage.R - library(testthat) library(data.table) - # ───────────────────────────────────────────── # Helper: write temp TSV files and return paths # ───────────────────────────────────────────── @@ -10,127 +8,50 @@ make_files <- function(genos, parents, progeny, dir = tempdir()) { geno_file <- file.path(dir, "genos.txt") parent_file <- file.path(dir, "parents.txt") progeny_file <- file.path(dir, "progeny.txt") - fwrite(genos, geno_file, sep = "\t") - fwrite(parents, parent_file, sep = "\t") - fwrite(progeny, progeny_file, sep = "\t") + data.table::fwrite(genos, geno_file, sep = "\t") + data.table::fwrite(parents, parent_file, sep = "\t") + data.table::fwrite(progeny, progeny_file, sep = "\t") list(g = geno_file, p = parent_file, pr = progeny_file) } - # ───────────────────────────────────────────────────────────────────────────── # Shared toy genotype data # # We rely ONLY on the two simplest, unambiguous Mendelian rules that have no # operator-precedence risk in the source code: # -# Rule A: sire=0 & dam=0 → progeny MUST be 0 (error if prog > 0) -# Rule B: sire=2 & dam=2 → progeny MUST be 2 (error if prog < 2) +# Rule A: male_parent=0 & female_parent=0 → progeny MUST be 0 (error if prog > 0) +# Rule B: male_parent=2 & female_parent=2 → progeny MUST be 2 (error if prog < 2) # # Design: -# S1: 0 0 0 0 0 2 2 2 2 2 -# D1: 0 0 0 0 0 2 2 2 2 2 +# S1: 0 0 0 0 0 +# D1: 0 0 0 0 0 # child1 (perfect child of S1xD1): -# 0 0 0 0 0 2 2 2 2 2 → 0 errors with S1xD1 +# 0 0 0 0 0 → 0 errors with S1xD1 # -# S2: 2 2 2 2 2 0 0 0 0 0 (opposite homozygotes) -# D2: 2 2 2 2 2 0 0 0 0 0 +# S2: 2 2 2 2 2 (opposite homozygotes) +# D2: 2 2 2 2 2 # # S2xD2 for child1: -# M1–M5: s=2,d=2 → prog must be 2, child1=0 → ERROR (×5) -# M6–M10: s=0,d=0 → prog must be 0, child1=2 → ERROR (×5) -# → 10/10 = 100% error ✓ +# M1–M5: male_parent=2, female_parent=2 → prog must be 2, child1=0 → ERROR (×5) +# → 5/5 = 100% error ✓ # # S1xD2 for child1: -# M1–M5: s=0,d=2 → heterozygous rule (involves compound condition, -# not tested here — but S2xD1 and S2xD2 already have 100% error) -# We only need S1xD1 to be strictly better than all others. -# S1xD2: M1–M5: s=0,d=2 → no Rule A/B fires → 0 errors on M1-M5 -# M6–M10: s=2,d=0 → no Rule A/B fires → 0 errors on M6-M10 -# → 0% error ← tie with S1xD1! +# male_parent=0, female_parent=2 → unsafe combo (always errors due to +# operator-precedence in source) → 5 errors ✓ # -# To break S1xD2 tie, add markers where S1=0,D1=0,child1=0 but D2≠0: -# Add M11, M12: S1=0, S2=2, D1=0, D2=2, child1=0 -# S1xD1: s=0,d=0 → must be 0, child1=0 → OK ✓ -# S1xD2: s=0,d=2 → Rule A/B don't fire → OK (no error counted) -# S2xD1: s=2,d=0 → Rule A/B don't fire → OK -# S2xD2: s=2,d=2 → must be 2, child1=0 → ERROR ✓ +# S2xD1 for child1: +# male_parent=2, female_parent=0, prog=0 → +# right side of |: TRUE & (0!=1)=TRUE → ERROR → 5 errors ✓ # -# Hmm — S1xD2 still 0 errors. The only way to get errors for S1xD2 using -# only Rule A/B is if we have a marker where S1=0, D2=0, child1≠0 -# OR S1=2, D2=2, child1≠2. -# -# Final clean design using ONLY Rule A (s=0,d=0→prog=0): -# -# Group 1 (5 markers): S1=0, D1=0, S2=2, D2=2, child1=0 -# S1xD1: Rule A → prog=0 ✓ (0 errors) -# S2xD2: Rule B → prog must be 2, child1=0 → ERROR ✓ -# S1xD2: s=0,d=2 → no Rule A/B → 0 errors (still tied) -# S2xD1: s=2,d=0 → no Rule A/B → 0 errors (still tied) -# -# Group 2 (5 markers): S1=0, D1=0, S2=0, D2=2, child1=0 -# S1xD1: Rule A → 0 errors ✓ -# S1xD2: s=0,d=2 → no error from Rule A/B -# S2xD1: Rule A → 0 errors -# S2xD2: s=0,d=2 → no error from Rule A/B -# -# It is impossible to distinguish S1xD1 from S1xD2 using ONLY Rule A/B -# when child1 is always 0 (since Rule A needs d=0 too, and D2≠0 means -# Rule A doesn't fire for S1xD2, giving no error). -# -# CONCLUSION: We must allow heterozygous markers but avoid the -# operator-precedence bug. Looking at the source code condition: -# -# ((sire==0 & dam==1) | (sire==1 & dam==0)) & (prog==2) -# -# Due to R's precedence (& binds tighter than |), this parses as: -# (sire==0 & dam==1) | ((sire==1 & dam==0) & prog==2) -# -# So the condition misfires for sire=0,dam=1,prog=anything (always TRUE -# for the left side regardless of prog). This means any marker where -# sire=0,dam=1 will ALWAYS be counted as an error, regardless of progeny. -# -# Similarly: ((sire==2 & dam==1) | (sire==1 & dam==2)) & (prog==0) -# parses as: (sire==2 & dam==1) | ((sire==1 & dam==2) & prog==0) -# → sire=2,dam=1 always flagged as error. -# -# And: ((sire==0 & dam==2) | (sire==2 & dam==0)) & (prog!=1) -# parses as: (sire==0 & dam==2) | ((sire==2 & dam==0) & prog!=1) -# → sire=0,dam=2 always flagged as error regardless of prog. -# -# SAFE rules (no precedence issue): -# Rule A: s=0,d=0 → prog must be 0 ✓ safe -# Rule B: s=2,d=2 → prog must be 2 ✓ safe -# -# UNSAFE parent combos (always produce errors due to bug): -# s=0,d=1 → always error -# s=2,d=1 → always error -# s=0,d=2 → always error -# -# SAFE combos with no error fired (Rule A/B don't apply): -# s=1,d=0, s=1,d=2, s=1,d=1, s=2,d=0 (only right side of | checked) -# s=2,d=0: parses as (FALSE) | (TRUE & prog!=1) → error only if prog!=1 -# so s=2,d=0,prog=1 → NO error ✓ -# -# New design using ONLY Rule A, Rule B, and the safe s=2,d=0,prog=1 case: -# -# Group 1 (5 markers): S1=0,D1=0,child1=0 → Rule A, 0 errors for S1xD1 -# S2=2,D2=2 → Rule B fires: child1=0 < 2 → ERROR for S2xD2 -# S1xD2: s=0,d=2 → UNSAFE → always error for S1xD2 ✓ -# S2xD1: s=2,d=0,prog=0 → (FALSE)|(TRUE & 0!=1)=TRUE → ERROR ✓ -# -# Let's verify S1xD2 Group1: s=0,d=2 → always error (due to bug) → 5 errors -# Let's verify S2xD1 Group1: s=2,d=0,prog=0 → right side: TRUE & (0!=1)=TRUE → ERROR → 5 errors -# -# So with Group 1 alone (5 markers, all s=0,d=0,prog=0 for S1xD1): +# So with 5 markers (all male_parent=0, female_parent=0, prog=0 for S1xD1): # S1xD1: 0 errors / 5 = 0% ← BEST ✓ # S2xD2: 5 errors / 5 = 100% # S1xD2: 5 errors / 5 = 100% # S2xD1: 5 errors / 5 = 100% # -# This works! Simple and clean. -# child2 can be anything distinct. +# child2 is a perfect child of S2xD2 (all 2s). # ───────────────────────────────────────────────────────────────────────────── - -base_genos <- data.table( +base_genos <- data.table::data.table( ID = c("S1", "S2", "D1", "D2", "child1", "child2"), M1 = c(0L, 2L, 0L, 2L, 0L, 2L), M2 = c(0L, 2L, 0L, 2L, 0L, 2L), @@ -138,28 +59,22 @@ base_genos <- data.table( M4 = c(0L, 2L, 0L, 2L, 0L, 2L), M5 = c(0L, 2L, 0L, 2L, 0L, 2L) ) - -# child2 is a perfect child of S2xD2 (all 2s, Rule B: s=2,d=2→prog=2 ✓) -# S1xD1 for child2: s=0,d=0→must be 0, child2=2 → ERROR on all 5 markers - -base_parents <- data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M", "M", "F", "F")) -base_progeny <- data.table(ID = c("child1", "child2")) -child1_progeny <- data.table(ID = "child1") -child2_progeny <- data.table(ID = "child2") - +base_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), + Sex = c("M", "M", "F", "F")) +base_progeny <- data.table::data.table(ID = c("child1", "child2")) +child1_progeny <- data.table::data.table(ID = "child1") +child2_progeny <- data.table::data.table(ID = "child2") # ══════════════════════════════════════════════ # 1. Input validation # ══════════════════════════════════════════════ test_that("invalid method throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( - find_parentage(f$g, f$p, f$pr, method = "bad.method", + find_parentage(f$g, f$p, f$pr, method = "bad_method", verbose = FALSE, write_txt = FALSE), regexp = "Method must be one of" ) }) - test_that("missing genotype file throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( @@ -167,296 +82,264 @@ test_that("missing genotype file throws an error", { verbose = FALSE, write_txt = FALSE) ) }) - test_that("parent IDs absent from genotype file raise a warning and are dropped", { - extra_parents <- rbind(base_parents, data.table(ID = "GHOST", Sex = "M")) + extra_parents <- rbind(base_parents, data.table::data.table(ID = "GHOST", Sex = "M")) f <- make_files(base_genos, extra_parents, child1_progeny) expect_warning( - find_parentage(f$g, f$p, f$pr, method = "best.pair", + find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = FALSE), regexp = "GHOST" ) }) - test_that("progeny IDs absent from genotype file raise a warning and are dropped", { - extra_progeny <- rbind(child1_progeny, data.table(ID = "GHOST_KID")) + extra_progeny <- rbind(child1_progeny, data.table::data.table(ID = "GHOST_KID")) f <- make_files(base_genos, base_parents, extra_progeny) expect_warning( - find_parentage(f$g, f$p, f$pr, method = "best.pair", + find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = FALSE), regexp = "GHOST_KID" ) }) - test_that("no valid progeny candidates after filtering stops with an error", { - ghost_progeny <- data.table(ID = "NOBODY") + ghost_progeny <- data.table::data.table(ID = "NOBODY") f <- make_files(base_genos, base_parents, ghost_progeny) expect_warning( expect_error( - find_parentage(f$g, f$p, f$pr, method = "best.pair", + find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = FALSE), regexp = "No valid progeny" ) ) }) - test_that("missing Sex column raises a warning and defaults to ambiguous", { - parents_no_sex <- data.table(ID = c("S1", "D1")) + parents_no_sex <- data.table::data.table(ID = c("S1", "D1")) f <- make_files(base_genos, parents_no_sex, child1_progeny) expect_warning( - find_parentage(f$g, f$p, f$pr, method = "best.match", + find_parentage(f$g, f$p, f$pr, method = "best_match", verbose = FALSE, write_txt = FALSE), regexp = "Sex" ) }) - # ══════════════════════════════════════════════ # 2. Return structure # ══════════════════════════════════════════════ -test_that("best.pair returns a data.table with expected columns (no ties)", { +test_that("best_pair returns a data.table with expected columns (no ties)", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") - expect_true(all(c("Progeny", "Sire", "Dam", + expect_true(all(c("Progeny", "Male_Parent", "Female_Parent", "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) expect_equal(nrow(res), 1L) }) - -test_that("best.sire returns a data.table with expected columns", { +test_that("best_male_parent returns a data.table with expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.sire", + res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny", "Best_Match", "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) expect_equal(nrow(res), 1L) }) - -test_that("best.dam returns a data.table with expected columns", { +test_that("best_female_parent returns a data.table with expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.dam", + res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny", "Best_Match", "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) expect_equal(nrow(res), 1L) }) - -test_that("best.match returns a data.table with expected columns", { +test_that("best_match returns a data.table with expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.match", + res <- find_parentage(f$g, f$p, f$pr, method = "best_match", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny", "Best_Match", "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) expect_equal(nrow(res), 1L) }) - test_that("one row is returned per progeny for single-parent methods", { f <- make_files(base_genos, base_parents, child1_progeny) - for (m in c("best.sire", "best.dam", "best.match")) { + for (m in c("best_male_parent", "best_female_parent", "best_match")) { res <- find_parentage(f$g, f$p, f$pr, method = m, verbose = FALSE, write_txt = FALSE) - expect_equal(nrow(res), 1L, label = paste("row count for method", m)) + expect_equal(nrow(res), 1L, label = base::paste("row count for method", m)) } }) - # ══════════════════════════════════════════════ # 3. Biological correctness # ══════════════════════════════════════════════ -test_that("best.pair correctly identifies S1 x D1 as best pair with 0% error for child1", { +test_that("best_pair correctly identifies S1 x D1 as best pair with 0% error for child1", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Sire, "S1") - expect_equal(res$Dam, "D1") + expect_equal(res$Male_Parent, "S1") + expect_equal(res$Female_Parent, "D1") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) - -test_that("best.pair correctly identifies S2 x D2 as best pair with 0% error for child2", { +test_that("best_pair correctly identifies S2 x D2 as best pair with 0% error for child2", { f <- make_files(base_genos, base_parents, child2_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Sire, "S2") - expect_equal(res$Dam, "D2") + expect_equal(res$Male_Parent, "S2") + expect_equal(res$Female_Parent, "D2") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) - -test_that("best.sire identifies S1 as best sire for child1", { - # For homozygous method: child1=0 (hom), S1=0 (hom) → match; S2=2 → mismatch +test_that("best_male_parent identifies S1 as best male parent for child1", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.sire", + res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, write_txt = FALSE) expect_equal(res$Best_Match, "S1") }) - -test_that("best.dam identifies D1 as best dam for child1", { +test_that("best_female_parent identifies D1 as best female parent for child1", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.dam", + res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, write_txt = FALSE) expect_equal(res$Best_Match, "D1") }) - test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) - test_that("Mendelian_Error_Pct is between 0 and 100", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) pct <- as.numeric(res$Mendelian_Error_Pct) expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) }) - test_that("Markers_Tested equals the number of non-NA markers", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) # minus ID column }) - # ══════════════════════════════════════════════ # 4. allow_selfing # ══════════════════════════════════════════════ test_that("allow_selfing = FALSE removes self-pairs from candidates", { - ambig_parents <- data.table(ID = c("S1", "D1"), Sex = c("A", "A")) + ambig_parents <- data.table::data.table(ID = c("S1", "D1"), Sex = c("A", "A")) f <- make_files(base_genos, ambig_parents, child1_progeny) # With only 2 ambiguous parents, S1xD1 and D1xS1 are tied → warning expected expect_warning( - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", allow_selfing = FALSE, show_ties = FALSE, verbose = FALSE, write_txt = FALSE), regexp = "tied" ) - if (!is.na(res$Sire) && !is.na(res$Dam)) { - expect_false(res$Sire == res$Dam) + if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) { + expect_false(res$Male_Parent == res$Female_Parent) } }) - # ══════════════════════════════════════════════ # 5. show_ties # ══════════════════════════════════════════════ - -# All markers 0 → every sire×dam pair scores 0% error → guaranteed ties -tied_genos <- data.table( +# All markers 0 → every male_parent x female_parent pair scores 0% error → guaranteed ties +tied_genos <- data.table::data.table( ID = c("S1", "S2", "D1", "D2", "child_tie"), M1 = c(0L, 0L, 0L, 0L, 0L), M2 = c(0L, 0L, 0L, 0L, 0L) ) -tied_parents <- data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M", "M", "F", "F")) -tied_progeny <- data.table(ID = "child_tie") - +tied_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), + Sex = c("M", "M", "F", "F")) +tied_progeny <- data.table::data.table(ID = "child_tie") test_that("show_ties = TRUE produces _1/_2 suffixed columns when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = TRUE, verbose = FALSE, write_txt = FALSE) - expect_true(any(grepl("^Sire_", names(res)))) - expect_true(any(grepl("^Dam_", names(res)))) + expect_true(any(grepl("^Male_Parent_", names(res)))) + expect_true(any(grepl("^Female_Parent_", names(res)))) }) - test_that("show_ties = FALSE warns about ties and returns single-result columns", { f <- make_files(tied_genos, tied_parents, tied_progeny) expect_warning( - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE), regexp = "tied" ) - expect_true("Sire" %in% names(res)) - expect_false(any(grepl("^Sire_\\d", names(res)))) + expect_true("Male_Parent" %in% names(res)) + expect_true("Female_Parent" %in% names(res)) + expect_false(any(grepl("^Male_Parent_\\d", names(res)))) + expect_false(any(grepl("^Female_Parent_\\d", names(res)))) }) - # ══════════════════════════════════════════════ # 6. verbose / write_txt # ══════════════════════════════════════════════ test_that("verbose = TRUE returns the result invisibly", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = TRUE, write_txt = FALSE) expect_s3_class(res, "data.table") }) - test_that("verbose = FALSE returns the result visibly", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") }) - test_that("write_txt = TRUE creates the output file", { old_wd <- getwd() tmp <- tempdir() setwd(tmp) on.exit(setwd(old_wd), add = TRUE) - f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) - find_parentage(f$g, f$p, f$pr, method = "best.pair", + find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = TRUE) expect_true(file.exists(file.path(tmp, "parentage_results_dt.txt"))) }) - test_that("write_txt = FALSE does not create the output file", { old_wd <- getwd() tmp <- tempdir() setwd(tmp) on.exit(setwd(old_wd), add = TRUE) - out_file <- file.path(tmp, "parentage_results_dt.txt") if (file.exists(out_file)) file.remove(out_file) - f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) - find_parentage(f$g, f$p, f$pr, method = "best.pair", + find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = FALSE) expect_false(file.exists(out_file)) }) - # ══════════════════════════════════════════════ # 7. Sex-based candidate filtering # ══════════════════════════════════════════════ -test_that("best.sire only assigns male (M) or ambiguous (A) parents", { +test_that("best_male_parent only assigns male (M) or ambiguous (A) parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.sire", + res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, write_txt = FALSE) - valid_sires <- base_parents[Sex %in% c("M", "A")]$ID - expect_true(res$Best_Match %in% valid_sires) + valid_male_parents <- base_parents[Sex %in% c("M", "A")]$ID + expect_true(res$Best_Match %in% valid_male_parents) }) - -test_that("best.dam only assigns female (F) or ambiguous (A) parents", { +test_that("best_female_parent only assigns female (F) or ambiguous (A) parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.dam", + res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, write_txt = FALSE) - valid_dams <- base_parents[Sex %in% c("F", "A")]$ID - expect_true(res$Best_Match %in% valid_dams) + valid_female_parents <- base_parents[Sex %in% c("F", "A")]$ID + expect_true(res$Best_Match %in% valid_female_parents) }) - # ══════════════════════════════════════════════ # 8. Edge cases # ══════════════════════════════════════════════ test_that("single progeny individual is handled correctly", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_equal(nrow(res), 1L) }) - test_that("all-NA marker column does not cause an error", { - na_genos <- copy(base_genos) + na_genos <- data.table::copy(base_genos) na_genos[, M1 := NA_integer_] f <- make_files(na_genos, base_parents, child1_progeny) expect_no_error( - find_parentage(f$g, f$p, f$pr, method = "best.pair", + find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = FALSE) ) }) - test_that("Progeny column contains the correct progeny IDs", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best.pair", + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_setequal(res$Progeny, child1_progeny$ID) }) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index c3cb4b9..902501f 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -1,33 +1,17 @@ #### Tests for validate_pedigree() #### # Run with: testthat::test_file("test-validate_pedigree.R") -# Requires: data.table, testthat - library(testthat) library(data.table) -#### Helpers: Minimal test data #### - -# Genotypes: 20 markers, coded 0/1/2 -# IND_A and IND_B are parents; IND_C is a valid progeny (Mendelian-consistent) -# IND_D is a progeny whose sire is wrong (high Mendelian error) - +#### Helpers #### make_genos <- function() { - set.seed(42) - n_markers <- 20 + n_markers <- 20 marker_names <- paste0("M", seq_len(n_markers)) - - # Parent A: all homozygous ref (0) pa <- rep(0L, n_markers) - # Parent B: all homozygous alt (2) pb <- rep(2L, n_markers) - # Valid progeny: all het (1) — perfectly Mendelian from A x B pc <- rep(1L, n_markers) - # Bad progeny: all homozygous ref (0) — impossible if sire is B (2) and dam is A (0) - # 0/0 x 2/2 -> must be 1; so all-0 is 100% error pd <- rep(0L, n_markers) - # Parent C2: homozygous ref (0) — correct sire for pd pe <- rep(0L, n_markers) - dt <- data.table( ID = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), rbind(pa, pb, pc, pd, pe) @@ -38,13 +22,12 @@ make_genos <- function() { make_pedigree <- function() { data.table( - Progeny = c("IND_C", "IND_D"), - Sire = c("IND_A", "IND_B"), # IND_D sire is wrong (IND_B) - Dam = c("IND_B", "IND_A") + ID = c("IND_C", "IND_D"), # <-- changed from Progeny + Male_Parent = c("IND_A", "IND_B"), + Female_Parent = c("IND_B", "IND_A") ) } -# Write temp files and return paths write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { ped_file <- tempfile(fileext = ".txt") genos_file <- tempfile(fileext = ".txt") @@ -53,13 +36,12 @@ write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { list(ped = ped_file, genos = genos_file) } -#### Test suite #### +#### Tests #### test_that("PASS trio is correctly identified", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - - pass_row <- res[Progeny == "IND_C"] + pass_row <- res[ID == "IND_C"] # <-- changed expect_equal(nrow(pass_row), 1L) expect_equal(pass_row$Status, "PASS") expect_equal(pass_row$Mendelian_Error_Pct, 0) @@ -69,42 +51,36 @@ test_that("PASS trio is correctly identified", { test_that("FAIL trio is correctly identified", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - - fail_row <- res[Progeny == "IND_D"] + fail_row <- res[ID == "IND_D"] # <-- changed expect_equal(nrow(fail_row), 1L) expect_equal(fail_row$Status, "FAIL") expect_gt(fail_row$Mendelian_Error_Pct, 5.0) }) -test_that("FAIL trio has correct correction decision (REMOVE_SIRE)", { - # IND_D: sire IND_B (all-2) is wrong; dam IND_A (all-0) matches IND_D (all-0) perfectly +test_that("FAIL trio has correct correction decision (REMOVE_MALE_PARENT)", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - - fail_row <- res[Progeny == "IND_D"] - expect_equal(fail_row$Correction_Decision, "REMOVE_SIRE") - expect_false(is.na(fail_row$Best_Sire)) - expect_true(is.na(fail_row$Best_Dam)) # dam was fine, no replacement needed + fail_row <- res[ID == "IND_D"] # <-- changed + expect_equal(fail_row$Correction_Decision, "REMOVE_MALE_PARENT") + expect_false(is.na(fail_row$Best_Male_Parent)) + expect_true(is.na(fail_row$Best_Female_Parent)) }) test_that("Mendelian_Error_Pct is 0 for perfect trio", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - - expect_equal(res[Progeny == "IND_C"]$Mendelian_Error_Pct, 0) + expect_equal(res[ID == "IND_C"]$Mendelian_Error_Pct, 0) # <-- changed }) test_that("Markers_Tested equals number of markers for complete data", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - - expect_equal(res[Progeny == "IND_C"]$Markers_Tested, 20L) + expect_equal(res[ID == "IND_C"]$Markers_Tested, 20L) # <-- changed }) test_that("Returns a data.table invisibly", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") expect_equal(nrow(res), 2L) }) @@ -112,12 +88,14 @@ test_that("Returns a data.table invisibly", { test_that("Result has all expected columns", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - - expected_cols <- c("Progeny", "Sire", "Dam", "Mendelian_Error_Pct", - "Markers_Tested", "Status", "Correction_Decision", - "Sire_Hom_Error_Pct", "Dam_Hom_Error_Pct", - "Best_Sire", "Best_Sire_Error_Pct", - "Best_Dam", "Best_Dam_Error_Pct") + expected_cols <- c( + "ID", "Male_Parent", "Female_Parent", # <-- changed + "Mendelian_Error_Pct", "Markers_Tested", "Status", + "Correction_Decision", + "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", + "Best_Male_Parent", "Best_Male_Parent_Error_Pct", + "Best_Female_Parent", "Best_Female_Parent_Error_Pct" + ) expect_true(all(expected_cols %in% names(res))) }) @@ -126,7 +104,6 @@ test_that("write_txt writes output file with correct name", { out_file <- tempfile(fileext = ".txt") validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = TRUE, output_filename = out_file) - expect_true(file.exists(out_file)) written <- fread(out_file) expect_equal(nrow(written), 2L) @@ -137,13 +114,12 @@ test_that("write_txt = FALSE does not create default output file", { out_file <- tempfile(fileext = ".txt") validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE, output_filename = out_file) - expect_false(file.exists(out_file)) }) test_that("corrected_pedigree.txt is always written with zeros for bad parents", { f <- write_temp_files() - tmp_dir <- tempfile() # unique dir per test — no cross-test pollution + tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) @@ -152,13 +128,10 @@ test_that("corrected_pedigree.txt is always written with zeros for bad parents", validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) - # IND_D sire should be zeroed out (written as character "0") - expect_equal(corr[Progeny == "IND_D"]$Sire, "0") - # IND_D dam should be unchanged - expect_equal(corr[Progeny == "IND_D"]$Dam, "IND_A") - # IND_C should be completely unchanged - expect_equal(corr[Progeny == "IND_C"]$Sire, "IND_A") - expect_equal(corr[Progeny == "IND_C"]$Dam, "IND_B") + expect_equal(corr[ID == "IND_D"]$Male_Parent, "0") # <-- changed + expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") # <-- changed + expect_equal(corr[ID == "IND_C"]$Male_Parent, "IND_A") # <-- changed + expect_equal(corr[ID == "IND_C"]$Female_Parent, "IND_B") # <-- changed }) test_that("fill_pedigree = TRUE writes filled_pedigree.txt with replacement IDs", { @@ -172,11 +145,10 @@ test_that("fill_pedigree = TRUE writes filled_pedigree.txt with replacement IDs" validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE, fill_pedigree = TRUE) - filled <- fread(file.path(tmp_dir, "filled_pedigree.txt")) - # IND_D sire should be replaced with a valid ID (not 0, not the wrong IND_B) - new_sire <- filled[Progeny == "IND_D"]$Sire - expect_false(new_sire == "IND_B") - expect_false(new_sire == "0") + filled <- fread(file.path(tmp_dir, "filled_pedigree.txt")) + new_male_par <- filled[ID == "IND_D"]$Male_Parent # <-- changed + expect_false(new_male_par == "IND_B") + expect_false(new_male_par == "0") }) test_that("fill_pedigree = FALSE does not write filled_pedigree.txt", { @@ -189,15 +161,16 @@ test_that("fill_pedigree = FALSE does not write filled_pedigree.txt", { validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE, fill_pedigree = FALSE) - expect_false(file.exists(file.path(tmp_dir, "filled_pedigree.txt"))) }) -test_that("Trios with missing genotype data are removed with a message", { - ped <- data.table(Progeny = "GHOST", Sire = "IND_A", Dam = "IND_B") - f <- write_temp_files(ped = ped) - - # No valid trios remain -> should stop +test_that("Trios with missing genotype data are removed and error is thrown", { + ped <- data.table( + ID = "GHOST", # <-- changed + Male_Parent = "IND_A", + Female_Parent = "IND_B" + ) + f <- write_temp_files(ped = ped) expect_error( validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), "No valid trios remain" @@ -205,34 +178,40 @@ test_that("Trios with missing genotype data are removed with a message", { }) test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { - # Set min_markers higher than the 20 in our test data f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE, min_markers = 25L) - expect_true(all(res$Status == "LOW_MARKERS")) expect_true(all(res$Correction_Decision == "NONE")) }) -test_that("error_threshold out of range raises an error", { +test_that("trio_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, error_threshold = 150, + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = 150, verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, error_threshold = -1, + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = -1, verbose = FALSE, write_txt = FALSE)) }) -test_that("homozygous_threshold out of range raises an error", { +test_that("single_parent_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, homozygous_threshold = 101, + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = 101, verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, homozygous_threshold = -5, + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = -5, verbose = FALSE, write_txt = FALSE)) }) test_that("missing required pedigree column raises an error", { - bad_ped <- data.table(Progeny = "IND_C", Parent1 = "IND_A", Dam = "IND_B") - f <- write_temp_files(ped = bad_ped) + bad_ped <- data.table( + ID = "IND_C", # <-- changed + Parent1 = "IND_A", + Female_Parent = "IND_B" + ) + f <- write_temp_files(ped = bad_ped) expect_error( validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), "missing required columns" @@ -251,12 +230,10 @@ test_that("missing ID column in genotypes raises an error", { test_that("NA markers do not cause errors and are handled gracefully", { genos <- make_genos() - # Introduce NAs in a few markers for IND_C genos[ID == "IND_C", M1 := NA_integer_] genos[ID == "IND_C", M2 := NA_integer_] f <- write_temp_files(genos = genos) res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - - expect_equal(res[Progeny == "IND_C"]$Markers_Tested, 18L) # 2 NAs excluded - expect_equal(res[Progeny == "IND_C"]$Status, "PASS") + expect_equal(res[ID == "IND_C"]$Markers_Tested, 18L) # <-- changed + expect_equal(res[ID == "IND_C"]$Status, "PASS") # <-- changed }) From 5eb356a0208a1c09b00fc532b1119addded284bc Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Fri, 10 Apr 2026 13:41:58 -0400 Subject: [PATCH 51/80] updated functions, test files and man files for parentage (except for ped_check) --- R/find_parentage.R | 186 ++++++--- R/validate_pedigree.R | 491 +++++++++++++---------- man/find_parentage.Rd | 38 +- man/validate_pedigree.Rd | 114 +++--- tests/testthat/corrected_pedigree.txt | 1 + tests/testthat/test-find_parentage.R | 356 ++++++++++------- tests/testthat/test-validate_pedigree.R | 494 +++++++++++++++++------- 7 files changed, 1099 insertions(+), 581 deletions(-) diff --git a/R/find_parentage.R b/R/find_parentage.R index c070659..83279cc 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -24,6 +24,14 @@ #' \item \code{"best_pair"} — finds the best male-female parent pair for #' each progeny using full Mendelian error rate (default). #' } +#' @param min_markers Integer. Minimum number of non-missing markers required +#' to report a parentage assignment. Progeny-parent comparisons with fewer +#' markers are flagged as \code{LOW_MARKERS} and no assignment is made +#' (default: \code{10}). +#' @param error_threshold Numeric. Maximum mismatch percentage to report a +#' parentage assignment as confident. Assignments above this threshold are +#' flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column +#' (default: \code{5.0}). Must be between 0 and 100. #' @param show_ties Logical. If \code{TRUE}, all tied best pairs are reported #' as additional columns (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) #' when \code{method = "best_pair"}. If \code{FALSE}, only one tied pair is @@ -42,13 +50,15 @@ #' \itemize{ #' \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: #' \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -#' \code{Markers_Tested}. +#' \code{Markers_Tested}, \code{Assignment_Status}. #' \item \code{best_pair} (no ties): \code{Progeny}, \code{Male_Parent}, -#' \code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. +#' \code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, +#' \code{Assignment_Status}. #' \item \code{best_pair} (with ties): columns are suffixed \code{_1}, -#' \code{_2}, etc. for each tied pair. +#' \code{_2}, etc. for each tied pair, plus \code{Assignment_Status}. #' } -#' Returned invisibly when \code{verbose = TRUE}. +#' \code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or +#' \code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. #' #' @details #' For \code{"best_male_parent"}, \code{"best_female_parent"}, and @@ -63,6 +73,10 @@ #' Individuals in \code{parents_file} or \code{progeny_file} that are absent #' from \code{genotypes_file} are removed with a warning. #' +#' Progeny with fewer non-missing markers than \code{min_markers} are flagged +#' \code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best +#' match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +#' #' @examples #' \dontrun{ #' # Assign best male-female parent pair to each progeny @@ -71,16 +85,20 @@ #' parents_file = "parents.txt", #' progeny_file = "progeny.txt", #' method = "best_pair", +#' min_markers = 50, +#' error_threshold = 5.0, #' show_ties = TRUE, #' allow_selfing = FALSE #' ) #' #' # Find best individual parent match (ignoring sex) #' results <- find_parentage( -#' genotypes_file = "genotypes.txt", -#' parents_file = "parents.txt", -#' progeny_file = "progeny.txt", -#' method = "best_match" +#' genotypes_file = "genotypes.txt", +#' parents_file = "parents.txt", +#' progeny_file = "progeny.txt", +#' method = "best_match", +#' min_markers = 30, +#' error_threshold = 3.0 #' ) #' } #' @@ -88,101 +106,142 @@ #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best_pair", + min_markers = 10, + error_threshold = 5.0, show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE) { + #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") - if (!method %in% allowed_methods) { + if (!method %in% allowed_methods) stop("Method must be one of: ", paste(allowed_methods, collapse = ", ")) - } + if (min_markers < 1) + stop("min_markers must be a positive integer.") + if (error_threshold < 0 || error_threshold > 100) + stop("error_threshold must be between 0 and 100.") + tryCatch({ - genos <- data.table::fread(genotypes_file, sep = "auto") - all_parents <- data.table::fread(parents_file, sep = "auto") - progeny_candidates <- data.table::fread(progeny_file, sep = "auto") + genos <- data.table::fread(genotypes_file, sep = "auto") + all_parents <- data.table::fread(parents_file, sep = "auto") + progeny_candidates <- data.table::fread(progeny_file, sep = "auto") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - valid_ids <- genos$ID + + valid_ids <- genos$ID removed_parents <- base::setdiff(all_parents$ID, valid_ids) if (length(removed_parents) > 0) { warning("The following parent IDs were not in the genotype file and will not be analyzed: ", paste(removed_parents, collapse = ", "), call. = FALSE) all_parents <- all_parents[ID %in% valid_ids] } + removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) if (length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) progeny_candidates <- progeny_candidates[ID %in% valid_ids] } + if (!"Sex" %in% base::colnames(all_parents)) { warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") all_parents[, Sex := "A"] } + all_parents[, Sex := base::toupper(Sex)] male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] - if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) { + + if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) - } - if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) { + if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) warning("No valid female parent candidates remain after filtering.", call. = FALSE) - } - if (base::nrow(progeny_candidates) == 0) { + if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") + + #### Helper: assign Assignment_Status from markers and error rate #### + ## Returns LOW_MARKERS, HIGH_ERROR, or PASS + assign_status <- function(markers, error_pct) { + base::ifelse(markers < min_markers, "LOW_MARKERS", + base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) } + #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { + genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") - for (col in marker_cols) { + for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] - } + parent_ids <- base::switch(method, "best_male_parent" = male_parent_candidates$ID, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - parent_genos <- base::as.matrix(genos_hom[ID %in% parent_ids], rownames = "ID") - progeny_genos <- base::as.matrix(genos_hom[ID %in% progeny_candidates$ID], rownames = "ID") + + parent_genos <- base::as.matrix(genos_hom[ID %in% parent_ids], rownames = "ID") + progeny_genos <- base::as.matrix(genos_hom[ID %in% progeny_candidates$ID], rownames = "ID") + results_list <- base::lapply(base::rownames(progeny_genos), function(progeny_id) { progeny_vec <- progeny_genos[progeny_id, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA + best_idx <- base::which.min(percent_mismatch) + + # No candidate found — return NA row flagged LOW_MARKERS if (base::length(best_idx) == 0) { - data.table::data.table(Progeny = progeny_id, Best_Match = NA, - Mendelian_Error_Pct = NA, Markers_Tested = NA) - } else { - data.table::data.table( + return(data.table::data.table( Progeny = progeny_id, - Best_Match = base::rownames(parent_genos)[best_idx], - Mendelian_Error_Pct = base::round(percent_mismatch[best_idx], 2), - Markers_Tested = comparisons[best_idx] - ) + Best_Match = NA_character_, + Mendelian_Error_Pct = NA_real_, + Markers_Tested = 0L, + Assignment_Status = "LOW_MARKERS" + )) } + + best_markers <- comparisons[best_idx] + best_error <- base::round(percent_mismatch[best_idx], 2) + + data.table::data.table( + Progeny = progeny_id, + Best_Match = base::rownames(parent_genos)[best_idx], + Mendelian_Error_Pct = best_error, + Markers_Tested = best_markers, + Assignment_Status = assign_status(best_markers, best_error) + ) }) + final_df <- data.table::rbindlist(results_list) } + #### Logic for Best Pair Method #### if (method == "best_pair") { + genos_mat <- base::as.matrix(genos, rownames = "ID") parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, Female_Parent = female_parent_candidates$ID) + if (!allow_selfing) { parent_pairs <- parent_pairs[Male_Parent != Female_Parent] if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } + if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") + male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] + results_list <- base::lapply(progeny_candidates$ID, function(prog_id) { + progeny_vec <- genos_mat[prog_id, ] - mismatches <- base::rowSums( + + mismatches <- base::rowSums( (male_parent_genos_mat == 0 & female_parent_genos_mat == 0 & progeny_vec > 0) | (male_parent_genos_mat == 2 & female_parent_genos_mat == 2 & progeny_vec < 2) | ((male_parent_genos_mat == 0 & female_parent_genos_mat == 1) | @@ -193,51 +252,86 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, (male_parent_genos_mat == 2 & female_parent_genos_mat == 0)) & (progeny_vec != 1), na.rm = TRUE ) + comparisons <- base::rowSums(!base::is.na(male_parent_genos_mat) & !base::is.na(female_parent_genos_mat) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA + min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) + + # No markers overlap at all — flag LOW_MARKERS, return early if (base::is.infinite(min_mismatch_val)) { - return(data.table::data.table(Progeny = prog_id, Markers_Tested = 0)) + return(data.table::data.table( + Progeny = prog_id, + Markers_Tested = 0L, + Assignment_Status = "LOW_MARKERS" + )) } + best_indices <- base::which(percent_mismatch == min_mismatch_val) best_pairs <- parent_pairs[best_indices] + best_markers <- comparisons[best_indices[1]] + best_error <- base::round(min_mismatch_val, 2) + a_status <- assign_status(best_markers, best_error) + if (!show_ties && base::nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) } - num_to_report <- if (show_ties) base::nrow(best_pairs) else 1 - num_to_report <- base::min(base::nrow(best_pairs), num_to_report) - result_row <- base::list(Progeny = prog_id) + + num_to_report <- base::min(base::nrow(best_pairs), + if (show_ties) base::nrow(best_pairs) else 1) + result_row <- base::list(Progeny = prog_id) + if (num_to_report == 1) { - result_row[["Male_Parent"]] <- best_pairs$Male_Parent[1] - result_row[["Female_Parent"]] <- best_pairs$Female_Parent[1] - result_row[["Mendelian_Error_Pct"]] <- base::sprintf("%.2f", min_mismatch_val) - result_row[["Markers_Tested"]] <- comparisons[best_indices[1]] - } else if (num_to_report > 1) { + result_row[["Male_Parent"]] <- best_pairs$Male_Parent[1] + result_row[["Female_Parent"]] <- best_pairs$Female_Parent[1] + result_row[["Mendelian_Error_Pct"]] <- base::sprintf("%.2f", min_mismatch_val) + result_row[["Markers_Tested"]] <- best_markers + result_row[["Assignment_Status"]] <- a_status + + } else { for (k in base::seq_len(num_to_report)) { - result_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] - result_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] - result_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val - result_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] + result_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] + result_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] + result_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val + result_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] } + result_row[["Assignment_Status"]] <- a_status } + data.table::as.data.table(result_row) }) + final_df <- data.table::rbindlist(results_list, fill = TRUE) } + + #### Summary #### + if (verbose) { + total <- base::nrow(final_df) + a_counts <- base::table(final_df$Assignment_Status) + base::cat("\n--- Parentage Assignment Summary ---\n") + base::cat("Total progeny evaluated:", total, "\n") + for (s in base::names(a_counts)) + base::cat(base::sprintf(" %-14s: %d (%.1f%%)\n", s, + a_counts[s], (a_counts[s] / total) * 100)) + base::cat("Min markers threshold :", min_markers, "\n") + base::cat("Error threshold :", error_threshold, "%\n\n") + } + #### Output #### if (write_txt) { output_filename <- "parentage_results_dt.txt" tryCatch({ data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) - if (verbose) base::cat("\nResults successfully written to:", output_filename, "\n") + if (verbose) base::cat("Results successfully written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } + if (verbose) { base::cat("\n--- Parentage Assignment Results ---\n") base::print(final_df) diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index be2e96a..b34bc1b 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -1,86 +1,90 @@ #' Validate Pedigree Trios Using Mendelian Error Analysis #' #' Validates parent-offspring trios by calculating Mendelian error rates from -#' SNP genotype data. Identifies incorrect parentage assignments and optionally -#' suggests or fills in best-matching replacements. +#' SNP genotype data. Identifies incorrect parentage assignments and suggests +#' best-matching replacements. If a list of founders is supplied, trios that +#' are declared founders (both parents coded as 0) are preserved unchanged +#' with no recommendations. Trios removed due to missing genotype data are +#' retained in the output with a NO_GENOTYPE_DATA status. #' -#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) with -#' columns: \code{ID}, \code{Male_Parent}, \code{Female_Parent}. +#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) +#' with columns: ID, Male_Parent, Female_Parent. #' @param genotypes_file Character. Path to the genotypes file (TSV/CSV/TXT) -#' with an \code{ID} column followed by marker columns coded as 0, 1, 2 -#' (additive allele dosage). -#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage to -#' classify a trio as \code{PASS} (default: \code{5.0}). Must be between -#' 0 and 100. -#' @param min_markers Integer. Minimum number of non-missing markers required -#' to evaluate a trio; below this the trio is flagged \code{LOW_MARKERS} -#' (default: \code{10}). +#' with an ID column followed by marker columns coded as 0, 1, 2. +#' @param founders_file Character, optional. Path to a one-column file +#' listing the IDs of founder individuals. Founders with both parents +#' coded as 0 are left unchanged with no recommendations. Defaults to NULL. +#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage +#' to classify a trio as PASS (default: 5.0). Must be between 0 and 100. +#' @param min_markers Integer. Minimum number of non-missing markers +#' required to evaluate a trio (default: 10). #' @param single_parent_error_threshold Numeric. Maximum homozygous-marker -#' mismatch percentage for a parent to be considered acceptable in a failed -#' trio (default: \code{2.0}). Must be between 0 and 100. -#' @param fill_pedigree Logical. If \code{TRUE}, writes an additional file -#' (\code{filled_pedigree.txt}) with failed parents replaced by the -#' best-matching candidate IDs (default: \code{FALSE}). -#' @param verbose Logical. If \code{TRUE}, prints progress messages, a summary -#' table, and the results to the console (default: \code{TRUE}). -#' @param write_txt Logical. If \code{TRUE}, writes the validation results -#' to \code{output_filename} (default: \code{TRUE}). -#' @param output_filename Character. Path/name of the output file for -#' validation results (default: \code{"pedigree_validation_results.txt"}). +#' mismatch percentage for a parent to be considered acceptable in a +#' failed trio (default: 2.0). Must be between 0 and 100. +#' @param verbose Logical. If TRUE, prints progress messages, a summary +#' table, and results to the console (default: TRUE). +#' @param write_txt Logical. If TRUE, writes validation results to +#' output_filename (default: TRUE). +#' @param output_filename Character. Path/name of the output file +#' (default: "pedigree_validation_results.txt"). #' -#' @return A \code{data.table} (returned invisibly) with one row per trio and +#' @return A data.table (returned invisibly) with one row per trio and #' the following columns: #' \describe{ -#' \item{ID}{Individual ID (first column of the pedigree input).} +#' \item{ID}{Individual ID.} #' \item{Male_Parent}{Declared male parent ID.} #' \item{Female_Parent}{Declared female parent ID.} #' \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -#' \item{Markers_Tested}{Number of markers with non-missing genotypes in -#' all three individuals.} -#' \item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or -#' \code{NO_DATA}.} -#' \item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, -#' \code{REMOVE_MALE_PARENT}, \code{REMOVE_FEMALE_PARENT}, or -#' \code{REMOVE_BOTH}.} +#' \item{Markers_Tested}{Number of markers with non-missing genotypes.} +#' \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, +#' MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, +#' or NO_GENOTYPE_DATA (trio present in pedigree but absent from +#' the genotype file).} +#' \item{Correction_Decision}{One of NONE, KEEP_BOTH, +#' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH.} #' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch -#' percentage (\code{NA} unless \code{Status == "FAIL"}).} +#' percentage (NA unless Status == "FAIL").} #' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker -#' mismatch percentage (\code{NA} unless \code{Status == "FAIL"}).} -#' \item{Best_Male_Parent}{Best-matching male parent candidate ID -#' (\code{NA} unless male parent is removed).} +#' mismatch percentage (NA unless Status == "FAIL").} +#' \item{Best_Male_Parent}{Best-matching male parent candidate ID.} #' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for -#' \code{Best_Male_Parent}.} -#' \item{Best_Female_Parent}{Best-matching female parent candidate ID -#' (\code{NA} unless female parent is removed).} +#' Best_Male_Parent.} +#' \item{Best_Female_Parent}{Best-matching female parent candidate ID.} #' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for -#' \code{Best_Female_Parent}.} +#' Best_Female_Parent.} #' } #' #' @details -#' Trios are filtered to individuals present in the genotype file before -#' analysis. Mendelian errors are counted as genotype combinations impossible -#' under Mendelian inheritance (e.g. both parents homozygous reference but the -#' offspring carries the alternate allele). Failed trios are further dissected -#' using homozygous-only markers to identify which parent is likely incorrect. +#' All trios in the pedigree file are represented in the output. Trios where +#' the progeny or a declared parent is absent from the genotype file are +#' flagged as NO_GENOTYPE_DATA and are excluded from Mendelian error analysis +#' but retained in the final report and summary counts. #' -#' A corrected pedigree with failed parents replaced by \code{0} is always -#' written to \code{corrected_pedigree.txt} in the working directory. If -#' \code{fill_pedigree = TRUE}, a second file (\code{filled_pedigree.txt}) -#' replaces those zeros with the best genomic match found across all -#' genotyped individuals. +#' Trios with missing parents (coded as 0) that are not listed as founders +#' receive a MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, or +#' MISSING_BOTH_PARENTS status. Recommendations are provided in the +#' Best_Male_Parent and Best_Female_Parent columns, but the 0 values are +#' preserved in the corrected pedigree output. +#' +#' If founders_file is provided, any trio where the individual is listed as +#' a founder and both parents are coded as 0 is flagged as FOUNDERS; no +#' recommendation or correction is attempted. +#' +#' A corrected pedigree with failed parents replaced by 0 is always written +#' to corrected_pedigree.txt. #' #' @examples #' \dontrun{ -#' # Basic run with defaults +#' # Basic run #' results <- validate_pedigree("pedigree.txt", "genotypes.txt") #' -#' # Stricter thresholds, fill replacements, suppress console output +#' # With founders list and stricter thresholds #' results <- validate_pedigree( #' pedigree_file = "pedigree.txt", #' genotypes_file = "genotypes.txt", +#' founders_file = "founders.txt", #' trio_error_threshold = 2.0, #' single_parent_error_threshold = 1.0, -#' fill_pedigree = TRUE, #' verbose = FALSE, #' output_filename = "my_validation.txt" #' ) @@ -89,75 +93,94 @@ #' @importFrom data.table fread fwrite rbindlist copy data.table := set #' @export validate_pedigree <- function(pedigree_file, genotypes_file, + founders_file = NULL, trio_error_threshold = 5.0, min_markers = 10, single_parent_error_threshold = 2.0, - fill_pedigree = FALSE, verbose = TRUE, write_txt = TRUE, output_filename = "pedigree_validation_results.txt") { - #### Input Validation #### - if (trio_error_threshold < 0 || trio_error_threshold > 100) { - stop("trio_error_threshold must be between 0 and 100") + #### Read founders list #### + if (!is.null(founders_file)) { + founders_raw <- tryCatch({ + data.table::fread(founders_file, header = FALSE, colClasses = "character") + }, error = function(e) { + stop("Could not read founders list. Ensure it is a plain text or CSV/TSV file.") + }) + founder_ids <- unique(founders_raw[[1]]) + } else { + founder_ids <- character(0) } - if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) { + + #### Input validation #### + if (trio_error_threshold < 0 || trio_error_threshold > 100) + stop("trio_error_threshold must be between 0 and 100") + if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) stop("single_parent_error_threshold must be between 0 and 100") - } tryCatch({ - pedigree <- data.table::fread(pedigree_file, sep = "auto") + pedigree <- data.table::fread(pedigree_file, sep = "auto", + colClasses = "character") genos <- data.table::fread(genotypes_file, sep = "auto") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - #### Check Required Columns #### - required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") # <-- changed + #### Check required columns #### + required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) - if (base::length(missing_cols) > 0) { + if (base::length(missing_cols) > 0) stop("Pedigree file missing required columns: ", base::paste(missing_cols, collapse = ", ")) - } - if (!"ID" %in% base::names(genos)) { + if (!"ID" %in% base::names(genos)) stop("Genotypes file must have an 'ID' column") - } - # Keep original pedigree for correction output + # Ensure parent columns are character for consistent "0" comparisons + pedigree[, Male_Parent := as.character(Male_Parent)] + pedigree[, Female_Parent := as.character(Female_Parent)] + original_pedigree <- data.table::copy(pedigree) - #### Filter to Individuals with Genotype Data #### - valid_ids <- genos$ID - initial_trios <- base::nrow(pedigree) - pedigree <- pedigree[ID %in% valid_ids & # <-- changed - Male_Parent %in% valid_ids & - Female_Parent %in% valid_ids] - removed_trios <- initial_trios - base::nrow(pedigree) - if (removed_trios > 0 && verbose) { - base::cat("Removed", removed_trios, "trios due to missing genotype data.\n") - } - if (base::nrow(pedigree) == 0) { + #### Identify trios missing from the genotype file #### + ## These are retained in the output with Status = NO_GENOTYPE_DATA + ## rather than silently dropped + valid_ids <- as.character(genos$ID) + + has_geno <- pedigree[ID %in% valid_ids & + (Male_Parent %in% valid_ids | Male_Parent == "0") & + (Female_Parent %in% valid_ids | Female_Parent == "0")] + + # Correct operator precedence: ! negates the full %in% expression + no_geno_rows <- pedigree[!(ID %in% valid_ids) | + (!(Male_Parent %in% valid_ids) & Male_Parent != "0") | + (!(Female_Parent %in% valid_ids) & Female_Parent != "0")] + + if (base::nrow(no_geno_rows) > 0 && verbose) + base::cat("Found", base::nrow(no_geno_rows), + "trios with missing genotype data; flagged as NO_GENOTYPE_DATA.\n") + + pedigree <- has_geno + + if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") - } - #### Mendelian Error Calculation #### + #### Build genotype matrices #### genos_mat <- base::as.matrix(genos, rownames = "ID") - # Homozygous-only matrix + # Homozygous-only matrix (het markers set to NA) genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") - for (col in marker_cols) { + for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] - } genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - #### Helper: Find Best Matching Parent #### + #### Helper: find best matching parent via homozygous mismatch #### find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { candidates <- base::setdiff(base::rownames(genos_hom_mat), c(prog_id, exclude_ids)) - if (base::length(candidates) == 0) { + if (base::length(candidates) == 0) return(base::list(id = NA_character_, error_pct = NA_real_)) - } prog_hom <- genos_hom_mat[prog_id, ] errors <- base::sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] @@ -170,31 +193,18 @@ validate_pedigree <- function(pedigree_file, genotypes_file, error_pct = base::round(errors[best_idx], 2)) } + #### Main trio evaluation loop #### results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { - prog_id <- pedigree$ID[i] # <-- changed + + prog_id <- pedigree$ID[i] male_parent_id <- pedigree$Male_Parent[i] female_parent_id <- pedigree$Female_Parent[i] - progeny_vec <- genos_mat[prog_id, ] - male_parent_vec <- genos_mat[male_parent_id, ] - female_parent_vec <- genos_mat[female_parent_id, ] - - mismatches <- base::sum( - (male_parent_vec == 0 & female_parent_vec == 0 & progeny_vec > 0) | - (male_parent_vec == 2 & female_parent_vec == 2 & progeny_vec < 2) | - ((male_parent_vec == 0 & female_parent_vec == 1) | - (male_parent_vec == 1 & female_parent_vec == 0)) & (progeny_vec == 2) | - ((male_parent_vec == 2 & female_parent_vec == 1) | - (male_parent_vec == 1 & female_parent_vec == 2)) & (progeny_vec == 0) | - ((male_parent_vec == 0 & female_parent_vec == 2) | - (male_parent_vec == 2 & female_parent_vec == 0)) & (progeny_vec != 1), - na.rm = TRUE - ) - - markers_tested <- base::sum(!base::is.na(male_parent_vec) & - !base::is.na(female_parent_vec) & - !base::is.na(progeny_vec)) - + # Default values + correction_decision <- "NONE" + error_pct <- NA_real_ + status <- "NO_DATA" + markers_tested <- 0L male_parent_error_pct <- NA_real_ female_parent_error_pct <- NA_real_ best_male_parent <- NA_character_ @@ -202,73 +212,141 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- NA_character_ best_female_parent_pct <- NA_real_ - if (markers_tested == 0) { - error_pct <- NA_real_ - status <- "NO_DATA" - correction_decision <- "NONE" - } else if (markers_tested < min_markers) { - error_pct <- (mismatches / markers_tested) * 100 - status <- "LOW_MARKERS" + ## Founder check — both parents "0" and ID in founders list + if (male_parent_id == "0" && female_parent_id == "0" && + prog_id %in% founder_ids) { + status <- "FOUNDERS" correction_decision <- "NONE" + } else { - error_pct <- (mismatches / markers_tested) * 100 - if (error_pct <= trio_error_threshold) { - status <- "PASS" + + ## Missing parent(s) — recommendations only, "0"s preserved in pedigree + if (male_parent_id == "0" && female_parent_id == "0") { + status <- "MISSING_BOTH_PARENTS" + correction_decision <- "NONE" + best_m <- find_best_parent(prog_id, + exclude_ids = character(0)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + best_f <- find_best_parent(prog_id, + exclude_ids = c(best_m$id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + + } else if (male_parent_id == "0" && female_parent_id != "0") { + status <- "MISSING_MALE_PARENT" + correction_decision <- "NONE" + best_m <- find_best_parent(prog_id, + exclude_ids = c(female_parent_id)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + + } else if (male_parent_id != "0" && female_parent_id == "0") { + status <- "MISSING_FEMALE_PARENT" correction_decision <- "NONE" + best_f <- find_best_parent(prog_id, + exclude_ids = c(male_parent_id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + } else { - status <- "FAIL" - - progeny_hom <- genos_hom_mat[prog_id, ] - male_parent_hom <- genos_hom_mat[male_parent_id, ] - female_parent_hom <- genos_hom_mat[female_parent_id, ] - - male_comparisons <- base::sum(!base::is.na(male_parent_hom) & - !base::is.na(progeny_hom)) - male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else - base::round((base::sum(male_parent_hom != progeny_hom, - na.rm = TRUE) / male_comparisons) * 100, 2) - - female_comparisons <- base::sum(!base::is.na(female_parent_hom) & - !base::is.na(progeny_hom)) - female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else - base::round((base::sum(female_parent_hom != progeny_hom, - na.rm = TRUE) / female_comparisons) * 100, 2) - - male_acceptable <- !base::is.na(male_parent_error_pct) && - male_parent_error_pct <= single_parent_error_threshold - female_acceptable <- !base::is.na(female_parent_error_pct) && - female_parent_error_pct <= single_parent_error_threshold - - if (male_acceptable && female_acceptable) { - correction_decision <- "KEEP_BOTH" - } else if (male_acceptable && !female_acceptable) { - correction_decision <- "REMOVE_FEMALE_PARENT" - best <- find_best_parent(prog_id, - exclude_ids = c(male_parent_id)) - best_female_parent <- best$id - best_female_parent_pct <- best$error_pct - } else if (!male_acceptable && female_acceptable) { - correction_decision <- "REMOVE_MALE_PARENT" - best <- find_best_parent(prog_id, - exclude_ids = c(female_parent_id)) - best_male_parent <- best$id - best_male_parent_pct <- best$error_pct + ## Both parents present — Mendelian error calculation + progeny_vec <- genos_mat[prog_id, ] + male_parent_vec <- genos_mat[male_parent_id, ] + female_parent_vec <- genos_mat[female_parent_id, ] + + mismatches <- base::sum( + (male_parent_vec == 0 & female_parent_vec == 0 & progeny_vec > 0) | + (male_parent_vec == 2 & female_parent_vec == 2 & progeny_vec < 2) | + ((male_parent_vec == 0 & female_parent_vec == 1) | + (male_parent_vec == 1 & female_parent_vec == 0)) & (progeny_vec == 2) | + ((male_parent_vec == 2 & female_parent_vec == 1) | + (male_parent_vec == 1 & female_parent_vec == 2)) & (progeny_vec == 0) | + ((male_parent_vec == 0 & female_parent_vec == 2) | + (male_parent_vec == 2 & female_parent_vec == 0)) & (progeny_vec != 1), + na.rm = TRUE + ) + + markers_tested <- base::sum(!base::is.na(male_parent_vec) & + !base::is.na(female_parent_vec) & + !base::is.na(progeny_vec)) + + if (markers_tested == 0) { + status <- "NO_DATA" + correction_decision <- "NONE" + + } else if (markers_tested < min_markers) { + error_pct <- (mismatches / markers_tested) * 100 + status <- "LOW_MARKERS" + correction_decision <- "NONE" + } else { - correction_decision <- "REMOVE_BOTH" - best_m <- find_best_parent(prog_id, - exclude_ids = base::character(0)) - best_male_parent <- best_m$id - best_male_parent_pct <- best_m$error_pct - best_f <- find_best_parent(prog_id, - exclude_ids = c(best_m$id)) - best_female_parent <- best_f$id - best_female_parent_pct <- best_f$error_pct + error_pct <- (mismatches / markers_tested) * 100 + + if (error_pct <= trio_error_threshold) { + status <- "PASS" + correction_decision <- "NONE" + + } else { + status <- "FAIL" + + # Homozygous mismatch per parent + progeny_hom <- genos_hom_mat[prog_id, ] + male_parent_hom <- genos_hom_mat[male_parent_id, ] + female_parent_hom <- genos_hom_mat[female_parent_id, ] + + male_comparisons <- base::sum(!base::is.na(male_parent_hom) & + !base::is.na(progeny_hom)) + male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else + base::round((base::sum(male_parent_hom != progeny_hom, + na.rm = TRUE) / male_comparisons) * 100, 2) + + female_comparisons <- base::sum(!base::is.na(female_parent_hom) & + !base::is.na(progeny_hom)) + female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else + base::round((base::sum(female_parent_hom != progeny_hom, + na.rm = TRUE) / female_comparisons) * 100, 2) + + male_acceptable <- !is.na(male_parent_error_pct) && + male_parent_error_pct <= single_parent_error_threshold + female_acceptable <- !is.na(female_parent_error_pct) && + female_parent_error_pct <= single_parent_error_threshold + + if (male_acceptable && female_acceptable) { + correction_decision <- "KEEP_BOTH" + + } else if (male_acceptable && !female_acceptable) { + correction_decision <- "REMOVE_FEMALE_PARENT" + best_f <- find_best_parent(prog_id, + exclude_ids = c(male_parent_id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + + } else if (!male_acceptable && female_acceptable) { + correction_decision <- "REMOVE_MALE_PARENT" + best_m <- find_best_parent(prog_id, + exclude_ids = c(female_parent_id)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + + } else { + correction_decision <- "REMOVE_BOTH" + best_m <- find_best_parent(prog_id, + exclude_ids = character(0)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + best_f <- find_best_parent(prog_id, + exclude_ids = c(best_m$id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + } + } } } } data.table::data.table( - ID = prog_id, # <-- changed + ID = prog_id, Male_Parent = male_parent_id, Female_Parent = female_parent_id, Mendelian_Error_Pct = base::round(error_pct, 2), @@ -286,21 +364,48 @@ validate_pedigree <- function(pedigree_file, genotypes_file, final_df <- data.table::rbindlist(results_list) - #### Always Write Corrected Pedigree #### + #### Append NO_GENOTYPE_DATA rows to the final report #### + ## All columns except ID, Male_Parent, Female_Parent are set to NA + if (base::nrow(no_geno_rows) > 0) { + no_geno_df <- data.table::data.table( + ID = no_geno_rows$ID, + Male_Parent = no_geno_rows$Male_Parent, + Female_Parent = no_geno_rows$Female_Parent, + Mendelian_Error_Pct = NA_real_, + Markers_Tested = 0L, + Status = "NO_GENOTYPE_DATA", + Correction_Decision = "NONE", + Male_Parent_Hom_Error_Pct = NA_real_, + Female_Parent_Hom_Error_Pct = NA_real_, + Best_Male_Parent = NA_character_, + Best_Male_Parent_Error_Pct = NA_real_, + Best_Female_Parent = NA_character_, + Best_Female_Parent_Error_Pct = NA_real_ + ) + final_df <- data.table::rbindlist(list(final_df, no_geno_df)) + } + + #### Write corrected pedigree #### + ## MISSING_*, FOUNDERS and NO_GENOTYPE_DATA: original values preserved + ## FAIL: failed parents replaced by "0" corrected_pedigree <- data.table::copy(original_pedigree) + for (i in base::seq_len(base::nrow(final_df))) { - prog_id <- final_df$ID[i] # <-- changed + prog_id <- final_df$ID[i] decision <- final_df$Correction_Decision[i] - row_idx <- base::which(corrected_pedigree$ID == prog_id) # <-- changed + row_idx <- base::which(corrected_pedigree$ID == prog_id) + if (decision == "REMOVE_MALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", 0L) + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") } else if (decision == "REMOVE_FEMALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Female_Parent", 0L) + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") } else if (decision == "REMOVE_BOTH") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", 0L) - data.table::set(corrected_pedigree, row_idx, "Female_Parent", 0L) + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") } + # NONE / KEEP_BOTH / FOUNDERS / MISSING_* / NO_GENOTYPE_DATA — no changes } + tryCatch({ data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", sep = "\t", quote = FALSE) @@ -309,60 +414,30 @@ validate_pedigree <- function(pedigree_file, genotypes_file, warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) }) - #### Optionally Write Filled Pedigree #### - if (fill_pedigree) { - filled_pedigree <- data.table::copy(original_pedigree) - for (i in base::seq_len(base::nrow(final_df))) { - prog_id <- final_df$ID[i] # <-- changed - decision <- final_df$Correction_Decision[i] - row_idx <- base::which(filled_pedigree$ID == prog_id) # <-- changed - if (decision == "REMOVE_MALE_PARENT") { - data.table::set(filled_pedigree, row_idx, - "Male_Parent", final_df$Best_Male_Parent[i]) - } else if (decision == "REMOVE_FEMALE_PARENT") { - data.table::set(filled_pedigree, row_idx, - "Female_Parent", final_df$Best_Female_Parent[i]) - } else if (decision == "REMOVE_BOTH") { - data.table::set(filled_pedigree, row_idx, - "Male_Parent", final_df$Best_Male_Parent[i]) - data.table::set(filled_pedigree, row_idx, - "Female_Parent", final_df$Best_Female_Parent[i]) - } - } - tryCatch({ - data.table::fwrite(filled_pedigree, file = "filled_pedigree.txt", - sep = "\t", quote = FALSE) - if (verbose) base::cat("Filled pedigree written to: filled_pedigree.txt\n") - }, error = function(e) { - warning("Could not write filled pedigree. Error: ", e$message, call. = FALSE) - }) - } - - #### Summary #### + #### Summary output #### if (verbose) { total_trios <- base::nrow(final_df) status_counts <- base::table(final_df$Status) base::cat("\n--- Trio Validation Summary ---\n") - base::cat("Total trios tested:", total_trios, "\n") - for (s in base::names(status_counts)) { - base::cat(base::sprintf("%-12s: %d (%.1f%%)\n", s, + base::cat("Total trios in pedigree:", total_trios, "\n") + for (s in base::names(status_counts)) + base::cat(base::sprintf("%-24s: %d (%.1f%%)\n", s, status_counts[s], (status_counts[s] / total_trios) * 100)) - } base::cat("Error threshold:", trio_error_threshold, "%\n") base::cat("Homozygous threshold:", single_parent_error_threshold, "%\n") base::cat("Minimum markers required:", min_markers, "\n\n") + corrections <- base::table(final_df$Correction_Decision) base::cat("Correction summary:\n") - for (decision in base::names(corrections)) { + for (decision in base::names(corrections)) if (decision != "NONE") base::cat(" ", decision, ":", corrections[decision], "\n") - } base::cat("\n") base::print(final_df) } - #### Output #### + #### Write results #### if (write_txt) { tryCatch({ data.table::fwrite(final_df, file = output_filename, diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index bdd1c4f..420bed8 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -9,6 +9,8 @@ find_parentage( parents_file, progeny_file, method = "best_pair", + min_markers = 10, + error_threshold = 5, show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, @@ -41,6 +43,16 @@ using homozygous mismatch rate. each progeny using full Mendelian error rate (default). }} +\item{min_markers}{Integer. Minimum number of non-missing markers required +to report a parentage assignment. Progeny-parent comparisons with fewer +markers are flagged as \code{LOW_MARKERS} and no assignment is made +(default: \code{10}).} + +\item{error_threshold}{Numeric. Maximum mismatch percentage to report a +parentage assignment as confident. Assignments above this threshold are +flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column +(default: \code{5.0}). Must be between 0 and 100.} + \item{show_ties}{Logical. If \code{TRUE}, all tied best pairs are reported as additional columns (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when \code{method = "best_pair"}. If \code{FALSE}, only one tied pair is @@ -63,13 +75,15 @@ reported). Columns depend on the method used: \itemize{ \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -\code{Markers_Tested}. +\code{Markers_Tested}, \code{Assignment_Status}. \item \code{best_pair} (no ties): \code{Progeny}, \code{Male_Parent}, -\code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}. +\code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, +\code{Assignment_Status}. \item \code{best_pair} (with ties): columns are suffixed \code{_1}, -\code{_2}, etc. for each tied pair. +\code{_2}, etc. for each tied pair, plus \code{Assignment_Status}. } -Returned invisibly when \code{verbose = TRUE}. +\code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or +\code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. } \description{ Assigns the most likely parent(s) to each progeny individual based on @@ -87,6 +101,10 @@ rules are applied across all possible male-female parent combinations via Individuals in \code{parents_file} or \code{progeny_file} that are absent from \code{genotypes_file} are removed with a warning. + +Progeny with fewer non-missing markers than \code{min_markers} are flagged +\code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best +match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. } \examples{ \dontrun{ @@ -96,16 +114,20 @@ results <- find_parentage( parents_file = "parents.txt", progeny_file = "progeny.txt", method = "best_pair", + min_markers = 50, + error_threshold = 5.0, show_ties = TRUE, allow_selfing = FALSE ) # Find best individual parent match (ignoring sex) results <- find_parentage( - genotypes_file = "genotypes.txt", - parents_file = "parents.txt", - progeny_file = "progeny.txt", - method = "best_match" + genotypes_file = "genotypes.txt", + parents_file = "parents.txt", + progeny_file = "progeny.txt", + method = "best_match", + min_markers = 30, + error_threshold = 3.0 ) } diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 958b669..2493096 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -7,107 +7,111 @@ validate_pedigree( pedigree_file, genotypes_file, + founders_file = NULL, trio_error_threshold = 5, min_markers = 10, single_parent_error_threshold = 2, - fill_pedigree = FALSE, verbose = TRUE, write_txt = TRUE, output_filename = "pedigree_validation_results.txt" ) } \arguments{ -\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV/TXT) with -columns: \code{ID}, \code{Male_Parent}, \code{Female_Parent}.} +\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV/TXT) +with columns: ID, Male_Parent, Female_Parent.} \item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV/TXT) -with an \code{ID} column followed by marker columns coded as 0, 1, 2 -(additive allele dosage).} +with an ID column followed by marker columns coded as 0, 1, 2.} -\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage to -classify a trio as \code{PASS} (default: \code{5.0}). Must be between -0 and 100.} +\item{founders_file}{Character, optional. Path to a one-column file +listing the IDs of founder individuals. Founders with both parents +coded as 0 are left unchanged with no recommendations. Defaults to NULL.} -\item{min_markers}{Integer. Minimum number of non-missing markers required -to evaluate a trio; below this the trio is flagged \code{LOW_MARKERS} -(default: \code{10}).} +\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage +to classify a trio as PASS (default: 5.0). Must be between 0 and 100.} -\item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker -mismatch percentage for a parent to be considered acceptable in a failed -trio (default: \code{2.0}). Must be between 0 and 100.} +\item{min_markers}{Integer. Minimum number of non-missing markers +required to evaluate a trio (default: 10).} -\item{fill_pedigree}{Logical. If \code{TRUE}, writes an additional file -(\code{filled_pedigree.txt}) with failed parents replaced by the -best-matching candidate IDs (default: \code{FALSE}).} +\item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker +mismatch percentage for a parent to be considered acceptable in a +failed trio (default: 2.0). Must be between 0 and 100.} -\item{verbose}{Logical. If \code{TRUE}, prints progress messages, a summary -table, and the results to the console (default: \code{TRUE}).} +\item{verbose}{Logical. If TRUE, prints progress messages, a summary +table, and results to the console (default: TRUE).} -\item{write_txt}{Logical. If \code{TRUE}, writes the validation results -to \code{output_filename} (default: \code{TRUE}).} +\item{write_txt}{Logical. If TRUE, writes validation results to +output_filename (default: TRUE).} -\item{output_filename}{Character. Path/name of the output file for -validation results (default: \code{"pedigree_validation_results.txt"}).} +\item{output_filename}{Character. Path/name of the output file +(default: "pedigree_validation_results.txt").} } \value{ -A \code{data.table} (returned invisibly) with one row per trio and +A data.table (returned invisibly) with one row per trio and the following columns: \describe{ -\item{ID}{Individual ID (first column of the pedigree input).} +\item{ID}{Individual ID.} \item{Male_Parent}{Declared male parent ID.} \item{Female_Parent}{Declared female parent ID.} \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -\item{Markers_Tested}{Number of markers with non-missing genotypes in -all three individuals.} -\item{Status}{One of \code{PASS}, \code{FAIL}, \code{LOW_MARKERS}, or -\code{NO_DATA}.} -\item{Correction_Decision}{One of \code{NONE}, \code{KEEP_BOTH}, -\code{REMOVE_MALE_PARENT}, \code{REMOVE_FEMALE_PARENT}, or -\code{REMOVE_BOTH}.} +\item{Markers_Tested}{Number of markers with non-missing genotypes.} +\item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, +MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, +or NO_GENOTYPE_DATA (trio present in pedigree but absent from +the genotype file).} +\item{Correction_Decision}{One of NONE, KEEP_BOTH, +REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH.} \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch -percentage (\code{NA} unless \code{Status == "FAIL"}).} +percentage (NA unless Status == "FAIL").} \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker -mismatch percentage (\code{NA} unless \code{Status == "FAIL"}).} -\item{Best_Male_Parent}{Best-matching male parent candidate ID -(\code{NA} unless male parent is removed).} +mismatch percentage (NA unless Status == "FAIL").} +\item{Best_Male_Parent}{Best-matching male parent candidate ID.} \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for -\code{Best_Male_Parent}.} -\item{Best_Female_Parent}{Best-matching female parent candidate ID -(\code{NA} unless female parent is removed).} +Best_Male_Parent.} +\item{Best_Female_Parent}{Best-matching female parent candidate ID.} \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for -\code{Best_Female_Parent}.} +Best_Female_Parent.} } } \description{ Validates parent-offspring trios by calculating Mendelian error rates from -SNP genotype data. Identifies incorrect parentage assignments and optionally -suggests or fills in best-matching replacements. +SNP genotype data. Identifies incorrect parentage assignments and suggests +best-matching replacements. If a list of founders is supplied, trios that +are declared founders (both parents coded as 0) are preserved unchanged +with no recommendations. Trios removed due to missing genotype data are +retained in the output with a NO_GENOTYPE_DATA status. } \details{ -Trios are filtered to individuals present in the genotype file before -analysis. Mendelian errors are counted as genotype combinations impossible -under Mendelian inheritance (e.g. both parents homozygous reference but the -offspring carries the alternate allele). Failed trios are further dissected -using homozygous-only markers to identify which parent is likely incorrect. +All trios in the pedigree file are represented in the output. Trios where +the progeny or a declared parent is absent from the genotype file are +flagged as NO_GENOTYPE_DATA and are excluded from Mendelian error analysis +but retained in the final report and summary counts. + +Trios with missing parents (coded as 0) that are not listed as founders +receive a MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, or +MISSING_BOTH_PARENTS status. Recommendations are provided in the +Best_Male_Parent and Best_Female_Parent columns, but the 0 values are +preserved in the corrected pedigree output. + +If founders_file is provided, any trio where the individual is listed as +a founder and both parents are coded as 0 is flagged as FOUNDERS; no +recommendation or correction is attempted. -A corrected pedigree with failed parents replaced by \code{0} is always -written to \code{corrected_pedigree.txt} in the working directory. If -\code{fill_pedigree = TRUE}, a second file (\code{filled_pedigree.txt}) -replaces those zeros with the best genomic match found across all -genotyped individuals. +A corrected pedigree with failed parents replaced by 0 is always written +to corrected_pedigree.txt. } \examples{ \dontrun{ -# Basic run with defaults +# Basic run results <- validate_pedigree("pedigree.txt", "genotypes.txt") -# Stricter thresholds, fill replacements, suppress console output +# With founders list and stricter thresholds results <- validate_pedigree( pedigree_file = "pedigree.txt", genotypes_file = "genotypes.txt", + founders_file = "founders.txt", trio_error_threshold = 2.0, single_parent_error_threshold = 1.0, - fill_pedigree = TRUE, verbose = FALSE, output_filename = "my_validation.txt" ) diff --git a/tests/testthat/corrected_pedigree.txt b/tests/testthat/corrected_pedigree.txt index fc05d4d..0434b0c 100644 --- a/tests/testthat/corrected_pedigree.txt +++ b/tests/testthat/corrected_pedigree.txt @@ -1,3 +1,4 @@ ID Male_Parent Female_Parent IND_C IND_A IND_B IND_D 0 IND_A +GHOST IND_A IND_B diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R index 56fa5f6..55e6913 100644 --- a/tests/testthat/test-find_parentage.R +++ b/tests/testthat/test-find_parentage.R @@ -1,58 +1,29 @@ # tests/testthat/test-find_parentage.R +# Run with: testthat::test_file("tests/testthat/test-find_parentage.R") library(testthat) library(data.table) -# ───────────────────────────────────────────── -# Helper: write temp TSV files and return paths -# ───────────────────────────────────────────── + +# ============================================================================== +# Helpers +# ============================================================================== + make_files <- function(genos, parents, progeny, dir = tempdir()) { - geno_file <- file.path(dir, "genos.txt") - parent_file <- file.path(dir, "parents.txt") - progeny_file <- file.path(dir, "progeny.txt") + geno_file <- file.path(dir, paste0("genos_", sample(1e6,1), ".txt")) + parent_file <- file.path(dir, paste0("parents_", sample(1e6,1), ".txt")) + progeny_file <- file.path(dir, paste0("progeny_", sample(1e6,1), ".txt")) data.table::fwrite(genos, geno_file, sep = "\t") data.table::fwrite(parents, parent_file, sep = "\t") data.table::fwrite(progeny, progeny_file, sep = "\t") list(g = geno_file, p = parent_file, pr = progeny_file) } -# ───────────────────────────────────────────────────────────────────────────── -# Shared toy genotype data -# -# We rely ONLY on the two simplest, unambiguous Mendelian rules that have no -# operator-precedence risk in the source code: -# -# Rule A: male_parent=0 & female_parent=0 → progeny MUST be 0 (error if prog > 0) -# Rule B: male_parent=2 & female_parent=2 → progeny MUST be 2 (error if prog < 2) -# -# Design: -# S1: 0 0 0 0 0 -# D1: 0 0 0 0 0 -# child1 (perfect child of S1xD1): -# 0 0 0 0 0 → 0 errors with S1xD1 -# -# S2: 2 2 2 2 2 (opposite homozygotes) -# D2: 2 2 2 2 2 -# -# S2xD2 for child1: -# M1–M5: male_parent=2, female_parent=2 → prog must be 2, child1=0 → ERROR (×5) -# → 5/5 = 100% error ✓ -# -# S1xD2 for child1: -# male_parent=0, female_parent=2 → unsafe combo (always errors due to -# operator-precedence in source) → 5 errors ✓ -# -# S2xD1 for child1: -# male_parent=2, female_parent=0, prog=0 → -# right side of |: TRUE & (0!=1)=TRUE → ERROR → 5 errors ✓ -# -# So with 5 markers (all male_parent=0, female_parent=0, prog=0 for S1xD1): -# S1xD1: 0 errors / 5 = 0% ← BEST ✓ -# S2xD2: 5 errors / 5 = 100% -# S1xD2: 5 errors / 5 = 100% -# S2xD1: 5 errors / 5 = 100% -# -# child2 is a perfect child of S2xD2 (all 2s). -# ───────────────────────────────────────────────────────────────────────────── + +# ------------------------------------------------------------------------------ +# Base toy data +# S1 / D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 +# S2 / D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 +# ------------------------------------------------------------------------------ base_genos <- data.table::data.table( - ID = c("S1", "S2", "D1", "D2", "child1", "child2"), + ID = c("S1","S2","D1","D2","child1","child2"), M1 = c(0L, 2L, 0L, 2L, 0L, 2L), M2 = c(0L, 2L, 0L, 2L, 0L, 2L), M3 = c(0L, 2L, 0L, 2L, 0L, 2L), @@ -60,13 +31,25 @@ base_genos <- data.table::data.table( M5 = c(0L, 2L, 0L, 2L, 0L, 2L) ) base_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M", "M", "F", "F")) -base_progeny <- data.table::data.table(ID = c("child1", "child2")) + Sex = c("M","M","F","F")) child1_progeny <- data.table::data.table(ID = "child1") child2_progeny <- data.table::data.table(ID = "child2") -# ══════════════════════════════════════════════ +base_progeny <- data.table::data.table(ID = c("child1","child2")) + +# All-zero genotypes — every pair ties at 0% error +tied_genos <- data.table::data.table( + ID = c("S1","S2","D1","D2","child_tie"), + M1 = c(0L, 0L, 0L, 0L, 0L), + M2 = c(0L, 0L, 0L, 0L, 0L) +) +tied_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), + Sex = c("M","M","F","F")) +tied_progeny <- data.table::data.table(ID = "child_tie") + +# ============================================================================== # 1. Input validation -# ══════════════════════════════════════════════ +# ============================================================================== + test_that("invalid method throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( @@ -75,6 +58,30 @@ test_that("invalid method throws an error", { regexp = "Method must be one of" ) }) + +test_that("min_markers < 1 throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, min_markers = 0, + verbose = FALSE, write_txt = FALSE), + regexp = "min_markers" + ) +}) + +test_that("error_threshold out of range throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, error_threshold = 150, + verbose = FALSE, write_txt = FALSE), + regexp = "error_threshold" + ) + expect_error( + find_parentage(f$g, f$p, f$pr, error_threshold = -1, + verbose = FALSE, write_txt = FALSE), + regexp = "error_threshold" + ) +}) + test_that("missing genotype file throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( @@ -82,8 +89,10 @@ test_that("missing genotype file throws an error", { verbose = FALSE, write_txt = FALSE) ) }) + test_that("parent IDs absent from genotype file raise a warning and are dropped", { - extra_parents <- rbind(base_parents, data.table::data.table(ID = "GHOST", Sex = "M")) + extra_parents <- rbind(base_parents, + data.table::data.table(ID = "GHOST", Sex = "M")) f <- make_files(base_genos, extra_parents, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", @@ -91,8 +100,10 @@ test_that("parent IDs absent from genotype file raise a warning and are dropped" regexp = "GHOST" ) }) + test_that("progeny IDs absent from genotype file raise a warning and are dropped", { - extra_progeny <- rbind(child1_progeny, data.table::data.table(ID = "GHOST_KID")) + extra_progeny <- rbind(child1_progeny, + data.table::data.table(ID = "GHOST_KID")) f <- make_files(base_genos, base_parents, extra_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", @@ -100,6 +111,7 @@ test_that("progeny IDs absent from genotype file raise a warning and are dropped regexp = "GHOST_KID" ) }) + test_that("no valid progeny candidates after filtering stops with an error", { ghost_progeny <- data.table::data.table(ID = "NOBODY") f <- make_files(base_genos, base_parents, ghost_progeny) @@ -111,8 +123,9 @@ test_that("no valid progeny candidates after filtering stops with an error", { ) ) }) + test_that("missing Sex column raises a warning and defaults to ambiguous", { - parents_no_sex <- data.table::data.table(ID = c("S1", "D1")) + parents_no_sex <- data.table::data.table(ID = c("S1","D1")) f <- make_files(base_genos, parents_no_sex, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_match", @@ -120,139 +133,197 @@ test_that("missing Sex column raises a warning and defaults to ambiguous", { regexp = "Sex" ) }) -# ══════════════════════════════════════════════ + +# ============================================================================== # 2. Return structure -# ══════════════════════════════════════════════ +# ============================================================================== + test_that("best_pair returns a data.table with expected columns (no ties)", { - f <- make_files(base_genos, base_parents, child1_progeny) + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") - expect_true(all(c("Progeny", "Male_Parent", "Female_Parent", - "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_true(all(c("Progeny","Male_Parent","Female_Parent", + "Mendelian_Error_Pct","Markers_Tested", + "Assignment_Status") %in% names(res))) expect_equal(nrow(res), 1L) }) -test_that("best_male_parent returns a data.table with expected columns", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("best_male_parent returns expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") - expect_true(all(c("Progeny", "Best_Match", - "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", + "Markers_Tested","Assignment_Status") %in% names(res))) expect_equal(nrow(res), 1L) }) -test_that("best_female_parent returns a data.table with expected columns", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("best_female_parent returns expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") - expect_true(all(c("Progeny", "Best_Match", - "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", + "Markers_Tested","Assignment_Status") %in% names(res))) expect_equal(nrow(res), 1L) }) -test_that("best_match returns a data.table with expected columns", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("best_match returns expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_match", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") - expect_true(all(c("Progeny", "Best_Match", - "Mendelian_Error_Pct", "Markers_Tested") %in% names(res))) + expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", + "Markers_Tested","Assignment_Status") %in% names(res))) expect_equal(nrow(res), 1L) }) -test_that("one row is returned per progeny for single-parent methods", { + +test_that("one row returned per progeny for single-parent methods", { f <- make_files(base_genos, base_parents, child1_progeny) - for (m in c("best_male_parent", "best_female_parent", "best_match")) { + for (m in c("best_male_parent","best_female_parent","best_match")) { res <- find_parentage(f$g, f$p, f$pr, method = m, verbose = FALSE, write_txt = FALSE) - expect_equal(nrow(res), 1L, label = base::paste("row count for method", m)) + expect_equal(nrow(res), 1L, label = paste("row count for method", m)) } }) -# ══════════════════════════════════════════════ + +test_that("Markers_Tested equals the number of non-NA marker columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) +}) + +test_that("Mendelian_Error_Pct is between 0 and 100", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + pct <- as.numeric(res$Mendelian_Error_Pct) + expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) +}) + +# ============================================================================== # 3. Biological correctness -# ══════════════════════════════════════════════ -test_that("best_pair correctly identifies S1 x D1 as best pair with 0% error for child1", { - f <- make_files(base_genos, base_parents, child1_progeny) +# ============================================================================== + +test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_equal(res$Male_Parent, "S1") expect_equal(res$Female_Parent, "D1") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) -test_that("best_pair correctly identifies S2 x D2 as best pair with 0% error for child2", { - f <- make_files(base_genos, base_parents, child2_progeny) + +test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { + f <- make_files(base_genos, base_parents, child2_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_equal(res$Male_Parent, "S2") expect_equal(res$Female_Parent, "D2") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) -test_that("best_male_parent identifies S1 as best male parent for child1", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("best_male_parent identifies S1 as best male for child1", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, write_txt = FALSE) expect_equal(res$Best_Match, "S1") }) -test_that("best_female_parent identifies D1 as best female parent for child1", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("best_female_parent identifies D1 as best female for child1", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, write_txt = FALSE) expect_equal(res$Best_Match, "D1") }) + test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { - f <- make_files(base_genos, base_parents, child1_progeny) + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) -test_that("Mendelian_Error_Pct is between 0 and 100", { - f <- make_files(base_genos, base_parents, child1_progeny) + +# ============================================================================== +# 4. Assignment_Status — min_markers and error_threshold +# ============================================================================== + +test_that("Assignment_Status = PASS for perfect trio within thresholds", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - pct <- as.numeric(res$Mendelian_Error_Pct) - expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) + expect_equal(res$Assignment_Status, "PASS") }) -test_that("Markers_Tested equals the number of non-NA markers", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("Assignment_Status = LOW_MARKERS when min_markers exceeds available markers", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 99999, error_threshold = 5.0, show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) # minus ID column + expect_equal(res$Assignment_Status, "LOW_MARKERS") }) -# ══════════════════════════════════════════════ -# 4. allow_selfing -# ══════════════════════════════════════════════ + +test_that("Assignment_Status = HIGH_ERROR when error rate exceeds threshold", { + # Use wrong parents so error rate is high + high_error_genos <- data.table::data.table( + ID = c("S1","D1","bad_child"), + M1 = c(0L, 0L, 2L), + M2 = c(0L, 0L, 2L), + M3 = c(0L, 0L, 2L), + M4 = c(0L, 0L, 2L), + M5 = c(0L, 0L, 2L) + ) + parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("M","F")) + progeny <- data.table::data.table(ID = "bad_child") + f <- make_files(high_error_genos, parents, progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Assignment_Status, "HIGH_ERROR") +}) + +test_that("Assignment_Status column is present in all methods", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_pair","best_male_parent","best_female_parent","best_match")) { + res <- find_parentage(f$g, f$p, f$pr, method = m, + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_true("Assignment_Status" %in% names(res), + label = paste("Assignment_Status present for method", m)) + } +}) + +# ============================================================================== +# 5. allow_selfing +# ============================================================================== + test_that("allow_selfing = FALSE removes self-pairs from candidates", { - ambig_parents <- data.table::data.table(ID = c("S1", "D1"), Sex = c("A", "A")) + ambig_parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("A","A")) f <- make_files(base_genos, ambig_parents, child1_progeny) - # With only 2 ambiguous parents, S1xD1 and D1xS1 are tied → warning expected expect_warning( res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", allow_selfing = FALSE, show_ties = FALSE, verbose = FALSE, write_txt = FALSE), regexp = "tied" ) - if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) { + if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) expect_false(res$Male_Parent == res$Female_Parent) - } }) -# ══════════════════════════════════════════════ -# 5. show_ties -# ══════════════════════════════════════════════ -# All markers 0 → every male_parent x female_parent pair scores 0% error → guaranteed ties -tied_genos <- data.table::data.table( - ID = c("S1", "S2", "D1", "D2", "child_tie"), - M1 = c(0L, 0L, 0L, 0L, 0L), - M2 = c(0L, 0L, 0L, 0L, 0L) -) -tied_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M", "M", "F", "F")) -tied_progeny <- data.table::data.table(ID = "child_tie") + +# ============================================================================== +# 6. show_ties +# ============================================================================== + test_that("show_ties = TRUE produces _1/_2 suffixed columns when ties exist", { - f <- make_files(tied_genos, tied_parents, tied_progeny) + f <- make_files(tied_genos, tied_parents, tied_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = TRUE, verbose = FALSE, write_txt = FALSE) expect_true(any(grepl("^Male_Parent_", names(res)))) expect_true(any(grepl("^Female_Parent_", names(res)))) }) + test_that("show_ties = FALSE warns about ties and returns single-result columns", { f <- make_files(tied_genos, tied_parents, tied_progeny) expect_warning( @@ -265,21 +336,25 @@ test_that("show_ties = FALSE warns about ties and returns single-result columns" expect_false(any(grepl("^Male_Parent_\\d", names(res)))) expect_false(any(grepl("^Female_Parent_\\d", names(res)))) }) -# ══════════════════════════════════════════════ -# 6. verbose / write_txt -# ══════════════════════════════════════════════ -test_that("verbose = TRUE returns the result invisibly", { - f <- make_files(base_genos, base_parents, child1_progeny) + +# ============================================================================== +# 7. verbose / write_txt +# ============================================================================== + +test_that("verbose = TRUE returns the result invisibly as data.table", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = TRUE, write_txt = FALSE) expect_s3_class(res, "data.table") }) -test_that("verbose = FALSE returns the result visibly", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("verbose = FALSE returns the result as data.table", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, write_txt = FALSE) expect_s3_class(res, "data.table") }) + test_that("write_txt = TRUE creates the output file", { old_wd <- getwd() tmp <- tempdir() @@ -290,9 +365,10 @@ test_that("write_txt = TRUE creates the output file", { verbose = FALSE, write_txt = TRUE) expect_true(file.exists(file.path(tmp, "parentage_results_dt.txt"))) }) + test_that("write_txt = FALSE does not create the output file", { - old_wd <- getwd() - tmp <- tempdir() + old_wd <- getwd() + tmp <- tempdir() setwd(tmp) on.exit(setwd(old_wd), add = TRUE) out_file <- file.path(tmp, "parentage_results_dt.txt") @@ -302,32 +378,38 @@ test_that("write_txt = FALSE does not create the output file", { verbose = FALSE, write_txt = FALSE) expect_false(file.exists(out_file)) }) -# ══════════════════════════════════════════════ -# 7. Sex-based candidate filtering -# ══════════════════════════════════════════════ -test_that("best_male_parent only assigns male (M) or ambiguous (A) parents", { - f <- make_files(base_genos, base_parents, child1_progeny) + +# ============================================================================== +# 8. Sex-based candidate filtering +# ============================================================================== + +test_that("best_male_parent only assigns M or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, write_txt = FALSE) - valid_male_parents <- base_parents[Sex %in% c("M", "A")]$ID + valid_male_parents <- base_parents[Sex %in% c("M","A")]$ID expect_true(res$Best_Match %in% valid_male_parents) }) -test_that("best_female_parent only assigns female (F) or ambiguous (A) parents", { - f <- make_files(base_genos, base_parents, child1_progeny) + +test_that("best_female_parent only assigns F or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, write_txt = FALSE) - valid_female_parents <- base_parents[Sex %in% c("F", "A")]$ID + valid_female_parents <- base_parents[Sex %in% c("F","A")]$ID expect_true(res$Best_Match %in% valid_female_parents) }) -# ══════════════════════════════════════════════ -# 8. Edge cases -# ══════════════════════════════════════════════ + +# ============================================================================== +# 9. Edge cases +# ============================================================================== + test_that("single progeny individual is handled correctly", { - f <- make_files(base_genos, base_parents, child1_progeny) + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_equal(nrow(res), 1L) }) + test_that("all-NA marker column does not cause an error", { na_genos <- data.table::copy(base_genos) na_genos[, M1 := NA_integer_] @@ -337,9 +419,17 @@ test_that("all-NA marker column does not cause an error", { verbose = FALSE, write_txt = FALSE) ) }) + test_that("Progeny column contains the correct progeny IDs", { - f <- make_files(base_genos, base_parents, child1_progeny) + f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, write_txt = FALSE) expect_setequal(res$Progeny, child1_progeny$ID) }) + +test_that("multiple progeny are all represented in output", { + f <- make_files(base_genos, base_parents, base_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_setequal(res$Progeny, base_progeny$ID) +}) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index 902501f..2dba74b 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -1,28 +1,35 @@ -#### Tests for validate_pedigree() #### -# Run with: testthat::test_file("test-validate_pedigree.R") +# tests/testthat/test-validate_pedigree.R +# Run with: testthat::test_file("tests/testthat/test-validate_pedigree.R") library(testthat) library(data.table) -#### Helpers #### +# ============================================================================== +# Helpers +# ============================================================================== + make_genos <- function() { + # IND_A: all 0, IND_B: all 2, IND_C: all 1 (het), IND_D: all 0, IND_E: all 0 n_markers <- 20 marker_names <- paste0("M", seq_len(n_markers)) - pa <- rep(0L, n_markers) - pb <- rep(2L, n_markers) - pc <- rep(1L, n_markers) - pd <- rep(0L, n_markers) - pe <- rep(0L, n_markers) dt <- data.table( ID = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), - rbind(pa, pb, pc, pd, pe) + rbind( + rep(0L, n_markers), # IND_A — all ref homozygous + rep(2L, n_markers), # IND_B — all alt homozygous + rep(1L, n_markers), # IND_C — all het (valid child of IND_A x IND_B) + rep(0L, n_markers), # IND_D — all ref (impossible child of IND_B x IND_A) + rep(0L, n_markers) # IND_E — all ref + ) ) setnames(dt, c("ID", marker_names)) dt } make_pedigree <- function() { + # IND_C: perfect Mendelian child of IND_A x IND_B -> PASS + # IND_D: declared parents swapped -> FAIL data.table( - ID = c("IND_C", "IND_D"), # <-- changed from Progeny + ID = c("IND_C", "IND_D"), Male_Parent = c("IND_A", "IND_B"), Female_Parent = c("IND_B", "IND_A") ) @@ -36,12 +43,96 @@ write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { list(ped = ped_file, genos = genos_file) } -#### Tests #### +# ============================================================================== +# 1. Input validation +# ============================================================================== + +test_that("trio_error_threshold out of range raises an error", { + f <- write_temp_files() + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = 150, + verbose = FALSE, write_txt = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = -1, + verbose = FALSE, write_txt = FALSE)) +}) + +test_that("single_parent_error_threshold out of range raises an error", { + f <- write_temp_files() + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = 101, + verbose = FALSE, write_txt = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = -5, + verbose = FALSE, write_txt = FALSE)) +}) + +test_that("missing required pedigree column raises an error", { + bad_ped <- data.table(ID = "IND_C", Parent1 = "IND_A", Female_Parent = "IND_B") + f <- write_temp_files(ped = bad_ped) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + regexp = "missing required columns" + ) +}) + +test_that("missing ID column in genotypes raises an error", { + bad_genos <- copy(make_genos()) + setnames(bad_genos, "ID", "SampleID") + f <- write_temp_files(genos = bad_genos) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + regexp = "ID" + ) +}) + +test_that("all trios with no genotype data stop with an error", { + ped <- data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + f <- write_temp_files(ped = ped) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + regexp = "No valid trios remain" + ) +}) + +# ============================================================================== +# 2. Return structure +# ============================================================================== + +test_that("returns a data.table", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") +}) + +test_that("result has one row per pedigree entry", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(nrow(res), 2L) +}) + +test_that("result has all expected columns", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expected_cols <- c( + "ID", "Male_Parent", "Female_Parent", + "Mendelian_Error_Pct", "Markers_Tested", "Status", + "Correction_Decision", + "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", + "Best_Male_Parent", "Best_Male_Parent_Error_Pct", + "Best_Female_Parent", "Best_Female_Parent_Error_Pct" + ) + expect_true(all(expected_cols %in% names(res))) +}) + +# ============================================================================== +# 3. PASS / FAIL / LOW_MARKERS / NO_DATA statuses +# ============================================================================== test_that("PASS trio is correctly identified", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - pass_row <- res[ID == "IND_C"] # <-- changed + pass_row <- res[ID == "IND_C"] expect_equal(nrow(pass_row), 1L) expect_equal(pass_row$Status, "PASS") expect_equal(pass_row$Mendelian_Error_Pct, 0) @@ -51,189 +142,330 @@ test_that("PASS trio is correctly identified", { test_that("FAIL trio is correctly identified", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] # <-- changed + fail_row <- res[ID == "IND_D"] expect_equal(nrow(fail_row), 1L) expect_equal(fail_row$Status, "FAIL") expect_gt(fail_row$Mendelian_Error_Pct, 5.0) }) -test_that("FAIL trio has correct correction decision (REMOVE_MALE_PARENT)", { +test_that("FAIL trio has REMOVE_MALE_PARENT decision with best match populated", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] # <-- changed + fail_row <- res[ID == "IND_D"] expect_equal(fail_row$Correction_Decision, "REMOVE_MALE_PARENT") expect_false(is.na(fail_row$Best_Male_Parent)) expect_true(is.na(fail_row$Best_Female_Parent)) }) -test_that("Mendelian_Error_Pct is 0 for perfect trio", { +test_that("Mendelian_Error_Pct is 0 for a perfect trio", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Mendelian_Error_Pct, 0) # <-- changed + expect_equal(res[ID == "IND_C"]$Mendelian_Error_Pct, 0) }) test_that("Markers_Tested equals number of markers for complete data", { f <- write_temp_files() res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 20L) # <-- changed + expect_equal(res[ID == "IND_C"]$Markers_Tested, 20L) }) -test_that("Returns a data.table invisibly", { +test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") - expect_equal(nrow(res), 2L) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, min_markers = 25L) + expect_true(all(res$Status == "LOW_MARKERS")) + expect_true(all(res$Correction_Decision == "NONE")) }) -test_that("Result has all expected columns", { - f <- write_temp_files() +test_that("NA markers reduce Markers_Tested and do not cause errors", { + genos <- make_genos() + genos[ID == "IND_C", M1 := NA_integer_] + genos[ID == "IND_C", M2 := NA_integer_] + f <- write_temp_files(genos = genos) res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expected_cols <- c( - "ID", "Male_Parent", "Female_Parent", # <-- changed - "Mendelian_Error_Pct", "Markers_Tested", "Status", - "Correction_Decision", - "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", - "Best_Male_Parent", "Best_Male_Parent_Error_Pct", - "Best_Female_Parent", "Best_Female_Parent_Error_Pct" + expect_equal(res[ID == "IND_C"]$Markers_Tested, 18L) + expect_equal(res[ID == "IND_C"]$Status, "PASS") +}) + +# ============================================================================== +# 4. Missing parent statuses (MISSING_MALE_PARENT / MISSING_FEMALE_PARENT / +# MISSING_BOTH_PARENTS) +# Note: each test includes make_pedigree() rows so has_geno is never empty, +# and filters res by the specific ID under test [3][4] +# ============================================================================== + +test_that("MISSING_MALE_PARENT status and recommendation are correct", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") ) - expect_true(all(expected_cols %in% names(res))) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_MALE_PARENT") + expect_equal(r$Correction_Decision, "NONE") + expect_false(is.na(r$Best_Male_Parent)) + expect_true(is.na(r$Best_Female_Parent)) }) -test_that("write_txt writes output file with correct name", { - f <- write_temp_files() - out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = TRUE, output_filename = out_file) - expect_true(file.exists(out_file)) - written <- fread(out_file) - expect_equal(nrow(written), 2L) +test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "IND_A", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_FEMALE_PARENT") + expect_equal(r$Correction_Decision, "NONE") + expect_true(is.na(r$Best_Male_Parent)) + expect_false(is.na(r$Best_Female_Parent)) }) -test_that("write_txt = FALSE does not create default output file", { - f <- write_temp_files() - out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, output_filename = out_file) - expect_false(file.exists(out_file)) +test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_BOTH_PARENTS") + expect_equal(r$Correction_Decision, "NONE") + expect_false(is.na(r$Best_Male_Parent)) + expect_false(is.na(r$Best_Female_Parent)) }) -test_that("corrected_pedigree.txt is always written with zeros for bad parents", { - f <- write_temp_files() +test_that("MISSING_* rows preserve 0s in corrected pedigree", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), + colClasses = "character") + expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") +}) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) - expect_equal(corr[ID == "IND_D"]$Male_Parent, "0") # <-- changed - expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") # <-- changed - expect_equal(corr[ID == "IND_C"]$Male_Parent, "IND_A") # <-- changed - expect_equal(corr[ID == "IND_C"]$Female_Parent, "IND_B") # <-- changed +test_that("Best_Male_Parent for MISSING_MALE_PARENT is excluded from being the known female", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + # The known female parent should not be suggested as the best male parent + expect_false(r$Best_Male_Parent == "IND_B") +}) + +# ============================================================================== +# 5. FOUNDERS status +# ============================================================================== + +test_that("FOUNDERS status is assigned when ID in founders list with 0 0 parents", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") + ) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(ID = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_txt = FALSE) + founder_row <- res[ID == "IND_A"] + expect_equal(founder_row$Status, "FOUNDERS") + expect_equal(founder_row$Correction_Decision, "NONE") + expect_true(is.na(founder_row$Best_Male_Parent)) + expect_true(is.na(founder_row$Best_Female_Parent)) +}) + +test_that("non-founder rows are still evaluated normally when founders file is supplied", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") + ) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(ID = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_txt = FALSE) + # IND_C has real parents — should still get PASS + expect_equal(res[ID == "IND_C"]$Status, "PASS") +}) + +test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") +}) + +test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUNDERS", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, + founders_file = NULL, + verbose = FALSE, + write_txt = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") +}) + +# ============================================================================== +# 6. NO_GENOTYPE_DATA status +# ============================================================================== + +test_that("NO_GENOTYPE_DATA is flagged for progeny absent from genotype file", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_equal(nrow(ghost_row), 1L) + expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") + expect_equal(ghost_row$Correction_Decision, "NONE") +}) + +test_that("NO_GENOTYPE_DATA rows have NA for all analysis columns", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_true(is.na(ghost_row$Mendelian_Error_Pct)) + expect_equal(ghost_row$Markers_Tested, 0L) + expect_true(is.na(ghost_row$Best_Male_Parent)) + expect_true(is.na(ghost_row$Best_Female_Parent)) +}) + +test_that("NO_GENOTYPE_DATA flagged when declared parent is absent from genotype file", { + ped <- rbind( + make_pedigree(), # ensures has_geno is not empty + data.table(ID = "IND_C_GHOST", Male_Parent = "GHOST_DAD", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") }) -test_that("fill_pedigree = TRUE writes filled_pedigree.txt with replacement IDs", { +test_that("valid trios still evaluated correctly when ghost rows are present", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_C"]$Status, "PASS") + expect_equal(res[ID == "IND_D"]$Status, "FAIL") +}) + +# ============================================================================== +# 7. Corrected pedigree output +# ============================================================================== + +test_that("corrected_pedigree.txt is written and PASS parents are unchanged", { f <- write_temp_files() tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, fill_pedigree = TRUE) - - filled <- fread(file.path(tmp_dir, "filled_pedigree.txt")) - new_male_par <- filled[ID == "IND_D"]$Male_Parent # <-- changed - expect_false(new_male_par == "IND_B") - expect_false(new_male_par == "0") + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_true(file.exists(file.path(tmp_dir, "corrected_pedigree.txt"))) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) + # IND_C passes — parents must be unchanged + expect_equal(as.character(corr[ID == "IND_C"]$Male_Parent), "IND_A") + expect_equal(as.character(corr[ID == "IND_C"]$Female_Parent), "IND_B") }) -test_that("fill_pedigree = FALSE does not write filled_pedigree.txt", { +test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { f <- write_temp_files() tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, fill_pedigree = FALSE) - expect_false(file.exists(file.path(tmp_dir, "filled_pedigree.txt"))) + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), + colClasses = "character") + # IND_D fails with REMOVE_MALE_PARENT — male should become "0" + expect_equal(corr[ID == "IND_D"]$Male_Parent, "0") + expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") }) -test_that("Trios with missing genotype data are removed and error is thrown", { - ped <- data.table( - ID = "GHOST", # <-- changed - Male_Parent = "IND_A", - Female_Parent = "IND_B" +test_that("corrected_pedigree.txt preserves 0s for MISSING_* rows", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - "No valid trios remain" - ) -}) - -test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, min_markers = 25L) - expect_true(all(res$Status == "LOW_MARKERS")) - expect_true(all(res$Correction_Decision == "NONE")) + f <- write_temp_files(ped = ped) + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), + colClasses = "character") + # MISSING_MALE_PARENT — male stays "0", female unchanged + expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") + expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") }) -test_that("trio_error_threshold out of range raises an error", { - f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = 150, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = -1, - verbose = FALSE, write_txt = FALSE)) -}) +# ============================================================================== +# 8. write_txt / output file +# ============================================================================== -test_that("single_parent_error_threshold out of range raises an error", { - f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = 101, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = -5, - verbose = FALSE, write_txt = FALSE)) +test_that("write_txt = TRUE writes output file with correct row count", { + f <- write_temp_files() + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = TRUE, output_filename = out_file) + expect_true(file.exists(out_file)) + written <- fread(out_file) + expect_equal(nrow(written), 2L) }) -test_that("missing required pedigree column raises an error", { - bad_ped <- data.table( - ID = "IND_C", # <-- changed - Parent1 = "IND_A", - Female_Parent = "IND_B" - ) - f <- write_temp_files(ped = bad_ped) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - "missing required columns" - ) +test_that("write_txt = FALSE does not create output file", { + f <- write_temp_files() + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, output_filename = out_file) + expect_false(file.exists(out_file)) }) -test_that("missing ID column in genotypes raises an error", { - bad_genos <- copy(make_genos()) - setnames(bad_genos, "ID", "SampleID") - f <- write_temp_files(genos = bad_genos) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - "ID" +test_that("output file contains correct number of rows when ghost rows present", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") ) -}) - -test_that("NA markers do not cause errors and are handled gracefully", { - genos <- make_genos() - genos[ID == "IND_C", M1 := NA_integer_] - genos[ID == "IND_C", M2 := NA_integer_] - f <- write_temp_files(genos = genos) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 18L) # <-- changed - expect_equal(res[ID == "IND_C"]$Status, "PASS") # <-- changed + f <- write_temp_files(ped = ped) + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = TRUE, output_filename = out_file) + written <- fread(out_file) + # 2 valid trios + 1 ghost = 3 rows total + expect_equal(nrow(written), 3L) }) From ab944e8e02a4eb5ca6491208996aff052b2cc315 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 08:20:22 -0400 Subject: [PATCH 52/80] revert filterVCF --- R/filterVCF.R | 321 +++++++---------------------------------------- man/filterVCF.Rd | 17 +-- 2 files changed, 54 insertions(+), 284 deletions(-) diff --git a/R/filterVCF.R b/R/filterVCF.R index b9bca78..a54e32e 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -17,7 +17,6 @@ #' @param filter.SAMPLE.miss Sample missing data filter #' @param filter.SNP.miss SNP missing data filter #' @param ploidy The ploidy of the species being analyzed -#' @param quality.rates Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity). #' @param output.file output file name (optional). If no output.file name provided, then a vcfR object will be returned. #' @return A gzipped vcf file #' @importFrom vcfR read.vcfR @@ -27,114 +26,42 @@ #' @examples #' ## Use file paths for each file on the local system #' +#' #Temp location (only for example) +#' output_file <- tempfile() #' -#' #filterVCF(vcf.file = "example_dart_Dosage_Report.csv", -#' # filter.OD = 0.5, -#' # ploidy = 2, -#' # output.file = "name_for_vcf") +#' filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), +#' filter.OD = 0.5, +#' filter.MAF = 0.05, +#' ploidy = 2, +#' output.file = output_file) +#' +#' # Removing the output for the example +#' rm(output_file) #' #' ##The function will output the filtered VCF to the current working directory #' #' @export filterVCF <- function(vcf.file, - quality.rates = FALSE, - filter.OD = NULL, - filter.BIAS.min = NULL, - filter.BIAS.max = NULL, - filter.DP = NULL, - filter.MPP = NULL, - filter.PMC = NULL, - filter.MAF = NULL, - filter.SAMPLE.miss = NULL, - filter.SNP.miss = NULL, - ploidy, - output.file = NULL) { + filter.OD = NULL, + filter.BIAS.min = NULL, + filter.BIAS.max = NULL, + filter.DP = NULL, + filter.MPP = NULL, + filter.PMC = NULL, + filter.MAF = NULL, + filter.SAMPLE.miss = NULL, + filter.SNP.miss = NULL, + ploidy, + output.file = NULL) { #Should allow for any INFO field to be entered to be filtered - - - # Read VCF (can be .vcf or .vcf.gz) - + # Import VCF (can be .vcf or .vcf.gz) if (!inherits(vcf.file, "vcfR")) { - vcf <- read.vcfR(vcf.file) + vcf <- read.vcfR(vcf.file, verbose = FALSE) } else { vcf <- vcf.file - } - - # Keep original VCF for pre‑filter statistics - vcf_orig <- vcf - - - # pre‑filtering quality rates - - if (quality.rates) { - ## Extract genotypes, depth and DP matrix - gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE) - - dfmt <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]] - if ("DP" %in% dfmt) { - dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE) - } else { - dp_orig <- matrix(NA_real_, - nrow = nrow(gt_orig), ncol = ncol(gt_orig), - dimnames = dimnames(gt_orig)) - } - - - # 1. Observed heterozygosity (per‑marker & per‑sample) - - # Helper: TRUE if a genotype is heterozygous (any two different - # alleles, excluding missing "./.") - is_het <- function(g) { - if (is.na(g) || g == "./.") return(FALSE) - alleles <- strsplit(g, split = "[/|]")[[1]] - return(length(unique(alleles)) > 1) - } - #matrix of heterozygous calls - het_mat <- apply(gt_orig, c(1, 2), is_het) - - #Observed heterozygosity per marker and per sample - obs_het_marker <- rowMeans(het_mat, na.rm = TRUE) - obs_het_sample <- colMeans(het_mat, na.rm = TRUE) - - - #Per‑marker stats - - mean_depth_marker <- rowMeans(dp_orig, na.rm = TRUE) - genotype_present <- !is.na(gt_orig) - genotyping_rate_marker <- rowMeans(genotype_present) - - markers_df <- data.frame( - marker = vcf_orig@fix[, "ID"], - mean_depth = round(mean_depth_marker, 2), - genotyping_rate = round(genotyping_rate_marker, 2), - obs_het = round(obs_het_marker, 2), - stringsAsFactors = FALSE - ) - - - #Per‑sample stats - - mean_depth_sample <- colMeans(dp_orig, na.rm = TRUE) - genotyping_rate_sample <- colMeans(genotype_present) - - samples_df <- data.frame( - sample = colnames(gt_orig), - mean_depth = round(mean_depth_sample, 2), - genotyping_rate = round(genotyping_rate_sample, 2), - obs_het = round(obs_het_sample, 2), - stringsAsFactors = FALSE - ) - - - #Write CSV - - base_name <- if (!is.null(output.file)) output.file else "pre_filter" - write.csv(markers_df, paste0(base_name, "_marker_stats.csv"), - row.names = FALSE, quote = FALSE) - write.csv(samples_df, paste0(base_name, "_sample_stats.csv"), - row.names = FALSE, quote = FALSE) + #rm(vcf.file) } #Update header based on user filtering parameters @@ -175,7 +102,7 @@ filterVCF <- function(vcf.file, # Extract the DP values if ("DP" %in% format_fields && !is.null(filter.DP)) { - cat("Filtering by DP\n") + message("Filtering by DP\n") dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold threshold <- as.numeric(filter.DP) @@ -189,7 +116,7 @@ filterVCF <- function(vcf.file, #Filter if the MPP field is present if ("MPP" %in% format_fields && !is.null(filter.MPP)) { - cat("Filtering by MPP\n") + message("Filtering by MPP\n") # Extract the MPP values mpp <- extract.gt(vcf, element = "MPP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold @@ -229,13 +156,13 @@ filterVCF <- function(vcf.file, # Filtering by OD if ("OD" %in% info_ids && !is.null(filter.OD)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - cat("Filtering by OD\n") + message("Filtering by OD\n") od_values <- extract_info_value(info, "OD") # Ensure no NA values before filtering if (!all(is.na(od_values))) { vcf <- vcf[od_values < as.numeric(filter.OD), ] } else { - cat("No valid OD values found.\n") + warning("No valid OD values found.\n") } } @@ -244,26 +171,26 @@ filterVCF <- function(vcf.file, # Filtering by BIAS if ("BIAS" %in% info_ids && !is.null(filter.BIAS.min) && !is.null(filter.BIAS.max)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - cat("Filtering by BIAS\n") + message("Filtering by BIAS\n") bias_values <- extract_info_value(info, "BIAS") # Ensure no NA values before filtering if (!all(is.na(bias_values))) { vcf <- vcf[bias_values > as.numeric(filter.BIAS.min) & bias_values < as.numeric(filter.BIAS.max), ] } else { - cat("No valid BIAS values found.\n") + warning("No valid BIAS values found.\n") } } # Filtering by PMC if ("PMC" %in% info_ids && !is.null(filter.PMC)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - cat("Filtering by PMC\n") + message("Filtering by PMC\n") pmc_values <- extract_info_value(info, "PMC") # Ensure no NA values before filtering if (!all(is.na(pmc_values))) { vcf <- vcf[pmc_values < as.numeric(filter.PMC), ] } else { - cat("No valid PMC values found.\n") + warning("No valid PMC values found.\n") } } @@ -273,14 +200,14 @@ filterVCF <- function(vcf.file, gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) if (!is.null(filter.SNP.miss)) { - cat("Filtering by SNP missing data\n") + message("Filtering by SNP missing data\n") snp_missing_data <- rowMeans(is.na(gt_matrix)) vcf <- vcf[snp_missing_data < as.numeric(filter.SNP.miss), ] gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) } if (!is.null(filter.SAMPLE.miss)) { - cat("Filtering by Sample missing data\n") + message("Filtering by Sample missing data\n") # Calculate the proportion of missing data for each sample sample_missing_data <- colMeans(is.na(gt_matrix)) # Identify samples to keep based on the missing data threshold @@ -295,112 +222,30 @@ filterVCF <- function(vcf.file, rm(gt_matrix) } - ##Convert GT to dosage - #gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) - - # Function to determine the ploidy level from a genotype string - #determine_ploidy <- function(gt) { - # if (is.na(gt)) { - # return(NA) - # } - # return(length(strsplit(gt, "[|/]")[[1]])) - #} - - # Function to find a non-NA example genotype to determine ploidy - #find_example_gt <- function(matrix) { - # for (i in seq_len(nrow(matrix))) { - # for (j in seq_len(ncol(matrix))) { - # if (!is.na(matrix[i, j])) { - # return(matrix[i, j]) - # } - # } - # } - # return(NA) # Return NA if no non-NA genotype is found - #} - - # Find a non-NA example genotype - #example_gt <- find_example_gt(gt_matrix) - - # Determine the ploidy level - #if (!is.na(example_gt)) { - # ploidy <- determine_ploidy(example_gt) - #} else { - # stop("No non-NA genotype found to determine ploidy.") - #} - - # Generate lookup table for genotypes to dosage conversion - #generate_lookup_table <- function(ploidy) { - # possible_alleles <- 0:ploidy - # genotypes <- expand.grid(rep(list(possible_alleles), ploidy)) - # genotypes <- apply(genotypes, 1, function(x) paste(x, collapse = "/")) - # dosage_values <- rowSums(expand.grid(rep(list(possible_alleles), ploidy))) - # lookup_table <- setNames(dosage_values, genotypes) - # return(lookup_table) - #} - - # Generate the lookup table - #lookup_table <- generate_lookup_table(ploidy) - - # Function to convert genotype to dosage using the lookup table - #genotype_to_dosage <- function(gt, lookup_table) { - # if (is.na(gt)) { - # return(NA) - # } - # return(lookup_table[[gt]]) - #} - - # Function to convert genotype matrix to dosage matrix using vectorized operations - #convert_genotypes_to_dosage <- function(gt_matrix, lookup_table) { - # unique_gts <- unique(gt_matrix) - # gt_to_dosage <- setNames(rep(NA, length(unique_gts)), unique_gts) - # valid_gts <- unique_gts[unique_gts %in% names(lookup_table)] - # gt_to_dosage[valid_gts] <- lookup_table[valid_gts] - # dosage_matrix <- gt_to_dosage[gt_matrix] - #colnames(dosage_matrix) <- colnames(gt_matrix) - #row.names(dosage_matrix) <- row.names(gt_matrix) - # return(matrix(as.numeric(dosage_matrix), nrow = nrow(gt_matrix), ncol = ncol(gt_matrix))) - #} - - # Convert the genotype matrix to dosage matrix - #dosage_matrix <- convert_genotypes_to_dosage(gt_matrix, lookup_table) - ##MAF filter - #Compare my lengthy process to estimate MAF with vcfR::maf() function - #The BIGr::calculate_MAF(dosage_matrix, ploidy) is the exact same as the vcfR::maf() calculations - #The step where I extract UD and calculate MAF is different... - #if ("UD" %in% format_fields) { - # maf_df <- BIGr::calculate_MAF(extract.gt(vcf, element = "UD", as.numeric = TRUE), ploidy = ploidy) - #} else { - #convert genotypes to dosage and filter - # maf_df <- BIGr::calculate_MAF(dosage_matrix, ploidy) - #} - #Need to confirm that vcfR::maf will work with any ploidy...if not, use my code if (!is.null(filter.MAF)) { - cat("Filtering by MAF\n") + message("Filtering by MAF\n") maf_df <- data.frame(vcfR::maf(vcf, element = 2)) vcf <- vcf[maf_df$Frequency > as.numeric(filter.MAF), ] } ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz) - cat("Exporting VCF\n") - if (!inherits(vcf.file, "vcfR")){ - if (!is.null(output.file)){ - output_name <- paste0(output.file,".vcf.gz") + message("Exporting VCF\n") + if (!inherits(vcf.file, "vcfR")) { + if (!is.null(output.file)) { + output_name <- paste0(output.file, ".vcf.gz") vcfR::write.vcf(vcf, file = output_name) - }else{ + } else { return(vcf) } - }else{ - if (!is.null(output.file)){ - output_name <- paste0(output.file,"_filtered.vcf.gz") + } else { + if (!is.null(output.file)) { + output_name <- paste0(output.file, "_filtered.vcf.gz") vcfR::write.vcf(vcf, file = output_name) - }else{ + } else { return(vcf) } } - #Message that includes the output vcf stats - print(vcf) - #Message samples_removed <- starting_samples - (ncol(vcf@gt)-1) SNPs_removed <- starting_snps - nrow(vcf) @@ -408,81 +253,3 @@ filterVCF <- function(vcf.file, message("SNPs removed due to filtering: ",SNPs_removed) message("Complete!") } -#This is not reliable, so no longer use this shortcut to get dosage matrix -#test2 <- vcfR2genlight(vcf) - - -#####Testing custom VCF reading function###### -# Open the gzipped VCF file -#con <- gzfile("/Users/ams866/Desktop/output.vcf", "rt") - -# Read in the entire file -#lines <- readLines(con) -#close(con) -# Read in the entire file -#lines <- readLines("/Users/ams866/Desktop/output.vcf") -# Filter out lines that start with ## -#filtered_lines <- lines[!grepl("^##", lines)] -# Create a temporary file to write the filtered lines -#temp_file <- tempfile() -#writeLines(filtered_lines, temp_file) -# Read in the filtered data using read.table or read.csv -#vcf_data <- read.table(temp_file, header = TRUE, sep = "\t", comment.char = "", check.names = FALSE) -# Clean up the temporary file -#unlink(temp_file) - -##Extract INFO column and Filter SNPs by those values -#Update the filtering options by the items present in the INFO column? - -# Load required library -#library(dplyr) - -# Split INFO column into key-value pairs -#vcf_data_parsed <- vcf_data %>% -# mutate(INFO_PARSED = strsplit(INFO, ";")) %>% -# unnest(INFO_PARSED) %>% -# separate(INFO_PARSED, into = c("KEY", "VALUE"), sep = "=") %>% -# spread(KEY, VALUE) - -#Filter by DP -#filtered_vcf_data <- vcf_data_parsed %>% -# filter(as.numeric(DP) > 10) - -# View the filtered dataframe -#print(filtered_vcf_data) - -##Extracting and filtering by FORMAT column -# Identify the columns that are not sample columns -#non_sample_cols <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT") -# Identify the sample columns -#sample_cols <- setdiff(names(vcf_data), non_sample_cols) -# Extract FORMAT keys -#format_keys <- strsplit(as.character(vcf_data$FORMAT[1]), ":")[[1]] -# Split SAMPLE columns based on FORMAT -#vcf_data_samples <- vcf_data %>% -# mutate(across(all_of(sample_cols), ~strsplit(as.character(.), ":"))) %>% -# mutate(across(all_of(sample_cols), ~map(., ~setNames(as.list(.), format_keys)))) %>% -# unnest_wider(all_of(sample_cols), names_sep = "_") - -# View the parsed dataframe -#print(head(vcf_data_samples)) - -# Create separate dataframes for each FORMAT variable -#format_dfs <- lapply(format_keys, function(format_key) { -# vcf_data_samples %>% -# select(ID, ends_with(paste0("_", format_key))) %>% -# column_to_rownames("ID") -#}) - -# Assign names to the list elements -#names(format_dfs) <- format_keys - -# Access the separate dataframes -#gt_df <- format_dfs$GT # Genotype dataframe -#ad_df <- format_dfs$AD # Allelic depths dataframe - -#*I think the above method is okay if you only need to filter at the INFO level, -#*But I think if you want to filter for FORMAT, that vcfR is probably best, -#*Will need to explore further if I can easily just filter for MPP by checking if it is above a -#*threshold, and then converting the GT and UD values to NA if so... -#*If that is efficient and works, then I will just use this custom VCF method... diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 0342fe1..676ef7f 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -6,7 +6,6 @@ \usage{ filterVCF( vcf.file, - quality.rates = FALSE, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, @@ -23,8 +22,6 @@ filterVCF( \arguments{ \item{vcf.file}{vcfR object or path to VCF file. Can be unzipped (.vcf) or gzipped (.vcf.gz).} -\item{quality.rates}{Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity).} - \item{filter.OD}{Updog filter} \item{filter.BIAS.min}{Updog filter (requires a value for both BIAS.min and BIAS.max)} @@ -61,11 +58,17 @@ The VCF format is v4.3 \examples{ ## Use file paths for each file on the local system +#Temp location (only for example) +output_file <- tempfile() + +filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), + filter.OD = 0.5, + filter.MAF = 0.05, + ploidy = 2, + output.file = output_file) -#filterVCF(vcf.file = "example_dart_Dosage_Report.csv", - # filter.OD = 0.5, - # ploidy = 2, - # output.file = "name_for_vcf") +# Removing the output for the example +rm(output_file) ##The function will output the filtered VCF to the current working directory From 4ac0c5526fc7baec013902d717717b2ab16baa96 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 08:20:52 -0400 Subject: [PATCH 53/80] support LUT Marker_ID --- R/madc2vcf_targets.R | 12 +++++++----- man/madc2vcf_targets.Rd | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 888c445..cc72e3e 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -71,7 +71,7 @@ #' `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`. #' @param markers_info character or `NULL`. Optional path to a CSV providing target #' metadata. Accepted columns: -#' - `CloneID` or `BI_markerID` (required as marker identifier); +#' - `CloneID`, `Marker_ID`, or `BI_markerID` (required as marker identifier); #' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; #' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence #' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). @@ -237,7 +237,7 @@ madc2vcf_targets <- function(madc_file, if(!isTRUE(checks$checks["ChromPos"])) { if(is.null(markers_info)){ stop("CloneID column does not follow the 'Chr_Pos'. ", - "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "Please provide a markers_info file with at least 'CloneID'/'Marker_ID'/'BI_markerID', ", "'Chr', and 'Pos' columns.") } else { @@ -309,7 +309,8 @@ madc2vcf_targets <- function(madc_file, if(is.null(mi_df)) mi_df <- read.csv(markers_info) id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else if ("CloneID" %in% colnames(mi_df)) "CloneID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", @@ -321,7 +322,7 @@ madc2vcf_targets <- function(madc_file, if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] - if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.") vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") @@ -342,7 +343,8 @@ madc2vcf_targets <- function(madc_file, if(is.null(mi_df)) mi_df <- read.csv(markers_info) id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else if ("CloneID" %in% colnames(mi_df)) "CloneID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 30363a6..25f99d5 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -28,7 +28,7 @@ Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target metadata. Accepted columns: \itemize{ -\item \code{CloneID} or \code{BI_markerID} (required as marker identifier); +\item \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} (required as marker identifier); \item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; \item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence inference is not possible (IUPAC codes, indels, or unfixed allele IDs). From 5234572eb39849f68a20d92c68d9040356a82556 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:33:10 -0400 Subject: [PATCH 54/80] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/madc2vcf_multi.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index bcbae02..2ff7b0e 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -166,6 +166,9 @@ madc2vcf_multi <- function(madc_file, vmsg("Loading MADC into polyRAD", verbose = verbose, level = 0, type = ">>") + if (!requireNamespace("polyRAD", quietly = TRUE)) { + stop("Package 'polyRAD' is required for madc2vcf_multi(). Please install it with install.packages('polyRAD').", call. = FALSE) + } raddat <- polyRAD::readDArTag( file = input_file, botloci = botloci_input, From 8c9dcda701ef63b4c3620979ee7205fbd4308751 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 08:46:33 -0400 Subject: [PATCH 55/80] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/madc2vcf_targets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index cc72e3e..a16fdfb 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -165,7 +165,7 @@ madc2vcf_targets <- function(madc_file, "verbose= ", verbose,')">') # MADC checks - report <- read.csv(madc_file) + report <- read.csv(madc_file, check.names = FALSE) checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { From b4d5534d7fde36a053be8245e6905548e3b92301 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 08:28:04 -0400 Subject: [PATCH 56/80] covered error case --- R/get_countsMADC.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 3a9bc2b..c1045ec 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -54,6 +54,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche # Add check inputs if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !is.null(madc_object)) stop("Please provide either madc_file or madc_object. Not both.") if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") From 57bbc89c93cb677bc614c4389ffc50df581465a8 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:29:30 -0400 Subject: [PATCH 57/80] add example and suggest ggplot --- DESCRIPTION | 4 +-- NAMESPACE | 1 - R/imputation_concordance.R | 46 +++++++++++++++++++++++------------ man/imputation_concordance.Rd | 23 ++++++++++++++---- 4 files changed, 51 insertions(+), 23 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 48ede7c..672faac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,10 +62,10 @@ Imports: janitor, quadprog, tibble, - stringr, - ggplot2 + stringr Suggests: covr, + ggplot2, spelling, rmdformats, knitr (>= 1.10), diff --git a/NAMESPACE b/NAMESPACE index ae09080..e6cbc30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,7 +24,6 @@ export(thinSNP) export(updog2vcf) export(vmsg) import(dplyr) -import(ggplot2) import(janitor) import(parallel) import(quadprog) diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 1eb441a..918071a 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -47,19 +47,31 @@ #' is generated using \pkg{ggplot2}. #' #' @import dplyr -#' @import ggplot2 #' #' @examples -#' \dontrun{ +#' ref <- data.frame( +#' ID = c("S1", "S2", "S3"), +#' SNP1 = c(0, 1, 2), +#' SNP2 = c(1, 1, 0), +#' SNP3 = c(2, 5, 1) +#' ) +#' +#' test <- data.frame( +#' ID = c("S1", "S2", "S3"), +#' SNP1 = c(0, 0, 2), +#' SNP2 = c(1, 1, 1), +#' SNP3 = c(2, 5, 0) +#' ) +#' #' result <- imputation_concordance( #' reference_genos = ref, #' imputed_genos = test, -#' snps_2_exclude = snps, +#' snps_2_exclude = "SNP2", #' missing_code = 5, -#' verbose = TRUE, -#' plot = TRUE +#' print_result = FALSE #' ) -#' } +#' +#' result #' #' @importFrom stats reorder #' @export @@ -136,21 +148,25 @@ imputation_concordance <- function(reference_genos, # Optional plot if (plot) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + stop("Package 'ggplot2' is required when plot = TRUE.", call. = FALSE) + } plot_df <- data.frame( ID = imputed_genos$ID, Concordance = percentage_match * 100 ) - concordance_plot <- ggplot(plot_df, - aes(x = reorder(ID, Concordance), - y = Concordance)) + - geom_bar(stat = "identity") + - labs(title = "Imputation Concordance by Sample", - x = "Sample ID", - y = "Concordance (%)") + - theme_minimal() + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + concordance_plot <- ggplot2::ggplot( + plot_df, + ggplot2::aes(x = reorder(ID, Concordance), y = Concordance) + ) + + ggplot2::geom_bar(stat = "identity") + + ggplot2::labs(title = "Imputation Concordance by Sample", + x = "Sample ID", + y = "Concordance (%)") + + ggplot2::theme_minimal() + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) print(concordance_plot) } diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 31f54a8..22e9462 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -64,15 +64,28 @@ When \code{plot = TRUE}, a bar plot showing concordance percentage per sample is generated using \pkg{ggplot2}. } \examples{ -\dontrun{ +ref <- data.frame( + ID = c("S1", "S2", "S3"), + SNP1 = c(0, 1, 2), + SNP2 = c(1, 1, 0), + SNP3 = c(2, 5, 1) +) + +test <- data.frame( + ID = c("S1", "S2", "S3"), + SNP1 = c(0, 0, 2), + SNP2 = c(1, 1, 1), + SNP3 = c(2, 5, 0) +) + result <- imputation_concordance( reference_genos = ref, imputed_genos = test, - snps_2_exclude = snps, + snps_2_exclude = "SNP2", missing_code = 5, - verbose = TRUE, - plot = TRUE + print_result = FALSE ) -} + +result } From 4bb67fd1a89cebd73b90a6d259cb3c625c01b817 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:29:45 -0400 Subject: [PATCH 58/80] make get_counts internal --- R/get_countsMADC.R | 3 ++- man/get_counts.Rd | 60 ------------------------------------------- man/get_countsMADC.Rd | 2 +- 3 files changed, 3 insertions(+), 62 deletions(-) delete mode 100644 man/get_counts.Rd diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index c1045ec..57b83ce 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -46,7 +46,7 @@ #' #' rm(counts_matrices) #' -#' @seealso [get_counts()], [check_madc_sanity()] +#' @seealso [check_madc_sanity()] #' #' @import dplyr #' @export @@ -182,6 +182,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche #' @importFrom dplyr %>% filter case_when #' #' @keywords internal +#' @noRd get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { # Add check inputs diff --git a/man/get_counts.Rd b/man/get_counts.Rd deleted file mode 100644 index 1879e07..0000000 --- a/man/get_counts.Rd +++ /dev/null @@ -1,60 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_countsMADC.R -\name{get_counts} -\alias{get_counts} -\title{Read and Pre-process a MADC File} -\usage{ -get_counts( - madc_file = NULL, - madc_object = NULL, - collapse_matches_counts = FALSE, - verbose = TRUE -) -} -\arguments{ -\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. -At least one of \code{madc_file} or \code{madc_object} must be provided.} - -\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame -(e.g., from \code{check_botloci()}). When supplied, file reading is skipped. -At least one of \code{madc_file} or \code{madc_object} must be provided.} - -\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} -and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} -rows. If \code{FALSE} (default), those rows are discarded.} - -\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} -} -\value{ -A data frame with one row per \code{Ref} or \code{Alt} allele entry, retaining -all original columns (\code{AlleleID}, \code{CloneID}, \code{AlleleSequence}, sample -count columns, etc.). -} -\description{ -Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the -file format, and returns a filtered data frame containing only \code{Ref} and \code{Alt} -haplotype rows ready for count-matrix construction. -} -\details{ -\strong{Input}: either \code{madc_file} (path to CSV) or \code{madc_object} (pre-read data -frame) must be supplied; at least one is required. - -\strong{Format detection} (applied to file or object alike): the first seven rows -of the first column are inspected: -\itemize{ -\item \strong{Standard format}: all entries are blank or \code{"*"} — the first 7 rows are -treated as DArT placeholder rows and skipped. -\item \strong{Fixed-allele-ID format}: no filler rows — data are used as-is. -} - -\strong{\verb{|AltMatch} / \verb{|RefMatch} handling} (controlled by \code{collapse_matches_counts}): -\itemize{ -\item \code{FALSE} (default): these rows are simply discarded. -\item \code{TRUE}: their counts are summed into the corresponding \verb{|Ref} or \verb{|Alt} -row for the same \code{CloneID}. -} - -In all cases, trailing suffixes on \code{AlleleID} (e.g., \verb{|Ref_001}, \verb{|Alt_002}) -are stripped to the canonical \verb{|Ref} / \verb{|Alt} form. -} -\keyword{internal} diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd index 28fca1e..207b899 100644 --- a/man/get_countsMADC.Rd +++ b/man/get_countsMADC.Rd @@ -67,5 +67,5 @@ rm(counts_matrices) } \seealso{ -\code{\link[=get_counts]{get_counts()}}, \code{\link[=check_madc_sanity]{check_madc_sanity()}} +\code{\link[=check_madc_sanity]{check_madc_sanity()}} } From ff1ef84d3a1ba905736c9d321527b70ca675e3a2 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:29:51 -0400 Subject: [PATCH 59/80] update test --- tests/testthat/test-madc2vcf_targets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index a64da34..b5e8a60 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -218,7 +218,7 @@ test_that("simu alfalfa",{ botloci_file = alfalfa_botloci, markers_info = alfalfa_markers_info, verbose = FALSE), - "None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match." + "None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match." ) # Test that it works when the function can find a matching ID in markers_info to fix the botloci mismatch issue From 86b4fef4e24b827595036749cad51b2a9e645f27 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:12:20 -0400 Subject: [PATCH 60/80] added marker_id support --- R/check_madc_sanity.R | 58 ++++++++++----- tests/testthat/test-check_madc_sanity.R | 97 +++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 18 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index fda01d5..88f564a 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -156,7 +156,10 @@ check_madc_sanity <- function(report) { pos <- strsplit(report$CloneID, "_") format <- all(sapply(pos, length) == 2) first <- all(grepl("^[A-Za-z]", sapply(pos, "[", 1))) - second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0)) + second <- all(vapply(pos, function(x) { + pos_num <- suppressWarnings(as.numeric(x[2])) + !is.na(pos_num) && pos_num > 0 + }, logical(1))) checks["ChromPos"] <- all(format, first, second) } else checks["ChromPos"] <- FALSE @@ -216,7 +219,9 @@ check_madc_sanity <- function(report) { #' @param botloci A data frame containing the botloci markers. #' @param report A data frame containing the MADC markers. #' @param ChromPos logical value indicating whether the CloneID in the MADC file contains chromosome and position information in the format "Chr_Pos". Default is TRUE -#' @param mi_df A data frame containing marker information with columns CloneID, Chr, and Pos. Required if `ChromPos` is FALSE. +#' @param mi_df A data frame containing marker information with one marker ID column +#' (`CloneID`, `Marker_ID`, or `BI_markerID`) plus `Chr` and `Pos`. Required if +#' `ChromPos` is FALSE. #' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. Required if `ChromPos` is FALSE. #' #' @return A list containing the adjusted botloci and MADC data frames. @@ -226,23 +231,42 @@ check_madc_sanity <- function(report) { #' #' @keywords internal #' @noRd +pick_markers_info_id_col <- function(mi_df, query_ids) { + query_ids <- unique(stats::na.omit(query_ids)) + id_cols <- intersect(c("CloneID", "BI_markerID", "Marker_ID"), colnames(mi_df)) + + if (!length(id_cols)) { + stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") + } + + match_n <- vapply(id_cols, function(col) { + sum(query_ids %in% unique(stats::na.omit(mi_df[[col]]))) + }, integer(1)) + + if (!any(match_n)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.") + } + + id_cols[which.max(match_n)] +} + check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=TRUE){ + original_clone_ids <- report$CloneID + use_col <- NULL # Check inputs if(!ChromPos) { - if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with CloneID, Chr and Pos columns must be provided.") - # if exists, it must contain CloneID or BI_markerID that matches the report$CloneID, and Chr and Pos columns - if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { - stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") - } else { - use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" - vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") - } + if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with 'CloneID'/'Marker_ID'/'BI_markerID', 'Chr', and 'Pos' columns must be provided.") + use_col <- pick_markers_info_id_col(mi_df, report$CloneID) + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") if(is.null(mi_df$Chr) | is.null(mi_df$Pos)) stop("When MADC CloneID don't follow the format Chr_Pos, Chr and Pos columns must be provided in the markers_info file.") } if(!any(botloci$V1 %in% report$CloneID)) { # First check if any botloci markers are found in MADC file. If not, check for padding mismatch. vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") + if(!is.null(mi_df) && is.null(use_col)) { + use_col <- pick_markers_info_id_col(mi_df, original_clone_ids) + } pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) @@ -257,10 +281,11 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= ) report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) if(!is.null(mi_df)) { - mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df$CloneID), "_", - sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df$CloneID))) + mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df[[use_col]]), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df[[use_col]]))) ) } + if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") } else { botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) @@ -270,12 +295,8 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 2, type = ">>") - if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { - stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") - } else { - use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" - vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") - } + use_col <- pick_markers_info_id_col(mi_df, report$CloneID) + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) if(!any(botloci$V1 %in% mk_info_CloneID)){ @@ -286,6 +307,7 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 2, type = ">>") vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 2, type = ">>") report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) mi_df$CloneID <- mk_info_CloneID } } else { diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index d0c45cf..b1a3857 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -68,4 +68,101 @@ test_that("check madc",{ expect_equal(res$checks, exp) }) +test_that("check_botloci remaps using Marker_ID", { + botloci <- data.frame(V1 = c("1_0001", "2_0002")) + report <- data.frame( + CloneID = c("ProbeA_0001", "ProbeB_0002"), + AlleleID = c("ProbeA_0001|Ref_0001", "ProbeB_0002|Ref_0001"), + AlleleSequence = c("A", "T"), + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = c("ProbeA_0001", "ProbeB_0002"), + Chr = c("1", "2"), + Pos = c(1, 2) + ) + + res <- check_botloci(botloci, report, ChromPos = FALSE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, botloci$V1) + expect_equal(res[[3]]$CloneID, botloci$V1) +}) + +test_that("check_botloci resolves Marker_ID before padding report CloneIDs", { + botloci <- data.frame(V1 = "1_000000123") + report <- data.frame( + CloneID = "1_123", + AlleleID = "1_123|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = "1_123", + Chr = "1", + Pos = 123 + ) + + res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, botloci$V1) + expect_equal(res[[3]]$CloneID, botloci$V1) +}) + +test_that("pick_markers_info_id_col scores distinct markers not allele rows", { + mi_df <- data.frame( + CloneID = c("m1", "m2"), + Marker_ID = c("m1", "m3") + ) + query_ids <- c("m1", "m1", "m1", "m2") + + expect_equal(pick_markers_info_id_col(mi_df, query_ids), "CloneID") +}) + +test_that("check_madc_sanity returns FALSE for malformed CloneID positions", { + report <- data.frame( + CloneID = c("Chr_abc", "Chr_abc"), + AlleleID = c("Chr_abc|Ref_0001", "Chr_abc|Alt_0002"), + AlleleSequence = c("A", "T"), + check.names = FALSE + ) + res <- check_madc_sanity(report) + + expect_false(is.na(res$checks["ChromPos"])) + expect_false(res$checks["ChromPos"]) +}) + +test_that("check_botloci errors if widening MADC padding still does not match", { + botloci <- data.frame(V1 = "1_0002") + report <- data.frame( + CloneID = "1_1", + AlleleID = "1_1|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + + expect_error( + check_botloci(botloci, report, ChromPos = TRUE, verbose = FALSE), + "After matching padding, botloci markers still not found in MADC file. Check marker IDs." + ) +}) + +test_that("check_botloci keeps AlleleID synchronized after CloneID remap", { + botloci <- data.frame(V1 = "1_0001") + report <- data.frame( + CloneID = "ProbeA_0001", + AlleleID = "ProbeA_0001|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = "ProbeA_0001", + Chr = "1", + Pos = 1 + ) + + res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, "1_0001") + expect_equal(res[[2]]$AlleleID, "1_0001|Ref_0001") +}) From c10d134ba604cd7ff2ebbe5fbcb15a533f189811 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:13:21 -0400 Subject: [PATCH 61/80] fixed AD generation bug --- R/madc2vcf_all.R | 24 +++++++---- tests/testthat/test-madc2vcf_all.R | 67 +++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 63c031d..e223977 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -138,7 +138,7 @@ madc2vcf_all <- function(madc, checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { - if (check) message[1] else message[2] + if (isTRUE(check)) message[1] else message[2] }, checks$checks, checks$messages) for(i in seq_along(messages_results)) @@ -158,13 +158,23 @@ madc2vcf_all <- function(madc, # Check whether markers_info is present and contains Ref + Alt columns if(!is.null(markers_info)) { mi_df <- read.csv(markers_info) - # Standardize marker ID column to CloneID - if(!"CloneID" %in% colnames(mi_df) && "BI_markerID" %in% colnames(mi_df)) { - colnames(mi_df)[colnames(mi_df) == "BI_markerID"] <- "CloneID" - vmsg("markers_info: 'BI_markerID' column renamed to 'CloneID' for internal use", verbose = verbose, level = 1) - } else if(!"CloneID" %in% colnames(mi_df) && !"BI_markerID" %in% colnames(mi_df)) { + id_cols <- intersect(c("CloneID", "BI_markerID"), colnames(mi_df)) + if(!length(id_cols)) { stop("markers_info must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") } + match_n <- vapply(id_cols, function(col) { + sum(unique(report$CloneID) %in% unique(stats::na.omit(mi_df[[col]]))) + }, integer(1)) + if(!any(match_n)) { + stop("None of the markers_info CloneID or BI_markerID values match the MADC CloneID column. Please make sure they use the same marker IDs.") + } + id_col <- id_cols[which.max(match_n)] + if(id_col != "CloneID" || !"CloneID" %in% colnames(mi_df)) { + mi_df$CloneID <- mi_df[[id_col]] + if(id_col == "BI_markerID") { + vmsg("markers_info: 'BI_markerID' column copied to 'CloneID' for internal use", verbose = verbose, level = 1) + } + } # Validate CloneID values if(any(is.na(mi_df$CloneID) | mi_df$CloneID == "")) stop("markers_info CloneID column contains empty or NA values. Please check your markers_info file.") @@ -910,7 +920,7 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info_mk <- paste0("DP=", sum(c(RefTag, AltTag,total)),";", "ADS=",sum(RefTag),",",sum(AltTag), ads) } else { - tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, AltTag) + tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) alts <- info$Alt info_mk <- paste0("DP=", sum(c(RefTag, AltTag)),";", "ADS=",sum(RefTag),",",sum(AltTag)) diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index c8c860f..1d6050f 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -66,6 +66,70 @@ test_that("test madc offtargets",{ }) +test_that("madc2vcf_all preserves comma-separated AD for biallelic targets", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + db_file <- system.file("example_allele_db.fa", package="BIGr") + temp <- tempfile(fileext = ".vcf") + + madc2vcf_all(madc = madc_file, + botloci_file = bot_file, + hap_seq_file = db_file, + n.cores = 1, + out_vcf = temp, + verbose = FALSE) + + vcf <- read.vcfR(temp, verbose = FALSE) + ad <- extract.gt(vcf, "AD") + biallelic <- !grepl(",", vcf@fix[, "ALT"]) + + expect_true(all(grepl("^[0-9]+,[0-9]+$", ad[biallelic, 1]))) +}) + +test_that("madc2vcf_all accepts BI_markerID matches when CloneID does not match", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + db_file <- system.file("example_allele_db.fa", package="BIGr") + temp <- tempfile(fileext = ".vcf") + temp_markers <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + marker_ids <- unique(report$CloneID) + markers_info <- data.frame( + CloneID = paste0("bogus_", seq_along(marker_ids)), + BI_markerID = marker_ids + ) + write.csv(markers_info, temp_markers, row.names = FALSE) + + expect_no_error( + madc2vcf_all(madc = madc_file, + botloci_file = bot_file, + hap_seq_file = db_file, + markers_info = temp_markers, + n.cores = 1, + out_vcf = temp, + verbose = FALSE) + ) +}) + +test_that("madc2vcf_all surfaces missing-column validation error without crashing", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + report$AlleleSequence <- NULL + write.csv(report, temp_madc, row.names = FALSE) + + expect_error( + madc2vcf_all(madc = temp_madc, + botloci_file = bot_file, + out_vcf = tempfile(fileext = ".vcf"), + verbose = FALSE), + "One or more required columns missing" + ) +}) + # ======================================================================= # Using Breeding-Insight/BIGapp-PanelHub test files # ======================================================================= @@ -220,7 +284,7 @@ test_that("simu alfalfa",{ markers_info = alfalfa_markers_info, out_vcf = out, verbose = FALSE), - regexp = "None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs." + regexp = "None of the markers_info CloneID( or BI_markerID)? values match the MADC CloneID column. Please make sure they use the same marker IDs." ) # Test error when markers_info_ChromPos is provided but IDs still don't match botloci @@ -528,4 +592,3 @@ test_that("simu alfalfa",{ ) }) }) - From 1ae386f95ca906ef294efad57aa522d3a8cee64a Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:13:41 -0400 Subject: [PATCH 62/80] improved truth check --- R/madc2vcf_multi.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 2ff7b0e..d22b8b4 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -81,7 +81,7 @@ madc2vcf_multi <- function(madc_file, checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { - if (check) message[1] else message[2] + if (isTRUE(check)) message[1] else message[2] }, checks$checks, checks$messages) for (i in seq_along(messages_results)) @@ -193,4 +193,3 @@ madc2vcf_multi <- function(madc_file, invisible(NULL) } - From 49cb0a4843cfb486418204fed54198299fa817ca Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:14:58 -0400 Subject: [PATCH 63/80] support Marker_ID --- R/madc2vcf_targets.R | 27 ++++++++++---------- man/madc2vcf_targets.Rd | 15 +++++++---- tests/testthat/test-madc2vcf_targets.R | 35 ++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 19 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index a16fdfb..d75a17f 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -67,14 +67,19 @@ #' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text #' file listing target IDs designed on the **bottom** strand (one ID per line). #' Used for strand-correcting probe sequences when `get_REF_ALT = TRUE` and -#' `markers_info` does not supply `Ref` and `Alt` columns. Also required when -#' `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`. +#' `markers_info` does not supply `Ref` and `Alt` columns. Not needed when +#' `markers_info` provides `Ref` and `Alt`, or when `get_REF_ALT = FALSE` and +#' `markers_info` provides `Chr` and `Pos`. Also required when `ChromPos` is +#' invalid and `markers_info` does not provide `Ref`/`Alt`. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Accepted columns: -#' - `CloneID`, `Marker_ID`, or `BI_markerID` (required as marker identifier); +#' metadata. Matching is done by column name, not column position. Accepted columns: +#' - one marker identifier column named `CloneID`, `Marker_ID`, or `BI_markerID` +#' (required; a generic `ID` column is not accepted); #' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; #' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence -#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When +#' `get_REF_ALT = TRUE`, `botloci_file` is still required unless `Ref` and `Alt` +#' are supplied here. #' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to recover #' REF/ALT bases. The source is chosen automatically: `markers_info` `Ref`/`Alt` #' columns take priority; otherwise probe sequences from the MADC are compared @@ -169,7 +174,7 @@ madc2vcf_targets <- function(madc_file, checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { - if (check) message[1] else message[2] + if (isTRUE(check)) message[1] else message[2] }, checks$checks, checks$messages) for(i in seq_along(messages_results)) @@ -307,10 +312,7 @@ madc2vcf_targets <- function(madc_file, vmsg("Using markers_info for CHROM, POS, REF and ALT.", verbose = verbose, level = 0, type = ">>") if(is.null(mi_df)) mi_df <- read.csv(markers_info) - id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else - if ("CloneID" %in% colnames(mi_df)) "CloneID" else - if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") + id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df)) if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", @@ -341,10 +343,7 @@ madc2vcf_targets <- function(madc_file, vmsg("markers_info file provided. Using CHROM and POS from the file.", verbose = verbose, level = 0, type = ">>") if(is.null(mi_df)) mi_df <- read.csv(markers_info) - id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else - if ("CloneID" %in% colnames(mi_df)) "CloneID" else - if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") + id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df)) if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 25f99d5..8cab155 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -22,16 +22,21 @@ madc2vcf_targets( \item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text file listing target IDs designed on the \strong{bottom} strand (one ID per line). Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and -\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Also required when -\code{ChromPos} is invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} +\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Not needed when +\code{markers_info} provides \code{Ref} and \code{Alt}, or when \code{get_REF_ALT = FALSE} and +\code{markers_info} provides \code{Chr} and \code{Pos}. Also required when \code{ChromPos} is +invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Accepted columns: +metadata. Matching is done by column name, not column position. Accepted columns: \itemize{ -\item \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} (required as marker identifier); +\item one marker identifier column named \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} +(required; a generic \code{ID} column is not accepted); \item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; \item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence -inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When +\code{get_REF_ALT = TRUE}, \code{botloci_file} is still required unless \code{Ref} and \code{Alt} +are supplied here. }} \item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to recover diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index b5e8a60..9ffc72e 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -87,6 +87,41 @@ test_that("bottom strand markers have correct REF/ALT", { rm(vcf_targets, temp_targets) }) +test_that("madc2vcf_targets preserves original sample names", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + temp_vcf <- tempfile(fileext = ".vcf") + + report <- read.csv(madc_file, check.names = FALSE) + colnames(report)[4:6] <- c("1A", "Sample-1", "sample 2") + write.csv(report, temp_madc, row.names = FALSE, quote = TRUE) + + suppressWarnings( + madc2vcf_targets(madc_file = temp_madc, output.file = temp_vcf, get_REF_ALT = FALSE) + ) + + vcf <- read.vcfR(temp_vcf, verbose = FALSE) + + expect_equal(colnames(vcf@gt)[2:4], c("1A", "Sample-1", "sample 2")) +}) + +test_that("madc2vcf_targets surfaces missing-column validation error without crashing", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + report$AlleleSequence <- NULL + write.csv(report, temp_madc, row.names = FALSE) + + expect_error( + madc2vcf_targets(madc_file = temp_madc, + output.file = tempfile(fileext = ".vcf"), + get_REF_ALT = FALSE, + verbose = FALSE), + "One or more required columns missing" + ) +}) + # ======================================================================= # Using Breeding-Insight/BIGapp-PanelHub test files From 22fc6e45d60e3cfb6c880037e40a0e74d995b120 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:58:31 -0400 Subject: [PATCH 64/80] Update documentation for verbose message utility --- R/utils.R | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/R/utils.R b/R/utils.R index 59e5563..076a430 100644 --- a/R/utils.R +++ b/R/utils.R @@ -27,20 +27,21 @@ convert_to_dosage <- function(gt) { }) } -##' Verbose Message Utility -##' -##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. -##' -##' @param text Character string, the message to print (supports sprintf formatting). -##' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. -##' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). -##' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. -##' @param ... Additional arguments passed to sprintf for formatting. -##' -##' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. -##' -##' @return No return value, called for side effects. -##' @export +#' Verbose Message Utility +#' +#' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +#' +#' @param text Character string, the message to print (supports sprintf formatting). +#' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. +#' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). +#' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. +#' @param ... Additional arguments passed to sprintf for formatting. +#' +#' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +#' +#' @return No return value, called for side effects. +#' @internal +#' @noRd vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { if (!verbose) return(invisible()) # Format timestamp From 38dd6090d057bf10352f6dd250d10f9026002583 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 12:00:24 -0400 Subject: [PATCH 65/80] fix exports --- NAMESPACE | 1 - R/utils.R | 3 +-- man/vmsg.Rd | 28 ---------------------------- 3 files changed, 1 insertion(+), 31 deletions(-) delete mode 100644 man/vmsg.Rd diff --git a/NAMESPACE b/NAMESPACE index e6cbc30..e9f2613 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,7 +22,6 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) -export(vmsg) import(dplyr) import(janitor) import(parallel) diff --git a/R/utils.R b/R/utils.R index 076a430..a30c6f6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,7 +40,6 @@ convert_to_dosage <- function(gt) { #' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. #' #' @return No return value, called for side effects. -#' @internal #' @noRd vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { if (!verbose) return(invisible()) @@ -84,7 +83,7 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { #' #' @keywords internal #' @noRd -#' +#' url_exists <- function(u) { tryCatch({ con <- url(u, open = "rb") diff --git a/man/vmsg.Rd b/man/vmsg.Rd deleted file mode 100644 index abcc768..0000000 --- a/man/vmsg.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{vmsg} -\alias{vmsg} -\title{Verbose Message Utility} -\usage{ -vmsg(text, verbose = FALSE, level = 1, type = ">>", ...) -} -\arguments{ -\item{text}{Character string, the message to print (supports sprintf formatting).} - -\item{verbose}{Logical. If TRUE, prints the message; if FALSE, suppresses output.} - -\item{level}{Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).} - -\item{type}{Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.} - -\item{...}{Additional arguments passed to sprintf for formatting.} -} -\value{ -No return value, called for side effects. -} -\description{ -Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. -} -\details{ -Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. -} From 56336da56c8c346f014e9c0879bf17c98ef4331f Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 12:03:58 -0400 Subject: [PATCH 66/80] skipping if offline --- tests/testthat/test-check_madc_sanity.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index b1a3857..4b60cfe 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,4 +1,5 @@ test_that("check madc",{ + skip_if_offline("raw.githubusercontent.com") github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs", "OtherAlleles") From 089e8fdb42bcaf5b6999e107a9b078e2725cf6b4 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 21 Apr 2026 09:26:34 -0400 Subject: [PATCH 67/80] madc2vcf_multi better function description --- R/madc2vcf_multi.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index d22b8b4..e71cf0f 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -1,11 +1,9 @@ #' Convert MADC file to VCF using polyRAD for multiallelic genotyping #' -#' This function converts a DArTag MADC file to a VCF using the polyRAD package's -#' `readDArTag` and `RADdata2VCF` pipeline. It runs `check_madc_sanity` before -#' loading the data, applies corrections for lowercase sequences and all-NA -#' rows/columns, and sets `n.header.rows` automatically based on whether the -#' MADC file follows the raw DArT format (6 header rows) or the fixed allele ID -#' format (no header rows). +#' This function converts a DArTag fixed allele ID MADC file to a VCF +#' containing multiallelic markers based on the microhaplotypes using +#' the polyRAD package's `readDArTag`, `IterateHWE` population model +#' and `RADdata2VCF` pipeline. #' #' @param madc_file character. Path or URL to the input MADC CSV file. #' @param botloci_file character. Path or URL to the botloci file listing target From 8ee0b81e36339d9fc76b9ef1bd46647bdabc1d43 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 21 Apr 2026 09:27:18 -0400 Subject: [PATCH 68/80] roxygenise --- man/madc2vcf_multi.Rd | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd index 70bc59d..b6a3324 100644 --- a/man/madc2vcf_multi.Rd +++ b/man/madc2vcf_multi.Rd @@ -35,12 +35,10 @@ Default is 2.} Invisible NULL. Writes a VCF file to \code{outfile}. } \description{ -This function converts a DArTag MADC file to a VCF using the polyRAD package's -\code{readDArTag} and \code{RADdata2VCF} pipeline. It runs \code{check_madc_sanity} before -loading the data, applies corrections for lowercase sequences and all-NA -rows/columns, and sets \code{n.header.rows} automatically based on whether the -MADC file follows the raw DArT format (6 header rows) or the fixed allele ID -format (no header rows). +This function converts a DArTag fixed allele ID MADC file to a VCF +containing multiallelic markers based on the microhaplotypes using +the polyRAD package's \code{readDArTag}, \code{IterateHWE} population model +and \code{RADdata2VCF} pipeline. } \details{ The function performs the following steps: From 555dda7547a50cdfbb16c2fc1f43d2e89c0864b6 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 22 Apr 2026 10:58:32 -0400 Subject: [PATCH 69/80] Updated parentage functions based on Meng's feedback: Validate Ped trios with low markers will still be flagged but now will show recommendations When no parent pair passes the threshold of errors, they will still be shown in the final report Find Parents Fixed formatting of final output when ties on best,pair were found. Implemented vectorization and improvements on efficiency. When two recommendations are tied on error %, the tiebreaker is the number of markers tested. the option with the highest # of markers testes takes priority. --- NAMESPACE | 1 - R/find_parentage.R | 301 +++++++++++++++--------- R/validate_pedigree.R | 182 +++++--------- man/find_parentage.Rd | 65 +++-- man/validate_pedigree.Rd | 60 +---- tests/testthat/test-validate_pedigree.R | 2 +- 6 files changed, 317 insertions(+), 294 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 5081f3b..7224af9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,7 +38,6 @@ importFrom(Biostrings,DNAString) importFrom(Biostrings,reverseComplement) importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) -importFrom(data.table,":=") importFrom(data.table,CJ) importFrom(data.table,as.data.table) importFrom(data.table,copy) diff --git a/R/find_parentage.R b/R/find_parentage.R index 83279cc..dcf3ed0 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -32,10 +32,13 @@ #' parentage assignment as confident. Assignments above this threshold are #' flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column #' (default: \code{5.0}). Must be between 0 and 100. -#' @param show_ties Logical. If \code{TRUE}, all tied best pairs are reported -#' as additional columns (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) -#' when \code{method = "best_pair"}. If \code{FALSE}, only one tied pair is -#' reported with a warning. Default is \code{TRUE}. +#' @param show_ties Logical. If \code{TRUE}, all tied best pairs (after +#' tie-breaking by maximum markers tested) are reported as additional columns +#' (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when +#' \code{method = "best_pair"}. The base columns (\code{Male_Parent}, +#' \code{Female_Parent}, etc.) are always populated with the top result. +#' If \code{FALSE}, only one tied pair is reported with a warning. +#' Default is \code{TRUE}. #' @param allow_selfing Logical. If \code{FALSE}, male-female parent pairs where #' both IDs are identical are excluded when \code{method = "best_pair"}. #' Default is \code{TRUE}. @@ -45,30 +48,52 @@ #' \code{parentage_results_dt.txt} in the working directory. Default is #' \code{TRUE}. #' -#' @return A \code{data.table} with one row per progeny (or more if ties are -#' reported). Columns depend on the method used: +#' @return A \code{data.table} with one row per progeny. Columns depend on the +#' method used: #' \itemize{ #' \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: #' \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, #' \code{Markers_Tested}, \code{Assignment_Status}. -#' \item \code{best_pair} (no ties): \code{Progeny}, \code{Male_Parent}, -#' \code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, -#' \code{Assignment_Status}. -#' \item \code{best_pair} (with ties): columns are suffixed \code{_1}, -#' \code{_2}, etc. for each tied pair, plus \code{Assignment_Status}. +#' \item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, +#' \code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, +#' \code{Markers_Tested}, \code{Assignment_Status}. +#' \item \code{best_pair} (ties remain after tie-breaking, +#' \code{show_ties = TRUE}): base columns are always populated with the +#' top result, plus suffix columns \code{Male_Parent_1}, +#' \code{Female_Parent_1}, etc. for each tied pair. #' } #' \code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or #' \code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. #' #' @details +#' A homozygous-only genotype matrix is pre-computed once at startup and shared +#' across all methods that require it, avoiding redundant computation. +#' #' For \code{"best_male_parent"}, \code{"best_female_parent"}, and #' \code{"best_match"}, only homozygous markers (coded 0 or 2) are used for #' comparison; heterozygous markers (coded 1) are set to \code{NA}. This #' reduces false mismatches caused by phase ambiguity. #' #' For \code{"best_pair"}, all markers are used and full Mendelian inheritance -#' rules are applied across all possible male-female parent combinations via -#' \code{data.table::CJ()}. +#' rules are applied. Mismatch rates and comparison counts are computed across +#' all progeny simultaneously using vectorised \code{vapply} calls, producing +#' \code{n_pairs x n_progeny} matrices and giving substantial speed gains for +#' large datasets. Both matrices are explicitly coerced to matrix form to +#' handle the edge case of a single parent pair correctly. +#' +#' When multiple pairs share the lowest Mendelian error rate, ties are broken +#' by selecting the pair(s) with the greatest number of markers tested. If ties +#' still remain after this step, all remaining tied pairs are reported when +#' \code{show_ties = TRUE}. +#' +#' The base columns (\code{Male_Parent}, \code{Female_Parent}, +#' \code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with +#' the top result, ensuring no missing values in these columns regardless of +#' tie status. +#' +#' Output rows are pre-allocated as a \code{data.table} and filled by reference +#' using \code{data.table::set()}, avoiding repeated memory allocation during +#' the results-building step. #' #' Individuals in \code{parents_file} or \code{progeny_file} that are absent #' from \code{genotypes_file} are removed with a warning. @@ -81,14 +106,14 @@ #' \dontrun{ #' # Assign best male-female parent pair to each progeny #' results <- find_parentage( -#' genotypes_file = "genotypes.txt", -#' parents_file = "parents.txt", -#' progeny_file = "progeny.txt", -#' method = "best_pair", -#' min_markers = 50, +#' genotypes_file = "genotypes.txt", +#' parents_file = "parents.txt", +#' progeny_file = "progeny.txt", +#' method = "best_pair", +#' min_markers = 50, #' error_threshold = 5.0, -#' show_ties = TRUE, -#' allow_selfing = FALSE +#' show_ties = TRUE, +#' allow_selfing = FALSE #' ) #' #' # Find best individual parent match (ignoring sex) @@ -105,13 +130,13 @@ #' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, - method = "best_pair", - min_markers = 10, + method = "best_pair", + min_markers = 10, error_threshold = 5.0, - show_ties = TRUE, - allow_selfing = TRUE, - verbose = TRUE, - write_txt = TRUE) { + show_ties = TRUE, + allow_selfing = TRUE, + verbose = TRUE, + write_txt = TRUE) { #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") @@ -132,14 +157,14 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, valid_ids <- genos$ID removed_parents <- base::setdiff(all_parents$ID, valid_ids) - if (length(removed_parents) > 0) { + if (base::length(removed_parents) > 0) { warning("The following parent IDs were not in the genotype file and will not be analyzed: ", paste(removed_parents, collapse = ", "), call. = FALSE) all_parents <- all_parents[ID %in% valid_ids] } removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) - if (length(removed_progeny) > 0) { + if (base::length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) progeny_candidates <- progeny_candidates[ID %in% valid_ids] @@ -161,6 +186,17 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") + #### Pre-compute genotype matrices once (shared across all methods) #### + ## Full genotype matrix used by best_pair + genos_mat <- base::as.matrix(genos, rownames = "ID") + + ## Homozygous-only matrix (het markers set to NA) used by hom methods + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "ID") + for (col in marker_cols) + genos_hom[base::get(col) == 1, (col) := NA_integer_] + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + #### Helper: assign Assignment_Status from markers and error rate #### ## Returns LOW_MARKERS, HIGH_ERROR, or PASS assign_status <- function(markers, error_pct) { @@ -171,22 +207,28 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { - genos_hom <- data.table::copy(genos) - marker_cols <- base::setdiff(base::names(genos_hom), "ID") - for (col in marker_cols) - genos_hom[base::get(col) == 1, (col) := NA_integer_] - parent_ids <- base::switch(method, "best_male_parent" = male_parent_candidates$ID, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - parent_genos <- base::as.matrix(genos_hom[ID %in% parent_ids], rownames = "ID") - progeny_genos <- base::as.matrix(genos_hom[ID %in% progeny_candidates$ID], rownames = "ID") - - results_list <- base::lapply(base::rownames(progeny_genos), function(progeny_id) { - progeny_vec <- progeny_genos[progeny_id, ] + ## Subset pre-computed homozygous matrix for relevant parents and progeny + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + + ## Pre-allocate results data.table; fill by reference with set() + n_progeny <- base::nrow(progeny_genos) + results_dt <- data.table::data.table( + Progeny = base::rownames(progeny_genos), + Best_Match = NA_character_, + Mendelian_Error_Pct = NA_real_, + Markers_Tested = NA_integer_, + Assignment_Status = NA_character_ + ) + + for (i in base::seq_len(n_progeny)) { + progeny_vec <- progeny_genos[i, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 @@ -194,36 +236,28 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, best_idx <- base::which.min(percent_mismatch) - # No candidate found — return NA row flagged LOW_MARKERS + # No candidate found — flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { - return(data.table::data.table( - Progeny = progeny_id, - Best_Match = NA_character_, - Mendelian_Error_Pct = NA_real_, - Markers_Tested = 0L, - Assignment_Status = "LOW_MARKERS" - )) + data.table::set(results_dt, i, "Markers_Tested", 0L) + data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") + next } best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - data.table::data.table( - Progeny = progeny_id, - Best_Match = base::rownames(parent_genos)[best_idx], - Mendelian_Error_Pct = best_error, - Markers_Tested = best_markers, - Assignment_Status = assign_status(best_markers, best_error) - ) - }) + data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) + data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) + data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) + data.table::set(results_dt, i, "Assignment_Status", assign_status(best_markers, best_error)) + } - final_df <- data.table::rbindlist(results_list) + final_df <- results_dt } #### Logic for Best Pair Method #### if (method == "best_pair") { - genos_mat <- base::as.matrix(genos, rownames = "ID") parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, Female_Parent = female_parent_candidates$ID) @@ -234,43 +268,90 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") + ## Pre-extract parent genotype row blocks for vectorised operations male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - results_list <- base::lapply(progeny_candidates$ID, function(prog_id) { - - progeny_vec <- genos_mat[prog_id, ] - - mismatches <- base::rowSums( - (male_parent_genos_mat == 0 & female_parent_genos_mat == 0 & progeny_vec > 0) | - (male_parent_genos_mat == 2 & female_parent_genos_mat == 2 & progeny_vec < 2) | - ((male_parent_genos_mat == 0 & female_parent_genos_mat == 1) | - (male_parent_genos_mat == 1 & female_parent_genos_mat == 0)) & (progeny_vec == 2) | - ((male_parent_genos_mat == 2 & female_parent_genos_mat == 1) | - (male_parent_genos_mat == 1 & female_parent_genos_mat == 2)) & (progeny_vec == 0) | - ((male_parent_genos_mat == 0 & female_parent_genos_mat == 2) | - (male_parent_genos_mat == 2 & female_parent_genos_mat == 0)) & (progeny_vec != 1), - na.rm = TRUE - ) - - comparisons <- base::rowSums(!base::is.na(male_parent_genos_mat) & - !base::is.na(female_parent_genos_mat) & - !base::is.na(progeny_vec)) - percent_mismatch <- (mismatches / comparisons) * 100 - percent_mismatch[base::is.nan(percent_mismatch)] <- NA - + ## Subset full genotype matrix to progeny only + progeny_ids <- progeny_candidates$ID + progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] + n_progeny <- base::nrow(progeny_mat) + n_pairs <- base::nrow(parent_pairs) + + ## Vectorised mismatch computation across ALL progeny at once + ## Result: matrix of dimensions n_pairs x n_progeny + ## Explicitly wrapped in matrix() to handle the n_pairs = 1 edge case + mismatch_mat <- base::matrix( + base::vapply(base::seq_len(n_progeny), function(j) { + progeny_vec <- progeny_mat[j, ] + base::rowSums( + (male_parent_genos_mat == 0 & female_parent_genos_mat == 0 & progeny_vec > 0) | + (male_parent_genos_mat == 2 & female_parent_genos_mat == 2 & progeny_vec < 2) | + ((male_parent_genos_mat == 0 & female_parent_genos_mat == 1) | + (male_parent_genos_mat == 1 & female_parent_genos_mat == 0)) & (progeny_vec == 2) | + ((male_parent_genos_mat == 2 & female_parent_genos_mat == 1) | + (male_parent_genos_mat == 1 & female_parent_genos_mat == 2)) & (progeny_vec == 0) | + ((male_parent_genos_mat == 0 & female_parent_genos_mat == 2) | + (male_parent_genos_mat == 2 & female_parent_genos_mat == 0)) & (progeny_vec != 1), + na.rm = TRUE + ) + }, numeric(n_pairs)), + nrow = n_pairs, ncol = n_progeny + ) + + ## Vectorised comparison count across ALL progeny at once + ## Result: matrix of dimensions n_pairs x n_progeny + ## Explicitly wrapped in matrix() to handle the n_pairs = 1 edge case + comparison_mat <- base::matrix( + base::vapply(base::seq_len(n_progeny), function(j) { + progeny_vec <- progeny_mat[j, ] + base::rowSums(!base::is.na(male_parent_genos_mat) & + !base::is.na(female_parent_genos_mat) & + !base::is.na(progeny_vec)) + }, numeric(n_pairs)), + nrow = n_pairs, ncol = n_progeny + ) + + ## Percent mismatch matrix: n_pairs x n_progeny + pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 + pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA + + ## Pre-allocate base results data.table; tie columns added dynamically + results_dt <- data.table::data.table( + Progeny = progeny_ids, + Male_Parent = NA_character_, + Female_Parent = NA_character_, + Mendelian_Error_Pct = NA_character_, + Markers_Tested = NA_integer_, + Assignment_Status = NA_character_ + ) + + ## Per-progeny result extraction from pre-computed matrices + results_list <- base::vector("list", n_progeny) + + for (j in base::seq_len(n_progeny)) { + + prog_id <- progeny_ids[j] + percent_mismatch <- pct_mismatch_mat[, j] + comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - # No markers overlap at all — flag LOW_MARKERS, return early + # No markers overlap at all — flag LOW_MARKERS if (base::is.infinite(min_mismatch_val)) { - return(data.table::data.table( - Progeny = prog_id, - Markers_Tested = 0L, - Assignment_Status = "LOW_MARKERS" - )) + data.table::set(results_dt, j, "Markers_Tested", 0L) + data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") + next } best_indices <- base::which(percent_mismatch == min_mismatch_val) + + # Tie-break: prefer pair(s) with the most markers tested + if (base::length(best_indices) > 1) { + best_markers_per_pair <- comparisons[best_indices] + max_markers <- base::max(best_markers_per_pair) + best_indices <- best_indices[best_markers_per_pair == max_markers] + } + best_pairs <- parent_pairs[best_indices] best_markers <- comparisons[best_indices[1]] best_error <- base::round(min_mismatch_val, 2) @@ -283,42 +364,52 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - result_row <- base::list(Progeny = prog_id) - - if (num_to_report == 1) { - result_row[["Male_Parent"]] <- best_pairs$Male_Parent[1] - result_row[["Female_Parent"]] <- best_pairs$Female_Parent[1] - result_row[["Mendelian_Error_Pct"]] <- base::sprintf("%.2f", min_mismatch_val) - result_row[["Markers_Tested"]] <- best_markers - result_row[["Assignment_Status"]] <- a_status - } else { + # Always populate base columns with the top result + data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) + data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) + data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) + data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) + data.table::set(results_dt, j, "Assignment_Status", a_status) + + # If ties remain after tie-breaking and show_ties is TRUE, + # store tie details for later column binding + if (show_ties && num_to_report > 1) { + tie_row <- base::list(Progeny = prog_id) for (k in base::seq_len(num_to_report)) { - result_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] - result_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] - result_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val - result_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] + tie_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] + tie_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] + tie_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val + tie_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] } - result_row[["Assignment_Status"]] <- a_status + results_list[[j]] <- data.table::as.data.table(tie_row) } + } - data.table::as.data.table(result_row) - }) + # Merge any tie suffix columns onto the pre-allocated base table + tie_rows <- data.table::rbindlist( + base::Filter(Negate(base::is.null), results_list), + fill = TRUE + ) - final_df <- data.table::rbindlist(results_list, fill = TRUE) + if (base::nrow(tie_rows) > 0) { + final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) + } else { + final_df <- results_dt + } } #### Summary #### if (verbose) { - total <- base::nrow(final_df) - a_counts <- base::table(final_df$Assignment_Status) + total <- base::nrow(final_df) + a_counts <- base::table(final_df$Assignment_Status) base::cat("\n--- Parentage Assignment Summary ---\n") base::cat("Total progeny evaluated:", total, "\n") for (s in base::names(a_counts)) base::cat(base::sprintf(" %-14s: %d (%.1f%%)\n", s, a_counts[s], (a_counts[s] / total) * 100)) - base::cat("Min markers threshold :", min_markers, "\n") - base::cat("Error threshold :", error_threshold, "%\n\n") + base::cat("Min markers threshold :", min_markers, "\n") + base::cat("Error threshold :", error_threshold, "%\n\n") } #### Output #### diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index b34bc1b..8b2ecbc 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -19,8 +19,8 @@ #' @param min_markers Integer. Minimum number of non-missing markers #' required to evaluate a trio (default: 10). #' @param single_parent_error_threshold Numeric. Maximum homozygous-marker -#' mismatch percentage for a parent to be considered acceptable in a -#' failed trio (default: 2.0). Must be between 0 and 100. +#' mismatch percentage for a parent to be considered acceptable during +#' parent-level evaluation (default: 2.0). Must be between 0 and 100. #' @param verbose Logical. If TRUE, prints progress messages, a summary #' table, and results to the console (default: TRUE). #' @param write_txt Logical. If TRUE, writes validation results to @@ -38,59 +38,18 @@ #' \item{Markers_Tested}{Number of markers with non-missing genotypes.} #' \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, #' MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, -#' or NO_GENOTYPE_DATA (trio present in pedigree but absent from -#' the genotype file).} +#' or NO_GENOTYPE_DATA.} #' \item{Correction_Decision}{One of NONE, KEEP_BOTH, -#' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH.} -#' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch -#' percentage (NA unless Status == "FAIL").} -#' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker -#' mismatch percentage (NA unless Status == "FAIL").} +#' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, +#' LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, +#' LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} +#' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} +#' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} #' \item{Best_Male_Parent}{Best-matching male parent candidate ID.} -#' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for -#' Best_Male_Parent.} +#' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} #' \item{Best_Female_Parent}{Best-matching female parent candidate ID.} -#' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for -#' Best_Female_Parent.} +#' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} #' } -#' -#' @details -#' All trios in the pedigree file are represented in the output. Trios where -#' the progeny or a declared parent is absent from the genotype file are -#' flagged as NO_GENOTYPE_DATA and are excluded from Mendelian error analysis -#' but retained in the final report and summary counts. -#' -#' Trios with missing parents (coded as 0) that are not listed as founders -#' receive a MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, or -#' MISSING_BOTH_PARENTS status. Recommendations are provided in the -#' Best_Male_Parent and Best_Female_Parent columns, but the 0 values are -#' preserved in the corrected pedigree output. -#' -#' If founders_file is provided, any trio where the individual is listed as -#' a founder and both parents are coded as 0 is flagged as FOUNDERS; no -#' recommendation or correction is attempted. -#' -#' A corrected pedigree with failed parents replaced by 0 is always written -#' to corrected_pedigree.txt. -#' -#' @examples -#' \dontrun{ -#' # Basic run -#' results <- validate_pedigree("pedigree.txt", "genotypes.txt") -#' -#' # With founders list and stricter thresholds -#' results <- validate_pedigree( -#' pedigree_file = "pedigree.txt", -#' genotypes_file = "genotypes.txt", -#' founders_file = "founders.txt", -#' trio_error_threshold = 2.0, -#' single_parent_error_threshold = 1.0, -#' verbose = FALSE, -#' output_filename = "my_validation.txt" -#' ) -#' } -#' -#' @importFrom data.table fread fwrite rbindlist copy data.table := set #' @export validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, @@ -101,18 +60,6 @@ validate_pedigree <- function(pedigree_file, genotypes_file, write_txt = TRUE, output_filename = "pedigree_validation_results.txt") { - #### Read founders list #### - if (!is.null(founders_file)) { - founders_raw <- tryCatch({ - data.table::fread(founders_file, header = FALSE, colClasses = "character") - }, error = function(e) { - stop("Could not read founders list. Ensure it is a plain text or CSV/TSV file.") - }) - founder_ids <- unique(founders_raw[[1]]) - } else { - founder_ids <- character(0) - } - #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) stop("trio_error_threshold must be between 0 and 100") @@ -120,8 +67,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, stop("single_parent_error_threshold must be between 0 and 100") tryCatch({ - pedigree <- data.table::fread(pedigree_file, sep = "auto", - colClasses = "character") + pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") genos <- data.table::fread(genotypes_file, sep = "auto") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") @@ -139,19 +85,26 @@ validate_pedigree <- function(pedigree_file, genotypes_file, # Ensure parent columns are character for consistent "0" comparisons pedigree[, Male_Parent := as.character(Male_Parent)] pedigree[, Female_Parent := as.character(Female_Parent)] - original_pedigree <- data.table::copy(pedigree) + #### Read founders list #### + if (!is.null(founders_file)) { + founders_raw <- tryCatch({ + data.table::fread(founders_file, header = FALSE, colClasses = "character") + }, error = function(e) { + stop("Could not read founders list. Ensure it is a plain text or CSV/TSV file.") + }) + founder_ids <- unique(founders_raw[[1]]) + } else { + founder_ids <- character(0) + } + #### Identify trios missing from the genotype file #### - ## These are retained in the output with Status = NO_GENOTYPE_DATA - ## rather than silently dropped valid_ids <- as.character(genos$ID) - has_geno <- pedigree[ID %in% valid_ids & (Male_Parent %in% valid_ids | Male_Parent == "0") & (Female_Parent %in% valid_ids | Female_Parent == "0")] - # Correct operator precedence: ! negates the full %in% expression no_geno_rows <- pedigree[!(ID %in% valid_ids) | (!(Male_Parent %in% valid_ids) & Male_Parent != "0") | (!(Female_Parent %in% valid_ids) & Female_Parent != "0")] @@ -161,7 +114,6 @@ validate_pedigree <- function(pedigree_file, genotypes_file, "trios with missing genotype data; flagged as NO_GENOTYPE_DATA.\n") pedigree <- has_geno - if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") @@ -181,6 +133,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, c(prog_id, exclude_ids)) if (base::length(candidates) == 0) return(base::list(id = NA_character_, error_pct = NA_real_)) + prog_hom <- genos_hom_mat[prog_id, ] errors <- base::sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] @@ -188,8 +141,9 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (comparisons == 0) return(NA_real_) (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) + best_idx <- base::which.min(errors) - base::list(id = candidates[best_idx], + base::list(id = candidates[best_idx], error_pct = base::round(errors[best_idx], 2)) } @@ -224,35 +178,36 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (male_parent_id == "0" && female_parent_id == "0") { status <- "MISSING_BOTH_PARENTS" correction_decision <- "NONE" - best_m <- find_best_parent(prog_id, - exclude_ids = character(0)) + + best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - best_f <- find_best_parent(prog_id, - exclude_ids = c(best_m$id)) + + best_f <- find_best_parent(prog_id, exclude_ids = c(best_m$id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } else if (male_parent_id == "0" && female_parent_id != "0") { status <- "MISSING_MALE_PARENT" correction_decision <- "NONE" - best_m <- find_best_parent(prog_id, - exclude_ids = c(female_parent_id)) + + best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct } else if (male_parent_id != "0" && female_parent_id == "0") { status <- "MISSING_FEMALE_PARENT" correction_decision <- "NONE" - best_f <- find_best_parent(prog_id, - exclude_ids = c(male_parent_id)) + + best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } else { + ## Both parents present — Mendelian error calculation - progeny_vec <- genos_mat[prog_id, ] - male_parent_vec <- genos_mat[male_parent_id, ] + progeny_vec <- genos_mat[prog_id, ] + male_parent_vec <- genos_mat[male_parent_id, ] female_parent_vec <- genos_mat[female_parent_id, ] mismatches <- base::sum( @@ -275,71 +230,69 @@ validate_pedigree <- function(pedigree_file, genotypes_file, status <- "NO_DATA" correction_decision <- "NONE" - } else if (markers_tested < min_markers) { - error_pct <- (mismatches / markers_tested) * 100 - status <- "LOW_MARKERS" - correction_decision <- "NONE" - } else { error_pct <- (mismatches / markers_tested) * 100 - if (error_pct <= trio_error_threshold) { + # LOW_MARKERS still computes parent mismatch/recommendations + if (markers_tested < min_markers) { + status <- "LOW_MARKERS" + } else if (error_pct <= trio_error_threshold) { status <- "PASS" correction_decision <- "NONE" - } else { status <- "FAIL" + } + + # Run parent-level evaluation for both FAIL and LOW_MARKERS + if (status %in% c("FAIL", "LOW_MARKERS")) { # Homozygous mismatch per parent - progeny_hom <- genos_hom_mat[prog_id, ] - male_parent_hom <- genos_hom_mat[male_parent_id, ] + progeny_hom <- genos_hom_mat[prog_id, ] + male_parent_hom <- genos_hom_mat[male_parent_id, ] female_parent_hom <- genos_hom_mat[female_parent_id, ] - male_comparisons <- base::sum(!base::is.na(male_parent_hom) & - !base::is.na(progeny_hom)) + male_comparisons <- base::sum(!base::is.na(male_parent_hom) & + !base::is.na(progeny_hom)) male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else - base::round((base::sum(male_parent_hom != progeny_hom, - na.rm = TRUE) / male_comparisons) * 100, 2) + base::round((base::sum(male_parent_hom != progeny_hom, na.rm = TRUE) / + male_comparisons) * 100, 2) - female_comparisons <- base::sum(!base::is.na(female_parent_hom) & - !base::is.na(progeny_hom)) + female_comparisons <- base::sum(!base::is.na(female_parent_hom) & + !base::is.na(progeny_hom)) female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else - base::round((base::sum(female_parent_hom != progeny_hom, - na.rm = TRUE) / female_comparisons) * 100, 2) + base::round((base::sum(female_parent_hom != progeny_hom, na.rm = TRUE) / + female_comparisons) * 100, 2) - male_acceptable <- !is.na(male_parent_error_pct) && + male_acceptable <- !is.na(male_parent_error_pct) && male_parent_error_pct <= single_parent_error_threshold female_acceptable <- !is.na(female_parent_error_pct) && female_parent_error_pct <= single_parent_error_threshold if (male_acceptable && female_acceptable) { correction_decision <- "KEEP_BOTH" - } else if (male_acceptable && !female_acceptable) { correction_decision <- "REMOVE_FEMALE_PARENT" - best_f <- find_best_parent(prog_id, - exclude_ids = c(male_parent_id)) + best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct - } else if (!male_acceptable && female_acceptable) { correction_decision <- "REMOVE_MALE_PARENT" - best_m <- find_best_parent(prog_id, - exclude_ids = c(female_parent_id)) + best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - } else { correction_decision <- "REMOVE_BOTH" - best_m <- find_best_parent(prog_id, - exclude_ids = character(0)) + best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - best_f <- find_best_parent(prog_id, - exclude_ids = c(best_m$id)) + best_f <- find_best_parent(prog_id, exclude_ids = c(best_m$id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } + + # Do not alter corrected pedigree for LOW_MARKERS rows + if (status == "LOW_MARKERS") + correction_decision <- paste0("LOW_MARKERS_", correction_decision) } } } @@ -365,7 +318,6 @@ validate_pedigree <- function(pedigree_file, genotypes_file, final_df <- data.table::rbindlist(results_list) #### Append NO_GENOTYPE_DATA rows to the final report #### - ## All columns except ID, Male_Parent, Female_Parent are set to NA if (base::nrow(no_geno_rows) > 0) { no_geno_df <- data.table::data.table( ID = no_geno_rows$ID, @@ -386,10 +338,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } #### Write corrected pedigree #### - ## MISSING_*, FOUNDERS and NO_GENOTYPE_DATA: original values preserved - ## FAIL: failed parents replaced by "0" corrected_pedigree <- data.table::copy(original_pedigree) - for (i in base::seq_len(base::nrow(final_df))) { prog_id <- final_df$ID[i] decision <- final_df$Correction_Decision[i] @@ -400,10 +349,9 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else if (decision == "REMOVE_FEMALE_PARENT") { data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") } else if (decision == "REMOVE_BOTH") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") } - # NONE / KEEP_BOTH / FOUNDERS / MISSING_* / NO_GENOTYPE_DATA — no changes } tryCatch({ diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 420bed8..706a15f 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -53,10 +53,13 @@ parentage assignment as confident. Assignments above this threshold are flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column (default: \code{5.0}). Must be between 0 and 100.} -\item{show_ties}{Logical. If \code{TRUE}, all tied best pairs are reported -as additional columns (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) -when \code{method = "best_pair"}. If \code{FALSE}, only one tied pair is -reported with a warning. Default is \code{TRUE}.} +\item{show_ties}{Logical. If \code{TRUE}, all tied best pairs (after +tie-breaking by maximum markers tested) are reported as additional columns +(\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when +\code{method = "best_pair"}. The base columns (\code{Male_Parent}, +\code{Female_Parent}, etc.) are always populated with the top result. +If \code{FALSE}, only one tied pair is reported with a warning. +Default is \code{TRUE}.} \item{allow_selfing}{Logical. If \code{FALSE}, male-female parent pairs where both IDs are identical are excluded when \code{method = "best_pair"}. @@ -70,17 +73,19 @@ statistics, and the results table to the console. Default is \code{TRUE}.} \code{TRUE}.} } \value{ -A \code{data.table} with one row per progeny (or more if ties are -reported). Columns depend on the method used: +A \code{data.table} with one row per progeny. Columns depend on the +method used: \itemize{ \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, \code{Assignment_Status}. -\item \code{best_pair} (no ties): \code{Progeny}, \code{Male_Parent}, -\code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, -\code{Assignment_Status}. -\item \code{best_pair} (with ties): columns are suffixed \code{_1}, -\code{_2}, etc. for each tied pair, plus \code{Assignment_Status}. +\item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, +\code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, +\code{Markers_Tested}, \code{Assignment_Status}. +\item \code{best_pair} (ties remain after tie-breaking, +\code{show_ties = TRUE}): base columns are always populated with the +top result, plus suffix columns \code{Male_Parent_1}, +\code{Female_Parent_1}, etc. for each tied pair. } \code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or \code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. @@ -90,14 +95,34 @@ Assigns the most likely parent(s) to each progeny individual based on genotypic data using Mendelian error rates or homozygous mismatch rates. } \details{ +A homozygous-only genotype matrix is pre-computed once at startup and shared +across all methods that require it, avoiding redundant computation. + For \code{"best_male_parent"}, \code{"best_female_parent"}, and \code{"best_match"}, only homozygous markers (coded 0 or 2) are used for comparison; heterozygous markers (coded 1) are set to \code{NA}. This reduces false mismatches caused by phase ambiguity. For \code{"best_pair"}, all markers are used and full Mendelian inheritance -rules are applied across all possible male-female parent combinations via -\code{data.table::CJ()}. +rules are applied. Mismatch rates and comparison counts are computed across +all progeny simultaneously using vectorised \code{vapply} calls, producing +\code{n_pairs x n_progeny} matrices and giving substantial speed gains for +large datasets. Both matrices are explicitly coerced to matrix form to +handle the edge case of a single parent pair correctly. + +When multiple pairs share the lowest Mendelian error rate, ties are broken +by selecting the pair(s) with the greatest number of markers tested. If ties +still remain after this step, all remaining tied pairs are reported when +\code{show_ties = TRUE}. + +The base columns (\code{Male_Parent}, \code{Female_Parent}, +\code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with +the top result, ensuring no missing values in these columns regardless of +tie status. + +Output rows are pre-allocated as a \code{data.table} and filled by reference +using \code{data.table::set()}, avoiding repeated memory allocation during +the results-building step. Individuals in \code{parents_file} or \code{progeny_file} that are absent from \code{genotypes_file} are removed with a warning. @@ -110,14 +135,14 @@ match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. \dontrun{ # Assign best male-female parent pair to each progeny results <- find_parentage( - genotypes_file = "genotypes.txt", - parents_file = "parents.txt", - progeny_file = "progeny.txt", - method = "best_pair", - min_markers = 50, + genotypes_file = "genotypes.txt", + parents_file = "parents.txt", + progeny_file = "progeny.txt", + method = "best_pair", + min_markers = 50, error_threshold = 5.0, - show_ties = TRUE, - allow_selfing = FALSE + show_ties = TRUE, + allow_selfing = FALSE ) # Find best individual parent match (ignoring sex) diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 2493096..8f47a7d 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -34,8 +34,8 @@ to classify a trio as PASS (default: 5.0). Must be between 0 and 100.} required to evaluate a trio (default: 10).} \item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker -mismatch percentage for a parent to be considered acceptable in a -failed trio (default: 2.0). Must be between 0 and 100.} +mismatch percentage for a parent to be considered acceptable during +parent-level evaluation (default: 2.0). Must be between 0 and 100.} \item{verbose}{Logical. If TRUE, prints progress messages, a summary table, and results to the console (default: TRUE).} @@ -57,20 +57,17 @@ the following columns: \item{Markers_Tested}{Number of markers with non-missing genotypes.} \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, -or NO_GENOTYPE_DATA (trio present in pedigree but absent from -the genotype file).} +or NO_GENOTYPE_DATA.} \item{Correction_Decision}{One of NONE, KEEP_BOTH, -REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH.} -\item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch -percentage (NA unless Status == "FAIL").} -\item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker -mismatch percentage (NA unless Status == "FAIL").} +REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, +LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, +LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} +\item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} +\item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} \item{Best_Male_Parent}{Best-matching male parent candidate ID.} -\item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for -Best_Male_Parent.} +\item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} \item{Best_Female_Parent}{Best-matching female parent candidate ID.} -\item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for -Best_Female_Parent.} +\item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} } } \description{ @@ -81,40 +78,3 @@ are declared founders (both parents coded as 0) are preserved unchanged with no recommendations. Trios removed due to missing genotype data are retained in the output with a NO_GENOTYPE_DATA status. } -\details{ -All trios in the pedigree file are represented in the output. Trios where -the progeny or a declared parent is absent from the genotype file are -flagged as NO_GENOTYPE_DATA and are excluded from Mendelian error analysis -but retained in the final report and summary counts. - -Trios with missing parents (coded as 0) that are not listed as founders -receive a MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, or -MISSING_BOTH_PARENTS status. Recommendations are provided in the -Best_Male_Parent and Best_Female_Parent columns, but the 0 values are -preserved in the corrected pedigree output. - -If founders_file is provided, any trio where the individual is listed as -a founder and both parents are coded as 0 is flagged as FOUNDERS; no -recommendation or correction is attempted. - -A corrected pedigree with failed parents replaced by 0 is always written -to corrected_pedigree.txt. -} -\examples{ -\dontrun{ -# Basic run -results <- validate_pedigree("pedigree.txt", "genotypes.txt") - -# With founders list and stricter thresholds -results <- validate_pedigree( - pedigree_file = "pedigree.txt", - genotypes_file = "genotypes.txt", - founders_file = "founders.txt", - trio_error_threshold = 2.0, - single_parent_error_threshold = 1.0, - verbose = FALSE, - output_filename = "my_validation.txt" -) -} - -} diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index 2dba74b..ef4e493 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -174,7 +174,7 @@ test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE, min_markers = 25L) expect_true(all(res$Status == "LOW_MARKERS")) - expect_true(all(res$Correction_Decision == "NONE")) + expect_true(all(grepl("^LOW_MARKERS_", res$Correction_Decision))) }) test_that("NA markers reduce Markers_Tested and do not cause errors", { From 523049bea67bdd853245017736a5ce92712f6817 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 22 Apr 2026 10:58:32 -0400 Subject: [PATCH 70/80] Updated parentage functions based on Meng's feedback: Validate Ped trios with low markers will still be flagged but now will show recommendations When no parent pair passes the threshold of errors, they will still be shown in the final report Find Parents Fixed formatting of final output when ties on best,pair were found. Implemented vectorization and improvements on efficiency. When two recommendations are tied on error %, the tiebreaker is the number of markers tested. the option with the highest # of markers testes takes priority. --- R/find_parentage.R | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/R/find_parentage.R b/R/find_parentage.R index dcf3ed0..728822a 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -187,18 +187,18 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("No valid progeny candidates remain after filtering.") #### Pre-compute genotype matrices once (shared across all methods) #### - ## Full genotype matrix used by best_pair + # Full genotype matrix used by best_pair genos_mat <- base::as.matrix(genos, rownames = "ID") - ## Homozygous-only matrix (het markers set to NA) used by hom methods + # Homozygous-only matrix (het markers set to NA) used by hom methods genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - #### Helper: assign Assignment_Status from markers and error rate #### - ## Returns LOW_MARKERS, HIGH_ERROR, or PASS + #### Assignment_Status from markers and error rate #### + # Returns LOW_MARKERS, HIGH_ERROR, or PASS assign_status <- function(markers, error_pct) { base::ifelse(markers < min_markers, "LOW_MARKERS", base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) @@ -206,18 +206,17 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { - parent_ids <- base::switch(method, "best_male_parent" = male_parent_candidates$ID, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - ## Subset pre-computed homozygous matrix for relevant parents and progeny + # Subset pre-computed homozygous matrix for relevant parents and progeny parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] - ## Pre-allocate results data.table; fill by reference with set() + # Pre-allocate results data.table; fill by reference with set() n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( Progeny = base::rownames(progeny_genos), @@ -251,36 +250,31 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) data.table::set(results_dt, i, "Assignment_Status", assign_status(best_markers, best_error)) } - final_df <- results_dt } #### Logic for Best Pair Method #### if (method == "best_pair") { - parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, Female_Parent = female_parent_candidates$ID) - if (!allow_selfing) { parent_pairs <- parent_pairs[Male_Parent != Female_Parent] if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } - if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - ## Pre-extract parent genotype row blocks for vectorised operations + # Pre-extract parent genotype row blocks for vectorised operations male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - ## Subset full genotype matrix to progeny only + # Subset full genotype matrix to progeny only progeny_ids <- progeny_candidates$ID progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) - ## Vectorised mismatch computation across ALL progeny at once - ## Result: matrix of dimensions n_pairs x n_progeny - ## Explicitly wrapped in matrix() to handle the n_pairs = 1 edge case + # Vectorised mismatch computation across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case mismatch_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { progeny_vec <- progeny_mat[j, ] @@ -299,9 +293,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, nrow = n_pairs, ncol = n_progeny ) - ## Vectorised comparison count across ALL progeny at once - ## Result: matrix of dimensions n_pairs x n_progeny - ## Explicitly wrapped in matrix() to handle the n_pairs = 1 edge case + # Vectorised comparison count across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case comparison_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { progeny_vec <- progeny_mat[j, ] @@ -312,11 +305,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, nrow = n_pairs, ncol = n_progeny ) - ## Percent mismatch matrix: n_pairs x n_progeny + # Percent mismatch matrix: n_pairs x n_progeny pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA - ## Pre-allocate base results data.table; tie columns added dynamically + # Pre-allocate base results data.table; tie columns added dynamically results_dt <- data.table::data.table( Progeny = progeny_ids, Male_Parent = NA_character_, @@ -326,11 +319,9 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Assignment_Status = NA_character_ ) - ## Per-progeny result extraction from pre-computed matrices + # Per-progeny result extraction from pre-computed matrices results_list <- base::vector("list", n_progeny) - for (j in base::seq_len(n_progeny)) { - prog_id <- progeny_ids[j] percent_mismatch <- pct_mismatch_mat[, j] comparisons <- comparison_mat[, j] @@ -391,7 +382,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::Filter(Negate(base::is.null), results_list), fill = TRUE ) - if (base::nrow(tie_rows) > 0) { final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) } else { From 7023a25e0a344705ed2c3b48942199aab4d78cdf Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 11:52:48 -0400 Subject: [PATCH 71/80] improve support for dosage2vcf --- NAMESPACE | 1 + R/dosage2vcf.R | 585 ++++++++++++++++++++++--------- man/dosage2vcf.Rd | 12 +- tests/testthat/test-dosage2vcf.R | 113 ++++++ 4 files changed, 543 insertions(+), 168 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c919736..4b5cb05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(validate_pedigree) import(dplyr) import(janitor) import(parallel) diff --git a/R/dosage2vcf.R b/R/dosage2vcf.R index e8a193d..fdc5ff7 100644 --- a/R/dosage2vcf.R +++ b/R/dosage2vcf.R @@ -1,13 +1,17 @@ -#' Convert DArTag Dosage and Counts to VCF +#' Convert DArTag genotype reports and counts to VCF #' -#' This function will convert the DArT Dosage Report and Counts files to VCF format +#' This function will convert DArT genotype report and Counts files to VCF format #' -#' This function will convert the Dosage Report and Counts files from DArT into a VCF file. +#' This function will convert Allele Dose Report or SNP/INDEL report files and Counts files from DArT into a VCF file. #' These two files are received directly from DArT for a given sequencing project. +#' SNP/INDEL one-row and two-row reports are treated as diploid genotype reports +#' with 0 = reference homozygote, 1 = alternate homozygote, 2 = heterozygote, +#' and - = missing. Allele Dose reports are interpreted as reference allele +#' dosages using the supplied ploidy. #' The output file will be saved to the location and with the name that is specified. #' The VCF format is v4.3 #' -#' @param dart.report Path to the DArT dosage report .csv file. Typically contains "Dosage Report" in the file name. +#' @param dart.report Path to the DArT Allele Dose Report or SNP/INDEL report .csv file. #' @param dart.counts Path to the DArT counts .csv file. Typically contains "Counts" in the file name. #' @param ploidy The ploidy of the species being analyzed #' @param output.file output file name and path @@ -65,204 +69,456 @@ dosage2vcf <- function(dart.report, dart.counts, ploidy, output.file) { '##FORMAT=' ) - ##Get information from DArT Counts and Dosage Report files + ## Helper functions --------------------------------------------------------- + is_missing_value <- function(x) { + x_chr <- trimws(as.character(x)) + is.na(x) | x_chr %in% c("", "-", "NA") + } - dosage <- suppressMessages(readr::read_csv(dosage_report, - skip = 6,show_col_types = FALSE)) - colnames(dosage)[1:5] <- dosage[1,1:5] - dosage <- dosage[-1, ] - names(dosage)[names(dosage) == "MarkerID"] <- "MarkerName" - dosage <- as.data.frame(dosage) - row.names(dosage) <- dosage$MarkerName - - counts <- suppressMessages(readr::read_csv(counts_file, - skip = 6,show_col_types = FALSE)) - - #Check that the counts file is in the 2 row format - #if (anyDuplicated(counts$MarkerName) > 0) { #I am going to just check if there are 2x rows in counts as dosage report - if (nrow(counts) >= nrow(dosage)*2) { - message("Note: Counts file is in the 2 row format for Ref and Alt alleles") - } else { - stop("Counts file is in single row format. Only two row format (row for ref and alt allele) is currently supported.") - return() + missing_gt <- function(ploidy) { + paste(rep(".", ploidy), collapse = "/") } - #Parse counts file depending on if its the collapsed format or not - if (all(c("MarkerName", "Variant") %in% counts[1, 1:15])) { - message("Counts file contains the counts for the target loci only") - - colnames(counts)[1:15] <- counts[1,1:15] - counts <- counts[-1, ] - counts <- as.data.frame(counts) - - #Get ref counts dataframe - ref_counts <- counts[is.na(counts$Variant), ] - row.names(ref_counts) <- ref_counts$MarkerName - #Get alt counts dataframe - alt_counts <- counts[!is.na(counts$Variant), ] - row.names(alt_counts) <- alt_counts$MarkerName - - #unload counts file - rm(counts) - - #Get the Ref and Alt allele from the alt_counts file - ##Note sometimes there are multiple nucleotides, I am assuming this file also includes indels, but make sure this is not an error - alleles_df <- alt_counts %>% - rowwise() %>% # Apply operations to each row - mutate(Variant = gsub("-:", "",Variant), # Remove "-:" prefix - REF = strsplit(Variant, ">")[[1]][1], - ALT = strsplit(Variant, ">")[[1]][2]) %>% - ungroup() %>% - select(MarkerName, REF, ALT) - - #Add the CHROM and POS information to the alleles_df from the dosage report - alleles_df <- merge(alleles_df, dosage[, c("MarkerName","Chrom", "ChromPos")], by = "MarkerName", all.x = TRUE) - #Add row name - row.names(alleles_df) <- alleles_df$MarkerName - - }else{ - message("Counts file contains the collapsed read counts across all microhaplotypes for the target loci") + as_clean_character_matrix <- function(x) { + mat <- as.matrix(x) + mat <- matrix(trimws(as.character(mat)), nrow = nrow(mat), + ncol = ncol(mat), dimnames = dimnames(mat)) + mat[is_missing_value(mat)] <- NA_character_ + mat + } - colnames(counts)[1:5] <- counts[1,1:5] - counts <- counts[-1, ] - counts <- as.data.frame(counts) + ensure_unique <- function(x, label) { + duplicated_values <- unique(x[duplicated(x)]) + if (length(duplicated_values) > 0) { + stop(label, " must be unique. Duplicated values include: ", + paste(utils::head(duplicated_values, 5), collapse = ", ")) + } + } - #Get ref counts dataframe - ref_counts <- counts[grepl("Ref$", counts$AlleleID), ] - row.names(ref_counts) <- ref_counts$CloneID - #Get alt counts dataframe - alt_counts <- counts[grepl("Alt$", counts$AlleleID), ] - row.names(alt_counts) <- alt_counts$CloneID + check_matching_sets <- function(x, y, x_label, y_label) { + missing_in_y <- setdiff(x, y) + missing_in_x <- setdiff(y, x) + if (length(missing_in_y) > 0 || length(missing_in_x) > 0) { + msg <- c() + if (length(missing_in_y) > 0) { + msg <- c(msg, paste0("in ", x_label, " only: ", + paste(utils::head(missing_in_y, 5), collapse = ", "))) + } + if (length(missing_in_x) > 0) { + msg <- c(msg, paste0("in ", y_label, " only: ", + paste(utils::head(missing_in_x, 5), collapse = ", "))) + } + stop("Mismatched ", x_label, " and ", y_label, " values (", paste(msg, collapse = "; "), ").") + } + } - #unload counts file - rm(counts) + metadata_cols <- c("MarkerName", "MarkerID", "AlleleID", "CloneID", + "AlleleSequence", "AlleleSequenceRef", "AlleleSequenceAlt", + "Variant", "CallRate", "OneRatioRef", "OneRatioSnp", + "FreqHomRef", "FreqHomSnp", "FreqHets", "PICRef", "PICSnp", + "AvgPIC", "AvgCountRef", "AvgCountSnp", + "RatioAvgCountRefAvgCountSnp", "Chrom", "ChromPos") + + get_sample_cols <- function(df) { + sample_cols <- names(df)[!(names(df) %in% metadata_cols)] + ensure_unique(sample_cols, "Sample columns") + sample_cols + } - #Get the Ref and Alt allele from the alt_counts file - ##Note sometimes there are multiple nucleotides, I am assuming this file also includes indels, but make sure this is not an error - alleles_df <- data.frame(MarkerName = alt_counts$CloneID, - REF = "A", - ALT = "B") + parse_variant_bases <- function(variant) { + variant <- trimws(as.character(variant)) + invalid <- is_missing_value(variant) | !grepl(">", variant, fixed = TRUE) + if (any(invalid)) { + stop("Counts Variant values must be present and contain '>' for REF/ALT parsing.") + } + clean_variant <- sub("^-:", "", variant) + parts <- strsplit(clean_variant, ">", fixed = TRUE) + valid_parts <- vapply(parts, length, integer(1)) == 2 + if (!all(valid_parts)) { + stop("Counts Variant values must contain exactly one REF>ALT allele definition.") + } + data.frame( + REF = vapply(parts, `[`, character(1), 1), + ALT = vapply(parts, `[`, character(1), 2), + stringsAsFactors = FALSE + ) + } + + derive_coordinates <- function(marker_info) { + if (all(c("Chrom", "ChromPos") %in% names(marker_info)) && + !any(is_missing_value(marker_info$Chrom)) && + !any(is_missing_value(marker_info$ChromPos))) { + return(data.frame(CHROM = as.character(marker_info$Chrom), + POS = as.character(marker_info$ChromPos), + stringsAsFactors = FALSE)) + } - #Add the CHROM and POS information to the alleles_df from the dosage report - alleles_df <- merge(alleles_df, dosage[, c("MarkerName","Chrom", "ChromPos")], by = "MarkerName", all.x = TRUE) - #Add row name - row.names(alleles_df) <- alleles_df$MarkerName + marker_names <- as.character(marker_info$MarkerName) + cannot_split <- !grepl("_", marker_names, fixed = TRUE) + if (any(cannot_split)) { + stop("Chrom/ChromPos columns are absent or incomplete, and MarkerName values cannot be split at '_'. Examples: ", + paste(utils::head(marker_names[cannot_split], 5), collapse = ", ")) + } + data.frame( + CHROM = sub("_[^_]*$", "", marker_names), + POS = sub("^.*_", "", marker_names), + stringsAsFactors = FALSE + ) } - #Remove the unwanted information from the counts dataframes - cols_to_remove <- c("MarkerName","AlleleSequence","Variant", - "CallRate","OneRatioRef","OneRatioSnp","FreqHomSnp", - "FreqHets","PICRef","PICSnp","AvgPIC","FreqHomRef", - "AvgCountRef","AvgCountSnp","RatioAvgCountRefAvgCountSnp", - "AlleleID","CloneID") - alt_counts <- alt_counts[, !(colnames(alt_counts) %in% cols_to_remove)] - ref_counts <- ref_counts[, !(colnames(ref_counts) %in% cols_to_remove)] + precompute_genotype_strings <- function(ploidy) { + genotype_strings <- character(ploidy + 1) + for (dosage_value in 0:ploidy) { + ref_count <- dosage_value + alt_count <- ploidy - dosage_value + genotype_strings[dosage_value + 1] <- paste(c(rep("0", ref_count), rep("1", alt_count)), collapse = "/") + } + genotype_strings + } - #Ensure that they are in the same order - ref_counts <- ref_counts[row.names(alt_counts),] + convert_dosage2gt <- function(dosage_matrix, ploidy) { + genotype_strings <- precompute_genotype_strings(ploidy) + genotype_matrix <- matrix(missing_gt(ploidy), nrow = nrow(dosage_matrix), ncol = ncol(dosage_matrix), + dimnames = dimnames(dosage_matrix)) + called <- !is.na(dosage_matrix) + genotype_matrix[called] <- genotype_strings[dosage_matrix[called] + 1] + genotype_matrix + } - #Make the total counts dataframe - total_counts <- alt_counts + ref_counts + matrix_to_character <- function(mat) { + out <- matrix(as.character(mat), nrow = nrow(mat), ncol = ncol(mat), dimnames = dimnames(mat)) + out[is.na(mat)] <- "." + out + } - #Get the total ref, total alt, and total read depth for each marker - alleles_df$AltCountsSum <- rowSums(alt_counts)[rownames(alleles_df)] #Alt - alleles_df$RefCountsSum <- rowSums(ref_counts)[rownames(alleles_df)] #Ref - alleles_df$TotalCountSum <- alleles_df$AltCountsSum + alleles_df$RefCountsSum + as_numeric_matrix <- function(df, label) { + char_mat <- as_clean_character_matrix(df) + numeric_mat <- suppressWarnings(matrix(as.numeric(char_mat), nrow = nrow(char_mat), + ncol = ncol(char_mat), dimnames = dimnames(char_mat))) + invalid <- !is.na(char_mat) & is.na(numeric_mat) + if (any(invalid)) { + stop(label, " contains non-numeric values. Examples: ", + paste(utils::head(unique(char_mat[invalid]), 5), collapse = ", ")) + } + numeric_mat + } - #Remove the unwanted information from the dosage dataframe - d_cols_to_remove <- c("MarkerName","AvgCountRef","AvgCountSnp","Chrom","ChromPos") - dosage <- dosage[, !(colnames(dosage) %in% d_cols_to_remove)] + convert_snp_codes <- function(code_matrix) { + code_matrix <- as_clean_character_matrix(code_matrix) + invalid <- !is.na(code_matrix) & !(code_matrix %in% c("0", "1", "2")) + if (any(invalid)) { + stop("SNP/INDEL genotype codes must be 0, 1, 2, '-', or NA. Invalid values include: ", + paste(utils::head(unique(code_matrix[invalid]), 5), collapse = ", ")) + } - #Make the VCF df - vcf_df <- data.frame( - CHROM = alleles_df$Chrom, - POS = alleles_df$ChromPos, - ID = alleles_df$MarkerName, - REF = alleles_df$REF, - ALT = alleles_df$ALT, - QUAL = ".", - FILTER = ".", - INFO = NA, - FORMAT = NA - ) + gt_matrix <- matrix("./.", nrow = nrow(code_matrix), ncol = ncol(code_matrix), + dimnames = dimnames(code_matrix)) + ud_matrix <- matrix(".", nrow = nrow(code_matrix), ncol = ncol(code_matrix), + dimnames = dimnames(code_matrix)) - #Add the INFO column for each SNP - vcf_df$INFO <- paste0("DP=",alleles_df$TotalCountSum,";", - "ADS=",alleles_df$RefCountsSum,",",alleles_df$AltCountsSum) + gt_matrix[!is.na(code_matrix) & code_matrix == "0"] <- "0/0" + gt_matrix[!is.na(code_matrix) & code_matrix == "1"] <- "1/1" + gt_matrix[!is.na(code_matrix) & code_matrix == "2"] <- "0/1" + ud_matrix[!is.na(code_matrix) & code_matrix == "0"] <- "2" + ud_matrix[!is.na(code_matrix) & code_matrix == "1"] <- "0" + ud_matrix[!is.na(code_matrix) & code_matrix == "2"] <- "1" - #Add the FORMAT label for each SNP - vcf_df$FORMAT <- paste("GT","UD","DP","RA",sep=":") + list(gt = gt_matrix, ud = ud_matrix) + } - message("Converting dosages to genotype format\n") + parse_report <- function(file, ploidy) { + raw <- suppressMessages(readr::read_csv(file, skip = 6, show_col_types = FALSE)) + raw <- as.data.frame(raw, check.names = FALSE) + if (nrow(raw) == 0) stop("DArT report file contains no data rows.") + + first_row <- as.character(unlist(raw[1, ], use.names = FALSE)) + is_allele_dose <- length(first_row) >= 5 && + identical(first_row[1:5], c("MarkerID", "AvgCountRef", "AvgCountSnp", "Chrom", "ChromPos")) + + if (is_allele_dose) { + message("DArT report is an Allele Dose report") + names(raw)[1:5] <- first_row[1:5] + report <- raw[-1, , drop = FALSE] + names(report)[names(report) == "MarkerID"] <- "MarkerName" + sample_cols <- get_sample_cols(report) + ensure_unique(report$MarkerName, "Report MarkerName") + + dosage_matrix <- as_numeric_matrix(report[, sample_cols, drop = FALSE], "Allele dose report") + invalid_dosage <- !is.na(dosage_matrix) & + (dosage_matrix < 0 | dosage_matrix > ploidy | dosage_matrix != round(dosage_matrix)) + if (any(invalid_dosage)) { + stop("Allele dose values must be integer reference allele counts between 0 and ploidy.") + } + + rownames(dosage_matrix) <- report$MarkerName + gt_matrix <- convert_dosage2gt(dosage_matrix, ploidy) + ud_matrix <- matrix_to_character(dosage_matrix) + + marker_info <- report[, intersect(c("MarkerName", "Chrom", "ChromPos"), names(report)), drop = FALSE] + rownames(marker_info) <- marker_info$MarkerName + + return(list(type = "allele_dose", + marker_info = marker_info, + marker_names = report$MarkerName, + sample_names = sample_cols, + gt = gt_matrix, + ud = ud_matrix, + variant = NULL, + allele_sequence_ref = NULL, + allele_sequence_alt = NULL)) + } - ###Convert genotypes from dosage to gt - # Precompute genotype strings for all possible dosage values to improve efficiency - precompute_genotype_strings <- function(ploidy) { - genotype_strings <- character(ploidy + 1) - # Generate the genotype string based on the dosage and ploidy - # Updog uses the ref counts, which is not typical, so this corrects it - for (dosage in 0:ploidy) { - ref_count <- dosage - alt_count <- ploidy - dosage - genotype_strings[dosage + 1] <- paste(c(rep("0", ref_count), rep("1", alt_count)), collapse = "/") + is_snp_1row <- all(c("MarkerName", "AlleleSequenceRef", "AlleleSequenceAlt", "Variant") %in% names(raw)) + is_snp_2row <- all(c("MarkerName", "AlleleSequence", "Variant") %in% names(raw)) && + anyDuplicated(raw$MarkerName) > 0 + + if (is_snp_1row || is_snp_2row) { + if (ploidy != 2) { + stop("SNP/INDEL reports are diploid genotype reports. Use ploidy = 2.") + } } - return(genotype_strings) - } - # Apply the precomputed genotype strings to the matrix - convert_dosage2gt <- function(dosage_matrix, ploidy) { - dosage_matrix <- as.matrix(dosage_matrix) - genotype_strings <- precompute_genotype_strings(ploidy) + if (is_snp_1row) { + message("DArT report is a SNP/INDEL 1 row report") + sample_cols <- get_sample_cols(raw) + ensure_unique(raw$MarkerName, "Report MarkerName") + + code_matrix <- as.matrix(raw[, sample_cols, drop = FALSE]) + rownames(code_matrix) <- raw$MarkerName + converted <- convert_snp_codes(code_matrix) + + marker_info <- raw[, intersect(c("MarkerName", "Chrom", "ChromPos"), names(raw)), drop = FALSE] + rownames(marker_info) <- marker_info$MarkerName + + variant <- setNames(trimws(as.character(raw$Variant)), raw$MarkerName) + allele_sequence_ref <- setNames(as.character(raw$AlleleSequenceRef), raw$MarkerName) + allele_sequence_alt <- setNames(as.character(raw$AlleleSequenceAlt), raw$MarkerName) + + return(list(type = "snp_1row", + marker_info = marker_info, + marker_names = raw$MarkerName, + sample_names = sample_cols, + gt = converted$gt, + ud = converted$ud, + variant = variant, + allele_sequence_ref = allele_sequence_ref, + allele_sequence_alt = allele_sequence_alt)) + } - # Handle missing values separately - genotype_matrix <- matrix(genotype_strings[dosage_matrix + 1], nrow = nrow(dosage_matrix), ncol = ncol(dosage_matrix)) - genotype_matrix[is.na(dosage_matrix)] <- "./." # Handle missing values + if (is_snp_2row) { + message("DArT report is a SNP/INDEL 2 row report") + sample_cols <- get_sample_cols(raw) + ref_rows <- is_missing_value(raw$Variant) + alt_rows <- !ref_rows + ref_report <- raw[ref_rows, , drop = FALSE] + alt_report <- raw[alt_rows, , drop = FALSE] + check_matching_sets(ref_report$MarkerName, alt_report$MarkerName, "SNP ref-row markers", "SNP alt-row markers") + ensure_unique(ref_report$MarkerName, "SNP ref-row MarkerName") + ensure_unique(alt_report$MarkerName, "SNP alt-row MarkerName") + alt_report <- alt_report[match(ref_report$MarkerName, alt_report$MarkerName), , drop = FALSE] + + ref_matrix <- as_clean_character_matrix(ref_report[, sample_cols, drop = FALSE]) + alt_matrix <- as_clean_character_matrix(alt_report[, sample_cols, drop = FALSE]) + invalid <- (!is.na(ref_matrix) & !(ref_matrix %in% c("0", "1"))) | + (!is.na(alt_matrix) & !(alt_matrix %in% c("0", "1"))) + if (any(invalid)) { + stop("SNP/INDEL 2 row reports must contain only 0, 1, '-', or NA in sample columns.") + } + + code_matrix <- matrix(NA_character_, nrow = nrow(ref_matrix), ncol = ncol(ref_matrix), + dimnames = list(ref_report$MarkerName, sample_cols)) + called <- !is.na(ref_matrix) & !is.na(alt_matrix) + code_matrix[called & ref_matrix == "1" & alt_matrix == "0"] <- "0" + code_matrix[called & ref_matrix == "0" & alt_matrix == "1"] <- "1" + code_matrix[called & ref_matrix == "1" & alt_matrix == "1"] <- "2" + code_matrix[called & ref_matrix == "0" & alt_matrix == "0"] <- NA_character_ + converted <- convert_snp_codes(code_matrix) + + marker_info <- data.frame(MarkerName = ref_report$MarkerName, stringsAsFactors = FALSE) + if (all(c("Chrom", "ChromPos") %in% names(ref_report))) { + marker_info$Chrom <- ref_report$Chrom + marker_info$ChromPos <- ref_report$ChromPos + } + rownames(marker_info) <- marker_info$MarkerName + + variant <- setNames(trimws(as.character(alt_report$Variant)), alt_report$MarkerName) + allele_sequence_ref <- setNames(as.character(ref_report$AlleleSequence), ref_report$MarkerName) + allele_sequence_alt <- setNames(as.character(alt_report$AlleleSequence), alt_report$MarkerName) + + return(list(type = "snp_2row", + marker_info = marker_info, + marker_names = ref_report$MarkerName, + sample_names = sample_cols, + gt = converted$gt, + ud = converted$ud, + variant = variant, + allele_sequence_ref = allele_sequence_ref, + allele_sequence_alt = allele_sequence_alt)) + } - # Retain row and column names - rownames(genotype_matrix) <- rownames(dosage_matrix) - colnames(genotype_matrix) <- colnames(dosage_matrix) + stop("Unsupported DArT report format.") + } - return(genotype_matrix) + parse_counts <- function(file) { + raw <- suppressMessages(readr::read_csv(file, skip = 6, show_col_types = FALSE)) + raw <- as.data.frame(raw, check.names = FALSE) + if (nrow(raw) == 0) stop("DArT counts file contains no data rows.") + + first_row <- as.character(unlist(raw[1, ], use.names = FALSE)) + if (all(c("MarkerName", "Variant") %in% first_row[seq_len(min(15, length(first_row)))])) { + message("Counts file contains the counts for the target loci only") + names(raw)[1:15] <- first_row[1:15] + counts <- raw[-1, , drop = FALSE] + sample_cols <- get_sample_cols(counts) + + ref_rows <- is_missing_value(counts$Variant) + alt_rows <- !ref_rows + ref_counts <- counts[ref_rows, , drop = FALSE] + alt_counts <- counts[alt_rows, , drop = FALSE] + check_matching_sets(ref_counts$MarkerName, alt_counts$MarkerName, "counts ref-row markers", "counts alt-row markers") + ensure_unique(ref_counts$MarkerName, "Counts ref-row MarkerName") + ensure_unique(alt_counts$MarkerName, "Counts alt-row MarkerName") + ref_counts <- ref_counts[match(alt_counts$MarkerName, ref_counts$MarkerName), , drop = FALSE] + + rownames(ref_counts) <- ref_counts$MarkerName + rownames(alt_counts) <- alt_counts$MarkerName + + bases <- parse_variant_bases(alt_counts$Variant) + alleles_df <- data.frame(MarkerName = alt_counts$MarkerName, + REF = bases$REF, + ALT = bases$ALT, + CountVariant = trimws(as.character(alt_counts$Variant)), + CountAlleleSequenceRef = as.character(ref_counts$AlleleSequence), + CountAlleleSequenceAlt = as.character(alt_counts$AlleleSequence), + stringsAsFactors = FALSE) + rownames(alleles_df) <- alleles_df$MarkerName + + return(list(type = "target", + alleles_df = alleles_df, + marker_names = alt_counts$MarkerName, + sample_names = sample_cols, + ref_counts = ref_counts[, sample_cols, drop = FALSE], + alt_counts = alt_counts[, sample_cols, drop = FALSE])) + } + + message("Counts file contains the collapsed read counts across all microhaplotypes for the target loci") + names(raw)[1:5] <- first_row[1:5] + counts <- raw[-1, , drop = FALSE] + sample_cols <- get_sample_cols(counts) + + ref_counts <- counts[grepl("Ref$", counts$AlleleID), , drop = FALSE] + alt_counts <- counts[grepl("Alt$", counts$AlleleID), , drop = FALSE] + check_matching_sets(ref_counts$CloneID, alt_counts$CloneID, "counts ref-row markers", "counts alt-row markers") + ensure_unique(ref_counts$CloneID, "Counts ref-row CloneID") + ensure_unique(alt_counts$CloneID, "Counts alt-row CloneID") + ref_counts <- ref_counts[match(alt_counts$CloneID, ref_counts$CloneID), , drop = FALSE] + + rownames(ref_counts) <- ref_counts$CloneID + rownames(alt_counts) <- alt_counts$CloneID + alleles_df <- data.frame(MarkerName = alt_counts$CloneID, + REF = "A", + ALT = "B", + CountVariant = NA_character_, + CountAlleleSequenceRef = NA_character_, + CountAlleleSequenceAlt = NA_character_, + stringsAsFactors = FALSE) + rownames(alleles_df) <- alleles_df$MarkerName + + list(type = "collapsed", + alleles_df = alleles_df, + marker_names = alt_counts$CloneID, + sample_names = sample_cols, + ref_counts = ref_counts[, sample_cols, drop = FALSE], + alt_counts = alt_counts[, sample_cols, drop = FALSE]) } - # Convert the dosage matrix to genotypes - geno_df <- convert_dosage2gt(dosage, ploidy) + validate_report_vs_counts <- function(report, counts) { + if (!is.null(report$variant) && counts$type == "target") { + report_variant <- report$variant[counts$marker_names] + count_variant <- counts$alleles_df[counts$marker_names, "CountVariant"] + mismatch <- !is_missing_value(report_variant) & !is_missing_value(count_variant) & + trimws(report_variant) != trimws(count_variant) + if (any(mismatch)) { + stop("SNP/INDEL report Variant values do not match Counts Variant values. Examples: ", + paste(utils::head(counts$marker_names[mismatch], 5), collapse = ", ")) + } + } - #Combine info from the matrices to form the VCF information for each sample - # Combine the matrices into a single matrix with elements separated by ":" - make_vcf_format <- function(..., separator = ":") { - matrices <- list(...) - n <- length(matrices) + if (!is.null(report$allele_sequence_ref) && counts$type == "target") { + report_ref <- report$allele_sequence_ref[counts$marker_names] + report_alt <- report$allele_sequence_alt[counts$marker_names] + count_ref <- counts$alleles_df[counts$marker_names, "CountAlleleSequenceRef"] + count_alt <- counts$alleles_df[counts$marker_names, "CountAlleleSequenceAlt"] + mismatch <- (!is_missing_value(report_ref) & !is_missing_value(count_ref) & report_ref != count_ref) | + (!is_missing_value(report_alt) & !is_missing_value(count_alt) & report_alt != count_alt) + if (any(mismatch)) { + stop("SNP/INDEL report allele sequences do not match Counts allele sequences. Examples: ", + paste(utils::head(counts$marker_names[mismatch], 5), collapse = ", ")) + } + } + } - # Convert matrices to long form - long_forms <- lapply(matrices, function(mat) { - suppressMessages(reshape2::melt(mat, varnames = c("Row", "Col"), value.name = "Value")) - }) + ##Get information from DArT Counts and Dosage Report files + report <- parse_report(dosage_report, ploidy) + counts <- parse_counts(counts_file) + + check_matching_sets(report$marker_names, counts$marker_names, "report markers", "counts markers") + check_matching_sets(report$sample_names, counts$sample_names, "report samples", "counts samples") + validate_report_vs_counts(report, counts) + + marker_order <- report$marker_names + sample_order <- report$sample_names + + alleles_df <- counts$alleles_df[marker_order, , drop = FALSE] + marker_info <- report$marker_info[marker_order, , drop = FALSE] + coordinates <- derive_coordinates(marker_info) + alleles_df$Chrom <- coordinates$CHROM + alleles_df$ChromPos <- coordinates$POS + + ref_counts <- counts$ref_counts[marker_order, sample_order, drop = FALSE] + alt_counts <- counts$alt_counts[marker_order, sample_order, drop = FALSE] + ref_counts <- as_numeric_matrix(ref_counts, "Reference counts") + alt_counts <- as_numeric_matrix(alt_counts, "Alternate counts") + total_counts <- alt_counts + ref_counts - # Concatenate the elements - combined_long <- long_forms[[1]] - combined_long$Combined <- combined_long$Value + if (!identical(rownames(report$gt), rownames(ref_counts)) || + !identical(colnames(report$gt), colnames(ref_counts))) { + stop("Internal alignment error: genotype and count matrices are not identically ordered.") + } - for (i in 2:n) { - combined_long$Combined <- paste(combined_long$Combined, long_forms[[i]]$Value, sep = separator) - } + alleles_df$AltCountsSum <- rowSums(alt_counts) + alleles_df$RefCountsSum <- rowSums(ref_counts) + alleles_df$TotalCountSum <- alleles_df$AltCountsSum + alleles_df$RefCountsSum + + vcf_df <- data.frame( + CHROM = alleles_df$Chrom, + POS = alleles_df$ChromPos, + ID = alleles_df$MarkerName, + REF = alleles_df$REF, + ALT = alleles_df$ALT, + QUAL = ".", + FILTER = ".", + INFO = NA, + FORMAT = NA, + stringsAsFactors = FALSE + ) - # Convert back to wide form - combined_wide <- suppressMessages(reshape2::dcast(combined_long, Row ~ Col, value.var = "Combined")) + vcf_df$INFO <- paste0("DP=",alleles_df$TotalCountSum,";", + "ADS=",alleles_df$RefCountsSum,",",alleles_df$AltCountsSum) + vcf_df$FORMAT <- paste("GT","UD","DP","RA",sep=":") - # Restore row and column names - rownames(combined_wide) <- combined_wide$Row - combined_wide$Row <- NULL - colnames(combined_wide) <- colnames(matrices[[1]]) + message("Converting dosages to genotype format\n") - return(as.matrix(combined_wide)) + make_vcf_format <- function(gt_matrix, ud_matrix, dp_matrix, ra_matrix) { + matrix(paste(gt_matrix, ud_matrix, matrix_to_character(dp_matrix), matrix_to_character(ra_matrix), sep = ":"), + nrow = nrow(gt_matrix), ncol = ncol(gt_matrix), dimnames = dimnames(gt_matrix)) } message("Formatting VCF and generating output file\n") # Combine the matrices - geno_df <- make_vcf_format(geno_df, dosage, total_counts, ref_counts) + geno_df <- make_vcf_format(report$gt, report$ud, total_counts, ref_counts) #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) @@ -278,7 +534,8 @@ dosage2vcf <- function(dart.report, dart.counts, ploidy, output.file) { write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) #Unload all items from memory - rm(dosage) + rm(report) + rm(counts) rm(alt_counts) rm(ref_counts) rm(geno_df) diff --git a/man/dosage2vcf.Rd b/man/dosage2vcf.Rd index 0bb0600..b6a1978 100644 --- a/man/dosage2vcf.Rd +++ b/man/dosage2vcf.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/dosage2vcf.R \name{dosage2vcf} \alias{dosage2vcf} -\title{Convert DArTag Dosage and Counts to VCF} +\title{Convert DArTag genotype reports and counts to VCF} \usage{ dosage2vcf(dart.report, dart.counts, ploidy, output.file) } \arguments{ -\item{dart.report}{Path to the DArT dosage report .csv file. Typically contains "Dosage Report" in the file name.} +\item{dart.report}{Path to the DArT Allele Dose Report or SNP/INDEL report .csv file.} \item{dart.counts}{Path to the DArT counts .csv file. Typically contains "Counts" in the file name.} @@ -19,11 +19,15 @@ dosage2vcf(dart.report, dart.counts, ploidy, output.file) A vcf file } \description{ -This function will convert the DArT Dosage Report and Counts files to VCF format +This function will convert DArT genotype report and Counts files to VCF format } \details{ -This function will convert the Dosage Report and Counts files from DArT into a VCF file. +This function will convert Allele Dose Report or SNP/INDEL report files and Counts files from DArT into a VCF file. These two files are received directly from DArT for a given sequencing project. +SNP/INDEL one-row and two-row reports are treated as diploid genotype reports +with 0 = reference homozygote, 1 = alternate homozygote, 2 = heterozygote, +and - = missing. Allele Dose reports are interpreted as reference allele +dosages using the supplied ploidy. The output file will be saved to the location and with the name that is specified. The VCF format is v4.3 } diff --git a/tests/testthat/test-dosage2vcf.R b/tests/testthat/test-dosage2vcf.R index 18482c3..4ea57df 100644 --- a/tests/testthat/test-dosage2vcf.R +++ b/tests/testthat/test-dosage2vcf.R @@ -1,5 +1,68 @@ context("Dosage Report to VCF") +read_dosage2vcf_body <- function(output_file) { + vcf_lines <- readLines(paste0(output_file, ".vcf")) + body_lines <- vcf_lines[!grepl("^##", vcf_lines)] + read.table(text = paste(body_lines, collapse = "\n"), + header = TRUE, + sep = "\t", + quote = "", + comment.char = "", + check.names = FALSE, + stringsAsFactors = FALSE, + colClasses = "character") +} + +write_dart_counts_fixture <- function(path, sample_order = c("S2", "S1", "S3")) { + meta_cols <- c("MarkerName", "AlleleSequence", "Variant", "CallRate", + "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", + "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + skipped <- rep(paste(rep("*", length(meta_cols) + length(sample_order)), collapse = ","), 6) + sample_header <- paste(c(rep("*", length(meta_cols)), sample_order), collapse = ",") + target_header <- paste(c(meta_cols, paste0("T", seq_along(sample_order))), collapse = ",") + fixed_values <- rep("1", length(meta_cols) - 3) + rows <- c( + paste(c("Chr01_000120735", "AAA", "", fixed_values, c("10", "20", "30")), collapse = ","), + paste(c("Chr01_000120735", "ATA", "-:A>T", fixed_values, c("1", "2", "3")), collapse = ","), + paste(c("Chr02_000000005", "GGG", "", fixed_values, c("40", "50", "60")), collapse = ","), + paste(c("Chr02_000000005", "GCG", "-:G>C", fixed_values, c("4", "5", "6")), collapse = ",") + ) + writeLines(c(skipped, sample_header, target_header, rows), path) +} + +write_snp_1row_fixture <- function(path, sample_order = c("S1", "S2", "S3")) { + meta_cols <- c("MarkerName", "AlleleSequenceRef", "AlleleSequenceAlt", "Variant", + "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", + "FreqHomSnp", "FreqHets", "PICRef", "PICSnp", "AvgPIC", + "AvgCountRef", "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + skipped <- rep(paste(rep("*", length(meta_cols) + length(sample_order)), collapse = ","), 6) + header <- paste(c(meta_cols, sample_order), collapse = ",") + fixed_values <- rep("1", length(meta_cols) - 4) + rows <- c( + paste(c("Chr01_000120735", "AAA", "ATA", "-:A>T", fixed_values, c("0", "1", "2")), collapse = ","), + paste(c("Chr02_000000005", "GGG", "GCG", "-:G>C", fixed_values, c("-", "", "0")), collapse = ",") + ) + writeLines(c(skipped, header, rows), path) +} + +write_snp_2row_fixture <- function(path, sample_order = c("S1", "S2", "S3")) { + meta_cols <- c("MarkerName", "AlleleSequence", "Variant", "CallRate", + "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", + "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + skipped <- rep(paste(rep("*", length(meta_cols) + length(sample_order)), collapse = ","), 6) + header <- paste(c(meta_cols, sample_order), collapse = ",") + fixed_values <- rep("1", length(meta_cols) - 3) + rows <- c( + paste(c("Chr01_000120735", "AAA", "", fixed_values, c("1", "0", "1")), collapse = ","), + paste(c("Chr01_000120735", "ATA", "-:A>T", fixed_values, c("0", "1", "1")), collapse = ","), + paste(c("Chr02_000000005", "GGG", "", fixed_values, c("-", "-", "1")), collapse = ","), + paste(c("Chr02_000000005", "GCG", "-:G>C", fixed_values, c("-", "-", "0")), collapse = ",") + ) + writeLines(c(skipped, header, rows), path) +} + test_that("test dosage report conversion",{ #Input variables @@ -31,3 +94,53 @@ test_that("test dosage report conversion",{ rm(output_file) }) + +test_that("SNP/INDEL 1 row reports use diploid genotype codes and align counts by sample name", { + report <- tempfile(fileext = ".csv") + counts <- tempfile(fileext = ".csv") + output_file <- tempfile() + write_snp_1row_fixture(report) + write_dart_counts_fixture(counts) + + dosage2vcf(dart.report = report, dart.counts = counts, ploidy = 2, output.file = output_file) + vcf <- read_dosage2vcf_body(output_file) + + expect_equal(colnames(vcf)[10:12], c("S1", "S2", "S3")) + expect_equal(vcf$`#CHROM`, c("Chr01", "Chr02")) + expect_equal(vcf$POS, c("000120735", "000000005")) + expect_equal(vcf$ID, c("Chr01_000120735", "Chr02_000000005")) + expect_equal(vcf$S1[1], "0/0:2:22:20") + expect_equal(vcf$S2[1], "1/1:0:11:10") + expect_equal(vcf$S3[1], "0/1:1:33:30") + expect_equal(vcf$S1[2], "./.:.:55:50") + expect_equal(vcf$S2[2], "./.:.:44:40") + expect_equal(vcf$S3[2], "0/0:2:66:60") +}) + +test_that("SNP/INDEL 2 row reports collapse to the same genotype codes as 1 row reports", { + snp_1row <- tempfile(fileext = ".csv") + snp_2row <- tempfile(fileext = ".csv") + counts <- tempfile(fileext = ".csv") + output_1row <- tempfile() + output_2row <- tempfile() + write_snp_1row_fixture(snp_1row) + write_snp_2row_fixture(snp_2row) + write_dart_counts_fixture(counts) + + dosage2vcf(dart.report = snp_1row, dart.counts = counts, ploidy = 2, output.file = output_1row) + dosage2vcf(dart.report = snp_2row, dart.counts = counts, ploidy = 2, output.file = output_2row) + + expect_equal(read_dosage2vcf_body(output_2row), read_dosage2vcf_body(output_1row)) +}) + +test_that("SNP/INDEL reports require diploid ploidy", { + report <- tempfile(fileext = ".csv") + counts <- tempfile(fileext = ".csv") + write_snp_1row_fixture(report) + write_dart_counts_fixture(counts) + + expect_error( + dosage2vcf(dart.report = report, dart.counts = counts, ploidy = 4, output.file = tempfile()), + "diploid genotype reports" + ) +}) From 7ae6e3d4e9fc3f1520d3950617b68eefd4b69cb9 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 11:53:18 -0400 Subject: [PATCH 72/80] edit comments --- R/dosage2vcf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/dosage2vcf.R b/R/dosage2vcf.R index fdc5ff7..0a15c48 100644 --- a/R/dosage2vcf.R +++ b/R/dosage2vcf.R @@ -69,7 +69,7 @@ dosage2vcf <- function(dart.report, dart.counts, ploidy, output.file) { '##FORMAT=' ) - ## Helper functions --------------------------------------------------------- + ## is_missing_value <- function(x) { x_chr <- trimws(as.character(x)) is.na(x) | x_chr %in% c("", "-", "NA") From 1f5a3f37a83c4e1e947caaf2b760f573fbc229eb Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 13:21:50 -0400 Subject: [PATCH 73/80] updated news --- NEWS.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1b9559b..db4b76e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ # BIGr 0.7.0 +## Updates on `dosage2vcf` + +- Added support for DArT SNP/INDEL 1-row and 2-row report formats +- `dosage2vcf` now validates marker and sample sets between report and counts files, then aligns counts to the report order before writing VCF genotypes +- VCF `CHROM` and `POS` are derived from `Chrom`/`ChromPos` when present, otherwise from `MarkerName`; `MarkerName` is retained in the VCF `ID` field +- Missing SNP/INDEL genotype calls (`-`/`NA`) are written as diploid missing genotypes (`./.`) + ## New function `madc2vcf_multi` - New function `madc2vcf_multi` to convert a DArTag MADC file to a VCF using the polyRAD pipeline for multiallelic genotyping @@ -139,4 +146,3 @@ FixAlleleIDs | TRUE | TRUE | botloci or markers_info REF/ALT - updog2vcf function option to output compressed VCF (.vcf.gz) - set as default - remove need for defining ploidy - add metadata at the VCF header - From e63410e17f4e0d68ef646d9a039ced0bed238478 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 14:06:31 -0400 Subject: [PATCH 74/80] reduce cran testing time --- .github/workflows/R-CMD-check.yaml | 2 +- tests/testthat/test-madc2vcf_all.R | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 587188c..4ddf24c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -26,6 +26,7 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes + NOT_CRAN: true steps: - uses: actions/checkout@v3 @@ -62,4 +63,3 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} slug: Breeding-Insight/BIGr files: coverage.xml - diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index 1d6050f..9694f75 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -159,6 +159,7 @@ test_that("simu alfalfa",{ potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + skip_on_cran() skip_if_offline("raw.githubusercontent.com") test_that("ALFALFA — clean fixed allele ID MADC", { From 54e106446232b2563072c034901019a40354102d Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 14:22:12 -0400 Subject: [PATCH 75/80] Added global items --- R/utils.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index a30c6f6..41acdbc 100644 --- a/R/utils.R +++ b/R/utils.R @@ -7,7 +7,8 @@ utils::globalVariables(c( "ind", "ref", "row_name", "size", "snp", "CloneID", "Count", "qualifying_sites_count", "MarkerID", "SampleID", "Dosage", - "pos", "alt", "match_key" + "pos", "alt", "match_key", + ":=", ".SD", "Sex", "Male_Parent", "Female_Parent", "chr" )) #' Convert GT format to numeric dosage From 2480ef77d6f14fd15c274aaca50dea97028ac40f Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 14:49:25 -0400 Subject: [PATCH 76/80] updated check_ped for CRAN --- R/check_ped.R | 38 +++++++++++---------------------- dev/dev_history.R | 2 +- man/check_ped.Rd | 16 ++++++++------ tests/testthat/test-check_ped.R | 12 +++++++++-- 4 files changed, 32 insertions(+), 36 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 0ebbd5a..1359eba 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -12,13 +12,13 @@ #' #' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. #' -#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: -#' * Corrected pedigree as a dataframe in the global environment -#' * A report listing all detected issues +#' The function does **not** overwrite the input file or create objects in the +#' global environment. Instead, it returns the report and corrected pedigree in +#' a list. #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional seed for reproducibility. -#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' @param verbose Logical. If TRUE (default), prints the report to the console. #' #' @return A list of data.frames containing detected issues: #' * `exact_duplicates`: rows that were exact duplicates @@ -26,14 +26,18 @@ #' * `messy_parents`: IDs appearing as both sire and dam #' * `missing_parents`: parents added to the pedigree with 0 as sire/dam #' * `dependencies`: detected cycles in the pedigree +#' * `corrected_pedigree`: corrected pedigree table #' #' @examples #' ped_file <- system.file("check_ped_test.txt", package = "BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) #' #' # Access messy parents #' ped_errors$messy_parents #' +#' # Access corrected pedigree +#' ped_errors$corrected_pedigree +#' #' # IDs with messy parents #' messy_ids <- ped_errors$messy_parents$id #' print(messy_ids) @@ -179,14 +183,10 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { repeated_ids_diff = repeated_ids_report, messy_parents = messy_parents, missing_parents = missing_parents, - dependencies = data.frame(Dependency = unique(unlist(errors))) + dependencies = data.frame(Dependency = unique(unlist(errors))), + corrected_pedigree = data ) - #### file names #### - file_base <- tools::file_path_sans_ext(basename(ped.file)) - corrected_name <- paste0(file_base, "_corrected") - report_name <- paste0(file_base, "_report") - #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") @@ -217,21 +217,7 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { print(input_ped_report$dependencies) } else cat("\nNo dependencies detected.\n") - #### interactive save #### - cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) - ans <- tolower(trimws(readline())) - if (ans == "y") { - assign(corrected_name, data, envir = .GlobalEnv) - assign("input_ped_report", input_ped_report, envir = .GlobalEnv) - cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) - } else { - cat("No corrected pedigree was saved.\n") - } - - } else { - # Silent automatic mode - assign(corrected_name, data, envir = .GlobalEnv) - assign(report_name, input_ped_report, envir = .GlobalEnv) + cat("\nThe corrected pedigree is included in the returned list as `corrected_pedigree`.\n") } invisible(input_ped_report) diff --git a/dev/dev_history.R b/dev/dev_history.R index ec7f969..34e9324 100644 --- a/dev/dev_history.R +++ b/dev/dev_history.R @@ -2,7 +2,7 @@ # Update dependencies in DESCRIPTION # install.packages('attachment', repos = 'https://thinkr-open.r-universe.dev') -attachment::att_amend_desc() +#attachment::att_amend_desc() # Check package coverage covr::package_coverage() diff --git a/man/check_ped.Rd b/man/check_ped.Rd index ea63de7..9a12a07 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -11,7 +11,7 @@ check_ped(ped.file, seed = NULL, verbose = TRUE) \item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} +\item{verbose}{Logical. If TRUE (default), prints the report to the console.} } \value{ A list of data.frames containing detected issues: @@ -21,6 +21,7 @@ A list of data.frames containing detected issues: \item \code{messy_parents}: IDs appearing as both sire and dam \item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam \item \code{dependencies}: detected cycles in the pedigree +\item \code{corrected_pedigree}: corrected pedigree table } } \description{ @@ -39,19 +40,20 @@ The function checks for: After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: -\itemize{ -\item Corrected pedigree as a dataframe in the global environment -\item A report listing all detected issues -} +The function does \strong{not} overwrite the input file or create objects in the +global environment. Instead, it returns the report and corrected pedigree in +a list. } \examples{ ped_file <- system.file("check_ped_test.txt", package = "BIGr") -ped_errors <- check_ped(ped.file = ped_file, seed = 101919) +ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) # Access messy parents ped_errors$messy_parents +# Access corrected pedigree +ped_errors$corrected_pedigree + # IDs with messy parents messy_ids <- ped_errors$messy_parents$id print(messy_ids) diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index 706143f..887eb75 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -4,9 +4,11 @@ context("Imputation Concordance") test_that("test imputation",{ #Input variables ped_file <- system.file("check_ped_test.txt", package="BIGr") + temp_ped <- tempfile("check_ped_test_", fileext = ".txt") + file.copy(ped_file, temp_ped) #Calculations - output.list <- check_ped(ped_file, + output.list <- check_ped(temp_ped, seed = 101919, verbose = FALSE) @@ -14,9 +16,15 @@ test_that("test imputation",{ df_length <- length(output.list) messy_parents <- output.list$messy_parents missing_parents <- output.list$missing_parents + corrected_pedigree <- output.list$corrected_pedigree + file_base <- tools::file_path_sans_ext(basename(temp_ped)) - expect_true(df_length == 5) # Before was 4 + expect_true(df_length == 6) expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) expect_true(nrow(missing_parents) == 13) + expect_s3_class(corrected_pedigree, "data.frame") + expect_true(all(missing_parents$id %in% corrected_pedigree$id)) + expect_false(exists(paste0(file_base, "_corrected"), envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists(paste0(file_base, "_report"), envir = .GlobalEnv, inherits = FALSE)) }) From 44fe4f72738d3cbe22d4d126d4ecbb6ba8d3c1d5 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 15:08:14 -0400 Subject: [PATCH 77/80] minor version update --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0524f2f..994eb81 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.7.0 +Version: 0.7.1 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index db4b76e..fb8c105 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# BIGr 0.7.1 + +- Updated `check_ped()` to return corrected pedigree data in the result list instead of assigning objects to the global environment +- Skipped long remote `madc2vcf_all` integration tests on CRAN while keeping them enabled in GitHub Actions + # BIGr 0.7.0 ## Updates on `dosage2vcf` From cd23487fc30b089fa19ed73e352defbfd4093f6e Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 15:26:27 -0400 Subject: [PATCH 78/80] update CRAN comments --- cran-comments.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 cran-comments.md diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..b5fed72 --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,5 @@ +## R CMD check results + +0 errors | 0 warnings | 0 note + +* This is an update release. From 0fd7d9c081cfb341c56dc58de4d77d283d7ce726 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Sun, 17 May 2026 19:04:12 -0400 Subject: [PATCH 79/80] CRAN fix --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ R/find_parentage.R | 12 ++++++------ R/get_countsMADC.R | 8 ++++---- R/madc2vcf_targets.R | 38 +++++++++++++++++++------------------- R/validate_pedigree.R | 6 +++--- cran-comments.md | 6 ++++++ man/find_parentage.Rd | 8 ++++---- man/get_countsMADC.Rd | 4 ++-- man/madc2vcf_targets.Rd | 32 ++++++++++++++++---------------- 10 files changed, 65 insertions(+), 55 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 994eb81..7b0280d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.7.1 +Version: 0.7.2 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index fb8c105..309fc13 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# BIGr 0.7.2 + +- Fixed manual text errors + # BIGr 0.7.1 - Updated `check_ped()` to return corrected pedigree data in the result list instead of assigning objects to the global environment diff --git a/R/find_parentage.R b/R/find_parentage.R index 728822a..b220a46 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -15,13 +15,13 @@ #' Must include an 'ID' column. #' @param method Character. Parentage assignment method. One of: #' \itemize{ -#' \item \code{"best_male_parent"} — finds the best male parent for each +#' \item \code{"best_male_parent"} - finds the best male parent for each #' progeny using homozygous mismatch rate. -#' \item \code{"best_female_parent"} — finds the best female parent for each +#' \item \code{"best_female_parent"} - finds the best female parent for each #' progeny using homozygous mismatch rate. -#' \item \code{"best_match"} — finds the single best parent (either sex) +#' \item \code{"best_match"} - finds the single best parent (either sex) #' using homozygous mismatch rate. -#' \item \code{"best_pair"} — finds the best male-female parent pair for +#' \item \code{"best_pair"} - finds the best male-female parent pair for #' each progeny using full Mendelian error rate (default). #' } #' @param min_markers Integer. Minimum number of non-missing markers required @@ -235,7 +235,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, best_idx <- base::which.min(percent_mismatch) - # No candidate found — flag LOW_MARKERS and continue + # No candidate found - flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { data.table::set(results_dt, i, "Markers_Tested", 0L) data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") @@ -327,7 +327,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - # No markers overlap at all — flag LOW_MARKERS + # No markers overlap at all - flag LOW_MARKERS if (base::is.infinite(min_mismatch_val)) { data.table::set(results_dt, j, "Markers_Tested", 0L) data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 57b83ce..c1de03d 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -9,8 +9,8 @@ #' Either `madc_file` or `madc_object` must be provided (not both `NULL`). #' When `madc_object` is supplied it is passed directly to `get_counts()`, #' skipping file I/O. The function constructs: -#' - `ref_matrix` — per-sample reference allele counts. -#' - `size_matrix` — per-sample total counts (ref + alt). +#' - `ref_matrix` - per-sample reference allele counts. +#' - `size_matrix` - per-sample total counts (ref + alt). #' #' Markers whose `CloneID` appears only in the `Ref` or only in the `Alt` rows #' are removed with a warning. A summary of the proportion of zero-count @@ -152,9 +152,9 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche #' #' **Format detection** (applied to file or object alike): the first seven rows #' of the first column are inspected: -#' - **Standard format**: all entries are blank or `"*"` — the first 7 rows are +#' - **Standard format**: all entries are blank or `"*"` - the first 7 rows are #' treated as DArT placeholder rows and skipped. -#' - **Fixed-allele-ID format**: no filler rows — data are used as-is. +#' - **Fixed-allele-ID format**: no filler rows - data are used as-is. #' #' **`|AltMatch` / `|RefMatch` handling** (controlled by `collapse_matches_counts`): #' - `FALSE` (default): these rows are simply discarded. diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index d75a17f..81e0643 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -4,7 +4,7 @@ #' #' @description #' Parses a DArTag **MADC** report and writes a **VCF v4.3** containing per-target -#' read counts for the panel’s target loci. This is useful because MADC is not +#' read counts for the panel's target loci. This is useful because MADC is not #' widely supported by general-purpose tools, while VCF is. #' #' @details @@ -13,7 +13,7 @@ #' presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, #' chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). #' - Extracts reference and total read counts per sample and target. -#' - Derives `AD` (ref,alt) by subtraction (alt = total − ref). +#' - Derives `AD` (ref,alt) by subtraction (alt = total - ref). #' - If `get_REF_ALT = TRUE`, recovers REF/ALT bases either from `markers_info` #' (when `Ref`/`Alt` columns are present) or by comparing the Ref/Alt probe #' sequences in the MADC file (with strand correction via `botloci_file`). @@ -23,12 +23,12 @@ #' #' **Output VCF layout** #' - `INFO` fields: -#' * `DP` — total depth across all samples for the locus -#' * `ADS` — total counts across samples in the order `ref,alt` +#' * `DP` - total depth across all samples for the locus +#' * `ADS` - total counts across samples in the order `ref,alt` #' - `FORMAT` fields (per sample): -#' * `DP` — total reads (ref + alt) -#' * `RA` — reads supporting the reference allele -#' * `AD` — `"ref,alt"` counts +#' * `DP` - total reads (ref + alt) +#' * `RA` - reads supporting the reference allele +#' * `AD` - `"ref,alt"` counts #' #' **Strand handling** #' If a target ID appears in `botloci_file`, its probe sequences are reverse- @@ -46,21 +46,21 @@ #' | Check | Status | `get_REF_ALT` | Required | #' |---|---|---|---| #' | **IUPAC codes** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | -#' | | detected | `FALSE` | — | +#' | | detected | `FALSE` | none | #' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | -#' | | not detected | `FALSE` | — | +#' | | not detected | `FALSE` | none | #' | **Indels** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | -#' | | detected | `FALSE` | — | +#' | | detected | `FALSE` | none | #' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | -#' | | not detected | `FALSE` | — | +#' | | not detected | `FALSE` | none | #' | **ChromPos** | valid | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | -#' | | valid | `FALSE` | — | +#' | | valid | `FALSE` | none | #' | | invalid | `TRUE` | `markers_info` with `Chr`/`Pos`/`Ref`/`Alt` **or** `markers_info` with `Chr`/`Pos` + `botloci_file` | #' | | invalid | `FALSE` | `markers_info` with `Chr`/`Pos` | #' | **FixAlleleIDs** | fixed | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | -#' | | fixed | `FALSE` | — | +#' | | fixed | `FALSE` | none | #' | | not fixed | `TRUE` | `markers_info` with `Ref`/`Alt` | -#' | | not fixed | `FALSE` | — | +#' | | not fixed | `FALSE` | none | #' #' @param madc_file character. Path to the input MADC CSV file. #' @param output.file character. Path to the output VCF file to write. @@ -75,8 +75,8 @@ #' metadata. Matching is done by column name, not column position. Accepted columns: #' - one marker identifier column named `CloneID`, `Marker_ID`, or `BI_markerID` #' (required; a generic `ID` column is not accepted); -#' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; -#' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence +#' - `Chr`, `Pos` - required when `CloneID` does not follow the `Chr_Pos` format; +#' - `Ref`, `Alt` - required when `get_REF_ALT = TRUE` and probe-sequence #' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When #' `get_REF_ALT = TRUE`, `botloci_file` is still required unless `Ref` and `Alt` #' are supplied here. @@ -256,7 +256,7 @@ madc2vcf_targets <- function(madc_file, if(get_REF_ALT) { if(mi_has_ref_alt) { - # markers_info supplies REF and ALT — no botloci required + # markers_info supplies REF and ALT - no botloci required vmsg("markers_info contains Ref and Alt columns. REF and ALT will be taken from markers_info.", verbose = verbose, level = 1, type = ">>") @@ -264,7 +264,7 @@ madc2vcf_targets <- function(madc_file, if(checks$checks["Indels"]) stop("Indels detected in MADC file. Since get_REF_ALT = TRUE, a markers_info file with REF/ALT information is required.") - # REF/ALT must be extracted from probe sequences — botloci is required + # REF/ALT must be extracted from probe sequences - botloci is required if(is.null(botloci_file) || (!file.exists(botloci_file) && !url_exists(botloci_file))) stop("get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE.") @@ -454,7 +454,7 @@ madc2vcf_targets <- function(madc_file, } } else { - # ── get_REF_ALT = FALSE, no markers_info ───────────────────────── + # get_REF_ALT = FALSE, no markers_info ref_base <- "." alt_base <- "." vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 8b2ecbc..dfcaf17 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -166,7 +166,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- NA_character_ best_female_parent_pct <- NA_real_ - ## Founder check — both parents "0" and ID in founders list + ## Founder check - both parents "0" and ID in founders list if (male_parent_id == "0" && female_parent_id == "0" && prog_id %in% founder_ids) { status <- "FOUNDERS" @@ -174,7 +174,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { - ## Missing parent(s) — recommendations only, "0"s preserved in pedigree + ## Missing parent(s) - recommendations only, "0"s preserved in pedigree if (male_parent_id == "0" && female_parent_id == "0") { status <- "MISSING_BOTH_PARENTS" correction_decision <- "NONE" @@ -205,7 +205,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { - ## Both parents present — Mendelian error calculation + ## Both parents present - Mendelian error calculation progeny_vec <- genos_mat[prog_id, ] male_parent_vec <- genos_mat[male_parent_id, ] female_parent_vec <- genos_mat[female_parent_id, ] diff --git a/cran-comments.md b/cran-comments.md index b5fed72..f09be5d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,4 +2,10 @@ 0 errors | 0 warnings | 0 note +## Resubmission + +This is a resubmission. I replaced Unicode punctuation in Rd-generated +documentation, including the Unicode minus sign (U+2212) that caused the CRAN +PDF manual build failure. + * This is an update release. diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 706a15f..77b9c3c 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -33,13 +33,13 @@ Must include an 'ID' column.} \item{method}{Character. Parentage assignment method. One of: \itemize{ -\item \code{"best_male_parent"} — finds the best male parent for each +\item \code{"best_male_parent"} - finds the best male parent for each progeny using homozygous mismatch rate. -\item \code{"best_female_parent"} — finds the best female parent for each +\item \code{"best_female_parent"} - finds the best female parent for each progeny using homozygous mismatch rate. -\item \code{"best_match"} — finds the single best parent (either sex) +\item \code{"best_match"} - finds the single best parent (either sex) using homozygous mismatch rate. -\item \code{"best_pair"} — finds the best male-female parent pair for +\item \code{"best_pair"} - finds the best male-female parent pair for each progeny using full Mendelian error rate (default). }} diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd index 207b899..393ddcb 100644 --- a/man/get_countsMADC.Rd +++ b/man/get_countsMADC.Rd @@ -44,8 +44,8 @@ Either \code{madc_file} or \code{madc_object} must be provided (not both \code{N When \code{madc_object} is supplied it is passed directly to \code{get_counts()}, skipping file I/O. The function constructs: \itemize{ -\item \code{ref_matrix} — per-sample reference allele counts. -\item \code{size_matrix} — per-sample total counts (ref + alt). +\item \code{ref_matrix} - per-sample reference allele counts. +\item \code{size_matrix} - per-sample total counts (ref + alt). } Markers whose \code{CloneID} appears only in the \code{Ref} or only in the \code{Alt} rows diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 8cab155..3ed4aaf 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -32,8 +32,8 @@ metadata. Matching is done by column name, not column position. Accepted columns \itemize{ \item one marker identifier column named \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} (required; a generic \code{ID} column is not accepted); -\item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; -\item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence +\item \code{Chr}, \code{Pos} - required when \code{CloneID} does not follow the \code{Chr_Pos} format; +\item \code{Ref}, \code{Alt} - required when \code{get_REF_ALT = TRUE} and probe-sequence inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When \code{get_REF_ALT = TRUE}, \code{botloci_file} is still required unless \code{Ref} and \code{Alt} are supplied here. @@ -61,7 +61,7 @@ samples in the MADC file. } \description{ Parses a DArTag \strong{MADC} report and writes a \strong{VCF v4.3} containing per-target -read counts for the panel’s target loci. This is useful because MADC is not +read counts for the panel's target loci. This is useful because MADC is not widely supported by general-purpose tools, while VCF is. } \details{ @@ -73,7 +73,7 @@ Convert DArTag MADC target read counts to a VCF presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). \item Extracts reference and total read counts per sample and target. -\item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref). +\item Derives \code{AD} (ref,alt) by subtraction (alt = total - ref). \item If \code{get_REF_ALT = TRUE}, recovers REF/ALT bases either from \code{markers_info} (when \code{Ref}/\code{Alt} columns are present) or by comparing the Ref/Alt probe sequences in the MADC file (with strand correction via \code{botloci_file}). @@ -86,14 +86,14 @@ Targets with >1 polymorphism between sequences are discarded. \itemize{ \item \code{INFO} fields: \itemize{ -\item \code{DP} — total depth across all samples for the locus -\item \code{ADS} — total counts across samples in the order \verb{ref,alt} +\item \code{DP} - total depth across all samples for the locus +\item \code{ADS} - total counts across samples in the order \verb{ref,alt} } \item \code{FORMAT} fields (per sample): \itemize{ -\item \code{DP} — total reads (ref + alt) -\item \code{RA} — reads supporting the reference allele -\item \code{AD} — \code{"ref,alt"} counts +\item \code{DP} - total reads (ref + alt) +\item \code{RA} - reads supporting the reference allele +\item \code{AD} - \code{"ref,alt"} counts } } @@ -111,21 +111,21 @@ For the remaining checks the required inputs depend on the combination of check result and \code{get_REF_ALT}:\tabular{llll}{ Check \tab Status \tab \code{get_REF_ALT} \tab Required \cr \strong{IUPAC codes} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr - \tab detected \tab \code{FALSE} \tab — \cr + \tab detected \tab \code{FALSE} \tab none \cr \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr - \tab not detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{FALSE} \tab none \cr \strong{Indels} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr - \tab detected \tab \code{FALSE} \tab — \cr + \tab detected \tab \code{FALSE} \tab none \cr \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr - \tab not detected \tab \code{FALSE} \tab — \cr + \tab not detected \tab \code{FALSE} \tab none \cr \strong{ChromPos} \tab valid \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr - \tab valid \tab \code{FALSE} \tab — \cr + \tab valid \tab \code{FALSE} \tab none \cr \tab invalid \tab \code{TRUE} \tab \code{markers_info} with \code{Chr}/\code{Pos}/\code{Ref}/\code{Alt} \strong{or} \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr \tab invalid \tab \code{FALSE} \tab \code{markers_info} with \code{Chr}/\code{Pos} \cr \strong{FixAlleleIDs} \tab fixed \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr - \tab fixed \tab \code{FALSE} \tab — \cr + \tab fixed \tab \code{FALSE} \tab none \cr \tab not fixed \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr - \tab not fixed \tab \code{FALSE} \tab — \cr + \tab not fixed \tab \code{FALSE} \tab none \cr } } \section{Dependencies}{ From f2c847b1d29c63355dd0631308ac7d3ea8b77499 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 18 May 2026 07:55:01 -0400 Subject: [PATCH 80/80] cran submit --- CRAN-SUBMISSION | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 CRAN-SUBMISSION diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 0000000..224e295 --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 0.7.2 +Date: 2026-05-17 23:05:53 UTC +SHA: 0fd7d9c081cfb341c56dc58de4d77d283d7ce726