diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 969e02d..4ddf24c 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -26,6 +26,7 @@ jobs: env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes + NOT_CRAN: true steps: - uses: actions/checkout@v3 @@ -43,7 +44,12 @@ jobs: extra-packages: | any::rcmdcheck any::covr + any::polyRAD needs: check + + - name: Install VariantAnnotation (no Suggests) + run: pak::pkg_install("bioc::VariantAnnotation", dependencies = c("Depends", "Imports", "LinkingTo")) + shell: Rscript {0} - uses: r-lib/actions/check-r-package@v2 - name: Generate test coverage report @@ -57,4 +63,3 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} slug: Breeding-Insight/BIGr files: coverage.xml - diff --git a/.gitignore b/.gitignore index b64a99f..d3ffaad 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .RData .Ruserdata revdep/ +.DS_Store diff --git a/BIGr.Rproj b/BIGr.Rproj index 69fafd4..5638e2e 100644 --- a/BIGr.Rproj +++ b/BIGr.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 0eeaab63-2615-4da7-b10a-927160fc78a3 RestoreWorkspace: No SaveWorkspace: No diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index ae8365d..224e295 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 0.6.2 -Date: 2025-09-18 12:16:11 UTC -SHA: 142dc9524d88b47db88ddca2aa39cd729a8d5a0d +Version: 0.7.2 +Date: 2026-05-17 23:05:53 UTC +SHA: 0fd7d9c081cfb341c56dc58de4d77d283d7ce726 diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..7b0280d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.2 +Version: 0.7.2 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', @@ -23,7 +23,7 @@ Authors@R: c(person(given='Alexander M.', person(given='Dongyan', family='Zhao', role='ctb'), - person('Cornell', 'University', + person('University', "of Florida", role=c('cph'), comment = "Breeding Insight")) Maintainer: Alexander M. Sandercock @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: @@ -62,12 +62,15 @@ Imports: janitor, quadprog, tibble, - stringr + stringr, + data.table Suggests: covr, + ggplot2, spelling, rmdformats, knitr (>= 1.10), rmarkdown, + polyRAD, testthat (>= 3.0.0) RdMacros: Rdpack diff --git a/NAMESPACE b/NAMESPACE index d47d95d..4b5cb05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,27 +4,30 @@ export(allele_freq_poly) export(calculate_Het) export(calculate_MAF) export(check_homozygous_trios) +export(check_madc_sanity) export(check_ped) export(check_replicates) export(dosage2vcf) export(dosage_ratios) export(filterMADC) export(filterVCF) +export(find_parentage) export(flip_dosage) export(get_countsMADC) export(imputation_concordance) export(madc2gmat) export(madc2vcf_all) +export(madc2vcf_multi) export(madc2vcf_targets) export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(validate_pedigree) import(dplyr) import(janitor) import(parallel) import(quadprog) -import(rlang) import(stringr) import(tibble) import(tidyr) @@ -33,13 +36,36 @@ importFrom(Biostrings,DNAString) importFrom(Biostrings,reverseComplement) importFrom(Rdpack,reprompt) importFrom(Rsamtools,bgzip) +importFrom(data.table,CJ) +importFrom(data.table,as.data.table) +importFrom(data.table,copy) +importFrom(data.table,data.table) +importFrom(data.table,fread) +importFrom(data.table,fwrite) +importFrom(data.table,rbindlist) +importFrom(data.table,set) +importFrom(dplyr,"%>%") +importFrom(dplyr,across) +importFrom(dplyr,arrange) +importFrom(dplyr,case_when) +importFrom(dplyr,filter) +importFrom(dplyr,group_by) +importFrom(dplyr,group_modify) +importFrom(dplyr,mutate) +importFrom(dplyr,select) +importFrom(dplyr,summarise) +importFrom(dplyr,ungroup) +importFrom(dplyr,where) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) +importFrom(rlang,sym) importFrom(stats,cor) +importFrom(stats,reorder) importFrom(stats,setNames) +importFrom(tibble,as_tibble) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/NEWS.md b/NEWS.md index b089e67..309fc13 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,107 @@ +# BIGr 0.7.2 + +- Fixed manual text errors + +# BIGr 0.7.1 + +- Updated `check_ped()` to return corrected pedigree data in the result list instead of assigning objects to the global environment +- Skipped long remote `madc2vcf_all` integration tests on CRAN while keeping them enabled in GitHub Actions + +# BIGr 0.7.0 + +## Updates on `dosage2vcf` + +- Added support for DArT SNP/INDEL 1-row and 2-row report formats +- `dosage2vcf` now validates marker and sample sets between report and counts files, then aligns counts to the report order before writing VCF genotypes +- VCF `CHROM` and `POS` are derived from `Chrom`/`ChromPos` when present, otherwise from `MarkerName`; `MarkerName` is retained in the VCF `ID` field +- Missing SNP/INDEL genotype calls (`-`/`NA`) are written as diploid missing genotypes (`./.`) + +## New function `madc2vcf_multi` + +- New function `madc2vcf_multi` to convert a DArTag MADC file to a VCF using the polyRAD pipeline for multiallelic genotyping +- Runs `check_madc_sanity` before loading the data and stops with informative errors if: + - Required columns are missing + - IUPAC (non-ATCG) codes are present in AlleleSequence + - Ref/Alt sequences are unpaired (`RefAltSeqs = FALSE`) + - Allele IDs have not been fixed by HapApp (`FixAlleleIDs = FALSE`) + - CloneIDs do not follow `Chr_Pos` format and no `markers_info` is provided +- New argument `markers_info`: optional path or URL to a CSV with `CloneID`/`BI_markerID`, `Chr`, and `Pos` columns; required when CloneIDs do not follow the `Chr_Pos` format +- Runs `check_botloci` to validate and reconcile CloneIDs between the MADC and botloci file, automatically fixing padding mismatches +- A corrected temp file is written and passed to `readDArTag` only when needed (all-NA rows/columns detected, CloneIDs remapped by `check_botloci`, or botloci IDs remapped) +- Accepts paths or URLs for `madc_file`, `botloci_file`, and `markers_info` +- Estimates overdispersion with `polyRAD::TestOverdispersion`, iterates priors with `polyRAD::IterateHWE`, and exports the result with `polyRAD::RADdata2VCF` +- `polyRAD` is a soft dependency (listed under `Suggests`); an informative error is raised if it is not installed + +# BIGr 0.6.6 + +## Updates on `madc2vcf_all` + +- New arguments for controlling processing of `Other` alleles: + - `add_others`: if `TRUE` (default), alleles labeled "Other" in the MADC are included in off-target SNP extraction + - `others_max_snps`: discards Other alleles with more than this many SNP differences relative to the Ref sequence (default: 5) + - `others_rm_with_indels`: discards Other alleles containing insertions or deletions relative to the Ref sequence (default: `TRUE`) +- Others alleles that carry a different base at the target SNP position are now reported as a 3rd allele in the VCF instead of being silently dropped +- Target position is now correctly removed from Others alignments, preventing duplicate VCF positions and marker IDs +- Fixed a bug where Others alleles with "Ref_" or "Alt_" in their AlleleID would corrupt the target SNP REF/ALT fields and read depth counts in `merge_counts` +- Improved verbose messages throughout: counts of Other alleles found, kept, and discarded (by indel filter and by max SNP filter) are now reported; multiallelic target SNPs with a 3rd allele from Others are counted and reported +- Debug-level message (level 3) listing each Other allele added and its genomic position + +# BIGr 0.6.5 + +## Updates on madc2vcf functions +Details: + +- both functions targets and all (targets + off-targets) markers now have `check_madc_sanity` function implemented. It tests: + - [Columns] If MADC has the expected columns + - [allNArow | allNAcol] Presence of columns and rows with all NA (happens often when people open the MADC in excel before loading in R) + - [IUPACcodes] Presence of IUPAC codes on AlleleSequence + - [LowerCase] Presence of lower case bases on AlleleSequence + - [Indels] Presence of Indels + - [ChromPos] If CloneID follows the format Chr_Pos + - [RefAltSeqs] If all Ref Allele has corresponding Alt and vice-versa + - [OtherAlleles] If "Other" exists in the MADC AlleleID + +- Better messages if `verbose = TRUE` in `madc2vcf_all` +- `madc2vcf_all` support for Indels - markers_info with Indels position is required; only the target indel is extracted, off-targets are ignored for the tag +- `madc2vcf_targets` doesn’t run if: + - MADC Column names are not correct + - Ignore Other alleles - but inform the user if they exist or not and direct them to `madc2vcf_all` in case they want to extract them as well +- See the table for madc2vcf_targets requirements accordingly to MADC content: + +  | check status | get_REF_ALT | Requires +-- | -- | -- | -- +IUPAC | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +Indels | TRUE | TRUE | markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | botloci or markers_info REF/ALT +  | FALSE | FALSE | - +ChromPos | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info CHR/POS/REF/ALT or markers_info CHR/POS/ + botloci +  | FALSE | FALSE | markers_info CHR/POS +FixAlleleIDs | TRUE | TRUE | botloci or markers_info REF/ALT +  | TRUE | FALSE | - +  | FALSE | TRUE | markers_info REF/ALT +  | FALSE | FALSE | - + +# BIGr 0.6.4 + +- Add function `vmsg` to organize messages printed on the console +- Add metadata to VCF header from madc2vcf_targets +- Add argument `madc_object` to `get_countsMADC` to avoid reading the MADC file twice and to get directly the MADC fixed padding output from `check_botloci` +- Organize messages from `madc2vcf_targets` checks +- Add argument `collapse_matches_counts` and `verbose` to `madc2vcf_targets` function + +# BIGr 0.6.3 + +- New function to check MADC files: `check_madc_sanity`. Currently, it checks for the presence of required columns, whether fixed allele IDs were assigned, the presence of IUPAC codes, lowercase sequence bases, indels, and chromosome and position information. +- Added new argument `markers_info`, which allows users to provide a CSV file with marker information such as CHROM, POS, marker type, and position of indels. For BI species, this information is available from [PanelHub](https://github.com/Breeding-Insight/BIGapp-PanelHub). +- Checked inputs for `madc2vcf_all`. +- Updated affiliation in `DESCRIPTION`. + # BIGr 0.6.2 - Fixed the doi and name list in the CITATION file @@ -51,4 +155,3 @@ - updog2vcf function option to output compressed VCF (.vcf.gz) - set as default - remove need for defining ploidy - add metadata at the VCF header - diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R new file mode 100644 index 0000000..88f564a --- /dev/null +++ b/R/check_madc_sanity.R @@ -0,0 +1,320 @@ +#' Run basic sanity checks on a MADC-style allele report +#' +#' @description +#' Performs nine quick validations on an allele report: +#' 1) **Columns** - required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); +#' 2) **FixAlleleIDs** - first column's first up-to-6 rows are not all blank or `"*"` +#' *and* both `_0001` and `_0002` appear in `AlleleID`; +#' 3) **IUPACcodes** - presence of non-ATCG characters in `AlleleSequence`; +#' 4) **LowerCase** - presence of lowercase a/t/c/g in `AlleleSequence`; +#' 5) **Indels** - reference/alternate allele lengths differ for the same `CloneID`, +#' or a `"-"` character is present in `AlleleSequence`; +#' 6) **ChromPos** - all `CloneID` values follow the `Chr_Position` format +#' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); +#' 7) **allNAcol** - at least one column contains only `NA` or empty values; +#' 8) **allNArow** - at least one row contains only `NA` or empty values; +#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row; +#' 10) **OtherAlleles** - presence of alleles where the target locus differs from both the Ref and Alt in `AlleleSequence`. +#' +#' @param report A `data.frame` with at least the columns +#' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also +#' used in the `FixAlleleIDs` check to inspect its first up to six entries. +#' If `FixAlleleIDs` is `FALSE` (raw DArT format), the first 7 rows are +#' treated as header filler and skipped before further checks are run. +#' +#' @details +#' - **FixAlleleIDs:** When the first six rows of the first column are all blank +#' or `"*"` (raw DArT format), row 7 is promoted to column headers and the +#' first 7 rows are dropped before subsequent checks are run. The check is +#' `TRUE` when the file has already been processed by HapApp (fixed IDs with +#' `_0001`/`_0002` suffixes). +#' - **IUPAC check:** Flags any character outside `A`, `T`, `C`, `G` and `"-"` +#' (case-insensitive), which includes ambiguity codes (`N`, `R`, `Y`, etc.). +#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs +#' `"Alt_0002"`, merged by `CloneID`, and flagged as indels if either (a) the +#' lengths of `AlleleSequence` differ, (b) the sequences have the same length +#' but more than one character differs between them (complex indel / local +#' rearrangement), or (c) a `"-"` character is present anywhere in +#' `AlleleSequence`. +#' - **ChromPos:** Each `CloneID` is split on `"_"` into exactly two parts; the +#' first part must match `"Chr"` (case-insensitive) and the second must be a +#' positive integer. Returns `FALSE` when any `CloneID` is `NA`. +#' - **allNAcol / allNArow:** Detected via `apply()` over columns/rows +#' respectively; a cell is treated as missing when it is `NA` or an empty +#' string (`""`). Useful for flagging empty or corrupt entries. +#' - **RefAltSeqs:** For each unique `CloneID`, checks whether at least one row +#' with a `Ref` (`|Ref_` when `FixAlleleIDs = TRUE`, `|Ref$` otherwise) and +#' one row with an `Alt` (`|Alt_` / `|Alt$`) allele exist. `CloneID`s that +#' lack a `Ref` row are stored in `missRef`; those lacking an `Alt` row in +#' `missAlt`. The check is `TRUE` when both sets are empty. +#' - If required columns are missing (`Columns = FALSE`), only `Columns` and +#' `FixAlleleIDs` are evaluated; all other checks remain `NA` and +#' `indel_clone_ids`, `missRef`, and `missAlt` are returned as `NULL`. +#' +#' @return A named list with five elements: +#' \describe{ +#' \item{checks}{Named logical vector with nine entries: +#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`, +#' `ChromPos`, `allNAcol`, `allNArow`, `RefAltSeqs`. +#' `TRUE` means the condition was detected (or passed for `Columns`, +#' `FixAlleleIDs`, `ChromPos`, and `RefAltSeqs`); `NA` means the check +#' was skipped.} +#' \item{messages}{Named list of length-2 character vectors, one per check. +#' Element `[[1]]` is the message when the check is `TRUE`, element `[[2]]` +#' when it is `FALSE`. Indexed by the same names as `checks`.} +#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt +#' lengths differ. Returns `character(0)` if none are found, or `NULL` +#' when required columns are missing.} +#' \item{missRef}{Character vector of `CloneID`s that have no `Ref` allele +#' row. Returns `character(0)` if all `CloneID`s have a `Ref` row, or +#' `NULL` when required columns are missing.} +#' \item{missAlt}{Character vector of `CloneID`s that have no `Alt` allele +#' row. Returns `character(0)` if all `CloneID`s have an `Alt` row, or +#' `NULL` when required columns are missing.} +#' } +#' +#' @export +check_madc_sanity <- function(report) { + + # Initialize + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, + ChromPos = NA, allNAcol = NA, allNArow = NA, RefAltSeqs = NA, OtherAlleles = NA) + + # ---- FixAlleleIDs ---- + # Check if first up-to-6 entries in the *first column* are all "" or "*" + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + if(all_blank_or_star) { + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 + + # Validate required columns + required <- c("CloneID", "AlleleID", "AlleleSequence") + missing_cols <- setdiff(required, names(report)) + checks["Columns"] <- length(missing_cols) == 0 + + if(checks[["Columns"]]){ + # ---- IUPACcodes ---- + iu <- grepl("[^ATCG-]", report$AlleleSequence, ignore.case = TRUE) + checks["IUPACcodes"] <- any(iu, na.rm = TRUE) + + # ---- LowerCase ---- + lc <- grepl("[atcg]", report$AlleleSequence) + checks["LowerCase"] <- any(lc, na.rm = TRUE) + + # ---- Indels ---- + refs <- subset(report, grepl("Ref_0001", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + alts <- subset(report, grepl("Alt_0002", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + + merged <- merge(refs, alts, by = "CloneID", suffixes = c("_ref", "_alt"), all = FALSE) + + if (nrow(merged) > 0) { + ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE) + alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) + cmp_ok <- !is.na(ref_len) & !is.na(alt_len) + + # Classic indel: different lengths + indel_mask <- cmp_ok & (ref_len != alt_len) + + # Complex indel: same length but >1 character difference between sequences + same_len <- cmp_ok & (ref_len == alt_len) + if (any(same_len)) { + n_diffs <- mapply(function(r, a) { + r_chars <- strsplit(r, "")[[1]] + a_chars <- strsplit(a, "")[[1]] + standard <- toupper(r_chars) %in% c("A","T","C","G") & toupper(a_chars) %in% c("A","T","C","G") + sum(toupper(r_chars[standard]) != toupper(a_chars[standard])) + }, merged$AlleleSequence_ref[same_len], merged$AlleleSequence_alt[same_len]) + indel_mask[same_len] <- n_diffs > 1 + } + + checks["Indels"] <- any(indel_mask) | any(grepl("-", report$AlleleSequence), na.rm = TRUE) + indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) + + } else { + checks["Indels"] <- FALSE + indels <- character(0) + } + + # --- All NA ---- + checks["allNArow"] <- any(apply(report, 1, function(x) all(is.na(x) | x == ""))) + checks["allNAcol"] <- any(apply(report, 2, function(x) all(is.na(x) | x == ""))) + + # ---- Chrom Pos ---- + if(!any(is.na(report$CloneID))) { + pos <- strsplit(report$CloneID, "_") + format <- all(sapply(pos, length) == 2) + first <- all(grepl("^[A-Za-z]", sapply(pos, "[", 1))) + second <- all(vapply(pos, function(x) { + pos_num <- suppressWarnings(as.numeric(x[2])) + !is.na(pos_num) && pos_num > 0 + }, logical(1))) + checks["ChromPos"] <- all(format, first, second) + } else checks["ChromPos"] <- FALSE + + # ---- RefAltSeqs ---- + all_clones <- unique(report$CloneID) + if (isTRUE(checks["FixAlleleIDs"])) { + has_ref <- unique(report$CloneID[grepl("\\|Ref_", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt_", report$AlleleID)]) + } else { + has_ref <- unique(report$CloneID[grepl("\\|Ref$", report$AlleleID)]) + has_alt <- unique(report$CloneID[grepl("\\|Alt$", report$AlleleID)]) + } + missRef <- setdiff(all_clones, has_ref) + missAlt <- setdiff(all_clones, has_alt) + checks["RefAltSeqs"] <- length(missRef) == 0 & length(missAlt) == 0 + + # ---- OtherAlleles ---- + checks["OtherAlleles"] <- any(grepl("[|]Other", report$AlleleID)) + + } else { + indels <- NULL + missRef <- NULL + missAlt <- NULL + } + + messages[["Columns"]] <- c("Required columns are present", + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence") + messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good", + "MADC not processed by HapApp") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them", + "No IUPAC (non-ATCG) codes found in AlleleSequence") + messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence", + "No lowercase bases found in AlleleSequence") + messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ or >1 mismatch between same-length sequences) for the CloneIDs:",paste(indels, collapse = " ")), + "No indels found (ref/alt lengths match and at most 1 mismatch) for all CloneIDs") + messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good", + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information") + messages[["allNArow"]] <- c("One or more rows contain all NA values", + "No rows with all NA values") + messages[["allNAcol"]] <- c("One or more columns contain all NA values", + "No columns with all NA values") + messages[["RefAltSeqs"]] <- c("All CloneIDs have both Ref and Alt alleles", + paste0("Some CloneIDs are missing Ref and/or Alt alleles ", + "Missing Ref: ", paste(missRef, collapse = " "), ". ", + "Missing Alt: ", paste(missAlt, collapse = " "), ".")) + messages[["OtherAlleles"]] <- c("Alleles other than Ref and Alt were found in AlleleID", + "No alleles other than Ref and Alt found in AlleleID") + + list(checks = checks, messages = messages, indel_clone_ids = indels, + missRef = missRef, missAlt = missAlt) +} + +#' Check and Adjust Botloci and MADC Marker Compatibility +#' +#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. +#' +#' @param botloci A data frame containing the botloci markers. +#' @param report A data frame containing the MADC markers. +#' @param ChromPos logical value indicating whether the CloneID in the MADC file contains chromosome and position information in the format "Chr_Pos". Default is TRUE +#' @param mi_df A data frame containing marker information with one marker ID column +#' (`CloneID`, `Marker_ID`, or `BI_markerID`) plus `Chr` and `Pos`. Required if +#' `ChromPos` is FALSE. +#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. Required if `ChromPos` is FALSE. +#' +#' @return A list containing the adjusted botloci and MADC data frames. +#' +#' @details +#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. +#' +#' @keywords internal +#' @noRd +pick_markers_info_id_col <- function(mi_df, query_ids) { + query_ids <- unique(stats::na.omit(query_ids)) + id_cols <- intersect(c("CloneID", "BI_markerID", "Marker_ID"), colnames(mi_df)) + + if (!length(id_cols)) { + stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") + } + + match_n <- vapply(id_cols, function(col) { + sum(query_ids %in% unique(stats::na.omit(mi_df[[col]]))) + }, integer(1)) + + if (!any(match_n)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.") + } + + id_cols[which.max(match_n)] +} + +check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=TRUE){ + original_clone_ids <- report$CloneID + use_col <- NULL + + # Check inputs + if(!ChromPos) { + if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with 'CloneID'/'Marker_ID'/'BI_markerID', 'Chr', and 'Pos' columns must be provided.") + use_col <- pick_markers_info_id_col(mi_df, report$CloneID) + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") + if(is.null(mi_df$Chr) | is.null(mi_df$Pos)) stop("When MADC CloneID don't follow the format Chr_Pos, Chr and Pos columns must be provided in the markers_info file.") + } + + if(!any(botloci$V1 %in% report$CloneID)) { # First check if any botloci markers are found in MADC file. If not, check for padding mismatch. + vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") + if(!is.null(mi_df) && is.null(use_col)) { + use_col <- pick_markers_info_id_col(mi_df, original_clone_ids) + } + + pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) + + if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") + + if(pad_madc != pad_botloci) { + vmsg("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.", verbose = verbose, level = 2, type = ">>") + if (pad_madc < pad_botloci) { + report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) + ) + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) + if(!is.null(mi_df)) { + mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df[[use_col]]), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df[[use_col]]))) + ) + } + if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") + } else { + botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", + sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) + ) + if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") + } + } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 2, type = ">>") + use_col <- pick_markers_info_id_col(mi_df, report$CloneID) + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") + mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) + + if(!any(botloci$V1 %in% mk_info_CloneID)){ + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file don't solve the issue.", verbose = verbose, level = 2, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } else { + vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 2, type = ">>") + report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) + mi_df$CloneID <- mk_info_CloneID + } + } else { + vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") + vmsg("Chromosome and Position information in marker_file not provided.", verbose = verbose, level = 2, type = ">>") + stop("Check marker IDs in both MADC and botloci files. They should be the same.") + } + } + return(list(botloci, report, mi_df)) +} diff --git a/R/check_ped.R b/R/check_ped.R index 3f4831b..1359eba 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,99 +1,146 @@ -#' Evaluate Pedigree File for Accuracy +#' Check a pedigree file for accuracy and report/correct common errors #' -#' Check a pedigree file for accuracy and output suspected errors +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' and performs quality checks, optionally correcting or flagging errors. #' -#'check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: -#'* Ids that appear more than once in the id column -#'* Ids that appear in both sire and dam columns -#'* Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -#'* Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +#' The function checks for: +#' * Exact duplicate rows and removes them (keeping one copy) +#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +#' * IDs that appear in both sire and dam columns +#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' -#'When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -#'Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. #' -#'Note: This function does not change the input file but prints any errors found in the console. +#' The function does **not** overwrite the input file or create objects in the +#' global environment. Instead, it returns the report and corrected pedigree in +#' a list. +#' +#' @param ped.file Path to the pedigree text file. +#' @param seed Optional seed for reproducibility. +#' @param verbose Logical. If TRUE (default), prints the report to the console. +#' +#' @return A list of data.frames containing detected issues: +#' * `exact_duplicates`: rows that were exact duplicates +#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam +#' * `messy_parents`: IDs appearing as both sire and dam +#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam +#' * `dependencies`: detected cycles in the pedigree +#' * `corrected_pedigree`: corrected pedigree table #' -#' @param ped.file path to pedigree text file. The pedigree file is a -#' 3-column pedigree tab separated file with columns labeled as id sire dam in any order -#' @param seed Optional seed for reproducibility -#' @param verbose Logical. If TRUE, print the errors to the console. -#' @return A list of data.frames of error types, and the output printed to the console #' @examples -#' ##Get list with a dataframe for each error type -#' ped_file <- system.file("check_ped_test.txt", package="BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, -#' seed = 101919) +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) #' -#' ##Access the "messy parents" dataframe result +#' # Access messy parents #' ped_errors$messy_parents #' -#' ##Get list of sample IDs with messy parents error -#' messy_parent_ids <- ped_errors$messy_parents$id -#' print(messy_parent_ids) +#' # Access corrected pedigree +#' ped_errors$corrected_pedigree +#' +#' # IDs with messy parents +#' messy_ids <- ped_errors$messy_parents$id +#' print(messy_ids) +#' #' @import dplyr #' @import janitor #' @importFrom stats setNames #' @importFrom utils read.table #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { - #### Function to check for hierarchical errors missing parents and repeated ids #### - if(!is.null(seed)){ - set.seed(seed) - } - #### read in data #### - data = utils::read.table(ped.file, header = T) - data <- data %>% + + #### setup #### + if (!is.null(seed)) set.seed(seed) + + # Read and clean data + data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( id = as.character(id), sire = as.character(sire), dam = as.character(dam) - ) - #Missing parents dataframe initialize - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + ) + + original_data <- data errors <- list() - # repeated id checks - n_occur <- data.frame(table(data$id)) - repeated_ids = n_occur[n_occur$Freq > 1,] %>% - rename(id = Var1) - # Check for ids that appear as both sire and dam ###This is possible for plants so maybe do not control for this or do not delete these rows just print them - messy_parents <- as.data.frame(intersect(data$sire, data$dam)) %>% - rename(id = 1) %>% - filter(id != 0) - # Missing parents check - for (i in 1:nrow(data)) { + missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + + #### check 1: exact duplicates #### + exact_duplicates <- data[duplicated(data), ] + if (nrow(exact_duplicates) > 0) { + data <- distinct(data) # remove exact duplicates + } + + #### check 2: repeated IDs with conflicting sire/dam #### + repeated_ids <- data %>% + group_by(id) %>% + filter(n() > 1) %>% + ungroup() + + # Only IDs with actual conflicting sire/dam + conflicting_ids <- repeated_ids %>% + group_by(id) %>% + filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% + ungroup() + + if (nrow(conflicting_ids) > 0) { + # Keep one row per ID, set sire/dam to "0" + data <- data %>% + group_by(id) %>% + summarize( + sire = if(n_distinct(sire) > 1) "0" else first(sire), + dam = if(n_distinct(dam) > 1) "0" else first(dam), + .groups = "drop" + ) + } + + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents #### + for (i in seq_len(nrow(data))) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] + if (sire != "0" && sire != id && !sire %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) } if (dam != "0" && dam != id && !dam %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) } + if (sire == id || dam == id) { errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) } } - # Remove duplicates + missing_parents <- distinct(missing_parents) - # Combine original data with missing parents - corrected_data <- bind_rows(data, missing_parents) - # Function to detect cycles in the pedigree graph and identify the nodes involved + if (nrow(missing_parents) > 0) { + data <- bind_rows(data, missing_parents) + } + + #### check 4: messy parents #### + sire_ids <- unique(data$sire[data$sire != "0"]) + dam_ids <- unique(data$dam[data$dam != "0"]) + messy_ids <- intersect(sire_ids, dam_ids) + messy_parents <- data %>% filter(id %in% messy_ids) + + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { - # Create an adjacency list - adj_list <- list() - for (i in 1:nrow(data)) { - adj_list[[data$id[i]]] <- c(data$sire[i], data$dam[i]) - } - # Helper function to perform DFS and detect cycles + adj_list <- lapply(data$id, function(x) { + row <- data[data$id == x, ] + c(row$sire, row$dam) + }) + names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { - if (neighbor != "0") { + if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { cycles <- append(cycles, dfs(neighbor, visited, rec_stack, path)) } else if (rec_stack[neighbor]) { @@ -102,14 +149,15 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } + rec_stack[node] <- FALSE return(cycles) } - # Initialize visited and recursion stack + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - # Check for cycles in the graph and return the nodes involved + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) @@ -120,75 +168,57 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } return(all_cycles) } - # Check for cycles in the corrected pedigree data - cycles <- detect_all_cycles(corrected_data) + + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { - cycle_number <- 1 for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) + errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) } } - results <- list(missing_parents = missing_parents, dependencies = data.frame(Dependency = unlist(errors)), repeated_ids = repeated_ids, messy_parents = messy_parents) - repeated_ids <- results$repeated_ids - missing_parents <- results$missing_parents - messy_parents <- results$messy_parents - errors <- results$dependencies - # Adding the dataframes as an output list - output.results <- list() - #### Print errors and cycles #### - # Print repeated ids if any - if (nrow(repeated_ids) > 0) { - if (verbose) { - cat("Repeated ids found:\n") - message(repeated_ids) - } - output.results$repeated_ids <- repeated_ids - } else { - if (verbose) { - cat("No repeated ids found.\n") - } - } - #Print parents that appear as male and female - if (nrow(messy_parents) > 0) { - if (verbose) { - cat("Ids found as male and female parent:\n") - message(messy_parents) - } - output.results$messy_parents <- messy_parents + #### compile findings #### + input_ped_report <- list( + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_report, + messy_parents = messy_parents, + missing_parents = missing_parents, + dependencies = data.frame(Dependency = unique(unlist(errors))), + corrected_pedigree = data + ) - } else { - if (verbose) { - cat("No ids found as male and female parent.\n") - } - } - # Print missing parents if any - if (nrow(missing_parents) > 0) { - if (verbose) { - cat("Missing parents found:\n") - message(missing_parents) - } - output.results$missing_parents <- missing_parents + #### output #### + if (verbose) { + cat("\n=== Pedigree Quality Check Report ===\n") - } else { - if (verbose) { - cat("No missing parents found.\n") - } - } - # Print errors if any - if (nrow(errors) > 0) { - if (verbose) { - cat("Dependencies found:\n") - message(unique(errors$Dependency)) - } - output.results$dependencies <- data.frame(Dependency = unlist(errors)) + if (nrow(exact_duplicates) > 0) { + cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate trios found.\n") - } else { - if (verbose) { - cat("No dependencies found.\n") - } + if (nrow(repeated_ids_report) > 0) { + cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") + print(repeated_ids_report) + } else cat("\nNo conflicting repeated trios found.\n") + + if (nrow(missing_parents) > 0) { + cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") + + if (nrow(messy_parents) > 0) { + cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") + print(messy_parents) + } else cat("\nNo IDs found as both sire and dam.\n") + + + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected:\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") + + cat("\nThe corrected pedigree is included in the returned list as `corrected_pedigree`.\n") } - return(results) + invisible(input_ped_report) } diff --git a/R/dosage2vcf.R b/R/dosage2vcf.R index e8a193d..0a15c48 100644 --- a/R/dosage2vcf.R +++ b/R/dosage2vcf.R @@ -1,13 +1,17 @@ -#' Convert DArTag Dosage and Counts to VCF +#' Convert DArTag genotype reports and counts to VCF #' -#' This function will convert the DArT Dosage Report and Counts files to VCF format +#' This function will convert DArT genotype report and Counts files to VCF format #' -#' This function will convert the Dosage Report and Counts files from DArT into a VCF file. +#' This function will convert Allele Dose Report or SNP/INDEL report files and Counts files from DArT into a VCF file. #' These two files are received directly from DArT for a given sequencing project. +#' SNP/INDEL one-row and two-row reports are treated as diploid genotype reports +#' with 0 = reference homozygote, 1 = alternate homozygote, 2 = heterozygote, +#' and - = missing. Allele Dose reports are interpreted as reference allele +#' dosages using the supplied ploidy. #' The output file will be saved to the location and with the name that is specified. #' The VCF format is v4.3 #' -#' @param dart.report Path to the DArT dosage report .csv file. Typically contains "Dosage Report" in the file name. +#' @param dart.report Path to the DArT Allele Dose Report or SNP/INDEL report .csv file. #' @param dart.counts Path to the DArT counts .csv file. Typically contains "Counts" in the file name. #' @param ploidy The ploidy of the species being analyzed #' @param output.file output file name and path @@ -65,204 +69,456 @@ dosage2vcf <- function(dart.report, dart.counts, ploidy, output.file) { '##FORMAT=' ) - ##Get information from DArT Counts and Dosage Report files + ## + is_missing_value <- function(x) { + x_chr <- trimws(as.character(x)) + is.na(x) | x_chr %in% c("", "-", "NA") + } - dosage <- suppressMessages(readr::read_csv(dosage_report, - skip = 6,show_col_types = FALSE)) - colnames(dosage)[1:5] <- dosage[1,1:5] - dosage <- dosage[-1, ] - names(dosage)[names(dosage) == "MarkerID"] <- "MarkerName" - dosage <- as.data.frame(dosage) - row.names(dosage) <- dosage$MarkerName - - counts <- suppressMessages(readr::read_csv(counts_file, - skip = 6,show_col_types = FALSE)) - - #Check that the counts file is in the 2 row format - #if (anyDuplicated(counts$MarkerName) > 0) { #I am going to just check if there are 2x rows in counts as dosage report - if (nrow(counts) >= nrow(dosage)*2) { - message("Note: Counts file is in the 2 row format for Ref and Alt alleles") - } else { - stop("Counts file is in single row format. Only two row format (row for ref and alt allele) is currently supported.") - return() + missing_gt <- function(ploidy) { + paste(rep(".", ploidy), collapse = "/") } - #Parse counts file depending on if its the collapsed format or not - if (all(c("MarkerName", "Variant") %in% counts[1, 1:15])) { - message("Counts file contains the counts for the target loci only") - - colnames(counts)[1:15] <- counts[1,1:15] - counts <- counts[-1, ] - counts <- as.data.frame(counts) - - #Get ref counts dataframe - ref_counts <- counts[is.na(counts$Variant), ] - row.names(ref_counts) <- ref_counts$MarkerName - #Get alt counts dataframe - alt_counts <- counts[!is.na(counts$Variant), ] - row.names(alt_counts) <- alt_counts$MarkerName - - #unload counts file - rm(counts) - - #Get the Ref and Alt allele from the alt_counts file - ##Note sometimes there are multiple nucleotides, I am assuming this file also includes indels, but make sure this is not an error - alleles_df <- alt_counts %>% - rowwise() %>% # Apply operations to each row - mutate(Variant = gsub("-:", "",Variant), # Remove "-:" prefix - REF = strsplit(Variant, ">")[[1]][1], - ALT = strsplit(Variant, ">")[[1]][2]) %>% - ungroup() %>% - select(MarkerName, REF, ALT) - - #Add the CHROM and POS information to the alleles_df from the dosage report - alleles_df <- merge(alleles_df, dosage[, c("MarkerName","Chrom", "ChromPos")], by = "MarkerName", all.x = TRUE) - #Add row name - row.names(alleles_df) <- alleles_df$MarkerName - - }else{ - message("Counts file contains the collapsed read counts across all microhaplotypes for the target loci") + as_clean_character_matrix <- function(x) { + mat <- as.matrix(x) + mat <- matrix(trimws(as.character(mat)), nrow = nrow(mat), + ncol = ncol(mat), dimnames = dimnames(mat)) + mat[is_missing_value(mat)] <- NA_character_ + mat + } - colnames(counts)[1:5] <- counts[1,1:5] - counts <- counts[-1, ] - counts <- as.data.frame(counts) + ensure_unique <- function(x, label) { + duplicated_values <- unique(x[duplicated(x)]) + if (length(duplicated_values) > 0) { + stop(label, " must be unique. Duplicated values include: ", + paste(utils::head(duplicated_values, 5), collapse = ", ")) + } + } - #Get ref counts dataframe - ref_counts <- counts[grepl("Ref$", counts$AlleleID), ] - row.names(ref_counts) <- ref_counts$CloneID - #Get alt counts dataframe - alt_counts <- counts[grepl("Alt$", counts$AlleleID), ] - row.names(alt_counts) <- alt_counts$CloneID + check_matching_sets <- function(x, y, x_label, y_label) { + missing_in_y <- setdiff(x, y) + missing_in_x <- setdiff(y, x) + if (length(missing_in_y) > 0 || length(missing_in_x) > 0) { + msg <- c() + if (length(missing_in_y) > 0) { + msg <- c(msg, paste0("in ", x_label, " only: ", + paste(utils::head(missing_in_y, 5), collapse = ", "))) + } + if (length(missing_in_x) > 0) { + msg <- c(msg, paste0("in ", y_label, " only: ", + paste(utils::head(missing_in_x, 5), collapse = ", "))) + } + stop("Mismatched ", x_label, " and ", y_label, " values (", paste(msg, collapse = "; "), ").") + } + } - #unload counts file - rm(counts) + metadata_cols <- c("MarkerName", "MarkerID", "AlleleID", "CloneID", + "AlleleSequence", "AlleleSequenceRef", "AlleleSequenceAlt", + "Variant", "CallRate", "OneRatioRef", "OneRatioSnp", + "FreqHomRef", "FreqHomSnp", "FreqHets", "PICRef", "PICSnp", + "AvgPIC", "AvgCountRef", "AvgCountSnp", + "RatioAvgCountRefAvgCountSnp", "Chrom", "ChromPos") + + get_sample_cols <- function(df) { + sample_cols <- names(df)[!(names(df) %in% metadata_cols)] + ensure_unique(sample_cols, "Sample columns") + sample_cols + } - #Get the Ref and Alt allele from the alt_counts file - ##Note sometimes there are multiple nucleotides, I am assuming this file also includes indels, but make sure this is not an error - alleles_df <- data.frame(MarkerName = alt_counts$CloneID, - REF = "A", - ALT = "B") + parse_variant_bases <- function(variant) { + variant <- trimws(as.character(variant)) + invalid <- is_missing_value(variant) | !grepl(">", variant, fixed = TRUE) + if (any(invalid)) { + stop("Counts Variant values must be present and contain '>' for REF/ALT parsing.") + } + clean_variant <- sub("^-:", "", variant) + parts <- strsplit(clean_variant, ">", fixed = TRUE) + valid_parts <- vapply(parts, length, integer(1)) == 2 + if (!all(valid_parts)) { + stop("Counts Variant values must contain exactly one REF>ALT allele definition.") + } + data.frame( + REF = vapply(parts, `[`, character(1), 1), + ALT = vapply(parts, `[`, character(1), 2), + stringsAsFactors = FALSE + ) + } + + derive_coordinates <- function(marker_info) { + if (all(c("Chrom", "ChromPos") %in% names(marker_info)) && + !any(is_missing_value(marker_info$Chrom)) && + !any(is_missing_value(marker_info$ChromPos))) { + return(data.frame(CHROM = as.character(marker_info$Chrom), + POS = as.character(marker_info$ChromPos), + stringsAsFactors = FALSE)) + } - #Add the CHROM and POS information to the alleles_df from the dosage report - alleles_df <- merge(alleles_df, dosage[, c("MarkerName","Chrom", "ChromPos")], by = "MarkerName", all.x = TRUE) - #Add row name - row.names(alleles_df) <- alleles_df$MarkerName + marker_names <- as.character(marker_info$MarkerName) + cannot_split <- !grepl("_", marker_names, fixed = TRUE) + if (any(cannot_split)) { + stop("Chrom/ChromPos columns are absent or incomplete, and MarkerName values cannot be split at '_'. Examples: ", + paste(utils::head(marker_names[cannot_split], 5), collapse = ", ")) + } + data.frame( + CHROM = sub("_[^_]*$", "", marker_names), + POS = sub("^.*_", "", marker_names), + stringsAsFactors = FALSE + ) } - #Remove the unwanted information from the counts dataframes - cols_to_remove <- c("MarkerName","AlleleSequence","Variant", - "CallRate","OneRatioRef","OneRatioSnp","FreqHomSnp", - "FreqHets","PICRef","PICSnp","AvgPIC","FreqHomRef", - "AvgCountRef","AvgCountSnp","RatioAvgCountRefAvgCountSnp", - "AlleleID","CloneID") - alt_counts <- alt_counts[, !(colnames(alt_counts) %in% cols_to_remove)] - ref_counts <- ref_counts[, !(colnames(ref_counts) %in% cols_to_remove)] + precompute_genotype_strings <- function(ploidy) { + genotype_strings <- character(ploidy + 1) + for (dosage_value in 0:ploidy) { + ref_count <- dosage_value + alt_count <- ploidy - dosage_value + genotype_strings[dosage_value + 1] <- paste(c(rep("0", ref_count), rep("1", alt_count)), collapse = "/") + } + genotype_strings + } - #Ensure that they are in the same order - ref_counts <- ref_counts[row.names(alt_counts),] + convert_dosage2gt <- function(dosage_matrix, ploidy) { + genotype_strings <- precompute_genotype_strings(ploidy) + genotype_matrix <- matrix(missing_gt(ploidy), nrow = nrow(dosage_matrix), ncol = ncol(dosage_matrix), + dimnames = dimnames(dosage_matrix)) + called <- !is.na(dosage_matrix) + genotype_matrix[called] <- genotype_strings[dosage_matrix[called] + 1] + genotype_matrix + } - #Make the total counts dataframe - total_counts <- alt_counts + ref_counts + matrix_to_character <- function(mat) { + out <- matrix(as.character(mat), nrow = nrow(mat), ncol = ncol(mat), dimnames = dimnames(mat)) + out[is.na(mat)] <- "." + out + } - #Get the total ref, total alt, and total read depth for each marker - alleles_df$AltCountsSum <- rowSums(alt_counts)[rownames(alleles_df)] #Alt - alleles_df$RefCountsSum <- rowSums(ref_counts)[rownames(alleles_df)] #Ref - alleles_df$TotalCountSum <- alleles_df$AltCountsSum + alleles_df$RefCountsSum + as_numeric_matrix <- function(df, label) { + char_mat <- as_clean_character_matrix(df) + numeric_mat <- suppressWarnings(matrix(as.numeric(char_mat), nrow = nrow(char_mat), + ncol = ncol(char_mat), dimnames = dimnames(char_mat))) + invalid <- !is.na(char_mat) & is.na(numeric_mat) + if (any(invalid)) { + stop(label, " contains non-numeric values. Examples: ", + paste(utils::head(unique(char_mat[invalid]), 5), collapse = ", ")) + } + numeric_mat + } - #Remove the unwanted information from the dosage dataframe - d_cols_to_remove <- c("MarkerName","AvgCountRef","AvgCountSnp","Chrom","ChromPos") - dosage <- dosage[, !(colnames(dosage) %in% d_cols_to_remove)] + convert_snp_codes <- function(code_matrix) { + code_matrix <- as_clean_character_matrix(code_matrix) + invalid <- !is.na(code_matrix) & !(code_matrix %in% c("0", "1", "2")) + if (any(invalid)) { + stop("SNP/INDEL genotype codes must be 0, 1, 2, '-', or NA. Invalid values include: ", + paste(utils::head(unique(code_matrix[invalid]), 5), collapse = ", ")) + } - #Make the VCF df - vcf_df <- data.frame( - CHROM = alleles_df$Chrom, - POS = alleles_df$ChromPos, - ID = alleles_df$MarkerName, - REF = alleles_df$REF, - ALT = alleles_df$ALT, - QUAL = ".", - FILTER = ".", - INFO = NA, - FORMAT = NA - ) + gt_matrix <- matrix("./.", nrow = nrow(code_matrix), ncol = ncol(code_matrix), + dimnames = dimnames(code_matrix)) + ud_matrix <- matrix(".", nrow = nrow(code_matrix), ncol = ncol(code_matrix), + dimnames = dimnames(code_matrix)) - #Add the INFO column for each SNP - vcf_df$INFO <- paste0("DP=",alleles_df$TotalCountSum,";", - "ADS=",alleles_df$RefCountsSum,",",alleles_df$AltCountsSum) + gt_matrix[!is.na(code_matrix) & code_matrix == "0"] <- "0/0" + gt_matrix[!is.na(code_matrix) & code_matrix == "1"] <- "1/1" + gt_matrix[!is.na(code_matrix) & code_matrix == "2"] <- "0/1" + ud_matrix[!is.na(code_matrix) & code_matrix == "0"] <- "2" + ud_matrix[!is.na(code_matrix) & code_matrix == "1"] <- "0" + ud_matrix[!is.na(code_matrix) & code_matrix == "2"] <- "1" - #Add the FORMAT label for each SNP - vcf_df$FORMAT <- paste("GT","UD","DP","RA",sep=":") + list(gt = gt_matrix, ud = ud_matrix) + } - message("Converting dosages to genotype format\n") + parse_report <- function(file, ploidy) { + raw <- suppressMessages(readr::read_csv(file, skip = 6, show_col_types = FALSE)) + raw <- as.data.frame(raw, check.names = FALSE) + if (nrow(raw) == 0) stop("DArT report file contains no data rows.") + + first_row <- as.character(unlist(raw[1, ], use.names = FALSE)) + is_allele_dose <- length(first_row) >= 5 && + identical(first_row[1:5], c("MarkerID", "AvgCountRef", "AvgCountSnp", "Chrom", "ChromPos")) + + if (is_allele_dose) { + message("DArT report is an Allele Dose report") + names(raw)[1:5] <- first_row[1:5] + report <- raw[-1, , drop = FALSE] + names(report)[names(report) == "MarkerID"] <- "MarkerName" + sample_cols <- get_sample_cols(report) + ensure_unique(report$MarkerName, "Report MarkerName") + + dosage_matrix <- as_numeric_matrix(report[, sample_cols, drop = FALSE], "Allele dose report") + invalid_dosage <- !is.na(dosage_matrix) & + (dosage_matrix < 0 | dosage_matrix > ploidy | dosage_matrix != round(dosage_matrix)) + if (any(invalid_dosage)) { + stop("Allele dose values must be integer reference allele counts between 0 and ploidy.") + } + + rownames(dosage_matrix) <- report$MarkerName + gt_matrix <- convert_dosage2gt(dosage_matrix, ploidy) + ud_matrix <- matrix_to_character(dosage_matrix) + + marker_info <- report[, intersect(c("MarkerName", "Chrom", "ChromPos"), names(report)), drop = FALSE] + rownames(marker_info) <- marker_info$MarkerName + + return(list(type = "allele_dose", + marker_info = marker_info, + marker_names = report$MarkerName, + sample_names = sample_cols, + gt = gt_matrix, + ud = ud_matrix, + variant = NULL, + allele_sequence_ref = NULL, + allele_sequence_alt = NULL)) + } - ###Convert genotypes from dosage to gt - # Precompute genotype strings for all possible dosage values to improve efficiency - precompute_genotype_strings <- function(ploidy) { - genotype_strings <- character(ploidy + 1) - # Generate the genotype string based on the dosage and ploidy - # Updog uses the ref counts, which is not typical, so this corrects it - for (dosage in 0:ploidy) { - ref_count <- dosage - alt_count <- ploidy - dosage - genotype_strings[dosage + 1] <- paste(c(rep("0", ref_count), rep("1", alt_count)), collapse = "/") + is_snp_1row <- all(c("MarkerName", "AlleleSequenceRef", "AlleleSequenceAlt", "Variant") %in% names(raw)) + is_snp_2row <- all(c("MarkerName", "AlleleSequence", "Variant") %in% names(raw)) && + anyDuplicated(raw$MarkerName) > 0 + + if (is_snp_1row || is_snp_2row) { + if (ploidy != 2) { + stop("SNP/INDEL reports are diploid genotype reports. Use ploidy = 2.") + } } - return(genotype_strings) - } - # Apply the precomputed genotype strings to the matrix - convert_dosage2gt <- function(dosage_matrix, ploidy) { - dosage_matrix <- as.matrix(dosage_matrix) - genotype_strings <- precompute_genotype_strings(ploidy) + if (is_snp_1row) { + message("DArT report is a SNP/INDEL 1 row report") + sample_cols <- get_sample_cols(raw) + ensure_unique(raw$MarkerName, "Report MarkerName") + + code_matrix <- as.matrix(raw[, sample_cols, drop = FALSE]) + rownames(code_matrix) <- raw$MarkerName + converted <- convert_snp_codes(code_matrix) + + marker_info <- raw[, intersect(c("MarkerName", "Chrom", "ChromPos"), names(raw)), drop = FALSE] + rownames(marker_info) <- marker_info$MarkerName + + variant <- setNames(trimws(as.character(raw$Variant)), raw$MarkerName) + allele_sequence_ref <- setNames(as.character(raw$AlleleSequenceRef), raw$MarkerName) + allele_sequence_alt <- setNames(as.character(raw$AlleleSequenceAlt), raw$MarkerName) + + return(list(type = "snp_1row", + marker_info = marker_info, + marker_names = raw$MarkerName, + sample_names = sample_cols, + gt = converted$gt, + ud = converted$ud, + variant = variant, + allele_sequence_ref = allele_sequence_ref, + allele_sequence_alt = allele_sequence_alt)) + } - # Handle missing values separately - genotype_matrix <- matrix(genotype_strings[dosage_matrix + 1], nrow = nrow(dosage_matrix), ncol = ncol(dosage_matrix)) - genotype_matrix[is.na(dosage_matrix)] <- "./." # Handle missing values + if (is_snp_2row) { + message("DArT report is a SNP/INDEL 2 row report") + sample_cols <- get_sample_cols(raw) + ref_rows <- is_missing_value(raw$Variant) + alt_rows <- !ref_rows + ref_report <- raw[ref_rows, , drop = FALSE] + alt_report <- raw[alt_rows, , drop = FALSE] + check_matching_sets(ref_report$MarkerName, alt_report$MarkerName, "SNP ref-row markers", "SNP alt-row markers") + ensure_unique(ref_report$MarkerName, "SNP ref-row MarkerName") + ensure_unique(alt_report$MarkerName, "SNP alt-row MarkerName") + alt_report <- alt_report[match(ref_report$MarkerName, alt_report$MarkerName), , drop = FALSE] + + ref_matrix <- as_clean_character_matrix(ref_report[, sample_cols, drop = FALSE]) + alt_matrix <- as_clean_character_matrix(alt_report[, sample_cols, drop = FALSE]) + invalid <- (!is.na(ref_matrix) & !(ref_matrix %in% c("0", "1"))) | + (!is.na(alt_matrix) & !(alt_matrix %in% c("0", "1"))) + if (any(invalid)) { + stop("SNP/INDEL 2 row reports must contain only 0, 1, '-', or NA in sample columns.") + } + + code_matrix <- matrix(NA_character_, nrow = nrow(ref_matrix), ncol = ncol(ref_matrix), + dimnames = list(ref_report$MarkerName, sample_cols)) + called <- !is.na(ref_matrix) & !is.na(alt_matrix) + code_matrix[called & ref_matrix == "1" & alt_matrix == "0"] <- "0" + code_matrix[called & ref_matrix == "0" & alt_matrix == "1"] <- "1" + code_matrix[called & ref_matrix == "1" & alt_matrix == "1"] <- "2" + code_matrix[called & ref_matrix == "0" & alt_matrix == "0"] <- NA_character_ + converted <- convert_snp_codes(code_matrix) + + marker_info <- data.frame(MarkerName = ref_report$MarkerName, stringsAsFactors = FALSE) + if (all(c("Chrom", "ChromPos") %in% names(ref_report))) { + marker_info$Chrom <- ref_report$Chrom + marker_info$ChromPos <- ref_report$ChromPos + } + rownames(marker_info) <- marker_info$MarkerName + + variant <- setNames(trimws(as.character(alt_report$Variant)), alt_report$MarkerName) + allele_sequence_ref <- setNames(as.character(ref_report$AlleleSequence), ref_report$MarkerName) + allele_sequence_alt <- setNames(as.character(alt_report$AlleleSequence), alt_report$MarkerName) + + return(list(type = "snp_2row", + marker_info = marker_info, + marker_names = ref_report$MarkerName, + sample_names = sample_cols, + gt = converted$gt, + ud = converted$ud, + variant = variant, + allele_sequence_ref = allele_sequence_ref, + allele_sequence_alt = allele_sequence_alt)) + } - # Retain row and column names - rownames(genotype_matrix) <- rownames(dosage_matrix) - colnames(genotype_matrix) <- colnames(dosage_matrix) + stop("Unsupported DArT report format.") + } - return(genotype_matrix) + parse_counts <- function(file) { + raw <- suppressMessages(readr::read_csv(file, skip = 6, show_col_types = FALSE)) + raw <- as.data.frame(raw, check.names = FALSE) + if (nrow(raw) == 0) stop("DArT counts file contains no data rows.") + + first_row <- as.character(unlist(raw[1, ], use.names = FALSE)) + if (all(c("MarkerName", "Variant") %in% first_row[seq_len(min(15, length(first_row)))])) { + message("Counts file contains the counts for the target loci only") + names(raw)[1:15] <- first_row[1:15] + counts <- raw[-1, , drop = FALSE] + sample_cols <- get_sample_cols(counts) + + ref_rows <- is_missing_value(counts$Variant) + alt_rows <- !ref_rows + ref_counts <- counts[ref_rows, , drop = FALSE] + alt_counts <- counts[alt_rows, , drop = FALSE] + check_matching_sets(ref_counts$MarkerName, alt_counts$MarkerName, "counts ref-row markers", "counts alt-row markers") + ensure_unique(ref_counts$MarkerName, "Counts ref-row MarkerName") + ensure_unique(alt_counts$MarkerName, "Counts alt-row MarkerName") + ref_counts <- ref_counts[match(alt_counts$MarkerName, ref_counts$MarkerName), , drop = FALSE] + + rownames(ref_counts) <- ref_counts$MarkerName + rownames(alt_counts) <- alt_counts$MarkerName + + bases <- parse_variant_bases(alt_counts$Variant) + alleles_df <- data.frame(MarkerName = alt_counts$MarkerName, + REF = bases$REF, + ALT = bases$ALT, + CountVariant = trimws(as.character(alt_counts$Variant)), + CountAlleleSequenceRef = as.character(ref_counts$AlleleSequence), + CountAlleleSequenceAlt = as.character(alt_counts$AlleleSequence), + stringsAsFactors = FALSE) + rownames(alleles_df) <- alleles_df$MarkerName + + return(list(type = "target", + alleles_df = alleles_df, + marker_names = alt_counts$MarkerName, + sample_names = sample_cols, + ref_counts = ref_counts[, sample_cols, drop = FALSE], + alt_counts = alt_counts[, sample_cols, drop = FALSE])) + } + + message("Counts file contains the collapsed read counts across all microhaplotypes for the target loci") + names(raw)[1:5] <- first_row[1:5] + counts <- raw[-1, , drop = FALSE] + sample_cols <- get_sample_cols(counts) + + ref_counts <- counts[grepl("Ref$", counts$AlleleID), , drop = FALSE] + alt_counts <- counts[grepl("Alt$", counts$AlleleID), , drop = FALSE] + check_matching_sets(ref_counts$CloneID, alt_counts$CloneID, "counts ref-row markers", "counts alt-row markers") + ensure_unique(ref_counts$CloneID, "Counts ref-row CloneID") + ensure_unique(alt_counts$CloneID, "Counts alt-row CloneID") + ref_counts <- ref_counts[match(alt_counts$CloneID, ref_counts$CloneID), , drop = FALSE] + + rownames(ref_counts) <- ref_counts$CloneID + rownames(alt_counts) <- alt_counts$CloneID + alleles_df <- data.frame(MarkerName = alt_counts$CloneID, + REF = "A", + ALT = "B", + CountVariant = NA_character_, + CountAlleleSequenceRef = NA_character_, + CountAlleleSequenceAlt = NA_character_, + stringsAsFactors = FALSE) + rownames(alleles_df) <- alleles_df$MarkerName + + list(type = "collapsed", + alleles_df = alleles_df, + marker_names = alt_counts$CloneID, + sample_names = sample_cols, + ref_counts = ref_counts[, sample_cols, drop = FALSE], + alt_counts = alt_counts[, sample_cols, drop = FALSE]) } - # Convert the dosage matrix to genotypes - geno_df <- convert_dosage2gt(dosage, ploidy) + validate_report_vs_counts <- function(report, counts) { + if (!is.null(report$variant) && counts$type == "target") { + report_variant <- report$variant[counts$marker_names] + count_variant <- counts$alleles_df[counts$marker_names, "CountVariant"] + mismatch <- !is_missing_value(report_variant) & !is_missing_value(count_variant) & + trimws(report_variant) != trimws(count_variant) + if (any(mismatch)) { + stop("SNP/INDEL report Variant values do not match Counts Variant values. Examples: ", + paste(utils::head(counts$marker_names[mismatch], 5), collapse = ", ")) + } + } - #Combine info from the matrices to form the VCF information for each sample - # Combine the matrices into a single matrix with elements separated by ":" - make_vcf_format <- function(..., separator = ":") { - matrices <- list(...) - n <- length(matrices) + if (!is.null(report$allele_sequence_ref) && counts$type == "target") { + report_ref <- report$allele_sequence_ref[counts$marker_names] + report_alt <- report$allele_sequence_alt[counts$marker_names] + count_ref <- counts$alleles_df[counts$marker_names, "CountAlleleSequenceRef"] + count_alt <- counts$alleles_df[counts$marker_names, "CountAlleleSequenceAlt"] + mismatch <- (!is_missing_value(report_ref) & !is_missing_value(count_ref) & report_ref != count_ref) | + (!is_missing_value(report_alt) & !is_missing_value(count_alt) & report_alt != count_alt) + if (any(mismatch)) { + stop("SNP/INDEL report allele sequences do not match Counts allele sequences. Examples: ", + paste(utils::head(counts$marker_names[mismatch], 5), collapse = ", ")) + } + } + } - # Convert matrices to long form - long_forms <- lapply(matrices, function(mat) { - suppressMessages(reshape2::melt(mat, varnames = c("Row", "Col"), value.name = "Value")) - }) + ##Get information from DArT Counts and Dosage Report files + report <- parse_report(dosage_report, ploidy) + counts <- parse_counts(counts_file) + + check_matching_sets(report$marker_names, counts$marker_names, "report markers", "counts markers") + check_matching_sets(report$sample_names, counts$sample_names, "report samples", "counts samples") + validate_report_vs_counts(report, counts) + + marker_order <- report$marker_names + sample_order <- report$sample_names + + alleles_df <- counts$alleles_df[marker_order, , drop = FALSE] + marker_info <- report$marker_info[marker_order, , drop = FALSE] + coordinates <- derive_coordinates(marker_info) + alleles_df$Chrom <- coordinates$CHROM + alleles_df$ChromPos <- coordinates$POS + + ref_counts <- counts$ref_counts[marker_order, sample_order, drop = FALSE] + alt_counts <- counts$alt_counts[marker_order, sample_order, drop = FALSE] + ref_counts <- as_numeric_matrix(ref_counts, "Reference counts") + alt_counts <- as_numeric_matrix(alt_counts, "Alternate counts") + total_counts <- alt_counts + ref_counts - # Concatenate the elements - combined_long <- long_forms[[1]] - combined_long$Combined <- combined_long$Value + if (!identical(rownames(report$gt), rownames(ref_counts)) || + !identical(colnames(report$gt), colnames(ref_counts))) { + stop("Internal alignment error: genotype and count matrices are not identically ordered.") + } - for (i in 2:n) { - combined_long$Combined <- paste(combined_long$Combined, long_forms[[i]]$Value, sep = separator) - } + alleles_df$AltCountsSum <- rowSums(alt_counts) + alleles_df$RefCountsSum <- rowSums(ref_counts) + alleles_df$TotalCountSum <- alleles_df$AltCountsSum + alleles_df$RefCountsSum + + vcf_df <- data.frame( + CHROM = alleles_df$Chrom, + POS = alleles_df$ChromPos, + ID = alleles_df$MarkerName, + REF = alleles_df$REF, + ALT = alleles_df$ALT, + QUAL = ".", + FILTER = ".", + INFO = NA, + FORMAT = NA, + stringsAsFactors = FALSE + ) - # Convert back to wide form - combined_wide <- suppressMessages(reshape2::dcast(combined_long, Row ~ Col, value.var = "Combined")) + vcf_df$INFO <- paste0("DP=",alleles_df$TotalCountSum,";", + "ADS=",alleles_df$RefCountsSum,",",alleles_df$AltCountsSum) + vcf_df$FORMAT <- paste("GT","UD","DP","RA",sep=":") - # Restore row and column names - rownames(combined_wide) <- combined_wide$Row - combined_wide$Row <- NULL - colnames(combined_wide) <- colnames(matrices[[1]]) + message("Converting dosages to genotype format\n") - return(as.matrix(combined_wide)) + make_vcf_format <- function(gt_matrix, ud_matrix, dp_matrix, ra_matrix) { + matrix(paste(gt_matrix, ud_matrix, matrix_to_character(dp_matrix), matrix_to_character(ra_matrix), sep = ":"), + nrow = nrow(gt_matrix), ncol = ncol(gt_matrix), dimnames = dimnames(gt_matrix)) } message("Formatting VCF and generating output file\n") # Combine the matrices - geno_df <- make_vcf_format(geno_df, dosage, total_counts, ref_counts) + geno_df <- make_vcf_format(report$gt, report$ud, total_counts, ref_counts) #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) @@ -278,7 +534,8 @@ dosage2vcf <- function(dart.report, dart.counts, ploidy, output.file) { write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) #Unload all items from memory - rm(dosage) + rm(report) + rm(counts) rm(alt_counts) rm(ref_counts) rm(geno_df) diff --git a/R/find_parentage.R b/R/find_parentage.R new file mode 100644 index 0000000..b220a46 --- /dev/null +++ b/R/find_parentage.R @@ -0,0 +1,423 @@ +#' Find Parentage Assignments for Progeny +#' +#' Assigns the most likely parent(s) to each progeny individual based on +#' genotypic data using Mendelian error rates or homozygous mismatch rates. +#' +#' @param genotypes_file Path to a TSV/CSV/TXT file containing genotype data. +#' Must include an 'ID' column followed by marker columns coded as 0, 1, 2 +#' (allele dosage). +#' @param parents_file Path to a TSV/CSV/TXT file listing candidate parent IDs. +#' Must include an 'ID' column. An optional 'Sex' column with values +#' 'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines +#' which parents are tested for each role. If absent, all parents are treated +#' as ambiguous. +#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs to assign. +#' Must include an 'ID' column. +#' @param method Character. Parentage assignment method. One of: +#' \itemize{ +#' \item \code{"best_male_parent"} - finds the best male parent for each +#' progeny using homozygous mismatch rate. +#' \item \code{"best_female_parent"} - finds the best female parent for each +#' progeny using homozygous mismatch rate. +#' \item \code{"best_match"} - finds the single best parent (either sex) +#' using homozygous mismatch rate. +#' \item \code{"best_pair"} - finds the best male-female parent pair for +#' each progeny using full Mendelian error rate (default). +#' } +#' @param min_markers Integer. Minimum number of non-missing markers required +#' to report a parentage assignment. Progeny-parent comparisons with fewer +#' markers are flagged as \code{LOW_MARKERS} and no assignment is made +#' (default: \code{10}). +#' @param error_threshold Numeric. Maximum mismatch percentage to report a +#' parentage assignment as confident. Assignments above this threshold are +#' flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column +#' (default: \code{5.0}). Must be between 0 and 100. +#' @param show_ties Logical. If \code{TRUE}, all tied best pairs (after +#' tie-breaking by maximum markers tested) are reported as additional columns +#' (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when +#' \code{method = "best_pair"}. The base columns (\code{Male_Parent}, +#' \code{Female_Parent}, etc.) are always populated with the top result. +#' If \code{FALSE}, only one tied pair is reported with a warning. +#' Default is \code{TRUE}. +#' @param allow_selfing Logical. If \code{FALSE}, male-female parent pairs where +#' both IDs are identical are excluded when \code{method = "best_pair"}. +#' Default is \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints progress messages, summary +#' statistics, and the results table to the console. Default is \code{TRUE}. +#' @param write_txt Logical. If \code{TRUE}, writes results to +#' \code{parentage_results_dt.txt} in the working directory. Default is +#' \code{TRUE}. +#' +#' @return A \code{data.table} with one row per progeny. Columns depend on the +#' method used: +#' \itemize{ +#' \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: +#' \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, +#' \code{Markers_Tested}, \code{Assignment_Status}. +#' \item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, +#' \code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, +#' \code{Markers_Tested}, \code{Assignment_Status}. +#' \item \code{best_pair} (ties remain after tie-breaking, +#' \code{show_ties = TRUE}): base columns are always populated with the +#' top result, plus suffix columns \code{Male_Parent_1}, +#' \code{Female_Parent_1}, etc. for each tied pair. +#' } +#' \code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or +#' \code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. +#' +#' @details +#' A homozygous-only genotype matrix is pre-computed once at startup and shared +#' across all methods that require it, avoiding redundant computation. +#' +#' For \code{"best_male_parent"}, \code{"best_female_parent"}, and +#' \code{"best_match"}, only homozygous markers (coded 0 or 2) are used for +#' comparison; heterozygous markers (coded 1) are set to \code{NA}. This +#' reduces false mismatches caused by phase ambiguity. +#' +#' For \code{"best_pair"}, all markers are used and full Mendelian inheritance +#' rules are applied. Mismatch rates and comparison counts are computed across +#' all progeny simultaneously using vectorised \code{vapply} calls, producing +#' \code{n_pairs x n_progeny} matrices and giving substantial speed gains for +#' large datasets. Both matrices are explicitly coerced to matrix form to +#' handle the edge case of a single parent pair correctly. +#' +#' When multiple pairs share the lowest Mendelian error rate, ties are broken +#' by selecting the pair(s) with the greatest number of markers tested. If ties +#' still remain after this step, all remaining tied pairs are reported when +#' \code{show_ties = TRUE}. +#' +#' The base columns (\code{Male_Parent}, \code{Female_Parent}, +#' \code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with +#' the top result, ensuring no missing values in these columns regardless of +#' tie status. +#' +#' Output rows are pre-allocated as a \code{data.table} and filled by reference +#' using \code{data.table::set()}, avoiding repeated memory allocation during +#' the results-building step. +#' +#' Individuals in \code{parents_file} or \code{progeny_file} that are absent +#' from \code{genotypes_file} are removed with a warning. +#' +#' Progeny with fewer non-missing markers than \code{min_markers} are flagged +#' \code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best +#' match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +#' +#' @examples +#' \dontrun{ +#' # Assign best male-female parent pair to each progeny +#' results <- find_parentage( +#' genotypes_file = "genotypes.txt", +#' parents_file = "parents.txt", +#' progeny_file = "progeny.txt", +#' method = "best_pair", +#' min_markers = 50, +#' error_threshold = 5.0, +#' show_ties = TRUE, +#' allow_selfing = FALSE +#' ) +#' +#' # Find best individual parent match (ignoring sex) +#' results <- find_parentage( +#' genotypes_file = "genotypes.txt", +#' parents_file = "parents.txt", +#' progeny_file = "progeny.txt", +#' method = "best_match", +#' min_markers = 30, +#' error_threshold = 3.0 +#' ) +#' } +#' +#' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table +#' @export +find_parentage <- function(genotypes_file, parents_file, progeny_file, + method = "best_pair", + min_markers = 10, + error_threshold = 5.0, + show_ties = TRUE, + allow_selfing = TRUE, + verbose = TRUE, + write_txt = TRUE) { + + #### Input Validation and Data Loading #### + allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") + if (!method %in% allowed_methods) + stop("Method must be one of: ", paste(allowed_methods, collapse = ", ")) + if (min_markers < 1) + stop("min_markers must be a positive integer.") + if (error_threshold < 0 || error_threshold > 100) + stop("error_threshold must be between 0 and 100.") + + tryCatch({ + genos <- data.table::fread(genotypes_file, sep = "auto") + all_parents <- data.table::fread(parents_file, sep = "auto") + progeny_candidates <- data.table::fread(progeny_file, sep = "auto") + }, error = function(e) { + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") + }) + + valid_ids <- genos$ID + removed_parents <- base::setdiff(all_parents$ID, valid_ids) + if (base::length(removed_parents) > 0) { + warning("The following parent IDs were not in the genotype file and will not be analyzed: ", + paste(removed_parents, collapse = ", "), call. = FALSE) + all_parents <- all_parents[ID %in% valid_ids] + } + + removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) + if (base::length(removed_progeny) > 0) { + warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", + paste(removed_progeny, collapse = ", "), call. = FALSE) + progeny_candidates <- progeny_candidates[ID %in% valid_ids] + } + + if (!"Sex" %in% base::colnames(all_parents)) { + warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") + all_parents[, Sex := "A"] + } + + all_parents[, Sex := base::toupper(Sex)] + male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] + female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] + + if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) + warning("No valid male parent candidates remain after filtering.", call. = FALSE) + if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) + warning("No valid female parent candidates remain after filtering.", call. = FALSE) + if (base::nrow(progeny_candidates) == 0) + stop("No valid progeny candidates remain after filtering.") + + #### Pre-compute genotype matrices once (shared across all methods) #### + # Full genotype matrix used by best_pair + genos_mat <- base::as.matrix(genos, rownames = "ID") + + # Homozygous-only matrix (het markers set to NA) used by hom methods + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "ID") + for (col in marker_cols) + genos_hom[base::get(col) == 1, (col) := NA_integer_] + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + + #### Assignment_Status from markers and error rate #### + # Returns LOW_MARKERS, HIGH_ERROR, or PASS + assign_status <- function(markers, error_pct) { + base::ifelse(markers < min_markers, "LOW_MARKERS", + base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) + } + + #### Logic for Homozygous Matching Methods #### + if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { + parent_ids <- base::switch(method, + "best_male_parent" = male_parent_candidates$ID, + "best_female_parent" = female_parent_candidates$ID, + "best_match" = base::union(male_parent_candidates$ID, + female_parent_candidates$ID)) + + # Subset pre-computed homozygous matrix for relevant parents and progeny + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + + # Pre-allocate results data.table; fill by reference with set() + n_progeny <- base::nrow(progeny_genos) + results_dt <- data.table::data.table( + Progeny = base::rownames(progeny_genos), + Best_Match = NA_character_, + Mendelian_Error_Pct = NA_real_, + Markers_Tested = NA_integer_, + Assignment_Status = NA_character_ + ) + + for (i in base::seq_len(n_progeny)) { + progeny_vec <- progeny_genos[i, ] + mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) + comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) + percent_mismatch <- (mismatches / comparisons) * 100 + percent_mismatch[base::is.nan(percent_mismatch)] <- NA + + best_idx <- base::which.min(percent_mismatch) + + # No candidate found - flag LOW_MARKERS and continue + if (base::length(best_idx) == 0) { + data.table::set(results_dt, i, "Markers_Tested", 0L) + data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") + next + } + + best_markers <- comparisons[best_idx] + best_error <- base::round(percent_mismatch[best_idx], 2) + + data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) + data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) + data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) + data.table::set(results_dt, i, "Assignment_Status", assign_status(best_markers, best_error)) + } + final_df <- results_dt + } + + #### Logic for Best Pair Method #### + if (method == "best_pair") { + parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, + Female_Parent = female_parent_candidates$ID) + if (!allow_selfing) { + parent_pairs <- parent_pairs[Male_Parent != Female_Parent] + if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") + } + if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") + + # Pre-extract parent genotype row blocks for vectorised operations + male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] + female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] + + # Subset full genotype matrix to progeny only + progeny_ids <- progeny_candidates$ID + progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] + n_progeny <- base::nrow(progeny_mat) + n_pairs <- base::nrow(parent_pairs) + + # Vectorised mismatch computation across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case + mismatch_mat <- base::matrix( + base::vapply(base::seq_len(n_progeny), function(j) { + progeny_vec <- progeny_mat[j, ] + base::rowSums( + (male_parent_genos_mat == 0 & female_parent_genos_mat == 0 & progeny_vec > 0) | + (male_parent_genos_mat == 2 & female_parent_genos_mat == 2 & progeny_vec < 2) | + ((male_parent_genos_mat == 0 & female_parent_genos_mat == 1) | + (male_parent_genos_mat == 1 & female_parent_genos_mat == 0)) & (progeny_vec == 2) | + ((male_parent_genos_mat == 2 & female_parent_genos_mat == 1) | + (male_parent_genos_mat == 1 & female_parent_genos_mat == 2)) & (progeny_vec == 0) | + ((male_parent_genos_mat == 0 & female_parent_genos_mat == 2) | + (male_parent_genos_mat == 2 & female_parent_genos_mat == 0)) & (progeny_vec != 1), + na.rm = TRUE + ) + }, numeric(n_pairs)), + nrow = n_pairs, ncol = n_progeny + ) + + # Vectorised comparison count across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case + comparison_mat <- base::matrix( + base::vapply(base::seq_len(n_progeny), function(j) { + progeny_vec <- progeny_mat[j, ] + base::rowSums(!base::is.na(male_parent_genos_mat) & + !base::is.na(female_parent_genos_mat) & + !base::is.na(progeny_vec)) + }, numeric(n_pairs)), + nrow = n_pairs, ncol = n_progeny + ) + + # Percent mismatch matrix: n_pairs x n_progeny + pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 + pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA + + # Pre-allocate base results data.table; tie columns added dynamically + results_dt <- data.table::data.table( + Progeny = progeny_ids, + Male_Parent = NA_character_, + Female_Parent = NA_character_, + Mendelian_Error_Pct = NA_character_, + Markers_Tested = NA_integer_, + Assignment_Status = NA_character_ + ) + + # Per-progeny result extraction from pre-computed matrices + results_list <- base::vector("list", n_progeny) + for (j in base::seq_len(n_progeny)) { + prog_id <- progeny_ids[j] + percent_mismatch <- pct_mismatch_mat[, j] + comparisons <- comparison_mat[, j] + min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) + + # No markers overlap at all - flag LOW_MARKERS + if (base::is.infinite(min_mismatch_val)) { + data.table::set(results_dt, j, "Markers_Tested", 0L) + data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") + next + } + + best_indices <- base::which(percent_mismatch == min_mismatch_val) + + # Tie-break: prefer pair(s) with the most markers tested + if (base::length(best_indices) > 1) { + best_markers_per_pair <- comparisons[best_indices] + max_markers <- base::max(best_markers_per_pair) + best_indices <- best_indices[best_markers_per_pair == max_markers] + } + + best_pairs <- parent_pairs[best_indices] + best_markers <- comparisons[best_indices[1]] + best_error <- base::round(min_mismatch_val, 2) + a_status <- assign_status(best_markers, best_error) + + if (!show_ties && base::nrow(best_pairs) > 1) { + warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), + " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) + } + + num_to_report <- base::min(base::nrow(best_pairs), + if (show_ties) base::nrow(best_pairs) else 1) + + # Always populate base columns with the top result + data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) + data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) + data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) + data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) + data.table::set(results_dt, j, "Assignment_Status", a_status) + + # If ties remain after tie-breaking and show_ties is TRUE, + # store tie details for later column binding + if (show_ties && num_to_report > 1) { + tie_row <- base::list(Progeny = prog_id) + for (k in base::seq_len(num_to_report)) { + tie_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] + tie_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] + tie_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val + tie_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] + } + results_list[[j]] <- data.table::as.data.table(tie_row) + } + } + + # Merge any tie suffix columns onto the pre-allocated base table + tie_rows <- data.table::rbindlist( + base::Filter(Negate(base::is.null), results_list), + fill = TRUE + ) + if (base::nrow(tie_rows) > 0) { + final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) + } else { + final_df <- results_dt + } + } + + #### Summary #### + if (verbose) { + total <- base::nrow(final_df) + a_counts <- base::table(final_df$Assignment_Status) + base::cat("\n--- Parentage Assignment Summary ---\n") + base::cat("Total progeny evaluated:", total, "\n") + for (s in base::names(a_counts)) + base::cat(base::sprintf(" %-14s: %d (%.1f%%)\n", s, + a_counts[s], (a_counts[s] / total) * 100)) + base::cat("Min markers threshold :", min_markers, "\n") + base::cat("Error threshold :", error_threshold, "%\n\n") + } + + #### Output #### + if (write_txt) { + output_filename <- "parentage_results_dt.txt" + tryCatch({ + data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) + if (verbose) base::cat("Results successfully written to:", output_filename, "\n") + }, error = function(e) { + warning("Could not write results to file. Error: ", e$message, call. = FALSE) + }) + } + + if (verbose) { + base::cat("\n--- Parentage Assignment Results ---\n") + base::print(final_df) + return(base::invisible(final_df)) + } else { + return(final_df) + } +} diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 8be1cda..c1de03d 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -1,13 +1,38 @@ #' Obtain Read Counts from MADC File #' -#' This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -#' and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -#' only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +#' Reads a DArTag MADC report and returns reference and total read count matrices +#' per marker and sample. Only `Ref` and `Alt` target loci are retained; +#' `|AltMatch` / `|RefMatch` rows are either discarded or collapsed depending on +#' `collapse_matches_counts`. #' +#' @details +#' Either `madc_file` or `madc_object` must be provided (not both `NULL`). +#' When `madc_object` is supplied it is passed directly to `get_counts()`, +#' skipping file I/O. The function constructs: +#' - `ref_matrix` - per-sample reference allele counts. +#' - `size_matrix` - per-sample total counts (ref + alt). +#' +#' Markers whose `CloneID` appears only in the `Ref` or only in the `Alt` rows +#' are removed with a warning. A summary of the proportion of zero-count +#' data points (missing data) is reported via `vmsg()`. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., as returned by `check_botloci()`). When supplied, file reading is +#' skipped. At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), `|AltMatch` and `|RefMatch` rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A named list with two numeric matrices, both with markers as rows +#' and samples as columns: +#' \describe{ +#' \item{`ref_matrix`}{Reference allele read counts.} +#' \item{`size_matrix`}{Total read counts (reference + alternative).} +#' } #' -#' @param madc_file Path to MADC file -#' @import dplyr -#' @return A list of read count matrices for reference, alternate, and total read count values #' @examples #' # Get the path to the MADC file #' madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") @@ -15,17 +40,38 @@ #' # Extract the read count matrices #' counts_matrices <- get_countsMADC(madc_path) #' -#' # Access the reference, alternate, and size matrices -#' -#' # ref_matrix <- counts_matrices$ref_matrix -#' # alt_matrix <- counts_matrices$alt_matrix +#' # Access the reference and size matrices +#' # ref_matrix <- counts_matrices$ref_matrix #' # size_matrix <- counts_matrices$size_matrix #' #' rm(counts_matrices) +#' +#' @seealso [check_madc_sanity()] +#' +#' @import dplyr #' @export -get_countsMADC <- function(madc_file) { +get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !is.null(madc_object)) stop("Please provide either madc_file or madc_object. Not both.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + + vmsg(paste0("Extracting read counts from ", ifelse(!is.null(madc_file), paste0("MADC file: ", madc_file), "madc_object")), verbose = verbose, level = 0, type = ">>") + vmsg(ifelse(collapse_matches_counts, + "|AltMatch and |RefMatch counts will be collapsed into their respective |Ref and |Alt alleles.", + "|AltMatch and |RefMatch rows will be discarded (collapse_matches_counts = FALSE)."), + verbose = verbose, level = 1, type = ">>") + # This function takes the MADC file as input and generates a Ref and Alt counts dataframe as output - update_df <- get_counts(madc_file) + if (is.null(madc_object)) { + update_df <- get_counts(madc_file = madc_file, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } else { + update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose) + } + # Ensure plain data.frame so row.names<- does not trigger tibble deprecation warning + update_df <- as.data.frame(update_df) # Filter rows where 'AlleleID' ends with 'Ref' ref_df <- subset(update_df, grepl("Ref$", AlleleID)) @@ -43,9 +89,17 @@ get_countsMADC <- function(madc_file) { #Retain only the rows in common if they are not identical and provide warning if (same == FALSE) { - warning("Mismatch between Ref and Alt Markers. Markers without a Ref or Alt match removed.") # Find the common CloneIDs between the two dataframes + all_mks <- unique(c(rownames(ref_df), rownames(alt_df))) common_ids <- intersect(rownames(ref_df), rownames(alt_df)) + n_singles <- length(all_mks) - length(common_ids) + + vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>") + vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>") + vmsg("Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences", verbose = verbose, level = 1, type = ">>") + + warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion. Consider providing a haplotype database file to resolve unpaired Ref/Alt sequences.")) + # Subset both dataframes to retain only the common rows ref_df <- ref_df[common_ids, ] alt_df <- alt_df[common_ids, ] @@ -77,7 +131,7 @@ get_countsMADC <- function(madc_file) { # Print the result ratio_missing_data <- count_zeros / length(size_matrix) - message("Ratio of missing data =", ratio_missing_data, "\n") + vmsg(paste0("Percentage of missing data (datapoints with 0 total count): ", round(ratio_missing_data * 100, 2), "%"), verbose = verbose, level = 2, type = ">>") # Return the ref and alt matrices as a list matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix) @@ -86,41 +140,117 @@ get_countsMADC <- function(madc_file) { } -get_counts <- function(madc_file) { +#' Read and Pre-process a MADC File +#' +#' Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the +#' file format, and returns a filtered data frame containing only `Ref` and `Alt` +#' haplotype rows ready for count-matrix construction. +#' +#' @details +#' **Input**: either `madc_file` (path to CSV) or `madc_object` (pre-read data +#' frame) must be supplied; at least one is required. +#' +#' **Format detection** (applied to file or object alike): the first seven rows +#' of the first column are inspected: +#' - **Standard format**: all entries are blank or `"*"` - the first 7 rows are +#' treated as DArT placeholder rows and skipped. +#' - **Fixed-allele-ID format**: no filler rows - data are used as-is. +#' +#' **`|AltMatch` / `|RefMatch` handling** (controlled by `collapse_matches_counts`): +#' - `FALSE` (default): these rows are simply discarded. +#' - `TRUE`: their counts are summed into the corresponding `|Ref` or `|Alt` +#' row for the same `CloneID`. +#' +#' In all cases, trailing suffixes on `AlleleID` (e.g., `|Ref_001`, `|Alt_002`) +#' are stripped to the canonical `|Ref` / `|Alt` form. +#' +#' @param madc_file character or `NULL`. Path to the input MADC CSV file. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param madc_object data frame or `NULL`. A pre-read MADC data frame +#' (e.g., from `check_botloci()`). When supplied, file reading is skipped. +#' At least one of `madc_file` or `madc_object` must be provided. +#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch` +#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt` +#' rows. If `FALSE` (default), those rows are discarded. +#' @param verbose logical. Whether to print progress messages. Default is `TRUE`. +#' +#' @return A data frame with one row per `Ref` or `Alt` allele entry, retaining +#' all original columns (`AlleleID`, `CloneID`, `AlleleSequence`, sample +#' count columns, etc.). +#' +#' @importFrom dplyr mutate group_by summarise across where select +#' @importFrom dplyr %>% filter case_when +#' +#' @keywords internal +#' @noRd +get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { + + # Add check inputs + if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") + if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") + # Read the MADC file + + if(!is.null(madc_file)){ #Read only the first column for the first seven rows first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) #Check if all entries in the first column are either blank or "*" check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + } else { + check_entries <- all(madc_object[1:min(7L, nrow(madc_object)), 1] %in% c("", "*")) + } + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline if (check_entries) { #Note: This assumes that the first 7 rows are placeholder info from DArT processing - #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) - - #Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] - - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - + vmsg("Detected raw MADC format with 7-row header. Reading file while skipping the first 7 rows.", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + } else { + madc_df <- madc_object[-(1:7), ] + } } else { #Read the madc file - madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) - - # Retain only the Ref and Alt haplotypes - filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] + vmsg("Detected fixed allele IDs MADC format", verbose = verbose, level = 1, type = ">>") + if(!is.null(madc_file)){ + madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + } else { + madc_df <- madc_object + } + } - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + if(collapse_matches_counts){ + filtered_df <- madc_df[order(madc_df$AlleleID),] %>% + # Keep only Ref/Alt alleles and their Match variants; drop other allele types + filter(grepl("\\|(Ref|Alt)(Match)?(_|$)", AlleleID)) %>% + mutate( + Type = case_when( + grepl("\\|Alt(Match)?(_|$)", AlleleID) ~ "Alt", + grepl("\\|Ref(Match)?(_|$)", AlleleID) ~ "Ref" + ) + ) %>% + group_by(CloneID, Type) %>% + summarise( + AlleleID = paste0(unique(CloneID), "|", unique(Type)), + AlleleSequence = first(AlleleSequence), + across(where(is.numeric), ~ sum(.x, na.rm = TRUE)), + .groups = "drop" + ) %>% + select(AlleleID, CloneID, AlleleSequence, everything(), -Type) + } else { + #Retain only the Ref and Alt haplotypes + filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ] } + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + return(filtered_df) } diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 6ab2eba..918071a 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -1,89 +1,136 @@ #' Calculate Concordance between Imputed and Reference Genotypes #' -#' This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -#' It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -#' will not be considered for concordance if the `missing_code` argument is used. If a specific subset of markers should be excluded, -#' it can be provided using the `snps_2_exclude` argument. +#' This function calculates the concordance between imputed and reference +#' genotypes. It assumes that samples are rows and markers are columns. +#' Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +#' Missing data in either dataset can be excluded from the concordance calculation +#' using the `missing_code` argument. Specific markers can be excluded using +#' the `snps_2_exclude` argument. #' -#' @param reference_genos A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param imputed_genos A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended. -#' @param missing_code An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation. -#' @param snps_2_exclude An optional vector of marker IDs to exclude from the concordance calculation. -#' @param verbose A logical value indicating whether to print a summary of the concordance results. Default is FALSE. +#' @param reference_genos A data frame containing reference genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @return A list with two elements: -#' \itemize{ -#' \item \code{result_df}: A data frame with sample IDs and their concordance percentages. -#' \item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. -#' } +#' @param imputed_genos A data frame containing imputed genotype data, +#' with rows as samples and columns as markers. Must include a column named `ID`. #' -#' @details The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +#' @param missing_code Optional value specifying missing data. If provided, +#' loci with this value in either dataset will be excluded from the concordance calculation. #' -#' @import dplyr +#' @param snps_2_exclude Optional vector of marker IDs to exclude from the concordance calculation. #' -#' @examples +#' @param verbose Logical. If `TRUE`, prints summary statistics (minimum, quartiles, +#' median, mean, maximum) of concordance percentages. #' -#' # Example Input variables -#' ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -#' ref_file <- system.file("imputation_reference.txt", package="BIGr") -#' test_file <- system.file("imputation_test.txt", package="BIGr") +#' @param plot Logical. If `TRUE`, produces a bar plot of concordance percentage +#' by sample. #' -#' # Import files -#' snps = read.table(ignore_file, header = TRUE) -#' ref = read.table(ref_file, header = TRUE) -#' test = read.table(test_file, header = TRUE) +#' @param print_result Logical. If `TRUE` (default), prints the concordance +#' results data frame to the console. If `FALSE`, results are returned invisibly. #' -#' #Calculations -#' result <- imputation_concordance(reference_genos = ref, -#' imputed_genos = test, -#' snps_2_exclude = snps, -#' missing_code = 5, -#' verbose = FALSE) +#' @return A data frame with: +#' \itemize{ +#' \item \code{ID}: Sample identifiers shared between the datasets. +#' \item \code{Concordance}: Percentage of matching genotypes per sample. +#' } +#' If \code{print_result = FALSE}, the data frame is returned invisibly. #' +#' @details +#' The function: +#' \enumerate{ +#' \item Identifies common samples and markers between the datasets. +#' \item Optionally excludes specified SNPs. +#' \item Removes loci with missing data (if \code{missing_code} is provided). +#' \item Computes per-sample concordance as the percentage of matching genotypes. +#' } #' +#' When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +#' is generated using \pkg{ggplot2}. #' -#' @export +#' @import dplyr +#' +#' @examples +#' ref <- data.frame( +#' ID = c("S1", "S2", "S3"), +#' SNP1 = c(0, 1, 2), +#' SNP2 = c(1, 1, 0), +#' SNP3 = c(2, 5, 1) +#' ) #' +#' test <- data.frame( +#' ID = c("S1", "S2", "S3"), +#' SNP1 = c(0, 0, 2), +#' SNP2 = c(1, 1, 1), +#' SNP3 = c(2, 5, 0) +#' ) +#' +#' result <- imputation_concordance( +#' reference_genos = ref, +#' imputed_genos = test, +#' snps_2_exclude = "SNP2", +#' missing_code = 5, +#' print_result = FALSE +#' ) +#' +#' result +#' +#' @importFrom stats reorder +#' @export imputation_concordance <- function(reference_genos, imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE) { + verbose = FALSE, + plot = FALSE, + print_result = TRUE) { # Find common IDs common_ids <- intersect(imputed_genos$ID, reference_genos$ID) - imputed_genos <- imputed_genos %>% filter(ID %in% common_ids) %>% arrange(ID) - reference_genos <- reference_genos %>% filter(ID %in% common_ids) %>% arrange(ID) + imputed_genos <- imputed_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) + + reference_genos <- reference_genos %>% + filter(ID %in% common_ids) %>% + arrange(ID) - # Find common SNPs, excluding those in snps_2_exclude if provided + # Find common SNPs common_snps <- setdiff( intersect(colnames(imputed_genos), colnames(reference_genos)), as.vector(unlist(snps_2_exclude)) ) - # Subset and convert to matrices for faster computation + # Remove ID column if present + common_snps <- setdiff(common_snps, "ID") + + # Convert to matrices imputed_matrix <- as.matrix(imputed_genos[, common_snps]) reference_matrix <- as.matrix(reference_genos[, common_snps]) - # Identify valid SNPs that are not missing in either dataset + # Identify valid SNPs if (!is.null(missing_code)) { - valid_snps <- (imputed_matrix != missing_code) & (reference_matrix != missing_code) + valid_snps <- (imputed_matrix != missing_code) & + (reference_matrix != missing_code) } else { - valid_snps <- matrix(TRUE, nrow = nrow(imputed_matrix), ncol = ncol(imputed_matrix)) + valid_snps <- matrix(TRUE, + nrow = nrow(imputed_matrix), + ncol = ncol(imputed_matrix)) } - # Compute concordance (row-wise percentage of matching SNPs) + # Compute concordance matches <- (imputed_matrix == reference_matrix) & valid_snps - percentage_match <- rowSums(matches, na.rm = TRUE) / rowSums(valid_snps, na.rm = TRUE) + percentage_match <- rowSums(matches, na.rm = TRUE) / + rowSums(valid_snps, na.rm = TRUE) - # Create output data frame + percentage_match[is.nan(percentage_match)] <- NA + + # Output data frame (original structure preserved) result_df <- data.frame( - ID = common_ids, + ID = imputed_genos$ID, Concordance = paste0(round(percentage_match * 100, 2), "%") ) - # Print mean concordance + # Summary statistics summary_concordance <- summary(percentage_match, na.rm = TRUE) * 100 names(summary_concordance) <- c("Min", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max") @@ -94,6 +141,39 @@ imputation_concordance <- function(reference_genos, } } - return(result_df) -} + # Print results to console (NEW OPTION) + if (print_result) { + print(result_df) + } + + # Optional plot + if (plot) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + stop("Package 'ggplot2' is required when plot = TRUE.", call. = FALSE) + } + + plot_df <- data.frame( + ID = imputed_genos$ID, + Concordance = percentage_match * 100 + ) + concordance_plot <- ggplot2::ggplot( + plot_df, + ggplot2::aes(x = reorder(ID, Concordance), y = Concordance) + ) + + ggplot2::geom_bar(stat = "identity") + + ggplot2::labs(title = "Imputation Concordance by Sample", + x = "Sample ID", + y = "Concordance (%)") + + ggplot2::theme_minimal() + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) + + print(concordance_plot) + } + + if (print_result) { + return(result_df) + } else { + invisible(result_df) + } +} diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 329ac5a..e223977 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -2,8 +2,8 @@ #' #' This function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It includes options for filtering multiallelic SNPs and parallel processing to improve performance. #' -#' @param madc A string specifying the path to the MADC file. -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. +#' @param madc Required. A string specifying the path or URL to the MADC file. +#' @param botloci_file Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand. #' @param hap_seq_file A string specifying the path to the haplotype database fasta file. #' @param rm_multiallelic_SNP A logical value. If TRUE, SNPs with more than one alternative base are removed. If FALSE, the thresholds specified by `multiallelic_SNP_dp_thr` and `multiallelic_SNP_sample_thr` are used to filter low-frequency SNP alleles. Default is FALSE. #' @param multiallelic_SNP_dp_thr A numeric value specifying the minimum depth by tag threshold for filtering low-frequency SNP alleles when `rm_multiallelic_SNP` is FALSE. Default is 0. @@ -11,6 +11,10 @@ #' @param alignment_score_thr A numeric value specifying the minimum alignment score threshold. Default is 40. #' @param n.cores An integer specifying the number of cores to use for parallel processing. Default is 1. #' @param out_vcf A string specifying the name of the output VCF file. If the file extension is not `.vcf`, it will be appended automatically. +#' @param markers_info A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed). +#' @param add_others A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE. +#' @param others_max_snps An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit). +#' @param others_rm_with_indels A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE. #' @param verbose A logical value indicating whether to print metrics and progress to the console. Default is TRUE. #' #' @return This function does not return an R object. It writes the processed VCF file v4.3 to the specified `out_vcf` path. @@ -18,6 +22,18 @@ #' @details #' The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the `alignment_score_thr` parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the `alignment_score_thr` is not met, the corresponding SNPs are discarded. #' +#' **Sanity check behaviour and requirements** +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `markers_info` with `Ref`/`Alt`/`Indel_pos`/`Indel_length` + `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | **RefAltSeqs** | all paired | `botloci_file` | +#' | | unpaired | `botloci_file` + `hap_seq_file` (microhaplotype DB) | +#' +#' #' @examples #' # Example usage: #' @@ -51,8 +67,8 @@ #' @import vcfR #' #' @export -madc2vcf_all <- function(madc = NULL, - botloci_file = NULL, +madc2vcf_all <- function(madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -60,13 +76,56 @@ madc2vcf_all <- function(madc = NULL, multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE){ + vmsg("Running BIGr madc2vcf_all", verbose = verbose, level = 0, type = ">>") + vmsg("madc : %s", verbose = verbose, level = 1, type = ">>", madc) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("hap_seq_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(hap_seq_file)) "NULL" else hap_seq_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("n.cores : %s", verbose = verbose, level = 1, type = ">>", n.cores) + vmsg("alignment_score_thr : %s", verbose = verbose, level = 1, type = ">>", alignment_score_thr) + vmsg("rm_multiallelic_SNP : %s", verbose = verbose, level = 1, type = ">>", rm_multiallelic_SNP) + vmsg("multiallelic_SNP_dp_thr : %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_dp_thr) + vmsg("multiallelic_SNP_sample_thr: %s", verbose = verbose, level = 1, type = ">>", multiallelic_SNP_sample_thr) + vmsg("add_others : %s", verbose = verbose, level = 1, type = ">>", add_others) + vmsg("others_max_snps : %s", verbose = verbose, level = 1, type = ">>", if (is.null(others_max_snps)) "NULL" else others_max_snps) + vmsg("others_rm_with_indels : %s", verbose = verbose, level = 1, type = ">>", others_rm_with_indels) + vmsg("out_vcf : %s", verbose = verbose, level = 1, type = ">>", if (is.null(out_vcf)) "NULL" else out_vcf) + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + # Input checks + if(is.null(madc) || !(file.exists(madc) | url_exists(madc))) stop("MADC file not found. Please provide a valid path or URL.") + if(is.null(botloci_file) || !(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if(!is.null(hap_seq_file) && !(file.exists(hap_seq_file) | url_exists(hap_seq_file))) stop("Haplotype sequence file not found. Please provide a valid path or URL.") + + ## n.cores as integer + if(!is.numeric(n.cores) | n.cores < 1) stop("n.cores should be a positive integer.") + + ## alignment_score_thr, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr as numeric + if(!is.numeric(alignment_score_thr)) stop("alignment_score_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_dp_thr)) stop("multiallelic_SNP_dp_thr should be numeric.") + if(!is.numeric(multiallelic_SNP_sample_thr)) stop("multiallelic_SNP_sample_thr should be numeric.") + + ## out_vcf as string + if(!is.null(out_vcf) & !is.character(out_vcf)) stop("out_vcf should be a string specifying the output file name.") + + ## rm_multiallelic_SNP, add_others and verbose as logical + if(!is.logical(rm_multiallelic_SNP)) stop("rm_multiallelic_SNP should be logical.") + if(!is.logical(add_others)) stop("add_others should be logical.") + if(!is.null(others_max_snps) && (!is.numeric(others_max_snps) || others_max_snps < 1)) stop("others_max_snps should be a positive integer or NULL.") + if(!is.logical(others_rm_with_indels)) stop("others_rm_with_indels should be logical.") + if(!is.logical(verbose)) stop("verbose should be logical.") + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=') - if(!is.null(madc)) report <- read.csv(madc, check.names = FALSE) else stop("Please provide a MADC file") - if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") + report <- read.csv(madc, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (isTRUE(check)) message[1] else message[2] + }, checks$checks, checks$messages) + + for(i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ + idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes")])){ + idx <- which((checks$checks[c("IUPACcodes")])) + stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n")) + } + + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + id_cols <- intersect(c("CloneID", "BI_markerID"), colnames(mi_df)) + if(!length(id_cols)) { + stop("markers_info must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + } + match_n <- vapply(id_cols, function(col) { + sum(unique(report$CloneID) %in% unique(stats::na.omit(mi_df[[col]]))) + }, integer(1)) + if(!any(match_n)) { + stop("None of the markers_info CloneID or BI_markerID values match the MADC CloneID column. Please make sure they use the same marker IDs.") + } + id_col <- id_cols[which.max(match_n)] + if(id_col != "CloneID" || !"CloneID" %in% colnames(mi_df)) { + mi_df$CloneID <- mi_df[[id_col]] + if(id_col == "BI_markerID") { + vmsg("markers_info: 'BI_markerID' column copied to 'CloneID' for internal use", verbose = verbose, level = 1) + } + } + # Validate CloneID values + if(any(is.na(mi_df$CloneID) | mi_df$CloneID == "")) + stop("markers_info CloneID column contains empty or NA values. Please check your markers_info file.") + if(!any(mi_df$CloneID %in% report$CloneID)) + stop("None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs.") + n_match <- sum(mi_df$CloneID %in% report$CloneID) + n_total <- length(unique(report$CloneID)) + if(n_match < n_total) + vmsg("%s of %s MADC CloneIDs found in markers_info. Unmatched markers will be removed", verbose = verbose, level = 1, n_match, n_total) + } else mi_df <- NULL + + if(any(!checks$checks[c("ChromPos")])){ + if(is.null(markers_info)) { + stop(paste(messages_results[c("ChromPos")], collapse = "\n")) + } else { + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ", + "The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.") + } + } + + if(any(checks$checks[c("Indels")])){ + idx <- which((checks$checks[c("Indels")])) + if(is.null(markers_info)) { + vmsg("The MADC file contains indels and markers_info file is not provided",verbose = verbose, level = 1, type = ">>") + vmsg("Tags with indels as targets will be flagged with warnings and removed from the analysis ",verbose = verbose, level = 2, type = ">>") + vmsg("Provide markers_info with REF/ALT/Indel_pos if you want to include the targets indels",verbose = verbose, level = 2, type = ">>") + + } else { + if(checks$checks["Indels"] && + !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) + stop("Indels detected in MADC file. ", + "The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.") + if(!"Type" %in% colnames(mi_df)) { + mi_df$Type <- ifelse(nchar(as.character(mi_df$Ref)) > 1 | nchar(as.character(mi_df$Alt)) > 1, + "Indel", "SNP") + vmsg("markers_info: 'Type' column not found. Derived from Ref/Alt lengths (%s SNPs, %s Indels)", + verbose = verbose, level = 1, + sum(mi_df$Type == "SNP"), sum(mi_df$Type == "Indel")) + } + vmsg("The MADC file contains indels and markers_info file was provided with all required columns",verbose = verbose, level = 1, type = ">>") + vmsg("Target indels will be exported, but no off-targets are extracted from these tags due to higher likelihood of pairwise alignment errors",verbose = verbose, level = 2, type = ">>") + } + } + vmsg("Inputs checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Initial filters and inputs adjustments...", verbose = verbose, level = 0, type = ">>") + + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + if(!checks$checks["RefAltSeqs"] && is.null(hap_seq_file)){ + vmsg("Not all Ref sequences have a corresponding Alt or vice-verse. Provide hap_seq_file for this function to recover the missing tags or tags with missing pairs will be discarded", verbose = verbose, level = 1) + } + + botloci <- read.csv(botloci_file, header = F) if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL # Check marker names compatibility between MADC and botloci - checked_botloci <- check_botloci(botloci, report) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) botloci <- checked_botloci[[1]] report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + + # Derive position padding width from CloneIDs in the original report + pad_width <- unique(nchar(sub(".*_", "", unique(report$CloneID)))) + if(length(pad_width) != 1) warning("CloneIDs in the MADC report have inconsistent position padding widths. IDs in the VCF may be inconsistent.") + pad_width <- pad_width[1] my_results_csv <- loop_though_dartag_report(report, botloci, hap_seq, n.cores=n.cores, alignment_score_thr = alignment_score_thr, + checks = checks, + mi_df = mi_df, + pad_width = pad_width, + add_others = add_others, + others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = verbose) + vmsg("All information gathered!", verbose = verbose, level = 0, type = ">>") + vcf_body <- create_VCF_body(csv = my_results_csv, n.cores = n.cores, rm_multiallelic_SNP = rm_multiallelic_SNP, multiallelic_SNP_dp_thr = multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr = multiallelic_SNP_sample_thr, + pad_width = pad_width, verbose = verbose) #Make a header separate from the dataframe @@ -114,6 +285,8 @@ madc2vcf_all <- function(madc = NULL, vcf_term <- sapply(strsplit(out_vcf, "[.]"), function(x) x[length(x)]) if(length(vcf_term) != 0) if(vcf_term != "vcf") out_vcf <- paste0(out_vcf,".vcf") + vmsg("VCF ready! Output written to: %s", verbose = verbose, level = 0, type = ">>", out_vcf) + writeLines(vcf_header, con = out_vcf) suppressWarnings( write.table(vcf_body, file = out_vcf, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) @@ -132,10 +305,13 @@ madc2vcf_all <- function(madc = NULL, #' @import parallel #' #' @noRd -loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){ +loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, + checks = NULL, mi_df = NULL, pad_width = NULL, + add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, + verbose = TRUE){ - if(!is.null(hap_seq)){ - hap_seq <- get_ref_alt_hap_seq(hap_seq) + if(!is.null(hap_seq) & (is.null(checks) | !isTRUE(checks$checks["RefAltSeqs"]))){ + hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) } nsamples <- ncol(report) - 3 @@ -148,7 +324,7 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align clust <- makeCluster(n.cores) #clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) - add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = verbose)) + add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = FALSE)) stopCluster(clust) ref_index <- sapply(add_ref_alt_results, "[[",2) @@ -160,17 +336,25 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align updated_by_cloneID <- lapply(add_ref_alt_results, "[[",1) - if(verbose){ - cat("The Ref_0001 sequence had to be added for:", sum(ref_index==1),"tags\n") - cat("The Alt_0002 sequence had to be added for:", sum(alt_index==1),"tags\n") - cat("Tags discarded due to lack of Ref_0001 sequence:", sum(ref_index==-1),"tags\n") - cat("Tags discarded due to lack of Alt_0002 sequence:", sum(alt_index==-1),"tags\n") + if(!is.null(hap_seq)){ + vmsg("The haplotype database was provided and used to recover missing Ref_0001 and Alt_0002 sequences", verbose = verbose, level = 1) + vmsg("The Ref_0001 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==1)) + vmsg("The Alt_0002 sequence were added for: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==1)) + } else { + vmsg("The haplotype database was not provided. Tags with missing Ref_0001 or Alt_0002 sequences were flagged with warnings and removed from the analysis", verbose = verbose, level = 1) } + vmsg("Tags discarded due to lack of Ref_0001 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(ref_index==-1)) + vmsg("Tags discarded due to lack of Alt_0002 sequence: %s tags", verbose = verbose, level = 2, type = ">>", sum(alt_index==-1)) + if(sum(ref_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Ref_0001 sequence. Consider providing the haplotype database file to recover these tags", sum(ref_index==-1))) + if(sum(alt_index==-1) > 0) warning(sprintf("%s tags discarded due to lack of Alt_0002 sequence. Consider providing the haplotype database file to recover these tags", sum(alt_index==-1))) + vmsg("Pairwise alignments of sequences to recover SNP position, reference and alternative bases...", verbose = verbose, level = 0) clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci", "alignment_score_thr")) - compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) + #clusterExport(clust, c("botloci", "alignment_score_thr", "mi_df", "add_others", "others_max_snps", "others_rm_with_indels")) + compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr, mi_df, + add_others = add_others, others_max_snps = others_max_snps, + others_rm_with_indels = others_rm_with_indels, verbose = FALSE)) stopCluster(clust) my_results_csv <- lapply(compare_results, "[[", 1) @@ -180,12 +364,41 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align rm_score <- unlist(rm_score) rm_N <- sapply(compare_results, "[[", 3) rm_N <- unlist(rm_N) - - if(verbose){ - cat("Number of tags removed because of low alignment score:", length(rm_score),"tags\n") - cat("Number of tags removed because of N in the alternative sequence:", length(rm_N),"tags\n") + rm_indels <- sapply(compare_results, "[[", 4) + rm_indels <- unlist(rm_indels) + n_rm_others_indels <- sum(sapply(compare_results, "[[", 5)) + n_rm_others_maxsnps <- sum(sapply(compare_results, "[[", 6)) + + vmsg("Number of tags removed because of low alignment score (threshold = %s): %s tags", verbose = verbose, level = 2, type = ">>", alignment_score_thr, length(rm_score)) + vmsg("Number of tags removed because of N in the alternative sequence: %s tags", verbose = verbose, level = 2, type = ">>", length(rm_N)) + if(length(rm_indels) > 0) { + if(!is.null(mi_df) && all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df))) { + vmsg("Number of tags with indels as targets: %s tags (markers_info provided with required columns; targets exported, off-targets skipped)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } else { + vmsg("Number of tags removed because of indels as targets: %s tags (no markers_info with Ref/Alt/Indel_pos provided; tags discarded)", verbose = verbose, level = 2, type = ">>", length(rm_indels)) + } + } else { + vmsg("Number of tags removed because of indels as targets: 0 tags", verbose = verbose, level = 2, type = ">>") + } + n_others_total <- sum(sapply(compare_results, "[[", 7)) + n_others_kept <- n_others_total - n_rm_others_indels - n_rm_others_maxsnps + others_added_info <- unlist(lapply(compare_results, "[[", 8)) + if(add_others) { + vmsg("Number of Other alleles found: %s (%s kept after filters, %s discarded)", verbose = verbose, level = 2, type = ">>", n_others_total, n_others_kept, n_others_total - n_others_kept) + if(others_rm_with_indels) + vmsg("Number of Other alleles discarded due to indels vs Ref: %s", verbose = verbose, level = 2, type = ">>", n_rm_others_indels) + if(!is.null(others_max_snps)) + vmsg("Number of Other alleles discarded due to exceeding max SNPs (%s): %s", verbose = verbose, level = 2, type = ">>", others_max_snps, n_rm_others_maxsnps) + # if(length(others_added_info) > 0) { + # vmsg("Others tags added:", verbose = verbose, level = 3, type = ">>") + # for(msg in others_added_info) vmsg(" %s", verbose = verbose, level = 3, type = ">>", msg) + # } + } else { + vmsg("Number of Other alleles found: %s (not processed, add_others = FALSE)", verbose = verbose, level = 2, type = ">>", n_others_total) } + vmsg("Pairwise alignments concluded!", verbose = verbose, level = 1) + rownames(my_results_csv) <- NULL return(my_results_csv) } @@ -279,10 +492,48 @@ add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { #' @importFrom pwalign pairwiseAlignment nucleotideSubstitutionMatrix #' #' @noRd -compare <- function(one_tag, botloci, alignment_score_thr = 40){ +compare <- function(one_tag, botloci, alignment_score_thr = 40, mi_df = NULL, add_others = TRUE, others_max_snps = NULL, others_rm_with_indels = TRUE, verbose = FALSE){ + + #idx <- which(names(updated_by_cloneID) == "Ra01_020534029") + #one_tag <- updated_by_cloneID[[idx]] cloneID <- one_tag$CloneID[1] + isBotLoci <- cloneID %in% botloci[,1] - # If marker is present in the botloc list, get the reverse complement sequence + if(!is.null(mi_df)) { + one_mi_df <- mi_df[which(mi_df$CloneID %in% cloneID), ] + # Handle duplicate CloneIDs in markers_info + if(nrow(one_mi_df) > 1) { + key_cols <- intersect(c("CloneID", "Chr", "Pos", "Ref", "Alt", "Type"), colnames(one_mi_df)) + if(nrow(unique(one_mi_df[, key_cols])) == 1) { + warning("Duplicate CloneID '", cloneID, "' found in markers_info with identical key columns. Keeping the first entry.") + one_mi_df <- one_mi_df[1, ] + } else { + stop("Duplicate CloneID '", cloneID, "' found in markers_info with differing values in key columns (CloneID, Chr, Pos, Ref, Alt, Type). Please resolve the conflict in your markers_info file.") + } + } + isIndel <- !is.null(one_mi_df$Type) && !is.na(one_mi_df$Type) && tolower(one_mi_df$Type) == "indel" + } else { + isIndel <- FALSE + } + + if(isIndel){ + update_tag <- one_tag[grep("Ref_0001$",one_tag$AlleleID),] + update_tag[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag_temp <- one_tag[grep("Alt_0002$",one_tag$AlleleID),] + update_tag_temp[,2:5] <- c(one_mi_df$Chr, one_mi_df$Pos, one_mi_df$Ref, one_mi_df$Alt) + update_tag <- rbind(update_tag, update_tag_temp) + + return(list(update_tag = update_tag, + rm_score = NULL, + rm_N = NULL, + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) + } + + # If marker is present in the botloci list, get the reverse complement sequence if(isBotLoci) one_tag$AlleleSequence <- sapply(one_tag$AlleleSequence, function(x) as.character(reverseComplement(DNAString(x)))) chr <- sapply(strsplit(cloneID, "_"), function(x) x[-length(x)]) @@ -296,11 +547,25 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ sigma <- nucleotideSubstitutionMatrix(match = 1, mismatch = -0.5, baseOnly = FALSE) # baseOnly = FALSE avoid breaking when N is present (they will be filtered after)) align <- pairwiseAlignment(ref_seq, alt_seq, - substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + substitutionMatrix = sigma, + gapOpening=-1.4, + gapExtension=-0.1, + type = "global") # The score is a bit different from the python script despite same weights if(align@score > alignment_score_thr){ # if score for the target sequence is smaller than the threshold, the tag will be discarted pos_target_idx <- align@pattern@mismatch@unlistData + if(length(align@pattern@mismatch@unlistData) == 0){ + #No polymorphisms found between ref and alt sequences - or just indels + return(list(update_tag = NULL, + rm_score = NULL, + rm_N = NULL, + rm_indels = cloneID, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) + } ref_base <- substring(ref_seq, align@pattern@mismatch@unlistData, align@pattern@mismatch@unlistData) alt_base <- substring(alt_seq, align@subject@mismatch@unlistData, align@subject@mismatch@unlistData) @@ -325,49 +590,127 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ if(length(rm_target) >0) pos_ref_idx <- pos_ref_idx[-rm_target] # Cases found where the AltMatch is another alternative for the target SNP - they are discarted if(length(pos_ref_idx) >0){ - ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + ref_base_match <- substring(ref_seq, pos_ref_idx, pos_ref_idx) pos_alt_idx <- align@subject@mismatch@unlistData # If there are indels, the position in the alternative is not the same as the reference if(length(rm_target) >0) pos_alt_idx <- pos_alt_idx[-rm_target] # remove target position when is AltMatch - but the order in the sequence is the same - alt_base <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + alt_base_match <- substring(Match_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) # If Match sequences have N, do not consider as polymorphism - if(any(!alt_base %in% c("A", "T", "C", "G"))) { - ref_base <- ref_base[-which(!alt_base %in% c("A", "T", "C", "G"))] - pos_ref_idx <- pos_ref_idx[-which(!alt_base %in% c("A", "T", "C", "G"))] - alt_base <- alt_base[-which(!alt_base %in% c("A", "T", "C", "G"))] + if(any(!alt_base_match %in% c("A", "T", "C", "G"))) { + ref_base_match <- ref_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!alt_base_match %in% c("A", "T", "C", "G"))] + alt_base_match <- alt_base_match[-which(!alt_base_match %in% c("A", "T", "C", "G"))] } - if(length(alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + if(length(alt_base_match) >0){ # If the N is the only polymorphis found, the Match tag will be discarted # The reported position is always on reference pos <- pos_target - (pos_target_idx - pos_ref_idx) # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table - update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base)), ] + update_tag_temp <- one_tag[grep("Match",one_tag$AlleleID)[j],][rep(1, length(alt_base_match)), ] update_tag_temp$Chromosome <- chr update_tag_temp$SNP_position_in_Genome <- pos - update_tag_temp$Ref <- ref_base - update_tag_temp$Alt <- alt_base + update_tag_temp$Ref <- ref_base_match + update_tag_temp$Alt <- alt_base_match update_tag <- rbind(update_tag, update_tag_temp) } } } } + others_seq <- one_tag[grep("Other",one_tag$AlleleID),] + n_others_total <- nrow(others_seq) + n_rm_others_indels <- 0L + n_rm_others_maxsnps <- 0L + others_added_info <- character(0) + + if(add_others && nrow(others_seq) > 0){ + for(j in seq_len(nrow(others_seq))){ + align <- pairwiseAlignment(ref_seq, # Align with the reference + others_seq[j,]$AlleleSequence, + substitutionMatrix = sigma,gapOpening=-1.4, gapExtension=-0.1, type = "global") + # Filter: discard Others with indels relative to Ref + if(others_rm_with_indels && + (length(align@pattern@indel@unlistData) > 0 | length(align@subject@indel@unlistData) > 0)) { + n_rm_others_indels <- n_rm_others_indels + 1L + next + } + pos_ref_idx <- align@pattern@mismatch@unlistData + pos_alt_idx <- align@subject@mismatch@unlistData + + # Filter: discard Others with too many SNPs vs Ref (count before removing target position) + if(!is.null(others_max_snps) && length(pos_ref_idx) > others_max_snps) { + n_rm_others_maxsnps <- n_rm_others_maxsnps + 1L + next + } + rm_target_other <- which(pos_ref_idx == pos_target_idx) # remove target position if base is the same as Ref or Alt + if(length(rm_target_other) > 0) { + other_tag_base <- substring(others_seq[j,]$AlleleSequence, pos_target_idx, pos_target_idx) + if(other_tag_base == ref_base | other_tag_base == alt_base){ # If Other has same base as REF and ALT, it won't be considered in their counts + pos_ref_idx <- pos_ref_idx[-rm_target_other] + pos_alt_idx <- pos_alt_idx[-rm_target_other] + } + } + other_ref_base <- substring(ref_seq, pos_ref_idx, pos_ref_idx) + other_alt_base <- substring(others_seq[j,]$AlleleSequence, pos_alt_idx, pos_alt_idx) + # Cases found where the AltMatch is another alternative for the target SNP - they are discarted + if(length(pos_ref_idx) >0){ + # If Match sequences have N, do not consider as polymorphism + if(any(!other_alt_base %in% c("A", "T", "C", "G"))) { + other_ref_base <- other_ref_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + other_alt_base <- other_alt_base[-which(!other_alt_base %in% c("A", "T", "C", "G"))] + } + + if(length(other_alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted + # The reported position is always on reference + pos <- pos_target - (pos_target_idx - pos_ref_idx) + + # Sometimes there are more than one polymorphism in the sequence, we need to add rows to the table + update_tag_temp <- one_tag[grep("Other",one_tag$AlleleID)[j],][rep(1, length(other_alt_base)), ] + + update_tag_temp$Chromosome <- chr + update_tag_temp$SNP_position_in_Genome <- pos + update_tag_temp$Ref <- other_ref_base + update_tag_temp$Alt <- other_alt_base + + update_tag <- rbind(update_tag, update_tag_temp) + others_added_info <- c(others_added_info, + paste0(others_seq[j,]$AlleleID, " -> position(s): ", paste(pos, collapse = ", "))) + } + } + } + } + return(list(update_tag = update_tag, # updated data.frame, NULL if discarted rm_score = NULL, # cloneID if removed because of low alignment score, NULL if kept - rm_N = NULL)) # cloneID if removed because of N in the target alternative, NULL if kept + rm_N = NULL, + rm_indels = NULL, + n_rm_others_indels = n_rm_others_indels, + n_rm_others_maxsnps = n_rm_others_maxsnps, + n_others_total = n_others_total, + others_added_info = others_added_info)) } else { return(list(update_tag = NULL, rm_score = NULL, - rm_N = cloneID)) + rm_N = cloneID, + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } } else{ return(list(update_tag = NULL, rm_score = cloneID, - rm_N = NULL)) + rm_N = NULL, + rm_indels = NULL, + n_rm_others_indels = 0L, + n_rm_others_maxsnps = 0L, + n_others_total = 0L, + others_added_info = character(0))) } - } #' Converts the fasta to a data.frame with first column the AlleleID and second the AlleleSequence @@ -376,7 +719,8 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ #' @param hap_seq haplotype db #' #' @noRd -get_ref_alt_hap_seq <- function(hap_seq){ +get_ref_alt_hap_seq <- function(hap_seq, botloci){ + headers <- hap_seq$V1[grep(">",hap_seq$V1)] headers <- gsub(">", "", headers) @@ -394,6 +738,17 @@ get_ref_alt_hap_seq <- function(hap_seq){ seqs <- sapply(seqs, function(x) paste0(x, collapse = "")) hap_seq <- data.frame(AlleleID = headers, AlleleSequence = seqs) + + # Check padding + hap_cloneID <- sapply(strsplit(hap_seq$AlleleID, "[|]"), function(x) x[1]) + botloci_cloneID <- botloci$V1 + + pad_hap <- unique(nchar(sub(".*_", "", hap_cloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci_cloneID))) + + if(length(pad_hap) > 1) stop("Check marker IDs in haplotype DB file. They should have the same padding.") + if(pad_hap != pad_botloci) stop("Check marker IDs padding in haplotype DB file. They should match the botloci file.") + return(hap_seq) } @@ -420,6 +775,7 @@ create_VCF_body <- function(csv, multiallelic_SNP_dp_thr = 2, multiallelic_SNP_sample_thr = 10, n.cores = 1, + pad_width = NULL, verbose = TRUE){ # Make sure counts are numeric @@ -430,14 +786,32 @@ create_VCF_body <- function(csv, clust <- makeCluster(n.cores) #clusterExport(clust, c("merge_counts","rm_multiallelic_SNP", "multiallelic_SNP_dp_thr", "multiallelic_SNP_sample_thr")) - vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr)) + vcf_tag_list <- parLapply(clust, cloneID, function(x) merge_counts(x, rm_multiallelic_SNP, multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, pad_width)) stopCluster(clust) vcf_tag_list1 <- lapply(vcf_tag_list, "[[", 1) - rm_mks <- sapply(vcf_tag_list, "[[" ,2) - - if(verbose){ - print(paste("SNP removed because presented more than one allele:", sum(rm_mks))) + rm_mks <- sapply(vcf_tag_list, "[[", 2) # total removed + total_mks <- sapply(vcf_tag_list, "[[", 3) # total multiallelic found + rm_setting <- sapply(vcf_tag_list, "[[", 4) # removed by rm_multiallelic_SNP=TRUE + rm_filter <- sapply(vcf_tag_list, "[[", 5) # removed because empty after filtering + kept_multi <- sapply(vcf_tag_list, "[[", 6) # kept as multiallelic + simplified <- sapply(vcf_tag_list, "[[", 7) # simplified to biallelic + multi_others_target <- sapply(vcf_tag_list, "[[", 8) # multiallelic target from Others + + vmsg("Performing final filterings", verbose = verbose, level = 0, type = ">>") + + vmsg("Multiallelic off-target SNPs found: %s", verbose = verbose, level = 2, type = ">>", sum(total_mks)) + vmsg("Multiallelic target SNPs with a 3rd allele from Others: %s", verbose = verbose, level = 2, type = ">>", sum(multi_others_target)) + if(rm_multiallelic_SNP) { + vmsg("Removed (rm_multiallelic_SNP = TRUE): %s", verbose = verbose, level = 3, type = ">>", sum(rm_setting)) + } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0) { + vmsg("Removed (empty after filtering; depth thr = %s, sample thr = %s): %s", + verbose = verbose, level = 3, type = ">>", + multiallelic_SNP_dp_thr, multiallelic_SNP_sample_thr, sum(rm_filter)) + vmsg("Kept as multiallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) + vmsg("Simplified to biallelic after filtering: %s", verbose = verbose, level = 3, type = ">>", sum(simplified)) + } else { + vmsg("All kept (rm_multiallelic_SNP = FALSE, no thresholds set): %s", verbose = verbose, level = 3, type = ">>", sum(kept_multi)) } for(i in seq_along(vcf_tag_list1)) { @@ -451,20 +825,12 @@ create_VCF_body <- function(csv, vcf_body$V3 <- as.numeric(vcf_body$V3) rownames(vcf_body) <- NULL - # Remove padding - sp <- strsplit(vcf_body$target, "_") - pos <- sapply(sp, function(x) x[length(x)]) - chr <- sapply(sp, function(x) paste0(x[-length(x)], collapse = "_")) - vcf_body$target <- paste0(chr, "_",as.numeric(pos)) - # Dealing with repeated positions # discard the ones that are not the target and keep only the first if all are off-targets if(length(which(duplicated(vcf_body[,3]))) > 0){ repeated <- vcf_body[which(duplicated(vcf_body[,3])), 4] - if(verbose){ - print(paste("Different primers pair capture same SNP positions in", length(repeated), "locations. The repeated were discarded.")) - } + vmsg("Different primers pair capture same SNP positions in %s locations. The repeated were discarded", verbose = verbose, level = 2, type = ">>", length(repeated)) repeated_tab <- vcf_body[which(vcf_body[,4] %in% repeated),] vcf_body_new <- vcf_body[-which(vcf_body[,4] %in% repeated),] @@ -482,6 +848,10 @@ create_VCF_body <- function(csv, vcf_body_new <- rbind(vcf_body_new, repeated_tab_stay) } else vcf_body_new <- vcf_body + vmsg("Filters finished", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF...", verbose = verbose, level = 0, type = ">>") + vcf_body_new <- vcf_body_new[,-1] colnames(vcf_body_new) <- c("#CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER", "INFO","FORMAT", colnames(csv)[-c(1:7)]) @@ -503,25 +873,80 @@ create_VCF_body <- function(csv, #' aspect of the marker, the marker is discarded. This is likely to happen to paralogous sites. #' #' @noRd -merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0){ - +merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic_SNP_dp_thr = 0, + multiallelic_SNP_sample_thr = 0, pad_width = NULL){ + #cloneID_unit <- cloneID[[250]] #Get counts for target SNP - rm <- 0 - RefTag <- apply(cloneID_unit[which(grepl("Ref", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - AltTag <- apply(cloneID_unit[which(grepl("Alt", cloneID_unit$AlleleID) & !duplicated(cloneID_unit$AlleleID)),-c(1:7)], 2, sum) - tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) + rm_by_setting <- 0 # removed because rm_multiallelic_SNP = TRUE + rm_by_filter <- 0 # removed because empty after threshold filtering + kept_multiallelic <- 0 # kept as-is (still multiallelic after filtering or no filter) + simplified <- 0 # simplified from multiallelic to biallelic by filtering + total_multiallelic <- 0 + multiallelic_others_target <- 0 # target SNPs with a 3rd allele from Others + + # Target marker + RefTag <- apply(cloneID_unit[which((grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("RefMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) + AltTag <- apply(cloneID_unit[which((grepl("Alt_0002$", cloneID_unit$AlleleID) | grepl("AltMatch", cloneID_unit$AlleleID)) & !duplicated(cloneID_unit$AlleleID)), -c(1:7)], 2, sum) + + cloneID <- cloneID_unit$CloneID[1] + if(is.null(pad_width)) pad_width <- nchar(sub(".*_", "", cloneID)) + info <- cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),] + + # In case there are Others that add multiallelics to targets + others_target <- cloneID_unit[,3] %in% cloneID_unit[grep("Ref_0001$", cloneID_unit$AlleleID),3] + if(sum(others_target) > 2 & !rm_multiallelic_SNP){ # If target is multiallelic + multiallelic_others_target <- 1 + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + other_alts <- unique(cloneID_unit[idx_other,5]) + other_alts_info <- cloneID_unit[idx_other,] + OtherTag_list <- list() + total <- rep(0, length(RefTag)) + ads <- vector() + tab_counts <- paste0(RefTag + AltTag + total, ":", RefTag, ":", RefTag, ",", AltTag) + for(j in 1:length(other_alts)){ + temp_other <- which(other_alts_info[,5] == other_alts[j]) + OtherTag_list[[j]] <- apply(other_alts_info[temp_other, -c(1:7)], 2, sum) + total_temp <- OtherTag_list[[j]] + + if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles + if(sum(total_temp > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr) next() + } + total <- total + total_temp + tab_counts <- paste0(tab_counts, ",",OtherTag_list[[j]]) + ads_temp <- sum(OtherTag_list[[j]]) + ads <- paste0(ads, ",", ads_temp) + } + alts <- paste0(info$Alt, ",", paste0(other_alts, collapse = ",")) + info_mk <- paste0("DP=", sum(c(RefTag, AltTag,total)),";", + "ADS=",sum(RefTag),",",sum(AltTag), ads) + } else { + tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) + alts <- info$Alt + info_mk <- paste0("DP=", sum(c(RefTag, AltTag)),";", + "ADS=",sum(RefTag),",",sum(AltTag)) + } - info <- cloneID_unit[grep("Ref_", cloneID_unit$AlleleID),] info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), - info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(RefTag, AltTag)),";", - "ADS=",sum(RefTag),",",sum(AltTag)), "DP:RA:AD") + cloneID, + info$Ref, + alts, + ".", + ".", + info_mk, + "DP:RA:AD") vcf_tag <- c(info, tab_counts) # Check if there are more than one alternative allele by loci - off_tag <- cloneID_unit[-which(grepl("Ref_", cloneID_unit$AlleleID) | grepl("Alt_", cloneID_unit$AlleleID)),] + rm_tags <- which(grepl("Ref_0001$", cloneID_unit$AlleleID) | grepl("Alt_0002$", cloneID_unit$AlleleID)) + if(sum(others_target) > 2){ + idx_other <- which(others_target & !grepl("Ref_0001$", cloneID_unit$AlleleID) & !grepl("Alt_0002$", cloneID_unit$AlleleID)) + off_tag <- cloneID_unit[-c(rm_tags,idx_other),] + } else { + off_tag <- cloneID_unit[-rm_tags,] + } + if(nrow(off_tag)){ # If there are off target SNP by_pos <- split.data.frame(off_tag, off_tag$SNP_position_in_Genome) @@ -531,19 +956,21 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic alleles <- unique(by_pos[[i]]$AlleleID) if(length(unique(by_pos[[i]]$Alt)) > 1){ # If SNP is multiallelic + total_multiallelic <- total_multiallelic + 1 if(rm_multiallelic_SNP){ # option to remove multiallelics - rm <- rm + 1 + rm_by_setting <- rm_by_setting + 1 next() } else if(multiallelic_SNP_dp_thr > 0 & multiallelic_SNP_sample_thr > 0){ # If not removed, user can set threshold to remove low frequency alleles rm.idx <- which(apply(by_pos[[i]][,-c(1:7)], 1, function(x) sum(x > multiallelic_SNP_dp_thr) < multiallelic_SNP_sample_thr)) if(length(rm.idx)) up_by_pos <- by_pos[[i]][-rm.idx,] else up_by_pos <- by_pos[[i]] if(length(unique(up_by_pos$Alt)) == 0) { # If after applied filter all tags are gone - rm <- rm + 1 + rm_by_filter <- rm_by_filter + 1 next() } else if (length(unique(up_by_pos$Alt)) > 1 ){ # If after applied filter the SNP remains multiallelic + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(up_by_pos, up_by_pos$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -558,12 +985,14 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- unique.data.frame(info) } else { # If after applied filter, only one alternative remains + simplified <- simplified + 1 alt <- apply(up_by_pos[,-c(1:7)], 2, sum) total_alt <- alt info <- unique.data.frame(up_by_pos[,2:5]) } } else { # If rm_multiallelic_SNP set to FALSE and threshold is 0, keep all multiallelics + kept_multiallelic <- kept_multiallelic + 1 by_alt <- split.data.frame(by_pos[[i]], by_pos[[i]]$Alt) by_alt_counts <- lapply(by_alt, function(x) apply(x[,-c(1:7)], 2, sum)) total_counts <- sapply(by_alt_counts, sum) @@ -590,7 +1019,7 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info <- c(info$Chromosome, info$SNP_position_in_Genome, - paste0(info$Chromosome, "_", info$SNP_position_in_Genome), + paste0(info$Chromosome, "_", formatC(as.integer(as.numeric(info$SNP_position_in_Genome)), width = pad_width, flag = "0", format = "d")), info$Ref, info$Alt, ".", ".", paste0("DP=", sum(c(ref, total_alt)),";", "ADS=",sum(ref),",",sum(total_alt)), "DP:RA:AD") @@ -600,5 +1029,5 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic } } - return(list(vcf_tag, rm)) + return(list(vcf_tag, rm_by_setting + rm_by_filter, total_multiallelic, rm_by_setting, rm_by_filter, kept_multiallelic, simplified, multiallelic_others_target)) } diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R new file mode 100644 index 0000000..e71cf0f --- /dev/null +++ b/R/madc2vcf_multi.R @@ -0,0 +1,193 @@ +#' Convert MADC file to VCF using polyRAD for multiallelic genotyping +#' +#' This function converts a DArTag fixed allele ID MADC file to a VCF +#' containing multiallelic markers based on the microhaplotypes using +#' the polyRAD package's `readDArTag`, `IterateHWE` population model +#' and `RADdata2VCF` pipeline. +#' +#' @param madc_file character. Path or URL to the input MADC CSV file. +#' @param botloci_file character. Path or URL to the botloci file listing target +#' IDs designed on the bottom strand. +#' @param outfile character. Path for the output VCF file. +#' @param markers_info character or NULL. Optional path or URL to a CSV file +#' with marker metadata. Required when CloneIDs do not follow the +#' \code{Chr_Pos} format; must contain \code{CloneID} (or +#' \code{BI_markerID}), \code{Chr}, and \code{Pos} columns. +#' @param ploidy integer. Ploidy level of the samples passed to \code{taxaPloidy}. +#' Default is 2. +#' @param verbose logical. Whether to print progress messages. Default is TRUE. +#' +#' @return Invisible NULL. Writes a VCF file to \code{outfile}. +#' +#' @details +#' The function performs the following steps: +#' \enumerate{ +#' \item Reads the MADC file and runs \code{check_madc_sanity}. +#' \item Validates the botloci file against MADC CloneIDs using +#' \code{check_botloci}, fixing any padding mismatches automatically. +#' \item Converts lowercase bases in AlleleSequence to uppercase if detected. +#' \item Removes all-NA rows and columns if detected. +#' \item Writes the corrected data to a temporary file and passes it to +#' \code{polyRAD::readDArTag}. +#' \item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +#' calls \code{polyRAD::IterateHWE}, then exports the result with +#' \code{polyRAD::RADdata2VCF}. +#' } +#' +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +#' unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +#' remaining checks the required inputs are: +#' +#' | Check | Status | Required | +#' |---|---|---| +#' | **Indels** | detected | `botloci_file` | +#' | | not detected | `botloci_file` | +#' | **ChromPos** | valid | `botloci_file` | +#' | | invalid | `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' +#' @importFrom utils read.csv write.csv read.table +#' +#' @export +madc2vcf_multi <- function(madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE) { + + vmsg("Running BIGr madc2vcf_multi", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("outfile : %s", verbose = verbose, level = 1, type = ">>", outfile) + vmsg("ploidy : %s", verbose = verbose, level = 1, type = ">>", ploidy) + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + if (!(file.exists(madc_file) | url_exists(madc_file))) stop("MADC file not found. Please provide a valid path or URL.") + if (!(file.exists(botloci_file) | url_exists(botloci_file))) stop("Botloci file not found. Please provide a valid path or URL.") + if (!is.null(markers_info) && !(file.exists(markers_info) | url_exists(markers_info))) stop("markers_info file not found. Please provide a valid path or URL.") + if (!is.numeric(ploidy) || ploidy < 1) stop("ploidy must be a positive integer.") + + # ---- Load markers_info if provided ---- + mi_df <- if (!is.null(markers_info)) read.csv(markers_info) else NULL + + # ---- Read and sanity-check ---- + report <- read.csv(madc_file, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (isTRUE(check)) message[1] else message[2] + }, checks$checks, checks$messages) + + for (i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if (!checks$checks["Columns"]) + stop("The MADC file is missing required columns (CloneID, AlleleID, AlleleSequence)") + + if (checks$checks["IUPACcodes"]) + stop("MADC Allele Sequences contain IUPAC (non-ATCG) codes. Please run HapApp to clean MADC file before using this function.") + + if (!isTRUE(checks$checks["RefAltSeqs"])) + stop("Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function.") + + if (!isTRUE(checks$checks["FixAlleleIDs"])) + stop("The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function.") + + if (!isTRUE(checks$checks["ChromPos"])) { + if (is.null(markers_info)) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + if (!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + } + + # ---- Check botloci vs MADC CloneIDs ---- + vmsg("Checking botloci file", verbose = verbose, level = 0, type = ">>") + cloneids_before <- report$CloneID + botloci_df <- read.table(botloci_file, header = FALSE) + botloci_before <- botloci_df$V1 + checked_botloci <- check_botloci(botloci_df, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci_df <- checked_botloci[[1]] + report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + cloneid_changed <- !identical(report$CloneID, cloneids_before) + botloci_changed <- !identical(botloci_df$V1, botloci_before) + + # ---- Botloci temp file (if IDs were remapped) ---- + if (botloci_changed) { + tmp_botloci <- tempfile() + on.exit(unlink(tmp_botloci), add = TRUE) + write.table(botloci_df, tmp_botloci, row.names = FALSE, col.names = FALSE, quote = FALSE) + botloci_input <- tmp_botloci + } else { + botloci_input <- botloci_file + } + + # ---- Corrections: only create a temp file if needed ---- + need_temp <- isTRUE(checks$checks["allNArow"]) || isTRUE(checks$checks["allNAcol"]) || cloneid_changed + + if (need_temp) { + if (checks$checks["LowerCase"]) { + vmsg("MADC Allele Sequences contain lowercase characters. Converting to uppercase", + verbose = verbose, level = 1, type = ">>") + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + if (checks$checks["allNArow"]) { + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA row(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[!idx, ] + } + + if (checks$checks["allNAcol"]) { + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + vmsg("Removing %s all-NA column(s)", verbose = verbose, level = 1, type = ">>", sum(idx)) + report <- report[, !idx] + } + + tmp_madc <- tempfile(fileext = ".csv") + on.exit(unlink(tmp_madc), add = TRUE) + write.csv(report, tmp_madc, row.names = FALSE, quote = TRUE) + input_file <- tmp_madc + } else { + if (checks$checks["LowerCase"]) + vmsg("MADC Allele Sequences contain lowercase characters. polyRAD will handle them", + verbose = verbose, level = 1, type = ">>") + input_file <- madc_file + } + + vmsg("Loading MADC into polyRAD", verbose = verbose, level = 0, type = ">>") + + if (!requireNamespace("polyRAD", quietly = TRUE)) { + stop("Package 'polyRAD' is required for madc2vcf_multi(). Please install it with install.packages('polyRAD').", call. = FALSE) + } + raddat <- polyRAD::readDArTag( + file = input_file, + botloci = botloci_input, + n.header.rows = 0L, + sample.name.row = 1, + trim.sample.names = "", + taxaPloidy = as.integer(ploidy) + ) + + overdispersionP <- polyRAD::TestOverdispersion(raddat) + my_ovdisp <- overdispersionP$optimal + + vmsg("Running HWE iteration (overdispersion = %s)", verbose = verbose, level = 0, type = ">>", my_ovdisp) + + raddat_hwe <- polyRAD::IterateHWE(raddat, overdispersion = my_ovdisp) + + vmsg("Writing VCF to %s", verbose = verbose, level = 0, type = ">>", outfile) + + polyRAD::RADdata2VCF(raddat_hwe, file = outfile, asSNPs = FALSE, hindhe = FALSE) + + vmsg("Done!", verbose = verbose, level = 0, type = ">>") + + invisible(NULL) +} diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 1b02c31..81e0643 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -1,47 +1,298 @@ #' Format MADC Target Loci Read Counts Into VCF #' -#' This function will extract the read count information from a MADC file target markers and convert to VCF file format. +#' Convert DArTag MADC target read counts to a VCF #' -#' The DArTag MADC file format is not commonly supported through existing tools. This function -#' will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -#' genotyping panel target markers only +#' @description +#' Parses a DArTag **MADC** report and writes a **VCF v4.3** containing per-target +#' read counts for the panel's target loci. This is useful because MADC is not +#' widely supported by general-purpose tools, while VCF is. #' -#' @param madc_file Path to MADC file -#' @param output.file output file name and path -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. -#' @param get_REF_ALT if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded. +#' @details +#' **What this function does** +#' - Runs basic sanity checks on the MADC file via `check_madc_sanity()` (column +#' presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +#' chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). +#' - Extracts reference and total read counts per sample and target. +#' - Derives `AD` (ref,alt) by subtraction (alt = total - ref). +#' - If `get_REF_ALT = TRUE`, recovers REF/ALT bases either from `markers_info` +#' (when `Ref`/`Alt` columns are present) or by comparing the Ref/Alt probe +#' sequences in the MADC file (with strand correction via `botloci_file`). +#' Targets with >1 polymorphism between sequences are discarded. +#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, +#' `ALT`, bypassing sequence-based inference. +#' +#' **Output VCF layout** +#' - `INFO` fields: +#' * `DP` - total depth across all samples for the locus +#' * `ADS` - total counts across samples in the order `ref,alt` +#' - `FORMAT` fields (per sample): +#' * `DP` - total reads (ref + alt) +#' * `RA` - reads supporting the reference allele +#' * `AD` - `"ref,alt"` counts +#' +#' **Strand handling** +#' If a target ID appears in `botloci_file`, its probe sequences are reverse- +#' complemented prior to base comparison so that REF/ALT are reported in the +#' top-strand genomic orientation. +#' +#' **Sanity check behaviour and requirements** +#' +#' The function always stops if required columns (`CloneID`, `AlleleID`, +#' `AlleleSequence`) are missing. +#' +#' For the remaining checks the required inputs depend on the combination of +#' check result and `get_REF_ALT`: +#' +#' | Check | Status | `get_REF_ALT` | Required | +#' |---|---|---|---| +#' | **IUPAC codes** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | none | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | none | +#' | **Indels** | detected | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | detected | `FALSE` | none | +#' | | not detected | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | not detected | `FALSE` | none | +#' | **ChromPos** | valid | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | valid | `FALSE` | none | +#' | | invalid | `TRUE` | `markers_info` with `Chr`/`Pos`/`Ref`/`Alt` **or** `markers_info` with `Chr`/`Pos` + `botloci_file` | +#' | | invalid | `FALSE` | `markers_info` with `Chr`/`Pos` | +#' | **FixAlleleIDs** | fixed | `TRUE` | `botloci_file` **or** `markers_info` with `Ref`/`Alt` | +#' | | fixed | `FALSE` | none | +#' | | not fixed | `TRUE` | `markers_info` with `Ref`/`Alt` | +#' | | not fixed | `FALSE` | none | +#' +#' @param madc_file character. Path to the input MADC CSV file. +#' @param output.file character. Path to the output VCF file to write. +#' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text +#' file listing target IDs designed on the **bottom** strand (one ID per line). +#' Used for strand-correcting probe sequences when `get_REF_ALT = TRUE` and +#' `markers_info` does not supply `Ref` and `Alt` columns. Not needed when +#' `markers_info` provides `Ref` and `Alt`, or when `get_REF_ALT = FALSE` and +#' `markers_info` provides `Chr` and `Pos`. Also required when `ChromPos` is +#' invalid and `markers_info` does not provide `Ref`/`Alt`. +#' @param markers_info character or `NULL`. Optional path to a CSV providing target +#' metadata. Matching is done by column name, not column position. Accepted columns: +#' - one marker identifier column named `CloneID`, `Marker_ID`, or `BI_markerID` +#' (required; a generic `ID` column is not accepted); +#' - `Chr`, `Pos` - required when `CloneID` does not follow the `Chr_Pos` format; +#' - `Ref`, `Alt` - required when `get_REF_ALT = TRUE` and probe-sequence +#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When +#' `get_REF_ALT = TRUE`, `botloci_file` is still required unless `Ref` and `Alt` +#' are supplied here. +#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to recover +#' REF/ALT bases. The source is chosen automatically: `markers_info` `Ref`/`Alt` +#' columns take priority; otherwise probe sequences from the MADC are compared +#' (with `botloci_file` for strand correction). Targets with more than one +#' difference between Ref/Alt sequences are removed. When `FALSE`, REF and ALT +#' are set to `"."` in the output VCF. +#' @param collapse_matches_counts logical (default `FALSE`). If `TRUE`, counts for +#' `|AltMatch` and `|RefMatch` rows are summed into their corresponding `|Ref` +#' and `|Alt` rows before building the matrices. Useful when the MADC contains +#' multiple allele rows per target that should be aggregated. +#' @param verbose logical (default `TRUE`). If `TRUE`, prints detailed progress +#' messages about each processing step. +#' +#' @return (Invisibly) returns the path to `output.file`. The side effect is a +#' **VCF v4.3** written to disk containing one row per target and columns for all +#' samples in the MADC file. +#' +#' @section Dependencies: +#' Uses **dplyr**, **tidyr**, **tibble**, **reshape2**, **Biostrings** and base +#' **utils**. Helper functions expected in this package: `check_madc_sanity()`, +#' `get_countsMADC()`, `get_counts()`, and `check_botloci()`. +#' +#' @examples +#' # Example files shipped with the package +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", +#' package = "BIGr") +#' out_vcf <- tempfile(fileext = ".vcf") +#' +#' # Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +#' \dontrun{ +#' madc2vcf_targets( +#' madc_file = madc_file, +#' output.file = out_vcf, +#' botloci_file = bot_file, +#' get_REF_ALT = TRUE +#' ) +#' } +#' +#' # Clean up (example) +#' unlink(out_vcf) +#' +#' @seealso +#' `check_madc_sanity()`, `get_countsMADC()`, `check_botloci()` #' -#' @return A VCF file v4.3 with the target marker read count information #' @import dplyr #' @import tidyr #' @import tibble -#' @importFrom Rdpack reprompt #' @importFrom reshape2 melt dcast #' @importFrom utils write.table #' @importFrom Biostrings DNAString reverseComplement -#' @return A VCF file v4.3 with the target marker read count information -#' -#' @examples -#' # Load example files -#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#' -#' #Temp location (only for example) -#' output_file <- tempfile() -#' -#' # Convert MADC to VCF -#' madc2vcf_targets(madc_file = madc_file, -#' output.file = output_file, -#' get_REF_ALT = TRUE, -#' botloci_file = bot_file) -#' -#' rm(output_file) #' #' @export -madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) { - #Making the VCF (This is highly dependent on snps being in a format where the SNP IDs are the CHR_POS) +madc2vcf_targets <- function(madc_file, + output.file, + botloci_file = NULL, + markers_info = NULL, + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = TRUE) { + + vmsg("Running BIGr madc2vcf_targets", verbose = verbose, level = 0, type = ">>") + vmsg("madc_file : %s", verbose = verbose, level = 1, type = ">>", madc_file) + vmsg("output.file : %s", verbose = verbose, level = 1, type = ">>", output.file) + vmsg("botloci_file : %s", verbose = verbose, level = 1, type = ">>", if (is.null(botloci_file)) "NULL" else botloci_file) + vmsg("markers_info : %s", verbose = verbose, level = 1, type = ">>", if (is.null(markers_info)) "NULL" else markers_info) + vmsg("get_REF_ALT : %s", verbose = verbose, level = 1, type = ">>", get_REF_ALT) + vmsg("collapse_matches_counts : %s", verbose = verbose, level = 1, type = ">>", collapse_matches_counts) + + vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>") + + # Input checks + if(!(file.exists(madc_file) | url_exists(madc_file))) stop("The MADC file does not exist.") + if(!is.character(output.file)) stop("output.file must be a character string.") + if(!is.null(markers_info) && !is.character(markers_info)) stop("markers_info must be a character string or NULL.") + if(!is.null(markers_info) && !file.exists(markers_info) && !url_exists(markers_info)) stop("The markers_info file does not exist.") + if(!is.logical(get_REF_ALT)) stop("get_REF_ALT must be a logical value (TRUE or FALSE).") + if(!is.logical(verbose)) stop("verbose must be a logical value (TRUE or FALSE).") + + # Create a VCF header line with metadata about the command and its parameters + bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_targets=') + + # MADC checks + report <- read.csv(madc_file, check.names = FALSE) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (isTRUE(check)) message[1] else message[2] + }, checks$checks, checks$messages) + + for(i in seq_along(messages_results)) + vmsg(messages_results[i], verbose = verbose, level = 1, type = ">>") + + if(any(!(checks$checks[c("Columns")]))){ + idx <- which(!(checks$checks[c("Columns")])) + if(length(idx) > 0) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes", "Indels")]) && get_REF_ALT){ + idx <- which((checks$checks[c("IUPACcodes", "Indels")])) + if(is.null(markers_info)) stop(paste("Please provide a markers_info file to proceed. The MADC file does not pass the sanity checks:", + paste(messages_results[c("IUPACcodes", "Indels")[idx]], collapse = "\n"))) + else vmsg("MADC file has IUPAC codes and/or indels, but a markers_info file is provided, so proceeding with VCF generation.", verbose = verbose, level = 1, type = ">>") + } + + if(checks$checks["LowerCase"]){ + vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1) + report$AlleleSequence <- toupper(report$AlleleSequence) + } + + # ---- Validate botloci and pre-process CloneIDs based on get_REF_ALT logic ---- + mi_df <- NULL # markers_info data frame (loaded once, reused below) + mi_has_ref_alt <- FALSE # TRUE when markers_info provides Ref and Alt columns + botloci <- NULL # botloci data frame (set when needed) + + # Check whether markers_info is present and contains Ref + Alt columns + if(!is.null(markers_info)) { + mi_df <- read.csv(markers_info) + mi_has_ref_alt <- all(c("Ref", "Alt") %in% colnames(mi_df)) + } + + if(!checks$checks["FixAlleleIDs"]){ + vmsg("MADC file has not been processed by HapApp.", verbose = verbose, level = 1) + if(get_REF_ALT){ + if(!mi_has_ref_alt) stop("MADC file has not been processed by HapApp. BIGr only provide results if get_REF_ALT is set to FALSE or if is TRUE but a marker_info with REF and ALT information is provided.") + } + # The check points to FALSE if the 6 initial rows exist or if there are no fixed allele ID (aka _0001, _0002, etc) + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + if(all_blank_or_star) { + colnames(report) <- report[7,] + report <- report[-c(1:7),] + } + } + + if(checks$checks["allNArow"]){ + idx <- apply(report, 1, function(x) all(is.na(x) | x == "")) + report <- report[!idx, ] + vmsg("MADC contains rows with all NA values. Rows %s will be removed.", verbose = verbose, level = 1, type = ">>", paste(which(idx), collapse = ", ")) + } + + if(checks$checks["allNAcol"]){ + idx <- apply(report, 2, function(x) all(is.na(x) | x == "")) + report <- report[, !idx] + vmsg("MADC contains columns with all NA values. Columns %s will be removed.", verbose = verbose, level = 1, type = ">>", paste0(which(idx), collapse = ",")) + } + + if(!isTRUE(checks$checks["ChromPos"])) { + if(is.null(markers_info)){ + stop("CloneID column does not follow the 'Chr_Pos'. ", + "Please provide a markers_info file with at least 'CloneID'/'Marker_ID'/'BI_markerID', ", + "'Chr', and 'Pos' columns.") + } else { + + if(!all(c("Chr", "Pos") %in% colnames(mi_df))) + stop("CloneID column does not follow the 'Chr_Pos' format. ", + "markers_info must contain at least 'Chr' and 'Pos' columns to remap marker IDs.") + + } + } + + if(get_REF_ALT) { + + if(mi_has_ref_alt) { + # markers_info supplies REF and ALT - no botloci required + vmsg("markers_info contains Ref and Alt columns. REF and ALT will be taken from markers_info.", + verbose = verbose, level = 1, type = ">>") + + } else { + if(checks$checks["Indels"]) + stop("Indels detected in MADC file. Since get_REF_ALT = TRUE, a markers_info file with REF/ALT information is required.") + + # REF/ALT must be extracted from probe sequences - botloci is required + if(is.null(botloci_file) || (!file.exists(botloci_file) && !url_exists(botloci_file))) + stop("get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE.") + + # Validate that CloneIDs match botloci marker names (after any remapping) + botloci <- read.table(botloci_file, header = FALSE) + checked_botloci <- check_botloci(botloci, report, ChromPos = checks$checks["ChromPos"], mi_df = mi_df, verbose = verbose) + botloci <- checked_botloci[[1]] + report <- checked_botloci[[2]] + mi_df <- checked_botloci[[3]] + + } + } + + # Throw message if OtherAlleles are present + if(checks$checks["OtherAlleles"]) { + vmsg("AlleleID contains alleles other than Ref and Alt. These will be ignored in the VCF output. Use function madc2vcf_all to include them.", verbose = verbose, level = 1, type = ">>") + } + + # Make sure counts are numeric + count_cols <- setdiff(colnames(report), c("CloneID", "AlleleID", "AlleleSequence")) + report[count_cols] <- lapply(report[count_cols], function(x) as.numeric(as.character(x))) + + vmsg("Input checks done!", verbose = verbose, level = 1, type = ">>") + + vmsg("Extracting depth information", verbose = verbose, level = 0, type = ">>") + + matrices <- get_countsMADC(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = verbose) - matrices <- get_countsMADC(madc_file) ref_df <- data.frame(matrices[[1]], check.names = FALSE) alt_df <- data.frame(matrices[[2]]-matrices[[1]], check.names = FALSE) size_df <- data.frame(matrices[[2]], check.names = FALSE) @@ -55,20 +306,164 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = ) row.names(ad_df) <- row.names(ref_df) - #Obtaining Chr and Pos information from the row_names - new_df <- size_df %>% - rownames_to_column(var = "row_name") %>% - separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% - select(CHROM, POS) + vmsg("Depth information extracted", verbose = verbose, level = 1, type = ">>") + + if(get_REF_ALT && mi_has_ref_alt) { + vmsg("Using markers_info for CHROM, POS, REF and ALT.", verbose = verbose, level = 0, type = ">>") + + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df)) + + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", + verbose = verbose, level = 1, type = ">>") + + if(!all(c(id_col, "Chr", "Pos", "Ref", "Alt") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", + id_col, ", Chr, Pos, Ref, Alt")) + + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.") + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] + + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + ref_base <- matched$Ref + alt_base <- matched$Alt + + } else if(!is.null(markers_info) && !get_REF_ALT) { + vmsg("markers_info file provided. Using CHROM and POS from the file.", verbose = verbose, level = 0, type = ">>") + + if(is.null(mi_df)) mi_df <- read.csv(markers_info) + id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df)) + + if(checks$checks["Indels"]) + vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", + verbose = verbose, level = 1, type = ">>") + + if(!all(c(id_col, "Chr", "Pos") %in% colnames(mi_df))) + stop(paste0("The markers_info dataframe must contain the following columns: ", id_col, ", Chr, Pos")) + + if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { + miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] + vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", + paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + } + matched <- mi_df[match(rownames(ad_df), mi_df[[id_col]]), ] + + new_df <- data.frame(CHROM = matched$Chr, POS = matched$Pos) + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") + + } else { + vmsg(ifelse(get_REF_ALT, + "Recovering CHROM and POS from CloneID for probe-sequence REF/ALT extraction...", + "No markers_info file provided. Recovering CHROM and POS from CloneID..."), + verbose = verbose, level = 0, type = ">>") + + # Split on the last underscore to handle chromosome names containing underscores + # (e.g. Chr_01_000123456). When ChromPos was FALSE, check_botloci already + # remapped CloneIDs to Chr_PaddedPos, so this split is always valid. + new_df <- size_df %>% + rownames_to_column(var = "row_name") %>% + separate(row_name, into = c("CHROM", "POS"), sep = "_(?=[^_]*$)") %>% + select(CHROM, POS) + new_df$POS <- sub("^0+", "", new_df$POS) + vmsg("CHROM and POS recovered from CloneID.", verbose = verbose, level = 1, type = ">>") + + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + if(get_REF_ALT) { + vmsg("get_REF_ALT = TRUE. Attempting to recover REF and ALT bases from probe sequences...", + verbose = verbose, level = 0, type = ">>") + + csv <- get_counts(madc_object = report, collapse_matches_counts = collapse_matches_counts, verbose = FALSE) + csv <- csv[which(csv$CloneID %in% rownames(ad_df)), ] + + ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] + alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + orig_ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + + if(all(sort(ref_ord) == sort(alt_ord))) { + # Key sequences by CloneID, then reorder to MADC row order so that + # loop index i always corresponds to rownames(size_df)[i]. + ref_seq_by_id <- setNames(orig_ref_seq, ref_ord) + alt_seq_by_id <- setNames(orig_alt_seq, alt_ord) + madc_ids <- rownames(size_df) + orig_ref_seq <- ref_seq_by_id[madc_ids] + orig_alt_seq <- alt_seq_by_id[madc_ids] + + more_poly <- no_diff <- 0 + ref_base <- alt_base <- rep(NA_character_, length(madc_ids)) + names(ref_base) <- names(alt_base) <- madc_ids + for(i in seq_along(madc_ids)) { + if(is.na(orig_ref_seq[i]) || is.na(orig_alt_seq[i])) next + temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") + if(length(temp_list[[1]]) != length(temp_list[[2]])) + stop(paste0("Marker '", madc_ids[i], "' has Ref and Alt probe sequences of different lengths ", + "(", length(temp_list[[1]]), " vs ", length(temp_list[[2]]), "). ", + "This should not happen for SNP markers. ", + "If this is an indel, please provide a markers_info file with Ref and Alt columns.")) + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - # Remove leading zeros from the POS column - new_df$POS <- sub("^0+", "", new_df$POS) + if(length(idx_diff) > 1) { + ref_base[i] <- NA + alt_base[i] <- NA + more_poly <- more_poly + 1 + } else if(length(idx_diff) == 1) { + orig_ref_base <- temp_list[[1]][idx_diff] + orig_alt_base <- temp_list[[2]][idx_diff] + if(madc_ids[i] %in% botloci[, 1]) { + ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) + alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) + } else { + ref_base[i] <- orig_ref_base + alt_base[i] <- orig_alt_base + } + } else { + ref_base[i] <- NA + alt_base[i] <- NA + no_diff <- no_diff + 1 + } + } + if(more_poly > 0) vmsg(paste(more_poly, "markers removed: more than one polymorphism between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") + if(no_diff > 0) vmsg(paste(no_diff, "markers removed: no differences found between Ref and Alt sequences."), verbose = verbose, level = 2, type = ">>") - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) - new_df$TotalSize <- rowSums(size_df) + } else { + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT bases could not be recovered: missing reference or alternative sequences.", + verbose = verbose, level = 1, type = ">>") + } + } else { + # get_REF_ALT = FALSE, no markers_info + ref_base <- "." + alt_base <- "." + vmsg("REF and ALT not recovered (get_REF_ALT = FALSE).", verbose = verbose, level = 1, type = ">>") + } + } + + vmsg("CHROM, POS, REF and ALT columns prepared", verbose = verbose, level = 1, type = ">>") + + vmsg("Preparing VCF dataframe", verbose = verbose, level = 0, type = ">>") #Make a header separate from the dataframe vcf_header <- c( "##fileformat=VCFv4.3", @@ -79,86 +474,10 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = '##INFO=', '##FORMAT=', '##FORMAT=', - '##FORMAT=' + '##FORMAT=', + bigr_meta ) - # Get REF and ALT - if(get_REF_ALT){ - if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - csv <- get_counts(madc_file) - # Keep only the ones that have alt and ref - csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] - - # Get reverse complement the tag is present in botloci - botloci <- read.table(botloci_file, header = FALSE) - - # Check if the botloci file marker IDs match with the MADC file - checked_botloci <- check_botloci(botloci, csv) - botloci <- checked_botloci[[1]] - csv <- checked_botloci[[2]] - - # FIXED: Store original sequences before any transformation - csv$OriginalAlleleSequence <- csv$AlleleSequence - - # Apply reverse complement to sequences for bottom strand markers - idx <- which(csv$CloneID %in% botloci[,1]) - csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) - - ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] - alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] - - # FIXED: Get original sequences for SNP calling - orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - - if(all(sort(ref_ord) == sort(alt_ord))){ - # Order sequences consistently - ref_seq <- ref_seq[order(ref_ord)] - alt_seq <- alt_seq[order(alt_ord)] - orig_ref_seq <- orig_ref_seq[order(ref_ord)] - orig_alt_seq <- orig_alt_seq[order(alt_ord)] - ordered_clone_ids <- sort(ref_ord) - - ref_base <- alt_base <- vector() - for(i in 1:length(orig_ref_seq)){ - # FIXED: Use original sequences for SNP calling - temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") - idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - - if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences - ref_base[i] <- NA - alt_base[i] <- NA - } else if(length(idx_diff) == 1) { - orig_ref_base <- temp_list[[1]][idx_diff] - orig_alt_base <- temp_list[[2]][idx_diff] - - # FIXED: Apply reverse complement to bases only if marker is in botloci - if(ordered_clone_ids[i] %in% botloci[,1]) { - ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) - alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) - } else { - ref_base[i] <- orig_ref_base - alt_base[i] <- orig_alt_base - } - } else { - # No differences found - ref_base[i] <- NA - alt_base[i] <- NA - } - } - } else { - warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") - ref_base <- "." - alt_base <- "." - } - - } else { - ref_base <- "." - alt_base <- "." - } - #Make the header#Make the VCF df vcf_df <- data.frame( CHROM = new_df$CHROM, @@ -180,40 +499,24 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = vcf_df$FORMAT <- paste("DP","RA","AD",sep=":") #Combine info from the matrices to form the VCF information for each sample - # Combine the matrices into a single matrix with elements separated by ":" - make_vcf_format <- function(..., separator = ":") { - matrices <- list(...) - n <- length(matrices) - - # Convert matrices to long form - long_forms <- lapply(matrices, function(mat) { - suppressMessages(reshape2::melt(mat, varnames = c("Row", "Col"), value.name = "Value")) - }) - - # Concatenate the elements - combined_long <- long_forms[[1]] - combined_long$Combined <- combined_long$Value - - for (i in 2:n) { - combined_long$Combined <- paste(combined_long$Combined, long_forms[[i]]$Value, sep = separator) - } + m_size <- melt(as.matrix(size_df), varnames = c("Row", "Col"), value.name = "Value") + m_ref <- melt(as.matrix(ref_df), varnames = c("Row", "Col"), value.name = "Value") + m_ad <- melt(as.matrix(ad_df), varnames = c("Row", "Col"), value.name = "Value") - # Convert back to wide form - combined_wide <- suppressMessages(reshape2::dcast(combined_long, Row ~ Col, value.var = "Combined")) + combined_long <- m_size + combined_long$Combined <- paste(m_size$Value, m_ref$Value, m_ad$Value, sep = ":") - # Restore row and column names - rownames(combined_wide) <- combined_wide$Row - combined_wide$Row <- NULL - colnames(combined_wide) <- colnames(matrices[[1]]) + combined_wide <- suppressMessages(dcast(combined_long, Row ~ Col, value.var = "Combined")) + rownames(combined_wide) <- combined_wide$Row + combined_wide$Row <- NULL + colnames(combined_wide) <- colnames(size_df) - return(as.matrix(combined_wide)) - } - - # Combine the matrices - geno_df <- make_vcf_format(as.matrix(size_df), as.matrix(ref_df), as.matrix(ad_df)) + geno_df <- as.matrix(combined_wide) + vmsg("Sample columns formatted for VCF", verbose = verbose, level = 1, type = ">>") #Combine the dataframes together vcf_df <- cbind(vcf_df,geno_df) + vmsg("VCF dataframe prepared", verbose = verbose, level = 1, type = ">>") # Add # to the CHROM column name colnames(vcf_df)[1] <- "#CHROM" @@ -221,8 +524,30 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = # Sort vcf_df <- vcf_df[order(vcf_df[,1],as.numeric(as.character(vcf_df[,2]))),] - if(sum(is.na(vcf_df$REF)) >1) { - warning(paste("Markers removed because of presence of more than one polymorphism between ref and alt sequences:",sum(is.na(vcf_df$REF)))) + # Remove markers with NA CHROM/POS (unmatched in markers_info, Case 3) + na_coord <- is.na(vcf_df[, 1]) | is.na(vcf_df$POS) + if(any(na_coord)) { + vmsg(paste(sum(na_coord), "markers removed: no matching entry found in markers_info."), verbose = verbose, level = 1, type = ">>") + warning(paste(sum(na_coord), "markers removed: no matching entry found in markers_info.")) + vcf_df <- vcf_df[!na_coord, ] + } + + if(sum(is.na(vcf_df$REF)) > 0) { + vmsg( + paste( + sum(is.na(vcf_df$REF)), + "markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences)." + ), + verbose = verbose, + level = 1, + type = ">>" + ) + warning( + paste( + "Markers removed because REF could not be reliably determined (e.g., multiple polymorphisms or no difference between probe sequences):", + sum(is.na(vcf_df$REF)) + ) + ) vcf_df <- vcf_df[-which(is.na(vcf_df$REF)),] } @@ -233,12 +558,5 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) - #Unload all items from memory - rm(matrices) - rm(ref_df) - rm(alt_df) - rm(size_df) - rm(ad_df) - rm(vcf_df) - rm(geno_df) + vmsg(paste("VCF file written to", output.file), verbose = verbose, level = 0, type = ">>") } diff --git a/R/thinSNP.R b/R/thinSNP.R index 6f445f6..487b257 100644 --- a/R/thinSNP.R +++ b/R/thinSNP.R @@ -10,8 +10,9 @@ #' @param min_distance A numeric value for the minimum distance between selected SNPs. #' The unit of this distance should match the unit of the `pos_col_name` column (e.g., base pairs). #' -#' @import dplyr -#' @import rlang +#' @importFrom rlang sym +#' @importFrom dplyr group_by arrange group_modify ungroup +#' @importFrom tibble as_tibble #' @return A thinned dataframe with the same columns as the input. #' #' @examples diff --git a/R/utils.R b/R/utils.R index c280ad2..41acdbc 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,12 +1,14 @@ #Internal Functions utils::globalVariables(c( - "ALT", "AlleleID", "CHROM", "Data", "ID", "MarkerName", "POS", - "QPseparate", "QPsolve_par", "REF", "Var1", "Variant", "geno", + "ALT", "AlleleID", "AlleleSequence", "CHROM", "Concordance", "Data", "ID", + "MarkerName", "POS", + "QPseparate", "QPsolve_par", "REF", "Type", "Var1", "Variant", "geno", "ind", "ref", "row_name", "size", "snp", "CloneID", "Count", "qualifying_sites_count", "MarkerID", "SampleID", "Dosage", - "pos", "alt", "match_key" + "pos", "alt", "match_key", + ":=", ".SD", "Sex", "Male_Parent", "Female_Parent", "chr" )) #' Convert GT format to numeric dosage @@ -26,45 +28,67 @@ convert_to_dosage <- function(gt) { }) } -#' Check and Adjust Botloci and MADC Marker Compatibility +#' Verbose Message Utility #' -#' This internal function checks the compatibility between botloci and MADC markers. It ensures that the marker IDs in the botloci file match those in the MADC file. If discrepancies are found, such as mismatched padding, the function attempts to adjust the IDs to ensure compatibility. +#' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. #' -#' @param botloci A data frame containing the botloci markers. -#' @param report A data frame containing the MADC markers. -#' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. +#' @param text Character string, the message to print (supports sprintf formatting). +#' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. +#' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). +#' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. +#' @param ... Additional arguments passed to sprintf for formatting. #' -#' @return A list containing the adjusted botloci and MADC data frames. +#' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. #' -#' @details -#' The function checks if the marker IDs in the botloci file are present in the MADC file. If no matches are found, it examines the padding (number of digits) in the marker IDs and adjusts them to match the longest padding. If the IDs still do not match after adjustment, an error is raised. This function is intended for internal use and helps ensure that the botloci and MADC files are compatible for downstream analysis. -#' -#' @keywords internal +#' @return No return value, called for side effects. #' @noRd -check_botloci <- function(botloci, report, verbose=TRUE){ - if(!any(botloci$V1 %in% report$CloneID)) { - if(verbose) message("None of the botloci markers could be found in the MADC file. Checking padding match...\n") +vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { + if (!verbose) return(invisible()) + # Format timestamp + timestamp <- format(Sys.time(), "[%H:%M:%S]") - pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) - pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) + # Create indentation based on level + indent <- switch(as.character(level), + "0" = "", # Section headers + "1" = " \u2219 ", # Main steps (medium bullet) + "2" = " - ", # Details + "3" = " > ", # Sub-details + paste0(paste(rep(" ", level), collapse = ""), "\u2022 ") # Fallback for level > 3 + ) - if(length(pad_madc) > 1 | length(pad_botloci) > 1) stop("Check marker IDs in both MADC and botloci files. They should be the same.") + # Format type label (only show for level 0) + type_label <- if (level == 0) sprintf("%-1s ", type) else "" - if(pad_madc != pad_botloci) { - if(verbose) message("Padding between MADC and botloci files do not match. Markers ID modified to match longest padding.\n") - if (pad_madc < pad_botloci) { - report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", - sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) - ) - } else { - botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", - sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) - ) - if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") - } - } else { - stop("Check marker IDs in both MADC and botloci files. They should be the same.") - } + # Format message text + dots <- list(...) + if(length(dots) == 0) { + msg_text <- text + } else { + msg_text <- sprintf(text, ...) } - return(list(botloci, report)) + # Combine everything + formatted_msg <- sprintf("%s %s%s%s", timestamp, type_label, indent, msg_text) + message(formatted_msg) +} + + +#' Check Whether a URL Is Accessible +#' +#' Attempts to open a connection to the given URL and returns `TRUE` if +#' successful, `FALSE` otherwise. Errors and warnings are both treated as +#' inaccessible. +#' +#' @param u character. The URL to test. +#' +#' @return A single logical: `TRUE` if the URL can be opened, `FALSE` if not. +#' +#' @keywords internal +#' @noRd +#' +url_exists <- function(u) { + tryCatch({ + con <- url(u, open = "rb") + close(con) + TRUE + }, error = function(e) FALSE, warning = function(w) FALSE) } diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R new file mode 100644 index 0000000..dfcaf17 --- /dev/null +++ b/R/validate_pedigree.R @@ -0,0 +1,400 @@ +#' Validate Pedigree Trios Using Mendelian Error Analysis +#' +#' Validates parent-offspring trios by calculating Mendelian error rates from +#' SNP genotype data. Identifies incorrect parentage assignments and suggests +#' best-matching replacements. If a list of founders is supplied, trios that +#' are declared founders (both parents coded as 0) are preserved unchanged +#' with no recommendations. Trios removed due to missing genotype data are +#' retained in the output with a NO_GENOTYPE_DATA status. +#' +#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) +#' with columns: ID, Male_Parent, Female_Parent. +#' @param genotypes_file Character. Path to the genotypes file (TSV/CSV/TXT) +#' with an ID column followed by marker columns coded as 0, 1, 2. +#' @param founders_file Character, optional. Path to a one-column file +#' listing the IDs of founder individuals. Founders with both parents +#' coded as 0 are left unchanged with no recommendations. Defaults to NULL. +#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage +#' to classify a trio as PASS (default: 5.0). Must be between 0 and 100. +#' @param min_markers Integer. Minimum number of non-missing markers +#' required to evaluate a trio (default: 10). +#' @param single_parent_error_threshold Numeric. Maximum homozygous-marker +#' mismatch percentage for a parent to be considered acceptable during +#' parent-level evaluation (default: 2.0). Must be between 0 and 100. +#' @param verbose Logical. If TRUE, prints progress messages, a summary +#' table, and results to the console (default: TRUE). +#' @param write_txt Logical. If TRUE, writes validation results to +#' output_filename (default: TRUE). +#' @param output_filename Character. Path/name of the output file +#' (default: "pedigree_validation_results.txt"). +#' +#' @return A data.table (returned invisibly) with one row per trio and +#' the following columns: +#' \describe{ +#' \item{ID}{Individual ID.} +#' \item{Male_Parent}{Declared male parent ID.} +#' \item{Female_Parent}{Declared female parent ID.} +#' \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +#' \item{Markers_Tested}{Number of markers with non-missing genotypes.} +#' \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, +#' MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, +#' or NO_GENOTYPE_DATA.} +#' \item{Correction_Decision}{One of NONE, KEEP_BOTH, +#' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, +#' LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, +#' LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} +#' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} +#' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} +#' \item{Best_Male_Parent}{Best-matching male parent candidate ID.} +#' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} +#' \item{Best_Female_Parent}{Best-matching female parent candidate ID.} +#' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} +#' } +#' @export +validate_pedigree <- function(pedigree_file, genotypes_file, + founders_file = NULL, + trio_error_threshold = 5.0, + min_markers = 10, + single_parent_error_threshold = 2.0, + verbose = TRUE, + write_txt = TRUE, + output_filename = "pedigree_validation_results.txt") { + + #### Input validation #### + if (trio_error_threshold < 0 || trio_error_threshold > 100) + stop("trio_error_threshold must be between 0 and 100") + if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) + stop("single_parent_error_threshold must be between 0 and 100") + + tryCatch({ + pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") + genos <- data.table::fread(genotypes_file, sep = "auto") + }, error = function(e) { + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") + }) + + #### Check required columns #### + required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") + missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) + if (base::length(missing_cols) > 0) + stop("Pedigree file missing required columns: ", + base::paste(missing_cols, collapse = ", ")) + if (!"ID" %in% base::names(genos)) + stop("Genotypes file must have an 'ID' column") + + # Ensure parent columns are character for consistent "0" comparisons + pedigree[, Male_Parent := as.character(Male_Parent)] + pedigree[, Female_Parent := as.character(Female_Parent)] + original_pedigree <- data.table::copy(pedigree) + + #### Read founders list #### + if (!is.null(founders_file)) { + founders_raw <- tryCatch({ + data.table::fread(founders_file, header = FALSE, colClasses = "character") + }, error = function(e) { + stop("Could not read founders list. Ensure it is a plain text or CSV/TSV file.") + }) + founder_ids <- unique(founders_raw[[1]]) + } else { + founder_ids <- character(0) + } + + #### Identify trios missing from the genotype file #### + valid_ids <- as.character(genos$ID) + has_geno <- pedigree[ID %in% valid_ids & + (Male_Parent %in% valid_ids | Male_Parent == "0") & + (Female_Parent %in% valid_ids | Female_Parent == "0")] + + no_geno_rows <- pedigree[!(ID %in% valid_ids) | + (!(Male_Parent %in% valid_ids) & Male_Parent != "0") | + (!(Female_Parent %in% valid_ids) & Female_Parent != "0")] + + if (base::nrow(no_geno_rows) > 0 && verbose) + base::cat("Found", base::nrow(no_geno_rows), + "trios with missing genotype data; flagged as NO_GENOTYPE_DATA.\n") + + pedigree <- has_geno + if (base::nrow(pedigree) == 0) + stop("No valid trios remain after filtering for genotype availability.") + + #### Build genotype matrices #### + genos_mat <- base::as.matrix(genos, rownames = "ID") + + # Homozygous-only matrix (het markers set to NA) + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "ID") + for (col in marker_cols) + genos_hom[base::get(col) == 1, (col) := NA_integer_] + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + + #### Helper: find best matching parent via homozygous mismatch #### + find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { + candidates <- base::setdiff(base::rownames(genos_hom_mat), + c(prog_id, exclude_ids)) + if (base::length(candidates) == 0) + return(base::list(id = NA_character_, error_pct = NA_real_)) + + prog_hom <- genos_hom_mat[prog_id, ] + errors <- base::sapply(candidates, function(cand_id) { + cand_hom <- genos_hom_mat[cand_id, ] + comparisons <- base::sum(!base::is.na(cand_hom) & !base::is.na(prog_hom)) + if (comparisons == 0) return(NA_real_) + (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 + }) + + best_idx <- base::which.min(errors) + base::list(id = candidates[best_idx], + error_pct = base::round(errors[best_idx], 2)) + } + + #### Main trio evaluation loop #### + results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { + + prog_id <- pedigree$ID[i] + male_parent_id <- pedigree$Male_Parent[i] + female_parent_id <- pedigree$Female_Parent[i] + + # Default values + correction_decision <- "NONE" + error_pct <- NA_real_ + status <- "NO_DATA" + markers_tested <- 0L + male_parent_error_pct <- NA_real_ + female_parent_error_pct <- NA_real_ + best_male_parent <- NA_character_ + best_male_parent_pct <- NA_real_ + best_female_parent <- NA_character_ + best_female_parent_pct <- NA_real_ + + ## Founder check - both parents "0" and ID in founders list + if (male_parent_id == "0" && female_parent_id == "0" && + prog_id %in% founder_ids) { + status <- "FOUNDERS" + correction_decision <- "NONE" + + } else { + + ## Missing parent(s) - recommendations only, "0"s preserved in pedigree + if (male_parent_id == "0" && female_parent_id == "0") { + status <- "MISSING_BOTH_PARENTS" + correction_decision <- "NONE" + + best_m <- find_best_parent(prog_id, exclude_ids = character(0)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + + best_f <- find_best_parent(prog_id, exclude_ids = c(best_m$id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + + } else if (male_parent_id == "0" && female_parent_id != "0") { + status <- "MISSING_MALE_PARENT" + correction_decision <- "NONE" + + best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + + } else if (male_parent_id != "0" && female_parent_id == "0") { + status <- "MISSING_FEMALE_PARENT" + correction_decision <- "NONE" + + best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + + } else { + + ## Both parents present - Mendelian error calculation + progeny_vec <- genos_mat[prog_id, ] + male_parent_vec <- genos_mat[male_parent_id, ] + female_parent_vec <- genos_mat[female_parent_id, ] + + mismatches <- base::sum( + (male_parent_vec == 0 & female_parent_vec == 0 & progeny_vec > 0) | + (male_parent_vec == 2 & female_parent_vec == 2 & progeny_vec < 2) | + ((male_parent_vec == 0 & female_parent_vec == 1) | + (male_parent_vec == 1 & female_parent_vec == 0)) & (progeny_vec == 2) | + ((male_parent_vec == 2 & female_parent_vec == 1) | + (male_parent_vec == 1 & female_parent_vec == 2)) & (progeny_vec == 0) | + ((male_parent_vec == 0 & female_parent_vec == 2) | + (male_parent_vec == 2 & female_parent_vec == 0)) & (progeny_vec != 1), + na.rm = TRUE + ) + + markers_tested <- base::sum(!base::is.na(male_parent_vec) & + !base::is.na(female_parent_vec) & + !base::is.na(progeny_vec)) + + if (markers_tested == 0) { + status <- "NO_DATA" + correction_decision <- "NONE" + + } else { + error_pct <- (mismatches / markers_tested) * 100 + + # LOW_MARKERS still computes parent mismatch/recommendations + if (markers_tested < min_markers) { + status <- "LOW_MARKERS" + } else if (error_pct <= trio_error_threshold) { + status <- "PASS" + correction_decision <- "NONE" + } else { + status <- "FAIL" + } + + # Run parent-level evaluation for both FAIL and LOW_MARKERS + if (status %in% c("FAIL", "LOW_MARKERS")) { + + # Homozygous mismatch per parent + progeny_hom <- genos_hom_mat[prog_id, ] + male_parent_hom <- genos_hom_mat[male_parent_id, ] + female_parent_hom <- genos_hom_mat[female_parent_id, ] + + male_comparisons <- base::sum(!base::is.na(male_parent_hom) & + !base::is.na(progeny_hom)) + male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else + base::round((base::sum(male_parent_hom != progeny_hom, na.rm = TRUE) / + male_comparisons) * 100, 2) + + female_comparisons <- base::sum(!base::is.na(female_parent_hom) & + !base::is.na(progeny_hom)) + female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else + base::round((base::sum(female_parent_hom != progeny_hom, na.rm = TRUE) / + female_comparisons) * 100, 2) + + male_acceptable <- !is.na(male_parent_error_pct) && + male_parent_error_pct <= single_parent_error_threshold + female_acceptable <- !is.na(female_parent_error_pct) && + female_parent_error_pct <= single_parent_error_threshold + + if (male_acceptable && female_acceptable) { + correction_decision <- "KEEP_BOTH" + } else if (male_acceptable && !female_acceptable) { + correction_decision <- "REMOVE_FEMALE_PARENT" + best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + } else if (!male_acceptable && female_acceptable) { + correction_decision <- "REMOVE_MALE_PARENT" + best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + } else { + correction_decision <- "REMOVE_BOTH" + best_m <- find_best_parent(prog_id, exclude_ids = character(0)) + best_male_parent <- best_m$id + best_male_parent_pct <- best_m$error_pct + best_f <- find_best_parent(prog_id, exclude_ids = c(best_m$id)) + best_female_parent <- best_f$id + best_female_parent_pct <- best_f$error_pct + } + + # Do not alter corrected pedigree for LOW_MARKERS rows + if (status == "LOW_MARKERS") + correction_decision <- paste0("LOW_MARKERS_", correction_decision) + } + } + } + } + + data.table::data.table( + ID = prog_id, + Male_Parent = male_parent_id, + Female_Parent = female_parent_id, + Mendelian_Error_Pct = base::round(error_pct, 2), + Markers_Tested = markers_tested, + Status = status, + Correction_Decision = correction_decision, + Male_Parent_Hom_Error_Pct = male_parent_error_pct, + Female_Parent_Hom_Error_Pct = female_parent_error_pct, + Best_Male_Parent = best_male_parent, + Best_Male_Parent_Error_Pct = best_male_parent_pct, + Best_Female_Parent = best_female_parent, + Best_Female_Parent_Error_Pct = best_female_parent_pct + ) + }) + + final_df <- data.table::rbindlist(results_list) + + #### Append NO_GENOTYPE_DATA rows to the final report #### + if (base::nrow(no_geno_rows) > 0) { + no_geno_df <- data.table::data.table( + ID = no_geno_rows$ID, + Male_Parent = no_geno_rows$Male_Parent, + Female_Parent = no_geno_rows$Female_Parent, + Mendelian_Error_Pct = NA_real_, + Markers_Tested = 0L, + Status = "NO_GENOTYPE_DATA", + Correction_Decision = "NONE", + Male_Parent_Hom_Error_Pct = NA_real_, + Female_Parent_Hom_Error_Pct = NA_real_, + Best_Male_Parent = NA_character_, + Best_Male_Parent_Error_Pct = NA_real_, + Best_Female_Parent = NA_character_, + Best_Female_Parent_Error_Pct = NA_real_ + ) + final_df <- data.table::rbindlist(list(final_df, no_geno_df)) + } + + #### Write corrected pedigree #### + corrected_pedigree <- data.table::copy(original_pedigree) + for (i in base::seq_len(base::nrow(final_df))) { + prog_id <- final_df$ID[i] + decision <- final_df$Correction_Decision[i] + row_idx <- base::which(corrected_pedigree$ID == prog_id) + + if (decision == "REMOVE_MALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + } else if (decision == "REMOVE_FEMALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + } else if (decision == "REMOVE_BOTH") { + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + } + } + + tryCatch({ + data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", + sep = "\t", quote = FALSE) + if (verbose) base::cat("Corrected pedigree written to: corrected_pedigree.txt\n") + }, error = function(e) { + warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) + }) + + #### Summary output #### + if (verbose) { + total_trios <- base::nrow(final_df) + status_counts <- base::table(final_df$Status) + base::cat("\n--- Trio Validation Summary ---\n") + base::cat("Total trios in pedigree:", total_trios, "\n") + for (s in base::names(status_counts)) + base::cat(base::sprintf("%-24s: %d (%.1f%%)\n", s, + status_counts[s], + (status_counts[s] / total_trios) * 100)) + base::cat("Error threshold:", trio_error_threshold, "%\n") + base::cat("Homozygous threshold:", single_parent_error_threshold, "%\n") + base::cat("Minimum markers required:", min_markers, "\n\n") + + corrections <- base::table(final_df$Correction_Decision) + base::cat("Correction summary:\n") + for (decision in base::names(corrections)) + if (decision != "NONE") + base::cat(" ", decision, ":", corrections[decision], "\n") + base::cat("\n") + base::print(final_df) + } + + #### Write results #### + if (write_txt) { + tryCatch({ + data.table::fwrite(final_df, file = output_filename, + sep = "\t", quote = FALSE) + if (verbose) base::cat("Results written to:", output_filename, "\n") + }, error = function(e) { + warning("Could not write results. Error: ", e$message, call. = FALSE) + }) + } + + return(base::invisible(final_df)) +} diff --git a/cran-comments.md b/cran-comments.md index 2515e5b..f09be5d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,9 +1,11 @@ ## R CMD check results -0 errors | 0 warnings | 1 note +0 errors | 0 warnings | 0 note -* This is a new release. +## Resubmission -## Updates +This is a resubmission. I replaced Unicode punctuation in Rd-generated +documentation, including the Unicode minus sign (U+2212) that caused the CRAN +PDF manual build failure. -- The maintainer is the same as the previous release, but the email address has been updated. +* This is an update release. diff --git a/dev/dev_history.R b/dev/dev_history.R index ec7f969..34e9324 100644 --- a/dev/dev_history.R +++ b/dev/dev_history.R @@ -2,7 +2,7 @@ # Update dependencies in DESCRIPTION # install.packages('attachment', repos = 'https://thinkr-open.r-universe.dev') -attachment::att_amend_desc() +#attachment::att_amend_desc() # Check package coverage covr::package_coverage() diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd new file mode 100644 index 0000000..1d7eebb --- /dev/null +++ b/man/check_madc_sanity.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_madc_sanity.R +\name{check_madc_sanity} +\alias{check_madc_sanity} +\title{Run basic sanity checks on a MADC-style allele report} +\usage{ +check_madc_sanity(report) +} +\arguments{ +\item{report}{A \code{data.frame} with at least the columns +\code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also +used in the \code{FixAlleleIDs} check to inspect its first up to six entries. +If \code{FixAlleleIDs} is \code{FALSE} (raw DArT format), the first 7 rows are +treated as header filler and skipped before further checks are run.} +} +\value{ +A named list with five elements: +\describe{ +\item{checks}{Named logical vector with nine entries: +\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}, +\code{ChromPos}, \code{allNAcol}, \code{allNArow}, \code{RefAltSeqs}. +\code{TRUE} means the condition was detected (or passed for \code{Columns}, +\code{FixAlleleIDs}, \code{ChromPos}, and \code{RefAltSeqs}); \code{NA} means the check +was skipped.} +\item{messages}{Named list of length-2 character vectors, one per check. +Element \verb{[[1]]} is the message when the check is \code{TRUE}, element \verb{[[2]]} +when it is \code{FALSE}. Indexed by the same names as \code{checks}.} +\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt +lengths differ. Returns \code{character(0)} if none are found, or \code{NULL} +when required columns are missing.} +\item{missRef}{Character vector of \code{CloneID}s that have no \code{Ref} allele +row. Returns \code{character(0)} if all \code{CloneID}s have a \code{Ref} row, or +\code{NULL} when required columns are missing.} +\item{missAlt}{Character vector of \code{CloneID}s that have no \code{Alt} allele +row. Returns \code{character(0)} if all \code{CloneID}s have an \code{Alt} row, or +\code{NULL} when required columns are missing.} +} +} +\description{ +Performs nine quick validations on an allele report: +\enumerate{ +\item \strong{Columns} - required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); +\item \strong{FixAlleleIDs} - first column's first up-to-6 rows are not all blank or \code{"*"} +\emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID}; +\item \strong{IUPACcodes} - presence of non-ATCG characters in \code{AlleleSequence}; +\item \strong{LowerCase} - presence of lowercase a/t/c/g in \code{AlleleSequence}; +\item \strong{Indels} - reference/alternate allele lengths differ for the same \code{CloneID}, +or a \code{"-"} character is present in \code{AlleleSequence}; +\item \strong{ChromPos} - all \code{CloneID} values follow the \code{Chr_Position} format +(prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); +\item \strong{allNAcol} - at least one column contains only \code{NA} or empty values; +\item \strong{allNArow} - at least one row contains only \code{NA} or empty values; +\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row; +\item \strong{OtherAlleles} - presence of alleles where the target locus differs from both the Ref and Alt in \code{AlleleSequence}. +} +} +\details{ +\itemize{ +\item \strong{FixAlleleIDs:} When the first six rows of the first column are all blank +or \code{"*"} (raw DArT format), row 7 is promoted to column headers and the +first 7 rows are dropped before subsequent checks are run. The check is +\code{TRUE} when the file has already been processed by HapApp (fixed IDs with +\verb{_0001}/\verb{_0002} suffixes). +\item \strong{IUPAC check:} Flags any character outside \code{A}, \code{T}, \code{C}, \code{G} and \code{"-"} +(case-insensitive), which includes ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.). +\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs +\code{"Alt_0002"}, merged by \code{CloneID}, and flagged as indels if either (a) the +lengths of \code{AlleleSequence} differ, (b) the sequences have the same length +but more than one character differs between them (complex indel / local +rearrangement), or (c) a \code{"-"} character is present anywhere in +\code{AlleleSequence}. +\item \strong{ChromPos:} Each \code{CloneID} is split on \code{"_"} into exactly two parts; the +first part must match \code{"Chr"} (case-insensitive) and the second must be a +positive integer. Returns \code{FALSE} when any \code{CloneID} is \code{NA}. +\item \strong{allNAcol / allNArow:} Detected via \code{apply()} over columns/rows +respectively; a cell is treated as missing when it is \code{NA} or an empty +string (\code{""}). Useful for flagging empty or corrupt entries. +\item \strong{RefAltSeqs:} For each unique \code{CloneID}, checks whether at least one row +with a \code{Ref} (\verb{|Ref_} when \code{FixAlleleIDs = TRUE}, \verb{|Ref$} otherwise) and +one row with an \code{Alt} (\verb{|Alt_} / \verb{|Alt$}) allele exist. \code{CloneID}s that +lack a \code{Ref} row are stored in \code{missRef}; those lacking an \code{Alt} row in +\code{missAlt}. The check is \code{TRUE} when both sets are empty. +\item If required columns are missing (\code{Columns = FALSE}), only \code{Columns} and +\code{FixAlleleIDs} are evaluated; all other checks remain \code{NA} and +\code{indel_clone_ids}, \code{missRef}, and \code{missAlt} are returned as \code{NULL}. +} +} diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 693bfe0..9a12a07 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,60 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Evaluate Pedigree File for Accuracy} +\title{Check a pedigree file for accuracy and report/correct common errors} \usage{ check_ped(ped.file, seed = NULL, verbose = TRUE) } \arguments{ -\item{ped.file}{path to pedigree text file. The pedigree file is a -3-column pedigree tab separated file with columns labeled as id sire dam in any order} +\item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility} +\item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE, print the errors to the console.} +\item{verbose}{Logical. If TRUE (default), prints the report to the console.} } \value{ -A list of data.frames of error types, and the output printed to the console +A list of data.frames containing detected issues: +\itemize{ +\item \code{exact_duplicates}: rows that were exact duplicates +\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam +\item \code{messy_parents}: IDs appearing as both sire and dam +\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam +\item \code{dependencies}: detected cycles in the pedigree +\item \code{corrected_pedigree}: corrected pedigree table +} } \description{ -Check a pedigree file for accuracy and output suspected errors +\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) +and performs quality checks, optionally correcting or flagging errors. } \details{ -check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: +The function checks for: \itemize{ -\item Ids that appear more than once in the id column -\item Ids that appear in both sire and dam columns -\item Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -\item Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +\item Exact duplicate rows and removes them (keeping one copy) +\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +\item IDs that appear in both sire and dam columns +\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" +\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant } -When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -Note: This function does not change the input file but prints any errors found in the console. +The function does \strong{not} overwrite the input file or create objects in the +global environment. Instead, it returns the report and corrected pedigree in +a list. } \examples{ -##Get list with a dataframe for each error type -ped_file <- system.file("check_ped_test.txt", package="BIGr") -ped_errors <- check_ped(ped.file = ped_file, - seed = 101919) +ped_file <- system.file("check_ped_test.txt", package = "BIGr") +ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) -##Access the "messy parents" dataframe result +# Access messy parents ped_errors$messy_parents -##Get list of sample IDs with messy parents error -messy_parent_ids <- ped_errors$messy_parents$id -print(messy_parent_ids) +# Access corrected pedigree +ped_errors$corrected_pedigree + +# IDs with messy parents +messy_ids <- ped_errors$messy_parents$id +print(messy_ids) + } diff --git a/man/dosage2vcf.Rd b/man/dosage2vcf.Rd index 0bb0600..b6a1978 100644 --- a/man/dosage2vcf.Rd +++ b/man/dosage2vcf.Rd @@ -2,12 +2,12 @@ % Please edit documentation in R/dosage2vcf.R \name{dosage2vcf} \alias{dosage2vcf} -\title{Convert DArTag Dosage and Counts to VCF} +\title{Convert DArTag genotype reports and counts to VCF} \usage{ dosage2vcf(dart.report, dart.counts, ploidy, output.file) } \arguments{ -\item{dart.report}{Path to the DArT dosage report .csv file. Typically contains "Dosage Report" in the file name.} +\item{dart.report}{Path to the DArT Allele Dose Report or SNP/INDEL report .csv file.} \item{dart.counts}{Path to the DArT counts .csv file. Typically contains "Counts" in the file name.} @@ -19,11 +19,15 @@ dosage2vcf(dart.report, dart.counts, ploidy, output.file) A vcf file } \description{ -This function will convert the DArT Dosage Report and Counts files to VCF format +This function will convert DArT genotype report and Counts files to VCF format } \details{ -This function will convert the Dosage Report and Counts files from DArT into a VCF file. +This function will convert Allele Dose Report or SNP/INDEL report files and Counts files from DArT into a VCF file. These two files are received directly from DArT for a given sequencing project. +SNP/INDEL one-row and two-row reports are treated as diploid genotype reports +with 0 = reference homozygote, 1 = alternate homozygote, 2 = heterozygote, +and - = missing. Allele Dose reports are interpreted as reference allele +dosages using the supplied ploidy. The output file will be saved to the location and with the name that is specified. The VCF format is v4.3 } diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd new file mode 100644 index 0000000..77b9c3c --- /dev/null +++ b/man/find_parentage.Rd @@ -0,0 +1,159 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/find_parentage.R +\name{find_parentage} +\alias{find_parentage} +\title{Find Parentage Assignments for Progeny} +\usage{ +find_parentage( + genotypes_file, + parents_file, + progeny_file, + method = "best_pair", + min_markers = 10, + error_threshold = 5, + show_ties = TRUE, + allow_selfing = TRUE, + verbose = TRUE, + write_txt = TRUE +) +} +\arguments{ +\item{genotypes_file}{Path to a TSV/CSV/TXT file containing genotype data. +Must include an 'ID' column followed by marker columns coded as 0, 1, 2 +(allele dosage).} + +\item{parents_file}{Path to a TSV/CSV/TXT file listing candidate parent IDs. +Must include an 'ID' column. An optional 'Sex' column with values +'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines +which parents are tested for each role. If absent, all parents are treated +as ambiguous.} + +\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs to assign. +Must include an 'ID' column.} + +\item{method}{Character. Parentage assignment method. One of: +\itemize{ +\item \code{"best_male_parent"} - finds the best male parent for each +progeny using homozygous mismatch rate. +\item \code{"best_female_parent"} - finds the best female parent for each +progeny using homozygous mismatch rate. +\item \code{"best_match"} - finds the single best parent (either sex) +using homozygous mismatch rate. +\item \code{"best_pair"} - finds the best male-female parent pair for +each progeny using full Mendelian error rate (default). +}} + +\item{min_markers}{Integer. Minimum number of non-missing markers required +to report a parentage assignment. Progeny-parent comparisons with fewer +markers are flagged as \code{LOW_MARKERS} and no assignment is made +(default: \code{10}).} + +\item{error_threshold}{Numeric. Maximum mismatch percentage to report a +parentage assignment as confident. Assignments above this threshold are +flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column +(default: \code{5.0}). Must be between 0 and 100.} + +\item{show_ties}{Logical. If \code{TRUE}, all tied best pairs (after +tie-breaking by maximum markers tested) are reported as additional columns +(\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when +\code{method = "best_pair"}. The base columns (\code{Male_Parent}, +\code{Female_Parent}, etc.) are always populated with the top result. +If \code{FALSE}, only one tied pair is reported with a warning. +Default is \code{TRUE}.} + +\item{allow_selfing}{Logical. If \code{FALSE}, male-female parent pairs where +both IDs are identical are excluded when \code{method = "best_pair"}. +Default is \code{TRUE}.} + +\item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary +statistics, and the results table to the console. Default is \code{TRUE}.} + +\item{write_txt}{Logical. If \code{TRUE}, writes results to +\code{parentage_results_dt.txt} in the working directory. Default is +\code{TRUE}.} +} +\value{ +A \code{data.table} with one row per progeny. Columns depend on the +method used: +\itemize{ +\item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: +\code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, +\code{Markers_Tested}, \code{Assignment_Status}. +\item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, +\code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, +\code{Markers_Tested}, \code{Assignment_Status}. +\item \code{best_pair} (ties remain after tie-breaking, +\code{show_ties = TRUE}): base columns are always populated with the +top result, plus suffix columns \code{Male_Parent_1}, +\code{Female_Parent_1}, etc. for each tied pair. +} +\code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or +\code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. +} +\description{ +Assigns the most likely parent(s) to each progeny individual based on +genotypic data using Mendelian error rates or homozygous mismatch rates. +} +\details{ +A homozygous-only genotype matrix is pre-computed once at startup and shared +across all methods that require it, avoiding redundant computation. + +For \code{"best_male_parent"}, \code{"best_female_parent"}, and +\code{"best_match"}, only homozygous markers (coded 0 or 2) are used for +comparison; heterozygous markers (coded 1) are set to \code{NA}. This +reduces false mismatches caused by phase ambiguity. + +For \code{"best_pair"}, all markers are used and full Mendelian inheritance +rules are applied. Mismatch rates and comparison counts are computed across +all progeny simultaneously using vectorised \code{vapply} calls, producing +\code{n_pairs x n_progeny} matrices and giving substantial speed gains for +large datasets. Both matrices are explicitly coerced to matrix form to +handle the edge case of a single parent pair correctly. + +When multiple pairs share the lowest Mendelian error rate, ties are broken +by selecting the pair(s) with the greatest number of markers tested. If ties +still remain after this step, all remaining tied pairs are reported when +\code{show_ties = TRUE}. + +The base columns (\code{Male_Parent}, \code{Female_Parent}, +\code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with +the top result, ensuring no missing values in these columns regardless of +tie status. + +Output rows are pre-allocated as a \code{data.table} and filled by reference +using \code{data.table::set()}, avoiding repeated memory allocation during +the results-building step. + +Individuals in \code{parents_file} or \code{progeny_file} that are absent +from \code{genotypes_file} are removed with a warning. + +Progeny with fewer non-missing markers than \code{min_markers} are flagged +\code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best +match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +} +\examples{ +\dontrun{ +# Assign best male-female parent pair to each progeny +results <- find_parentage( + genotypes_file = "genotypes.txt", + parents_file = "parents.txt", + progeny_file = "progeny.txt", + method = "best_pair", + min_markers = 50, + error_threshold = 5.0, + show_ties = TRUE, + allow_selfing = FALSE +) + +# Find best individual parent match (ignoring sex) +results <- find_parentage( + genotypes_file = "genotypes.txt", + parents_file = "parents.txt", + progeny_file = "progeny.txt", + method = "best_match", + min_markers = 30, + error_threshold = 3.0 +) +} + +} diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd index 66c5708..393ddcb 100644 --- a/man/get_countsMADC.Rd +++ b/man/get_countsMADC.Rd @@ -4,18 +4,53 @@ \alias{get_countsMADC} \title{Obtain Read Counts from MADC File} \usage{ -get_countsMADC(madc_file) +get_countsMADC( + madc_file = NULL, + madc_object = NULL, + collapse_matches_counts = FALSE, + verbose = TRUE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. +At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame +(e.g., as returned by \code{check_botloci()}). When supplied, file reading is +skipped. At least one of \code{madc_file} or \code{madc_object} must be provided.} + +\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} +and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} +rows. If \code{FALSE} (default), \verb{|AltMatch} and \verb{|RefMatch} rows are discarded.} + +\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} } \value{ -A list of read count matrices for reference, alternate, and total read count values +A named list with two numeric matrices, both with markers as rows +and samples as columns: +\describe{ +\item{\code{ref_matrix}}{Reference allele read counts.} +\item{\code{size_matrix}}{Total read counts (reference + alternative).} +} } \description{ -This function takes the MADC file as input and retrieves the ref and alt counts for each sample, -and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment, -only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored. +Reads a DArTag MADC report and returns reference and total read count matrices +per marker and sample. Only \code{Ref} and \code{Alt} target loci are retained; +\verb{|AltMatch} / \verb{|RefMatch} rows are either discarded or collapsed depending on +\code{collapse_matches_counts}. +} +\details{ +Either \code{madc_file} or \code{madc_object} must be provided (not both \code{NULL}). +When \code{madc_object} is supplied it is passed directly to \code{get_counts()}, +skipping file I/O. The function constructs: +\itemize{ +\item \code{ref_matrix} - per-sample reference allele counts. +\item \code{size_matrix} - per-sample total counts (ref + alt). +} + +Markers whose \code{CloneID} appears only in the \code{Ref} or only in the \code{Alt} rows +are removed with a warning. A summary of the proportion of zero-count +data points (missing data) is reported via \code{vmsg()}. } \examples{ # Get the path to the MADC file @@ -24,11 +59,13 @@ madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr") # Extract the read count matrices counts_matrices <- get_countsMADC(madc_path) -# Access the reference, alternate, and size matrices - -# ref_matrix <- counts_matrices$ref_matrix -# alt_matrix <- counts_matrices$alt_matrix +# Access the reference and size matrices +# ref_matrix <- counts_matrices$ref_matrix # size_matrix <- counts_matrices$size_matrix rm(counts_matrices) + +} +\seealso{ +\code{\link[=check_madc_sanity]{check_madc_sanity()}} } diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 0c06134..22e9462 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -9,55 +9,83 @@ imputation_concordance( imputed_genos, missing_code = NULL, snps_2_exclude = NULL, - verbose = FALSE + verbose = FALSE, + plot = FALSE, + print_result = TRUE ) } \arguments{ -\item{reference_genos}{A data frame containing reference genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{reference_genos}{A data frame containing reference genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{imputed_genos}{A data frame containing imputed genotype data, with rows as samples and columns as markers. Dosage format (0, 1, 2) is recommended.} +\item{imputed_genos}{A data frame containing imputed genotype data, +with rows as samples and columns as markers. Must include a column named \code{ID}.} -\item{missing_code}{An optional value to specify missing data. If provided, loci with this value in either dataset will be excluded from the concordance calculation.} +\item{missing_code}{Optional value specifying missing data. If provided, +loci with this value in either dataset will be excluded from the concordance calculation.} -\item{snps_2_exclude}{An optional vector of marker IDs to exclude from the concordance calculation.} +\item{snps_2_exclude}{Optional vector of marker IDs to exclude from the concordance calculation.} -\item{verbose}{A logical value indicating whether to print a summary of the concordance results. Default is FALSE.} +\item{verbose}{Logical. If \code{TRUE}, prints summary statistics (minimum, quartiles, +median, mean, maximum) of concordance percentages.} + +\item{plot}{Logical. If \code{TRUE}, produces a bar plot of concordance percentage +by sample.} + +\item{print_result}{Logical. If \code{TRUE} (default), prints the concordance +results data frame to the console. If \code{FALSE}, results are returned invisibly.} } \value{ -A list with two elements: +A data frame with: \itemize{ -\item \code{result_df}: A data frame with sample IDs and their concordance percentages. -\item \code{summary_concordance}: A summary of concordance percentages, including minimum, maximum, mean, and quartiles. +\item \code{ID}: Sample identifiers shared between the datasets. +\item \code{Concordance}: Percentage of matching genotypes per sample. } +If \code{print_result = FALSE}, the data frame is returned invisibly. } \description{ -This function calculates the concordance between imputed and reference genotypes. It assumes that samples are rows and markers are columns. -It is recommended to use allele dosages (0, 1, 2) but will work with other formats. Missing data in reference or imputed genotypes -will not be considered for concordance if the \code{missing_code} argument is used. If a specific subset of markers should be excluded, -it can be provided using the \code{snps_2_exclude} argument. +This function calculates the concordance between imputed and reference +genotypes. It assumes that samples are rows and markers are columns. +Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. +Missing data in either dataset can be excluded from the concordance calculation +using the \code{missing_code} argument. Specific markers can be excluded using +the \code{snps_2_exclude} argument. } \details{ -The function identifies common samples and markers between the reference and imputed genotype datasets. It calculates the percentage of matching genotypes for each sample, excluding missing data and specified markers. The concordance is reported as a percentage for each sample, along with a summary of the overall concordance distribution. +The function: +\enumerate{ +\item Identifies common samples and markers between the datasets. +\item Optionally excludes specified SNPs. +\item Removes loci with missing data (if \code{missing_code} is provided). +\item Computes per-sample concordance as the percentage of matching genotypes. } -\examples{ -# Example Input variables -ignore_file <- system.file("imputation_ignore.txt", package="BIGr") -ref_file <- system.file("imputation_reference.txt", package="BIGr") -test_file <- system.file("imputation_test.txt", package="BIGr") - -# Import files -snps = read.table(ignore_file, header = TRUE) -ref = read.table(ref_file, header = TRUE) -test = read.table(test_file, header = TRUE) +When \code{plot = TRUE}, a bar plot showing concordance percentage per sample +is generated using \pkg{ggplot2}. +} +\examples{ +ref <- data.frame( + ID = c("S1", "S2", "S3"), + SNP1 = c(0, 1, 2), + SNP2 = c(1, 1, 0), + SNP3 = c(2, 5, 1) +) -#Calculations -result <- imputation_concordance(reference_genos = ref, - imputed_genos = test, - snps_2_exclude = snps, - missing_code = 5, - verbose = FALSE) +test <- data.frame( + ID = c("S1", "S2", "S3"), + SNP1 = c(0, 0, 2), + SNP2 = c(1, 1, 1), + SNP3 = c(2, 5, 0) +) +result <- imputation_concordance( + reference_genos = ref, + imputed_genos = test, + snps_2_exclude = "SNP2", + missing_code = 5, + print_result = FALSE +) +result } diff --git a/man/madc2vcf_all.Rd b/man/madc2vcf_all.Rd index 6fe7f11..c15e69b 100644 --- a/man/madc2vcf_all.Rd +++ b/man/madc2vcf_all.Rd @@ -5,8 +5,8 @@ \title{Converts MADC file to VCF recovering target and off-target SNPs} \usage{ madc2vcf_all( - madc = NULL, - botloci_file = NULL, + madc, + botloci_file, hap_seq_file = NULL, n.cores = 1, rm_multiallelic_SNP = FALSE, @@ -14,13 +14,17 @@ madc2vcf_all( multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = NULL, + markers_info = NULL, + add_others = TRUE, + others_max_snps = 5, + others_rm_with_indels = TRUE, verbose = TRUE ) } \arguments{ -\item{madc}{A string specifying the path to the MADC file.} +\item{madc}{Required. A string specifying the path or URL to the MADC file.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{Required. A string specifying the path or URL to the file containing the target IDs designed in the bottom strand.} \item{hap_seq_file}{A string specifying the path to the haplotype database fasta file.} @@ -36,6 +40,14 @@ madc2vcf_all( \item{out_vcf}{A string specifying the name of the output VCF file. If the file extension is not \code{.vcf}, it will be appended automatically.} +\item{markers_info}{A string specifying the path to a CSV file with marker information (CloneID/BI_markerID, Chr, Pos, Ref, Alt, Type, Indel_pos columns as needed).} + +\item{add_others}{A logical value. If TRUE, alleles labeled "Other" in the MADC file are included in off-target SNP extraction. Default is TRUE.} + +\item{others_max_snps}{An integer or NULL. If not NULL, Other alleles with more than this many SNP differences versus the Ref sequence (as detected by pairwise alignment) are discarded. Default is NULL (no limit).} + +\item{others_rm_with_indels}{A logical value. If TRUE, Other alleles that contain insertions or deletions relative to the Ref sequence (as detected by pairwise alignment) are discarded. Default is TRUE.} + \item{verbose}{A logical value indicating whether to print metrics and progress to the console. Default is TRUE.} } \value{ @@ -46,6 +58,16 @@ This function processes a MADC file to generate a VCF file containing both targe } \details{ The function processes a MADC file to generate a VCF file containing both target and off-target SNPs. It uses parallel processing to improve performance and provides options to filter multiallelic SNPs based on user-defined thresholds. The alignment score threshold can be adjusted using the \code{alignment_score_thr} parameter. The generated VCF file includes metadata about the processing parameters and the BIGr package version. If the \code{alignment_score_thr} is not met, the corresponding SNPs are discarded. + +\strong{Sanity check behaviour and requirements}\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{markers_info} with \code{Ref}/\code{Alt}/\code{Indel_pos}/\code{Indel_length} + \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \strong{RefAltSeqs} \tab all paired \tab \code{botloci_file} \cr + \tab unpaired \tab \code{botloci_file} + \code{hap_seq_file} (microhaplotype DB) \cr +} } \examples{ # Example usage: diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd new file mode 100644 index 0000000..b6a3324 --- /dev/null +++ b/man/madc2vcf_multi.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/madc2vcf_multi.R +\name{madc2vcf_multi} +\alias{madc2vcf_multi} +\title{Convert MADC file to VCF using polyRAD for multiallelic genotyping} +\usage{ +madc2vcf_multi( + madc_file, + botloci_file, + outfile, + markers_info = NULL, + ploidy = 2L, + verbose = TRUE +) +} +\arguments{ +\item{madc_file}{character. Path or URL to the input MADC CSV file.} + +\item{botloci_file}{character. Path or URL to the botloci file listing target +IDs designed on the bottom strand.} + +\item{outfile}{character. Path for the output VCF file.} + +\item{markers_info}{character or NULL. Optional path or URL to a CSV file +with marker metadata. Required when CloneIDs do not follow the +\code{Chr_Pos} format; must contain \code{CloneID} (or +\code{BI_markerID}), \code{Chr}, and \code{Pos} columns.} + +\item{ploidy}{integer. Ploidy level of the samples passed to \code{taxaPloidy}. +Default is 2.} + +\item{verbose}{logical. Whether to print progress messages. Default is TRUE.} +} +\value{ +Invisible NULL. Writes a VCF file to \code{outfile}. +} +\description{ +This function converts a DArTag fixed allele ID MADC file to a VCF +containing multiallelic markers based on the microhaplotypes using +the polyRAD package's \code{readDArTag}, \code{IterateHWE} population model +and \code{RADdata2VCF} pipeline. +} +\details{ +The function performs the following steps: +\enumerate{ +\item Reads the MADC file and runs \code{check_madc_sanity}. +\item Validates the botloci file against MADC CloneIDs using +\code{check_botloci}, fixing any padding mismatches automatically. +\item Converts lowercase bases in AlleleSequence to uppercase if detected. +\item Removes all-NA rows and columns if detected. +\item Writes the corrected data to a temporary file and passes it to +\code{polyRAD::readDArTag}. +\item Estimates overdispersion with \code{polyRAD::TestOverdispersion} and +calls \code{polyRAD::IterateHWE}, then exports the result with +\code{polyRAD::RADdata2VCF}. +} + +\strong{Sanity check behaviour and requirements} + +The function always stops if IUPAC codes, unpaired Ref/Alt sequences, or +unfixed AlleleIDs are detected (see \code{check_madc_sanity}). For the +remaining checks the required inputs are:\tabular{lll}{ + Check \tab Status \tab Required \cr + \strong{Indels} \tab detected \tab \code{botloci_file} \cr + \tab not detected \tab \code{botloci_file} \cr + \strong{ChromPos} \tab valid \tab \code{botloci_file} \cr + \tab invalid \tab \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr +} +} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index a790460..3ed4aaf 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -4,44 +4,158 @@ \alias{madc2vcf_targets} \title{Format MADC Target Loci Read Counts Into VCF} \usage{ -madc2vcf_targets(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) +madc2vcf_targets( + madc_file, + output.file, + botloci_file = NULL, + markers_info = NULL, + get_REF_ALT = FALSE, + collapse_matches_counts = FALSE, + verbose = TRUE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character. Path to the input MADC CSV file.} -\item{output.file}{output file name and path} +\item{output.file}{character. Path to the output VCF file to write.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text +file listing target IDs designed on the \strong{bottom} strand (one ID per line). +Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and +\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Not needed when +\code{markers_info} provides \code{Ref} and \code{Alt}, or when \code{get_REF_ALT = FALSE} and +\code{markers_info} provides \code{Chr} and \code{Pos}. Also required when \code{ChromPos} is +invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} -\item{get_REF_ALT}{if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded.} +\item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target +metadata. Matching is done by column name, not column position. Accepted columns: +\itemize{ +\item one marker identifier column named \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} +(required; a generic \code{ID} column is not accepted); +\item \code{Chr}, \code{Pos} - required when \code{CloneID} does not follow the \code{Chr_Pos} format; +\item \code{Ref}, \code{Alt} - required when \code{get_REF_ALT = TRUE} and probe-sequence +inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When +\code{get_REF_ALT = TRUE}, \code{botloci_file} is still required unless \code{Ref} and \code{Alt} +are supplied here. +}} + +\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to recover +REF/ALT bases. The source is chosen automatically: \code{markers_info} \code{Ref}/\code{Alt} +columns take priority; otherwise probe sequences from the MADC are compared +(with \code{botloci_file} for strand correction). Targets with more than one +difference between Ref/Alt sequences are removed. When \code{FALSE}, REF and ALT +are set to \code{"."} in the output VCF.} + +\item{collapse_matches_counts}{logical (default \code{FALSE}). If \code{TRUE}, counts for +\verb{|AltMatch} and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} +and \verb{|Alt} rows before building the matrices. Useful when the MADC contains +multiple allele rows per target that should be aggregated.} + +\item{verbose}{logical (default \code{TRUE}). If \code{TRUE}, prints detailed progress +messages about each processing step.} } \value{ -A VCF file v4.3 with the target marker read count information - -A VCF file v4.3 with the target marker read count information +(Invisibly) returns the path to \code{output.file}. The side effect is a +\strong{VCF v4.3} written to disk containing one row per target and columns for all +samples in the MADC file. } \description{ -This function will extract the read count information from a MADC file target markers and convert to VCF file format. +Parses a DArTag \strong{MADC} report and writes a \strong{VCF v4.3} containing per-target +read counts for the panel's target loci. This is useful because MADC is not +widely supported by general-purpose tools, while VCF is. } \details{ -The DArTag MADC file format is not commonly supported through existing tools. This function -will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -genotyping panel target markers only +Convert DArTag MADC target read counts to a VCF + +\strong{What this function does} +\itemize{ +\item Runs basic sanity checks on the MADC file via \code{check_madc_sanity()} (column +presence, fixed allele IDs, IUPAC/ambiguous bases, lowercase bases, indels, +chromosome/position format, all-NA rows/columns, Ref/Alt sequence presence). +\item Extracts reference and total read counts per sample and target. +\item Derives \code{AD} (ref,alt) by subtraction (alt = total - ref). +\item If \code{get_REF_ALT = TRUE}, recovers REF/ALT bases either from \code{markers_info} +(when \code{Ref}/\code{Alt} columns are present) or by comparing the Ref/Alt probe +sequences in the MADC file (with strand correction via \code{botloci_file}). +Targets with >1 polymorphism between sequences are discarded. +\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, +\code{ALT}, bypassing sequence-based inference. } -\examples{ -# Load example files -madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#Temp location (only for example) -output_file <- tempfile() +\strong{Output VCF layout} +\itemize{ +\item \code{INFO} fields: +\itemize{ +\item \code{DP} - total depth across all samples for the locus +\item \code{ADS} - total counts across samples in the order \verb{ref,alt} +} +\item \code{FORMAT} fields (per sample): +\itemize{ +\item \code{DP} - total reads (ref + alt) +\item \code{RA} - reads supporting the reference allele +\item \code{AD} - \code{"ref,alt"} counts +} +} -# Convert MADC to VCF -madc2vcf_targets(madc_file = madc_file, - output.file = output_file, - get_REF_ALT = TRUE, - botloci_file = bot_file) +\strong{Strand handling} +If a target ID appears in \code{botloci_file}, its probe sequences are reverse- +complemented prior to base comparison so that REF/ALT are reported in the +top-strand genomic orientation. -rm(output_file) +\strong{Sanity check behaviour and requirements} +The function always stops if required columns (\code{CloneID}, \code{AlleleID}, +\code{AlleleSequence}) are missing. + +For the remaining checks the required inputs depend on the combination of +check result and \code{get_REF_ALT}:\tabular{llll}{ + Check \tab Status \tab \code{get_REF_ALT} \tab Required \cr + \strong{IUPAC codes} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab none \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab none \cr + \strong{Indels} \tab detected \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab detected \tab \code{FALSE} \tab none \cr + \tab not detected \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not detected \tab \code{FALSE} \tab none \cr + \strong{ChromPos} \tab valid \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab valid \tab \code{FALSE} \tab none \cr + \tab invalid \tab \code{TRUE} \tab \code{markers_info} with \code{Chr}/\code{Pos}/\code{Ref}/\code{Alt} \strong{or} \code{markers_info} with \code{Chr}/\code{Pos} + \code{botloci_file} \cr + \tab invalid \tab \code{FALSE} \tab \code{markers_info} with \code{Chr}/\code{Pos} \cr + \strong{FixAlleleIDs} \tab fixed \tab \code{TRUE} \tab \code{botloci_file} \strong{or} \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab fixed \tab \code{FALSE} \tab none \cr + \tab not fixed \tab \code{TRUE} \tab \code{markers_info} with \code{Ref}/\code{Alt} \cr + \tab not fixed \tab \code{FALSE} \tab none \cr +} +} +\section{Dependencies}{ + +Uses \strong{dplyr}, \strong{tidyr}, \strong{tibble}, \strong{reshape2}, \strong{Biostrings} and base +\strong{utils}. Helper functions expected in this package: \code{check_madc_sanity()}, +\code{get_countsMADC()}, \code{get_counts()}, and \code{check_botloci()}. +} + +\examples{ +# Example files shipped with the package +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", + package = "BIGr") +out_vcf <- tempfile(fileext = ".vcf") + +# Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +\dontrun{ +madc2vcf_targets( + madc_file = madc_file, + output.file = out_vcf, + botloci_file = bot_file, + get_REF_ALT = TRUE +) +} + +# Clean up (example) +unlink(out_vcf) + +} +\seealso{ +\code{check_madc_sanity()}, \code{get_countsMADC()}, \code{check_botloci()} } diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd new file mode 100644 index 0000000..8f47a7d --- /dev/null +++ b/man/validate_pedigree.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate_pedigree.R +\name{validate_pedigree} +\alias{validate_pedigree} +\title{Validate Pedigree Trios Using Mendelian Error Analysis} +\usage{ +validate_pedigree( + pedigree_file, + genotypes_file, + founders_file = NULL, + trio_error_threshold = 5, + min_markers = 10, + single_parent_error_threshold = 2, + verbose = TRUE, + write_txt = TRUE, + output_filename = "pedigree_validation_results.txt" +) +} +\arguments{ +\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV/TXT) +with columns: ID, Male_Parent, Female_Parent.} + +\item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV/TXT) +with an ID column followed by marker columns coded as 0, 1, 2.} + +\item{founders_file}{Character, optional. Path to a one-column file +listing the IDs of founder individuals. Founders with both parents +coded as 0 are left unchanged with no recommendations. Defaults to NULL.} + +\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage +to classify a trio as PASS (default: 5.0). Must be between 0 and 100.} + +\item{min_markers}{Integer. Minimum number of non-missing markers +required to evaluate a trio (default: 10).} + +\item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker +mismatch percentage for a parent to be considered acceptable during +parent-level evaluation (default: 2.0). Must be between 0 and 100.} + +\item{verbose}{Logical. If TRUE, prints progress messages, a summary +table, and results to the console (default: TRUE).} + +\item{write_txt}{Logical. If TRUE, writes validation results to +output_filename (default: TRUE).} + +\item{output_filename}{Character. Path/name of the output file +(default: "pedigree_validation_results.txt").} +} +\value{ +A data.table (returned invisibly) with one row per trio and +the following columns: +\describe{ +\item{ID}{Individual ID.} +\item{Male_Parent}{Declared male parent ID.} +\item{Female_Parent}{Declared female parent ID.} +\item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +\item{Markers_Tested}{Number of markers with non-missing genotypes.} +\item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, +MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, +or NO_GENOTYPE_DATA.} +\item{Correction_Decision}{One of NONE, KEEP_BOTH, +REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, +LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, +LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} +\item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} +\item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} +\item{Best_Male_Parent}{Best-matching male parent candidate ID.} +\item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} +\item{Best_Female_Parent}{Best-matching female parent candidate ID.} +\item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} +} +} +\description{ +Validates parent-offspring trios by calculating Mendelian error rates from +SNP genotype data. Identifies incorrect parentage assignments and suggests +best-matching replacements. If a list of founders is supplied, trios that +are declared founders (both parents coded as 0) are preserved unchanged +with no recommendations. Trios removed due to missing genotype data are +retained in the output with a NO_GENOTYPE_DATA status. +} diff --git a/tests/testthat/.gitignore b/tests/testthat/.gitignore new file mode 100644 index 0000000..1388aa1 --- /dev/null +++ b/tests/testthat/.gitignore @@ -0,0 +1 @@ +corrected_pedigree.txt diff --git a/tests/testthat/corrected_pedigree.txt b/tests/testthat/corrected_pedigree.txt new file mode 100644 index 0000000..0434b0c --- /dev/null +++ b/tests/testthat/corrected_pedigree.txt @@ -0,0 +1,4 @@ +ID Male_Parent Female_Parent +IND_C IND_A IND_B +IND_D 0 IND_A +GHOST IND_A IND_B diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R new file mode 100644 index 0000000..4b60cfe --- /dev/null +++ b/tests/testthat/test-check_madc_sanity.R @@ -0,0 +1,169 @@ +test_that("check madc",{ + skip_if_offline("raw.githubusercontent.com") + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" + names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs", "OtherAlleles") + + # raw madc + report <- read.csv(paste0(github_path,"/alfalfa_madc_raw.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test lower case + report <- read.csv(paste0(github_path,"/alfalfa_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # test IUPAC + report <- read.csv(paste0(github_path,"/alfalfa_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # clean alfalfa madc (fixed allele IDs, Chr_Pos CloneIDs, no issues) + report <- read.csv(paste0(github_path,"/alfalfa_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel madc (ChromPos FALSE because IDs are not Chr_Pos) + report <- read.csv(paste0(github_path,"/potato_indel_madc.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel IUPAC (IUPAC codes + lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_IUPAC.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato indel lowercase (lowercase + indels) + report <- read.csv(paste0(github_path,"/potato_indel_lowercase.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) + + # potato more indels madc ChromPosFALSE + report <- read.csv(paste0(github_path,"/potato_more_indels_madc_ChromPosFALSE.csv")) + + res <- check_madc_sanity(report) + exp <- c(TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,TRUE, FALSE) + names(exp) <- names + expect_equal(res$checks, exp) +}) + +test_that("check_botloci remaps using Marker_ID", { + botloci <- data.frame(V1 = c("1_0001", "2_0002")) + report <- data.frame( + CloneID = c("ProbeA_0001", "ProbeB_0002"), + AlleleID = c("ProbeA_0001|Ref_0001", "ProbeB_0002|Ref_0001"), + AlleleSequence = c("A", "T"), + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = c("ProbeA_0001", "ProbeB_0002"), + Chr = c("1", "2"), + Pos = c(1, 2) + ) + + res <- check_botloci(botloci, report, ChromPos = FALSE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, botloci$V1) + expect_equal(res[[3]]$CloneID, botloci$V1) +}) + +test_that("check_botloci resolves Marker_ID before padding report CloneIDs", { + botloci <- data.frame(V1 = "1_000000123") + report <- data.frame( + CloneID = "1_123", + AlleleID = "1_123|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = "1_123", + Chr = "1", + Pos = 123 + ) + + res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, botloci$V1) + expect_equal(res[[3]]$CloneID, botloci$V1) +}) + +test_that("pick_markers_info_id_col scores distinct markers not allele rows", { + mi_df <- data.frame( + CloneID = c("m1", "m2"), + Marker_ID = c("m1", "m3") + ) + query_ids <- c("m1", "m1", "m1", "m2") + + expect_equal(pick_markers_info_id_col(mi_df, query_ids), "CloneID") +}) + +test_that("check_madc_sanity returns FALSE for malformed CloneID positions", { + report <- data.frame( + CloneID = c("Chr_abc", "Chr_abc"), + AlleleID = c("Chr_abc|Ref_0001", "Chr_abc|Alt_0002"), + AlleleSequence = c("A", "T"), + check.names = FALSE + ) + + res <- check_madc_sanity(report) + + expect_false(is.na(res$checks["ChromPos"])) + expect_false(res$checks["ChromPos"]) +}) + +test_that("check_botloci errors if widening MADC padding still does not match", { + botloci <- data.frame(V1 = "1_0002") + report <- data.frame( + CloneID = "1_1", + AlleleID = "1_1|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + + expect_error( + check_botloci(botloci, report, ChromPos = TRUE, verbose = FALSE), + "After matching padding, botloci markers still not found in MADC file. Check marker IDs." + ) +}) + +test_that("check_botloci keeps AlleleID synchronized after CloneID remap", { + botloci <- data.frame(V1 = "1_0001") + report <- data.frame( + CloneID = "ProbeA_0001", + AlleleID = "ProbeA_0001|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = "ProbeA_0001", + Chr = "1", + Pos = 1 + ) + + res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, "1_0001") + expect_equal(res[[2]]$AlleleID, "1_0001|Ref_0001") +}) diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index fdf69e0..887eb75 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -4,9 +4,11 @@ context("Imputation Concordance") test_that("test imputation",{ #Input variables ped_file <- system.file("check_ped_test.txt", package="BIGr") + temp_ped <- tempfile("check_ped_test_", fileext = ".txt") + file.copy(ped_file, temp_ped) #Calculations - output.list <- check_ped(ped_file, + output.list <- check_ped(temp_ped, seed = 101919, verbose = FALSE) @@ -14,9 +16,15 @@ test_that("test imputation",{ df_length <- length(output.list) messy_parents <- output.list$messy_parents missing_parents <- output.list$missing_parents + corrected_pedigree <- output.list$corrected_pedigree + file_base <- tools::file_path_sans_ext(basename(temp_ped)) - expect_true(df_length == 4) + expect_true(df_length == 6) expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) expect_true(nrow(missing_parents) == 13) + expect_s3_class(corrected_pedigree, "data.frame") + expect_true(all(missing_parents$id %in% corrected_pedigree$id)) + expect_false(exists(paste0(file_base, "_corrected"), envir = .GlobalEnv, inherits = FALSE)) + expect_false(exists(paste0(file_base, "_report"), envir = .GlobalEnv, inherits = FALSE)) }) diff --git a/tests/testthat/test-dosage2vcf.R b/tests/testthat/test-dosage2vcf.R index 18482c3..4ea57df 100644 --- a/tests/testthat/test-dosage2vcf.R +++ b/tests/testthat/test-dosage2vcf.R @@ -1,5 +1,68 @@ context("Dosage Report to VCF") +read_dosage2vcf_body <- function(output_file) { + vcf_lines <- readLines(paste0(output_file, ".vcf")) + body_lines <- vcf_lines[!grepl("^##", vcf_lines)] + read.table(text = paste(body_lines, collapse = "\n"), + header = TRUE, + sep = "\t", + quote = "", + comment.char = "", + check.names = FALSE, + stringsAsFactors = FALSE, + colClasses = "character") +} + +write_dart_counts_fixture <- function(path, sample_order = c("S2", "S1", "S3")) { + meta_cols <- c("MarkerName", "AlleleSequence", "Variant", "CallRate", + "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", + "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + skipped <- rep(paste(rep("*", length(meta_cols) + length(sample_order)), collapse = ","), 6) + sample_header <- paste(c(rep("*", length(meta_cols)), sample_order), collapse = ",") + target_header <- paste(c(meta_cols, paste0("T", seq_along(sample_order))), collapse = ",") + fixed_values <- rep("1", length(meta_cols) - 3) + rows <- c( + paste(c("Chr01_000120735", "AAA", "", fixed_values, c("10", "20", "30")), collapse = ","), + paste(c("Chr01_000120735", "ATA", "-:A>T", fixed_values, c("1", "2", "3")), collapse = ","), + paste(c("Chr02_000000005", "GGG", "", fixed_values, c("40", "50", "60")), collapse = ","), + paste(c("Chr02_000000005", "GCG", "-:G>C", fixed_values, c("4", "5", "6")), collapse = ",") + ) + writeLines(c(skipped, sample_header, target_header, rows), path) +} + +write_snp_1row_fixture <- function(path, sample_order = c("S1", "S2", "S3")) { + meta_cols <- c("MarkerName", "AlleleSequenceRef", "AlleleSequenceAlt", "Variant", + "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", + "FreqHomSnp", "FreqHets", "PICRef", "PICSnp", "AvgPIC", + "AvgCountRef", "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + skipped <- rep(paste(rep("*", length(meta_cols) + length(sample_order)), collapse = ","), 6) + header <- paste(c(meta_cols, sample_order), collapse = ",") + fixed_values <- rep("1", length(meta_cols) - 4) + rows <- c( + paste(c("Chr01_000120735", "AAA", "ATA", "-:A>T", fixed_values, c("0", "1", "2")), collapse = ","), + paste(c("Chr02_000000005", "GGG", "GCG", "-:G>C", fixed_values, c("-", "", "0")), collapse = ",") + ) + writeLines(c(skipped, header, rows), path) +} + +write_snp_2row_fixture <- function(path, sample_order = c("S1", "S2", "S3")) { + meta_cols <- c("MarkerName", "AlleleSequence", "Variant", "CallRate", + "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", + "AvgCountSnp", "RatioAvgCountRefAvgCountSnp") + skipped <- rep(paste(rep("*", length(meta_cols) + length(sample_order)), collapse = ","), 6) + header <- paste(c(meta_cols, sample_order), collapse = ",") + fixed_values <- rep("1", length(meta_cols) - 3) + rows <- c( + paste(c("Chr01_000120735", "AAA", "", fixed_values, c("1", "0", "1")), collapse = ","), + paste(c("Chr01_000120735", "ATA", "-:A>T", fixed_values, c("0", "1", "1")), collapse = ","), + paste(c("Chr02_000000005", "GGG", "", fixed_values, c("-", "-", "1")), collapse = ","), + paste(c("Chr02_000000005", "GCG", "-:G>C", fixed_values, c("-", "-", "0")), collapse = ",") + ) + writeLines(c(skipped, header, rows), path) +} + test_that("test dosage report conversion",{ #Input variables @@ -31,3 +94,53 @@ test_that("test dosage report conversion",{ rm(output_file) }) + +test_that("SNP/INDEL 1 row reports use diploid genotype codes and align counts by sample name", { + report <- tempfile(fileext = ".csv") + counts <- tempfile(fileext = ".csv") + output_file <- tempfile() + write_snp_1row_fixture(report) + write_dart_counts_fixture(counts) + + dosage2vcf(dart.report = report, dart.counts = counts, ploidy = 2, output.file = output_file) + vcf <- read_dosage2vcf_body(output_file) + + expect_equal(colnames(vcf)[10:12], c("S1", "S2", "S3")) + expect_equal(vcf$`#CHROM`, c("Chr01", "Chr02")) + expect_equal(vcf$POS, c("000120735", "000000005")) + expect_equal(vcf$ID, c("Chr01_000120735", "Chr02_000000005")) + expect_equal(vcf$S1[1], "0/0:2:22:20") + expect_equal(vcf$S2[1], "1/1:0:11:10") + expect_equal(vcf$S3[1], "0/1:1:33:30") + expect_equal(vcf$S1[2], "./.:.:55:50") + expect_equal(vcf$S2[2], "./.:.:44:40") + expect_equal(vcf$S3[2], "0/0:2:66:60") +}) + +test_that("SNP/INDEL 2 row reports collapse to the same genotype codes as 1 row reports", { + snp_1row <- tempfile(fileext = ".csv") + snp_2row <- tempfile(fileext = ".csv") + counts <- tempfile(fileext = ".csv") + output_1row <- tempfile() + output_2row <- tempfile() + write_snp_1row_fixture(snp_1row) + write_snp_2row_fixture(snp_2row) + write_dart_counts_fixture(counts) + + dosage2vcf(dart.report = snp_1row, dart.counts = counts, ploidy = 2, output.file = output_1row) + dosage2vcf(dart.report = snp_2row, dart.counts = counts, ploidy = 2, output.file = output_2row) + + expect_equal(read_dosage2vcf_body(output_2row), read_dosage2vcf_body(output_1row)) +}) + +test_that("SNP/INDEL reports require diploid ploidy", { + report <- tempfile(fileext = ".csv") + counts <- tempfile(fileext = ".csv") + write_snp_1row_fixture(report) + write_dart_counts_fixture(counts) + + expect_error( + dosage2vcf(dart.report = report, dart.counts = counts, ploidy = 4, output.file = tempfile()), + "diploid genotype reports" + ) +}) diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R new file mode 100644 index 0000000..55e6913 --- /dev/null +++ b/tests/testthat/test-find_parentage.R @@ -0,0 +1,435 @@ +# tests/testthat/test-find_parentage.R +# Run with: testthat::test_file("tests/testthat/test-find_parentage.R") +library(testthat) +library(data.table) + +# ============================================================================== +# Helpers +# ============================================================================== + +make_files <- function(genos, parents, progeny, dir = tempdir()) { + geno_file <- file.path(dir, paste0("genos_", sample(1e6,1), ".txt")) + parent_file <- file.path(dir, paste0("parents_", sample(1e6,1), ".txt")) + progeny_file <- file.path(dir, paste0("progeny_", sample(1e6,1), ".txt")) + data.table::fwrite(genos, geno_file, sep = "\t") + data.table::fwrite(parents, parent_file, sep = "\t") + data.table::fwrite(progeny, progeny_file, sep = "\t") + list(g = geno_file, p = parent_file, pr = progeny_file) +} + +# ------------------------------------------------------------------------------ +# Base toy data +# S1 / D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 +# S2 / D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 +# ------------------------------------------------------------------------------ +base_genos <- data.table::data.table( + ID = c("S1","S2","D1","D2","child1","child2"), + M1 = c(0L, 2L, 0L, 2L, 0L, 2L), + M2 = c(0L, 2L, 0L, 2L, 0L, 2L), + M3 = c(0L, 2L, 0L, 2L, 0L, 2L), + M4 = c(0L, 2L, 0L, 2L, 0L, 2L), + M5 = c(0L, 2L, 0L, 2L, 0L, 2L) +) +base_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), + Sex = c("M","M","F","F")) +child1_progeny <- data.table::data.table(ID = "child1") +child2_progeny <- data.table::data.table(ID = "child2") +base_progeny <- data.table::data.table(ID = c("child1","child2")) + +# All-zero genotypes — every pair ties at 0% error +tied_genos <- data.table::data.table( + ID = c("S1","S2","D1","D2","child_tie"), + M1 = c(0L, 0L, 0L, 0L, 0L), + M2 = c(0L, 0L, 0L, 0L, 0L) +) +tied_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), + Sex = c("M","M","F","F")) +tied_progeny <- data.table::data.table(ID = "child_tie") + +# ============================================================================== +# 1. Input validation +# ============================================================================== + +test_that("invalid method throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, method = "bad_method", + verbose = FALSE, write_txt = FALSE), + regexp = "Method must be one of" + ) +}) + +test_that("min_markers < 1 throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, min_markers = 0, + verbose = FALSE, write_txt = FALSE), + regexp = "min_markers" + ) +}) + +test_that("error_threshold out of range throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, error_threshold = 150, + verbose = FALSE, write_txt = FALSE), + regexp = "error_threshold" + ) + expect_error( + find_parentage(f$g, f$p, f$pr, error_threshold = -1, + verbose = FALSE, write_txt = FALSE), + regexp = "error_threshold" + ) +}) + +test_that("missing genotype file throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage("nonexistent.txt", f$p, f$pr, + verbose = FALSE, write_txt = FALSE) + ) +}) + +test_that("parent IDs absent from genotype file raise a warning and are dropped", { + extra_parents <- rbind(base_parents, + data.table::data.table(ID = "GHOST", Sex = "M")) + f <- make_files(base_genos, extra_parents, child1_progeny) + expect_warning( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, write_txt = FALSE), + regexp = "GHOST" + ) +}) + +test_that("progeny IDs absent from genotype file raise a warning and are dropped", { + extra_progeny <- rbind(child1_progeny, + data.table::data.table(ID = "GHOST_KID")) + f <- make_files(base_genos, base_parents, extra_progeny) + expect_warning( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, write_txt = FALSE), + regexp = "GHOST_KID" + ) +}) + +test_that("no valid progeny candidates after filtering stops with an error", { + ghost_progeny <- data.table::data.table(ID = "NOBODY") + f <- make_files(base_genos, base_parents, ghost_progeny) + expect_warning( + expect_error( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, write_txt = FALSE), + regexp = "No valid progeny" + ) + ) +}) + +test_that("missing Sex column raises a warning and defaults to ambiguous", { + parents_no_sex <- data.table::data.table(ID = c("S1","D1")) + f <- make_files(base_genos, parents_no_sex, child1_progeny) + expect_warning( + find_parentage(f$g, f$p, f$pr, method = "best_match", + verbose = FALSE, write_txt = FALSE), + regexp = "Sex" + ) +}) + +# ============================================================================== +# 2. Return structure +# ============================================================================== + +test_that("best_pair returns a data.table with expected columns (no ties)", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny","Male_Parent","Female_Parent", + "Mendelian_Error_Pct","Markers_Tested", + "Assignment_Status") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("best_male_parent returns expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", + "Markers_Tested","Assignment_Status") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("best_female_parent returns expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", + "Markers_Tested","Assignment_Status") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("best_match returns expected columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_match", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") + expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", + "Markers_Tested","Assignment_Status") %in% names(res))) + expect_equal(nrow(res), 1L) +}) + +test_that("one row returned per progeny for single-parent methods", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_male_parent","best_female_parent","best_match")) { + res <- find_parentage(f$g, f$p, f$pr, method = m, + verbose = FALSE, write_txt = FALSE) + expect_equal(nrow(res), 1L, label = paste("row count for method", m)) + } +}) + +test_that("Markers_Tested equals the number of non-NA marker columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) +}) + +test_that("Mendelian_Error_Pct is between 0 and 100", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + pct <- as.numeric(res$Mendelian_Error_Pct) + expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) +}) + +# ============================================================================== +# 3. Biological correctness +# ============================================================================== + +test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Male_Parent, "S1") + expect_equal(res$Female_Parent, "D1") + expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) +}) + +test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { + f <- make_files(base_genos, base_parents, child2_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Male_Parent, "S2") + expect_equal(res$Female_Parent, "D2") + expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) +}) + +test_that("best_male_parent identifies S1 as best male for child1", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, write_txt = FALSE) + expect_equal(res$Best_Match, "S1") +}) + +test_that("best_female_parent identifies D1 as best female for child1", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, write_txt = FALSE) + expect_equal(res$Best_Match, "D1") +}) + +test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) +}) + +# ============================================================================== +# 4. Assignment_Status — min_markers and error_threshold +# ============================================================================== + +test_that("Assignment_Status = PASS for perfect trio within thresholds", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Assignment_Status, "PASS") +}) + +test_that("Assignment_Status = LOW_MARKERS when min_markers exceeds available markers", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 99999, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Assignment_Status, "LOW_MARKERS") +}) + +test_that("Assignment_Status = HIGH_ERROR when error rate exceeds threshold", { + # Use wrong parents so error rate is high + high_error_genos <- data.table::data.table( + ID = c("S1","D1","bad_child"), + M1 = c(0L, 0L, 2L), + M2 = c(0L, 0L, 2L), + M3 = c(0L, 0L, 2L), + M4 = c(0L, 0L, 2L), + M5 = c(0L, 0L, 2L) + ) + parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("M","F")) + progeny <- data.table::data.table(ID = "bad_child") + f <- make_files(high_error_genos, parents, progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(res$Assignment_Status, "HIGH_ERROR") +}) + +test_that("Assignment_Status column is present in all methods", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_pair","best_male_parent","best_female_parent","best_match")) { + res <- find_parentage(f$g, f$p, f$pr, method = m, + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_true("Assignment_Status" %in% names(res), + label = paste("Assignment_Status present for method", m)) + } +}) + +# ============================================================================== +# 5. allow_selfing +# ============================================================================== + +test_that("allow_selfing = FALSE removes self-pairs from candidates", { + ambig_parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("A","A")) + f <- make_files(base_genos, ambig_parents, child1_progeny) + expect_warning( + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + allow_selfing = FALSE, show_ties = FALSE, + verbose = FALSE, write_txt = FALSE), + regexp = "tied" + ) + if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) + expect_false(res$Male_Parent == res$Female_Parent) +}) + +# ============================================================================== +# 6. show_ties +# ============================================================================== + +test_that("show_ties = TRUE produces _1/_2 suffixed columns when ties exist", { + f <- make_files(tied_genos, tied_parents, tied_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = TRUE, verbose = FALSE, write_txt = FALSE) + expect_true(any(grepl("^Male_Parent_", names(res)))) + expect_true(any(grepl("^Female_Parent_", names(res)))) +}) + +test_that("show_ties = FALSE warns about ties and returns single-result columns", { + f <- make_files(tied_genos, tied_parents, tied_progeny) + expect_warning( + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE), + regexp = "tied" + ) + expect_true("Male_Parent" %in% names(res)) + expect_true("Female_Parent" %in% names(res)) + expect_false(any(grepl("^Male_Parent_\\d", names(res)))) + expect_false(any(grepl("^Female_Parent_\\d", names(res)))) +}) + +# ============================================================================== +# 7. verbose / write_txt +# ============================================================================== + +test_that("verbose = TRUE returns the result invisibly as data.table", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = TRUE, write_txt = FALSE) + expect_s3_class(res, "data.table") +}) + +test_that("verbose = FALSE returns the result as data.table", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") +}) + +test_that("write_txt = TRUE creates the output file", { + old_wd <- getwd() + tmp <- tempdir() + setwd(tmp) + on.exit(setwd(old_wd), add = TRUE) + f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, write_txt = TRUE) + expect_true(file.exists(file.path(tmp, "parentage_results_dt.txt"))) +}) + +test_that("write_txt = FALSE does not create the output file", { + old_wd <- getwd() + tmp <- tempdir() + setwd(tmp) + on.exit(setwd(old_wd), add = TRUE) + out_file <- file.path(tmp, "parentage_results_dt.txt") + if (file.exists(out_file)) file.remove(out_file) + f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, write_txt = FALSE) + expect_false(file.exists(out_file)) +}) + +# ============================================================================== +# 8. Sex-based candidate filtering +# ============================================================================== + +test_that("best_male_parent only assigns M or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, write_txt = FALSE) + valid_male_parents <- base_parents[Sex %in% c("M","A")]$ID + expect_true(res$Best_Match %in% valid_male_parents) +}) + +test_that("best_female_parent only assigns F or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, write_txt = FALSE) + valid_female_parents <- base_parents[Sex %in% c("F","A")]$ID + expect_true(res$Best_Match %in% valid_female_parents) +}) + +# ============================================================================== +# 9. Edge cases +# ============================================================================== + +test_that("single progeny individual is handled correctly", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_equal(nrow(res), 1L) +}) + +test_that("all-NA marker column does not cause an error", { + na_genos <- data.table::copy(base_genos) + na_genos[, M1 := NA_integer_] + f <- make_files(na_genos, base_parents, child1_progeny) + expect_no_error( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, write_txt = FALSE) + ) +}) + +test_that("Progeny column contains the correct progeny IDs", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_setequal(res$Progeny, child1_progeny$ID) +}) + +test_that("multiple progeny are all represented in output", { + f <- make_files(base_genos, base_parents, base_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + expect_setequal(res$Progeny, base_progeny$ID) +}) diff --git a/tests/testthat/test-imputation_concordance.R b/tests/testthat/test-imputation_concordance.R index f1fb421..459998c 100644 --- a/tests/testthat/test-imputation_concordance.R +++ b/tests/testthat/test-imputation_concordance.R @@ -1,6 +1,5 @@ context("Imputation Concordance") - test_that("test imputation",{ #Input variables ignore_file <- system.file("imputation_ignore.txt", package="BIGr") diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index f2095c5..9694f75 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -22,7 +22,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) set.seed(456) madc2vcf_all(madc = madc_file, @@ -34,7 +34,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_sample_thr = 0, alignment_score_thr = 40, out_vcf = temp_multi, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) vcf_multi <- read.vcfR(temp_multi) @@ -56,7 +56,7 @@ test_that("test madc offtargets",{ multiallelic_SNP_dp_thr = 0, multiallelic_SNP_sample_thr = 0, out_vcf = temp, - verbose = TRUE) + verbose = FALSE) vcf <- read.vcfR(temp) @@ -65,3 +65,531 @@ test_that("test madc offtargets",{ rm(vcf) }) + +test_that("madc2vcf_all preserves comma-separated AD for biallelic targets", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + db_file <- system.file("example_allele_db.fa", package="BIGr") + temp <- tempfile(fileext = ".vcf") + + madc2vcf_all(madc = madc_file, + botloci_file = bot_file, + hap_seq_file = db_file, + n.cores = 1, + out_vcf = temp, + verbose = FALSE) + + vcf <- read.vcfR(temp, verbose = FALSE) + ad <- extract.gt(vcf, "AD") + biallelic <- !grepl(",", vcf@fix[, "ALT"]) + + expect_true(all(grepl("^[0-9]+,[0-9]+$", ad[biallelic, 1]))) +}) + +test_that("madc2vcf_all accepts BI_markerID matches when CloneID does not match", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + db_file <- system.file("example_allele_db.fa", package="BIGr") + temp <- tempfile(fileext = ".vcf") + temp_markers <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + marker_ids <- unique(report$CloneID) + markers_info <- data.frame( + CloneID = paste0("bogus_", seq_along(marker_ids)), + BI_markerID = marker_ids + ) + write.csv(markers_info, temp_markers, row.names = FALSE) + + expect_no_error( + madc2vcf_all(madc = madc_file, + botloci_file = bot_file, + hap_seq_file = db_file, + markers_info = temp_markers, + n.cores = 1, + out_vcf = temp, + verbose = FALSE) + ) +}) + +test_that("madc2vcf_all surfaces missing-column validation error without crashing", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + report$AlleleSequence <- NULL + write.csv(report, temp_madc, row.names = FALSE) + + expect_error( + madc2vcf_all(madc = temp_madc, + botloci_file = bot_file, + out_vcf = tempfile(fileext = ".vcf"), + verbose = FALSE), + "One or more required columns missing" + ) +}) + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_on_cran() + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + #out <- "test.vcf" + # Default parameters + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 2, + rm_multiallelic_SNP = FALSE, + multiallelic_SNP_sample_thr = 0, + multiallelic_SNP_dp_thr = 0, + alignment_score_thr = 40, + out_vcf = out, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) + unlink(out) + + expect_no_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 9) + + # Test error when botloci_file is NULL + expect_error( + madc2vcf_all(madc = alfalfa_madc, + botloci_file = NULL, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + ) + + # Test that it also works when markers_info is provided together with botloci + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + multiallelic_SNP_dp_thr = 80, + multiallelic_SNP_sample_thr = 2, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 234777) + multi <- grepl(",", vcf@fix[,5]) + expect_true(any(multi)) # It has multiallelics + expect_equal(sum(multi), 3) + + # Remove multiallelics + madc2vcf_all(madc = alfalfa_madc, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + rm_multiallelic_SNP = TRUE, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(!is.na(vcf@fix[, "REF"]))) + expect_true(all(!is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 233482) + multi <- grepl(",", vcf@fix[,5]) + expect_false(any(multi)) # It has multiallelics + expect_equal(sum(multi), 0) + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when markers_info does not match MADC CloneIDs + expect_error( + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE), + regexp = "None of the markers_info CloneID( or BI_markerID)? values match the MADC CloneID column. Please make sure they use the same marker IDs." + ) + + # Test error when markers_info_ChromPos is provided but IDs still don't match botloci + madc2vcf_all(madc = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + markers_info = alfalfa_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 235217) + multi <- grepl(",", vcf@fix[,5]) + + }) + + test_that("alfalfa lower case missing 3 ref and 1 alt fixed MADC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 233719) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 230415) + + madc2vcf_all(madc = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = alfalfa_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Pos)),] + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 230415) + + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + # IUPAC codes cause a stop in madc2vcf_all + expect_error( + madc2vcf_all(madc = alfalfa_iupac, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence\\. This codes are not currently supported by BIGr/BIGapp\\. Run HapApp to replace them" + ) + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + + ) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 226838) + expect_equal(sum(dp[3,]), 3996) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 3937) + expect_equal(sum(dp[,5]), 248571) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_indel_madc, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) + + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + # Indels detected, no markers_info with Ref/Alt/Indel_pos -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5397) + expect_equal(sum(dp[,5]), 215070) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # ChromPos=FALSE and no markers_info -> error + expect_error( + madc2vcf_all(madc = potato_more_indels_chrompos_false, + botloci_file = potato_botloci, + hap_seq_file = NULL, + n.cores = 1, + out_vcf = out, + verbose = FALSE), + regexp = "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information" + ) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + check <- check[-which(is.na(check$Ref)),] + + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 219742) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + # markers_info without Ref/Alt/Indel_pos while indels present -> error + expect_error( + madc2vcf_all(madc = potato_indel_lowercase, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info_ChromPos, + out_vcf = out, + verbose = FALSE), + regexp = "Indels detected in MADC file. The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns." + ) + }) + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + + expect_error( + madc2vcf_all(madc = potato_indel_iupac, + botloci_file = potato_botloci, + hap_seq_file = potato_microhapDB, + n.cores = 1, + markers_info = potato_markers_info, + out_vcf = out, + verbose = FALSE), + regexp = "IUPAC \\(non-ATCG\\) codes found in AlleleSequence. This codes are not currently supported by BIGr/BIGapp. Run HapApp to replace them" + ) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # Raw format fails FixAlleleIDs check -> madc2vcf_all stops + expect_error( + madc2vcf_all(madc = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + hap_seq_file = alfalfa_microhapDB, + n.cores = 1, + out_vcf = out, + verbose = FALSE), regexp = "MADC not processed by HapApp" + ) + }) +}) diff --git a/tests/testthat/test-madc2vcf_multi.R b/tests/testthat/test-madc2vcf_multi.R new file mode 100644 index 0000000..adca090 --- /dev/null +++ b/tests/testthat/test-madc2vcf_multi.R @@ -0,0 +1,150 @@ +context("MADC to VCF via polyRAD") + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("madc2vcf_multi — alfalfa (BIGapp-PanelHub)", { + + skip_if_not_installed("polyRAD") + skip_if_not_installed("VariantAnnotation") + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + alfalfa_microhapDB <- paste0(github_path, "alfalfa/alfalfa_allele_db_v001.fa") + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + potato_microhapDB <- paste0(github_path, "potato/potato_allele_db_v001.fa") + + skip_if_offline("raw.githubusercontent.com") + + out <- tempfile(fileext = ".vcf") + # Fixed allele ID format + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + # Don't allow raw MADC + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_raw, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = FALSE + ), regexp = "The MADC file does not have fixed AlleleIDs. Please process the MADC file through HapApp before using this function." + ) + + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ) + ) + + # Wrong IDs + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_multi( + madc_file = alfalfa_madc_wrongID, + botloci_file = alfalfa_botloci, + outfile = out, + markers_info = alfalfa_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 281) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/0/0/3") + + ### Avoid IUPAC codes + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_iupac, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "MADC Allele Sequences contain IUPAC \\(non-ATCG\\) codes. Please run HapApp to clean MADC file before using this function." + ) + + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_multi( + madc_file = alfalfa_lowercase, + botloci_file = alfalfa_botloci, + outfile = out, + ploidy = 4L, + verbose = TRUE + ), regexp = "Not all Ref sequences have a corresponding Alt or vice versa. Please provide a complete MADC file before using this function." + ) + + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_multi( + madc_file = potato_indel_madc, + botloci_file = potato_botloci, + outfile = out, + markers_info = potato_markers_info_ChromPos, + ploidy = 4L, + verbose = TRUE + ) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_equal(sum(grepl(",", vcf@fix[,5])), 277) + GT <- extract.gt(vcf) + expect_equal(GT[3,5],"0/1/1/6") + +}) + diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index 524b687..9ffc72e 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -69,33 +69,10 @@ test_that("bottom strand markers have correct REF/ALT", { # Get results from both functions suppressWarnings( madc2vcf_targets(madc_file = madc_file, output.file = temp_targets, - get_REF_ALT = TRUE, botloci_file = bot_file) - ) - - suppressWarnings( - madc2vcf_all(madc = madc_file, botloci_file = bot_file, - hap_seq_file = NULL, out_vcf = temp_all, verbose = FALSE) + get_REF_ALT = TRUE, botloci_file = bot_file) ) vcf_targets <- read.vcfR(temp_targets, verbose = FALSE) - vcf_all <- read.vcfR(temp_all, verbose = FALSE) - - # Find common markers between both outputs - common_markers <- intersect(vcf_targets@fix[,"ID"], vcf_all@fix[,"ID"]) - - if(length(common_markers) > 0) { - # Compare REF/ALT for common markers - targets_subset <- vcf_targets@fix[vcf_targets@fix[,"ID"] %in% common_markers,] - all_subset <- vcf_all@fix[vcf_all@fix[,"ID"] %in% common_markers,] - - # Sort both by ID for comparison - targets_subset <- targets_subset[order(targets_subset[,"ID"]),] - all_subset <- all_subset[order(all_subset[,"ID"]),] - - # Check that REF/ALT match between the two functions - expect_equal(targets_subset[,"REF"], all_subset[,"REF"]) - expect_equal(targets_subset[,"ALT"], all_subset[,"ALT"]) - } # Validate that all REF/ALT are valid nucleotides expect_true(all(vcf_targets@fix[,"REF"] %in% c("A", "T", "G", "C", "."))) @@ -107,5 +84,679 @@ test_that("bottom strand markers have correct REF/ALT", { expect_true(all(vcf_targets@fix[valid_snps,"REF"] != vcf_targets@fix[valid_snps,"ALT"])) } - rm(vcf_targets, vcf_all, temp_targets, temp_all) + rm(vcf_targets, temp_targets) +}) + +test_that("madc2vcf_targets preserves original sample names", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + temp_vcf <- tempfile(fileext = ".vcf") + + report <- read.csv(madc_file, check.names = FALSE) + colnames(report)[4:6] <- c("1A", "Sample-1", "sample 2") + write.csv(report, temp_madc, row.names = FALSE, quote = TRUE) + + suppressWarnings( + madc2vcf_targets(madc_file = temp_madc, output.file = temp_vcf, get_REF_ALT = FALSE) + ) + + vcf <- read.vcfR(temp_vcf, verbose = FALSE) + + expect_equal(colnames(vcf@gt)[2:4], c("1A", "Sample-1", "sample 2")) +}) + +test_that("madc2vcf_targets surfaces missing-column validation error without crashing", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + report$AlleleSequence <- NULL + write.csv(report, temp_madc, row.names = FALSE) + + expect_error( + madc2vcf_targets(madc_file = temp_madc, + output.file = tempfile(fileext = ".vcf"), + get_REF_ALT = FALSE, + verbose = FALSE), + "One or more required columns missing" + ) +}) + + +# ======================================================================= +# Using Breeding-Insight/BIGapp-PanelHub test files +# ======================================================================= + +test_that("simu alfalfa",{ + + github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/" + + # External alfalfa test files + alfalfa_madc <- paste0(github_path, "test_madcs/alfalfa_madc.csv") + alfalfa_madc_wrongID <- paste0(github_path, "test_madcs/alfalfa_madc_wrongID.csv") + alfalfa_madc_raw <- paste0(github_path, "test_madcs/alfalfa_madc_raw.csv") # raw DArT format (7-row header) + alfalfa_iupac <- paste0(github_path, "test_madcs/alfalfa_IUPAC.csv") + alfalfa_lowercase <- paste0(github_path, "test_madcs/alfalfa_lowercase.csv") + alfalfa_botloci <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_f180bp.botloci") # botloci for alfalfa + alfalfa_markers_info <- paste0(github_path, "alfalfa/20201030-BI-Alfalfa_SNPs_DArTag-probe-design_snpID_lut.csv") # markers_info: CloneID/BI_markerID, Chr, Pos, Ref, Alt + alfalfa_markers_info_ChromPos <- paste0(github_path, "test_madcs/alfalfa_marker_info_ChromPos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + + # External potato test files + potato_indel_madc <- paste0(github_path, "test_madcs/potato_indel_madc.csv") + potato_indel_iupac <- paste0(github_path, "test_madcs/potato_indel_IUPAC.csv") + potato_indel_lowercase <- paste0(github_path, "test_madcs/potato_indel_lowercase.csv") + potato_more_indels_chrompos_false <- paste0(github_path, "test_madcs/potato_more_indels_madc_ChromPosFALSE.csv") + potato_botloci <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_f150bp_ref_alt.botloci") + potato_markers_info <- paste0(github_path, "potato/potato_dartag_v2_3915markers_rm7dupTags_6traitMarkers_rm1dup_snpID_lut.csv") # CloneID/BI_markerID, Chr, Pos, Ref, Alt + potato_markers_info_ChromPos <- paste0(github_path, "test_madcs/potato_marker_info_chrompos.csv") # markers_info: CloneID/BI_markerID, Chr, Pos + + skip_if_offline("raw.githubusercontent.com") + + test_that("ALFALFA — clean fixed allele ID MADC", { + out <- tempfile(fileext = ".vcf") + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4139) + expect_equal(sum(DP[,5]), 42869) + unlink(out) + + expect_no_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = TRUE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + # Test error when get_REF_ALT = TRUE but no markers_info or botloci provided to extract REF/ALT + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + verbose = FALSE), + regexp = "get_REF_ALT = TRUE but no markers_info file with Ref and Alt columns was provided neither a botloci_file to extrat REF/ALT from probe sequences. Please provide one of the these files or set get_REF_ALT to FALSE." + ) + + # Test that it works when get_REF_ALT = TRUE and botloci is provided (REF/ALT recovered from probe sequences) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + # Test that it also works when markers_info is provided together with botloci (should give same result as above but just to confirm no interference between the two) + madc2vcf_targets(madc_file = alfalfa_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("ALFALFA — clean fixed allele ID MADC wrong CloneID", { + out <- tempfile(fileext = ".vcf") + # Test error when botloci provided but no matching CloneID between botloci and MADC (even after trying to fix potential padding mismatch with ChromPos info in markers_info) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE), + regexp = "Check marker IDs in both MADC and botloci files. They should be the same." + ) + + # Test error when no matching CloneID between markers_info and MADC to fix the botloci mismatch issue (even if botloci file is not used, the function should still check that the provided markers_info can match with MADC CloneIDs to be able to use the ChromPos info to fix potential padding mismatch) + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE), + "None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match." + ) + + # Test that it works when the function can find a matching ID in markers_info to fix the botloci mismatch issue + # (even if botloci file is not used, the function should still be able to use the ChromPos info in markers_info to + # fix potential padding mismatch and find matching IDs between MADC and botloci) + madc2vcf_targets(madc_file = alfalfa_madc_wrongID, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info_ChromPos, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + }) + + test_that("alfalfa lower case fixed MADC", { + out <- tempfile(fileext = ".vcf") + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43017) + + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + ) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43017) + + expect_warning( + madc2vcf_targets(madc_file = alfalfa_lowercase, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + markers_info = alfalfa_markers_info, + verbose = FALSE) + ) + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43017) + }) + + test_that("alfalfa IUPAC code", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 4534) + expect_equal(sum(dp[,5]), 56547) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = alfalfa_iupac, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + }) + + test_that("potato indel madc chrompos=FALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 5163) + expect_equal(sum(dp[,5]), 58927) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_madc, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41656) + }) + + test_that("potato indel chromposFALSE", { + out <- tempfile(fileext = ".vcf") + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_more_indels_chrompos_false, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("potato lowercase", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + expect_error( + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info_ChromPos, + botloci_file = potato_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + botloci_file = potato_botloci, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_lowercase, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + + test_that("potato IUPAC", { + out <- tempfile(fileext = ".vcf") + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = TRUE, + markers_info = potato_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[1,]), 6301) + expect_equal(sum(dp[,5]), 53613) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + + madc2vcf_targets(madc_file = potato_indel_iupac, + output.file = out, + get_REF_ALT = FALSE, + markers_info = potato_markers_info, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(potato_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 41755) + }) + + test_that("alfalfa raw MADC format (7-row header)", { + out <- tempfile(fileext = ".vcf") + # get_REF_ALT = FALSE: same counts as alfalfa_madc + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + verbose = FALSE) + ) + + expect_error( + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + botloci_file = alfalfa_botloci, + verbose = FALSE) + ) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = FALSE, + collapse_matches_counts = TRUE, + verbose = TRUE) + + vcf <- read.vcfR(out, verbose = FALSE) + expect_s4_class(vcf, "vcfR") + expect_true(all(is.na(vcf@fix[, "REF"]))) + expect_true(all(is.na(vcf@fix[, "ALT"]))) + DP <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = FALSE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(dp[,10]), 43691) + + madc2vcf_targets(madc_file = alfalfa_madc_raw, + output.file = out, + get_REF_ALT = TRUE, + markers_info = alfalfa_markers_info, + collapse_matches_counts = TRUE, + verbose = FALSE) + + vcf <- read.vcfR(out, verbose = FALSE) + lut <- read.csv(alfalfa_markers_info) + vcf_infos <- vcf@fix[,c(1:5)] + lut_infos <- lut[match(vcf@fix[,3],lut$BI_markerID),c(2:6)] + check <- cbind(vcf_infos,lut_infos) + expect_equal(check$REF, check$Ref) + expect_equal(check$ALT, check$Alt) + expect_equal(as.numeric(check$POS), check$Pos) + dp <- extract.gt(vcf, "DP", as.numeric = TRUE) + expect_equal(sum(DP[1,]), 4534) + expect_equal(sum(DP[,5]), 56547) + }) }) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R new file mode 100644 index 0000000..ef4e493 --- /dev/null +++ b/tests/testthat/test-validate_pedigree.R @@ -0,0 +1,471 @@ +# tests/testthat/test-validate_pedigree.R +# Run with: testthat::test_file("tests/testthat/test-validate_pedigree.R") +library(testthat) +library(data.table) + +# ============================================================================== +# Helpers +# ============================================================================== + +make_genos <- function() { + # IND_A: all 0, IND_B: all 2, IND_C: all 1 (het), IND_D: all 0, IND_E: all 0 + n_markers <- 20 + marker_names <- paste0("M", seq_len(n_markers)) + dt <- data.table( + ID = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), + rbind( + rep(0L, n_markers), # IND_A — all ref homozygous + rep(2L, n_markers), # IND_B — all alt homozygous + rep(1L, n_markers), # IND_C — all het (valid child of IND_A x IND_B) + rep(0L, n_markers), # IND_D — all ref (impossible child of IND_B x IND_A) + rep(0L, n_markers) # IND_E — all ref + ) + ) + setnames(dt, c("ID", marker_names)) + dt +} + +make_pedigree <- function() { + # IND_C: perfect Mendelian child of IND_A x IND_B -> PASS + # IND_D: declared parents swapped -> FAIL + data.table( + ID = c("IND_C", "IND_D"), + Male_Parent = c("IND_A", "IND_B"), + Female_Parent = c("IND_B", "IND_A") + ) +} + +write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { + ped_file <- tempfile(fileext = ".txt") + genos_file <- tempfile(fileext = ".txt") + fwrite(ped, ped_file, sep = "\t") + fwrite(genos, genos_file, sep = "\t") + list(ped = ped_file, genos = genos_file) +} + +# ============================================================================== +# 1. Input validation +# ============================================================================== + +test_that("trio_error_threshold out of range raises an error", { + f <- write_temp_files() + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = 150, + verbose = FALSE, write_txt = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = -1, + verbose = FALSE, write_txt = FALSE)) +}) + +test_that("single_parent_error_threshold out of range raises an error", { + f <- write_temp_files() + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = 101, + verbose = FALSE, write_txt = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = -5, + verbose = FALSE, write_txt = FALSE)) +}) + +test_that("missing required pedigree column raises an error", { + bad_ped <- data.table(ID = "IND_C", Parent1 = "IND_A", Female_Parent = "IND_B") + f <- write_temp_files(ped = bad_ped) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + regexp = "missing required columns" + ) +}) + +test_that("missing ID column in genotypes raises an error", { + bad_genos <- copy(make_genos()) + setnames(bad_genos, "ID", "SampleID") + f <- write_temp_files(genos = bad_genos) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + regexp = "ID" + ) +}) + +test_that("all trios with no genotype data stop with an error", { + ped <- data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + f <- write_temp_files(ped = ped) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + regexp = "No valid trios remain" + ) +}) + +# ============================================================================== +# 2. Return structure +# ============================================================================== + +test_that("returns a data.table", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_s3_class(res, "data.table") +}) + +test_that("result has one row per pedigree entry", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(nrow(res), 2L) +}) + +test_that("result has all expected columns", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expected_cols <- c( + "ID", "Male_Parent", "Female_Parent", + "Mendelian_Error_Pct", "Markers_Tested", "Status", + "Correction_Decision", + "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", + "Best_Male_Parent", "Best_Male_Parent_Error_Pct", + "Best_Female_Parent", "Best_Female_Parent_Error_Pct" + ) + expect_true(all(expected_cols %in% names(res))) +}) + +# ============================================================================== +# 3. PASS / FAIL / LOW_MARKERS / NO_DATA statuses +# ============================================================================== + +test_that("PASS trio is correctly identified", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + pass_row <- res[ID == "IND_C"] + expect_equal(nrow(pass_row), 1L) + expect_equal(pass_row$Status, "PASS") + expect_equal(pass_row$Mendelian_Error_Pct, 0) + expect_equal(pass_row$Correction_Decision, "NONE") +}) + +test_that("FAIL trio is correctly identified", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + fail_row <- res[ID == "IND_D"] + expect_equal(nrow(fail_row), 1L) + expect_equal(fail_row$Status, "FAIL") + expect_gt(fail_row$Mendelian_Error_Pct, 5.0) +}) + +test_that("FAIL trio has REMOVE_MALE_PARENT decision with best match populated", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + fail_row <- res[ID == "IND_D"] + expect_equal(fail_row$Correction_Decision, "REMOVE_MALE_PARENT") + expect_false(is.na(fail_row$Best_Male_Parent)) + expect_true(is.na(fail_row$Best_Female_Parent)) +}) + +test_that("Mendelian_Error_Pct is 0 for a perfect trio", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_C"]$Mendelian_Error_Pct, 0) +}) + +test_that("Markers_Tested equals number of markers for complete data", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_C"]$Markers_Tested, 20L) +}) + +test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { + f <- write_temp_files() + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, min_markers = 25L) + expect_true(all(res$Status == "LOW_MARKERS")) + expect_true(all(grepl("^LOW_MARKERS_", res$Correction_Decision))) +}) + +test_that("NA markers reduce Markers_Tested and do not cause errors", { + genos <- make_genos() + genos[ID == "IND_C", M1 := NA_integer_] + genos[ID == "IND_C", M2 := NA_integer_] + f <- write_temp_files(genos = genos) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_C"]$Markers_Tested, 18L) + expect_equal(res[ID == "IND_C"]$Status, "PASS") +}) + +# ============================================================================== +# 4. Missing parent statuses (MISSING_MALE_PARENT / MISSING_FEMALE_PARENT / +# MISSING_BOTH_PARENTS) +# Note: each test includes make_pedigree() rows so has_geno is never empty, +# and filters res by the specific ID under test [3][4] +# ============================================================================== + +test_that("MISSING_MALE_PARENT status and recommendation are correct", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_MALE_PARENT") + expect_equal(r$Correction_Decision, "NONE") + expect_false(is.na(r$Best_Male_Parent)) + expect_true(is.na(r$Best_Female_Parent)) +}) + +test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "IND_A", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_FEMALE_PARENT") + expect_equal(r$Correction_Decision, "NONE") + expect_true(is.na(r$Best_Male_Parent)) + expect_false(is.na(r$Best_Female_Parent)) +}) + +test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_BOTH_PARENTS") + expect_equal(r$Correction_Decision, "NONE") + expect_false(is.na(r$Best_Male_Parent)) + expect_false(is.na(r$Best_Female_Parent)) +}) + +test_that("MISSING_* rows preserve 0s in corrected pedigree", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), + colClasses = "character") + expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") +}) + +test_that("Best_Male_Parent for MISSING_MALE_PARENT is excluded from being the known female", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + r <- res[ID == "IND_E"] + # The known female parent should not be suggested as the best male parent + expect_false(r$Best_Male_Parent == "IND_B") +}) + +# ============================================================================== +# 5. FOUNDERS status +# ============================================================================== + +test_that("FOUNDERS status is assigned when ID in founders list with 0 0 parents", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") + ) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(ID = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_txt = FALSE) + founder_row <- res[ID == "IND_A"] + expect_equal(founder_row$Status, "FOUNDERS") + expect_equal(founder_row$Correction_Decision, "NONE") + expect_true(is.na(founder_row$Best_Male_Parent)) + expect_true(is.na(founder_row$Best_Female_Parent)) +}) + +test_that("non-founder rows are still evaluated normally when founders file is supplied", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") + ) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(ID = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_txt = FALSE) + # IND_C has real parents — should still get PASS + expect_equal(res[ID == "IND_C"]$Status, "PASS") +}) + +test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") +}) + +test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUNDERS", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, + founders_file = NULL, + verbose = FALSE, + write_txt = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") +}) + +# ============================================================================== +# 6. NO_GENOTYPE_DATA status +# ============================================================================== + +test_that("NO_GENOTYPE_DATA is flagged for progeny absent from genotype file", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_equal(nrow(ghost_row), 1L) + expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") + expect_equal(ghost_row$Correction_Decision, "NONE") +}) + +test_that("NO_GENOTYPE_DATA rows have NA for all analysis columns", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_true(is.na(ghost_row$Mendelian_Error_Pct)) + expect_equal(ghost_row$Markers_Tested, 0L) + expect_true(is.na(ghost_row$Best_Male_Parent)) + expect_true(is.na(ghost_row$Best_Female_Parent)) +}) + +test_that("NO_GENOTYPE_DATA flagged when declared parent is absent from genotype file", { + ped <- rbind( + make_pedigree(), # ensures has_geno is not empty + data.table(ID = "IND_C_GHOST", Male_Parent = "GHOST_DAD", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") +}) + +test_that("valid trios still evaluated correctly when ghost rows are present", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_equal(res[ID == "IND_C"]$Status, "PASS") + expect_equal(res[ID == "IND_D"]$Status, "FAIL") +}) + +# ============================================================================== +# 7. Corrected pedigree output +# ============================================================================== + +test_that("corrected_pedigree.txt is written and PASS parents are unchanged", { + f <- write_temp_files() + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + expect_true(file.exists(file.path(tmp_dir, "corrected_pedigree.txt"))) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) + # IND_C passes — parents must be unchanged + expect_equal(as.character(corr[ID == "IND_C"]$Male_Parent), "IND_A") + expect_equal(as.character(corr[ID == "IND_C"]$Female_Parent), "IND_B") +}) + +test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { + f <- write_temp_files() + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), + colClasses = "character") + # IND_D fails with REMOVE_MALE_PARENT — male should become "0" + expect_equal(corr[ID == "IND_D"]$Male_Parent, "0") + expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") +}) + +test_that("corrected_pedigree.txt preserves 0s for MISSING_* rows", { + ped <- rbind( + make_pedigree(), + data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), + colClasses = "character") + # MISSING_MALE_PARENT — male stays "0", female unchanged + expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") + expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") +}) + +# ============================================================================== +# 8. write_txt / output file +# ============================================================================== + +test_that("write_txt = TRUE writes output file with correct row count", { + f <- write_temp_files() + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = TRUE, output_filename = out_file) + expect_true(file.exists(out_file)) + written <- fread(out_file) + expect_equal(nrow(written), 2L) +}) + +test_that("write_txt = FALSE does not create output file", { + f <- write_temp_files() + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = FALSE, output_filename = out_file) + expect_false(file.exists(out_file)) +}) + +test_that("output file contains correct number of rows when ghost rows present", { + ped <- rbind( + make_pedigree(), + data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") + ) + f <- write_temp_files(ped = ped) + out_file <- tempfile(fileext = ".txt") + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_txt = TRUE, output_filename = out_file) + written <- fread(out_file) + # 2 valid trios + 1 ghost = 3 rows total + expect_equal(nrow(written), 3L) +})