From 4ec471b7087f5fc5f235584b386a6769b7a239d8 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:40:59 -0400 Subject: [PATCH 01/19] Fix formatting of RefAltSeqs documentation --- R/check_madc_sanity.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index 2248779..fda01d5 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -13,7 +13,8 @@ #' (prefix matches `"chr"` case-insensitively, suffix is a positive integer); #' 7) **allNAcol** - at least one column contains only `NA` or empty values; #' 8) **allNArow** - at least one row contains only `NA` or empty values; -#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row. +#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row; +#' 10) **OtherAlleles** - presence of alleles where the target locus differs from both the Ref and Alt in `AlleleSequence`. #' #' @param report A `data.frame` with at least the columns #' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also From cb768808a97c550411a774e0cd3ca311597910a9 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:55:45 -0400 Subject: [PATCH 02/19] updated docs --- man/check_madc_sanity.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd index 0398625..1d7eebb 100644 --- a/man/check_madc_sanity.Rd +++ b/man/check_madc_sanity.Rd @@ -50,7 +50,8 @@ or a \code{"-"} character is present in \code{AlleleSequence}; (prefix matches \code{"chr"} case-insensitively, suffix is a positive integer); \item \strong{allNAcol} - at least one column contains only \code{NA} or empty values; \item \strong{allNArow} - at least one row contains only \code{NA} or empty values; -\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row. +\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row; +\item \strong{OtherAlleles} - presence of alleles where the target locus differs from both the Ref and Alt in \code{AlleleSequence}. } } \details{ From ab944e8e02a4eb5ca6491208996aff052b2cc315 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 08:20:22 -0400 Subject: [PATCH 03/19] revert filterVCF --- R/filterVCF.R | 321 +++++++---------------------------------------- man/filterVCF.Rd | 17 +-- 2 files changed, 54 insertions(+), 284 deletions(-) diff --git a/R/filterVCF.R b/R/filterVCF.R index b9bca78..a54e32e 100644 --- a/R/filterVCF.R +++ b/R/filterVCF.R @@ -17,7 +17,6 @@ #' @param filter.SAMPLE.miss Sample missing data filter #' @param filter.SNP.miss SNP missing data filter #' @param ploidy The ploidy of the species being analyzed -#' @param quality.rates Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity). #' @param output.file output file name (optional). If no output.file name provided, then a vcfR object will be returned. #' @return A gzipped vcf file #' @importFrom vcfR read.vcfR @@ -27,114 +26,42 @@ #' @examples #' ## Use file paths for each file on the local system #' +#' #Temp location (only for example) +#' output_file <- tempfile() #' -#' #filterVCF(vcf.file = "example_dart_Dosage_Report.csv", -#' # filter.OD = 0.5, -#' # ploidy = 2, -#' # output.file = "name_for_vcf") +#' filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), +#' filter.OD = 0.5, +#' filter.MAF = 0.05, +#' ploidy = 2, +#' output.file = output_file) +#' +#' # Removing the output for the example +#' rm(output_file) #' #' ##The function will output the filtered VCF to the current working directory #' #' @export filterVCF <- function(vcf.file, - quality.rates = FALSE, - filter.OD = NULL, - filter.BIAS.min = NULL, - filter.BIAS.max = NULL, - filter.DP = NULL, - filter.MPP = NULL, - filter.PMC = NULL, - filter.MAF = NULL, - filter.SAMPLE.miss = NULL, - filter.SNP.miss = NULL, - ploidy, - output.file = NULL) { + filter.OD = NULL, + filter.BIAS.min = NULL, + filter.BIAS.max = NULL, + filter.DP = NULL, + filter.MPP = NULL, + filter.PMC = NULL, + filter.MAF = NULL, + filter.SAMPLE.miss = NULL, + filter.SNP.miss = NULL, + ploidy, + output.file = NULL) { #Should allow for any INFO field to be entered to be filtered - - - # Read VCF (can be .vcf or .vcf.gz) - + # Import VCF (can be .vcf or .vcf.gz) if (!inherits(vcf.file, "vcfR")) { - vcf <- read.vcfR(vcf.file) + vcf <- read.vcfR(vcf.file, verbose = FALSE) } else { vcf <- vcf.file - } - - # Keep original VCF for pre‑filter statistics - vcf_orig <- vcf - - - # pre‑filtering quality rates - - if (quality.rates) { - ## Extract genotypes, depth and DP matrix - gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE) - - dfmt <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]] - if ("DP" %in% dfmt) { - dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE) - } else { - dp_orig <- matrix(NA_real_, - nrow = nrow(gt_orig), ncol = ncol(gt_orig), - dimnames = dimnames(gt_orig)) - } - - - # 1. Observed heterozygosity (per‑marker & per‑sample) - - # Helper: TRUE if a genotype is heterozygous (any two different - # alleles, excluding missing "./.") - is_het <- function(g) { - if (is.na(g) || g == "./.") return(FALSE) - alleles <- strsplit(g, split = "[/|]")[[1]] - return(length(unique(alleles)) > 1) - } - #matrix of heterozygous calls - het_mat <- apply(gt_orig, c(1, 2), is_het) - - #Observed heterozygosity per marker and per sample - obs_het_marker <- rowMeans(het_mat, na.rm = TRUE) - obs_het_sample <- colMeans(het_mat, na.rm = TRUE) - - - #Per‑marker stats - - mean_depth_marker <- rowMeans(dp_orig, na.rm = TRUE) - genotype_present <- !is.na(gt_orig) - genotyping_rate_marker <- rowMeans(genotype_present) - - markers_df <- data.frame( - marker = vcf_orig@fix[, "ID"], - mean_depth = round(mean_depth_marker, 2), - genotyping_rate = round(genotyping_rate_marker, 2), - obs_het = round(obs_het_marker, 2), - stringsAsFactors = FALSE - ) - - - #Per‑sample stats - - mean_depth_sample <- colMeans(dp_orig, na.rm = TRUE) - genotyping_rate_sample <- colMeans(genotype_present) - - samples_df <- data.frame( - sample = colnames(gt_orig), - mean_depth = round(mean_depth_sample, 2), - genotyping_rate = round(genotyping_rate_sample, 2), - obs_het = round(obs_het_sample, 2), - stringsAsFactors = FALSE - ) - - - #Write CSV - - base_name <- if (!is.null(output.file)) output.file else "pre_filter" - write.csv(markers_df, paste0(base_name, "_marker_stats.csv"), - row.names = FALSE, quote = FALSE) - write.csv(samples_df, paste0(base_name, "_sample_stats.csv"), - row.names = FALSE, quote = FALSE) + #rm(vcf.file) } #Update header based on user filtering parameters @@ -175,7 +102,7 @@ filterVCF <- function(vcf.file, # Extract the DP values if ("DP" %in% format_fields && !is.null(filter.DP)) { - cat("Filtering by DP\n") + message("Filtering by DP\n") dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold threshold <- as.numeric(filter.DP) @@ -189,7 +116,7 @@ filterVCF <- function(vcf.file, #Filter if the MPP field is present if ("MPP" %in% format_fields && !is.null(filter.MPP)) { - cat("Filtering by MPP\n") + message("Filtering by MPP\n") # Extract the MPP values mpp <- extract.gt(vcf, element = "MPP", as.numeric = TRUE) # Identify cells to modify based on the DP threshold @@ -229,13 +156,13 @@ filterVCF <- function(vcf.file, # Filtering by OD if ("OD" %in% info_ids && !is.null(filter.OD)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - cat("Filtering by OD\n") + message("Filtering by OD\n") od_values <- extract_info_value(info, "OD") # Ensure no NA values before filtering if (!all(is.na(od_values))) { vcf <- vcf[od_values < as.numeric(filter.OD), ] } else { - cat("No valid OD values found.\n") + warning("No valid OD values found.\n") } } @@ -244,26 +171,26 @@ filterVCF <- function(vcf.file, # Filtering by BIAS if ("BIAS" %in% info_ids && !is.null(filter.BIAS.min) && !is.null(filter.BIAS.max)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - cat("Filtering by BIAS\n") + message("Filtering by BIAS\n") bias_values <- extract_info_value(info, "BIAS") # Ensure no NA values before filtering if (!all(is.na(bias_values))) { vcf <- vcf[bias_values > as.numeric(filter.BIAS.min) & bias_values < as.numeric(filter.BIAS.max), ] } else { - cat("No valid BIAS values found.\n") + warning("No valid BIAS values found.\n") } } # Filtering by PMC if ("PMC" %in% info_ids && !is.null(filter.PMC)) { info <- vcf@fix[, "INFO"] #Need to get after each filter.. - cat("Filtering by PMC\n") + message("Filtering by PMC\n") pmc_values <- extract_info_value(info, "PMC") # Ensure no NA values before filtering if (!all(is.na(pmc_values))) { vcf <- vcf[pmc_values < as.numeric(filter.PMC), ] } else { - cat("No valid PMC values found.\n") + warning("No valid PMC values found.\n") } } @@ -273,14 +200,14 @@ filterVCF <- function(vcf.file, gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) if (!is.null(filter.SNP.miss)) { - cat("Filtering by SNP missing data\n") + message("Filtering by SNP missing data\n") snp_missing_data <- rowMeans(is.na(gt_matrix)) vcf <- vcf[snp_missing_data < as.numeric(filter.SNP.miss), ] gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) } if (!is.null(filter.SAMPLE.miss)) { - cat("Filtering by Sample missing data\n") + message("Filtering by Sample missing data\n") # Calculate the proportion of missing data for each sample sample_missing_data <- colMeans(is.na(gt_matrix)) # Identify samples to keep based on the missing data threshold @@ -295,112 +222,30 @@ filterVCF <- function(vcf.file, rm(gt_matrix) } - ##Convert GT to dosage - #gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf)) - - # Function to determine the ploidy level from a genotype string - #determine_ploidy <- function(gt) { - # if (is.na(gt)) { - # return(NA) - # } - # return(length(strsplit(gt, "[|/]")[[1]])) - #} - - # Function to find a non-NA example genotype to determine ploidy - #find_example_gt <- function(matrix) { - # for (i in seq_len(nrow(matrix))) { - # for (j in seq_len(ncol(matrix))) { - # if (!is.na(matrix[i, j])) { - # return(matrix[i, j]) - # } - # } - # } - # return(NA) # Return NA if no non-NA genotype is found - #} - - # Find a non-NA example genotype - #example_gt <- find_example_gt(gt_matrix) - - # Determine the ploidy level - #if (!is.na(example_gt)) { - # ploidy <- determine_ploidy(example_gt) - #} else { - # stop("No non-NA genotype found to determine ploidy.") - #} - - # Generate lookup table for genotypes to dosage conversion - #generate_lookup_table <- function(ploidy) { - # possible_alleles <- 0:ploidy - # genotypes <- expand.grid(rep(list(possible_alleles), ploidy)) - # genotypes <- apply(genotypes, 1, function(x) paste(x, collapse = "/")) - # dosage_values <- rowSums(expand.grid(rep(list(possible_alleles), ploidy))) - # lookup_table <- setNames(dosage_values, genotypes) - # return(lookup_table) - #} - - # Generate the lookup table - #lookup_table <- generate_lookup_table(ploidy) - - # Function to convert genotype to dosage using the lookup table - #genotype_to_dosage <- function(gt, lookup_table) { - # if (is.na(gt)) { - # return(NA) - # } - # return(lookup_table[[gt]]) - #} - - # Function to convert genotype matrix to dosage matrix using vectorized operations - #convert_genotypes_to_dosage <- function(gt_matrix, lookup_table) { - # unique_gts <- unique(gt_matrix) - # gt_to_dosage <- setNames(rep(NA, length(unique_gts)), unique_gts) - # valid_gts <- unique_gts[unique_gts %in% names(lookup_table)] - # gt_to_dosage[valid_gts] <- lookup_table[valid_gts] - # dosage_matrix <- gt_to_dosage[gt_matrix] - #colnames(dosage_matrix) <- colnames(gt_matrix) - #row.names(dosage_matrix) <- row.names(gt_matrix) - # return(matrix(as.numeric(dosage_matrix), nrow = nrow(gt_matrix), ncol = ncol(gt_matrix))) - #} - - # Convert the genotype matrix to dosage matrix - #dosage_matrix <- convert_genotypes_to_dosage(gt_matrix, lookup_table) - ##MAF filter - #Compare my lengthy process to estimate MAF with vcfR::maf() function - #The BIGr::calculate_MAF(dosage_matrix, ploidy) is the exact same as the vcfR::maf() calculations - #The step where I extract UD and calculate MAF is different... - #if ("UD" %in% format_fields) { - # maf_df <- BIGr::calculate_MAF(extract.gt(vcf, element = "UD", as.numeric = TRUE), ploidy = ploidy) - #} else { - #convert genotypes to dosage and filter - # maf_df <- BIGr::calculate_MAF(dosage_matrix, ploidy) - #} - #Need to confirm that vcfR::maf will work with any ploidy...if not, use my code if (!is.null(filter.MAF)) { - cat("Filtering by MAF\n") + message("Filtering by MAF\n") maf_df <- data.frame(vcfR::maf(vcf, element = 2)) vcf <- vcf[maf_df$Frequency > as.numeric(filter.MAF), ] } ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz) - cat("Exporting VCF\n") - if (!inherits(vcf.file, "vcfR")){ - if (!is.null(output.file)){ - output_name <- paste0(output.file,".vcf.gz") + message("Exporting VCF\n") + if (!inherits(vcf.file, "vcfR")) { + if (!is.null(output.file)) { + output_name <- paste0(output.file, ".vcf.gz") vcfR::write.vcf(vcf, file = output_name) - }else{ + } else { return(vcf) } - }else{ - if (!is.null(output.file)){ - output_name <- paste0(output.file,"_filtered.vcf.gz") + } else { + if (!is.null(output.file)) { + output_name <- paste0(output.file, "_filtered.vcf.gz") vcfR::write.vcf(vcf, file = output_name) - }else{ + } else { return(vcf) } } - #Message that includes the output vcf stats - print(vcf) - #Message samples_removed <- starting_samples - (ncol(vcf@gt)-1) SNPs_removed <- starting_snps - nrow(vcf) @@ -408,81 +253,3 @@ filterVCF <- function(vcf.file, message("SNPs removed due to filtering: ",SNPs_removed) message("Complete!") } -#This is not reliable, so no longer use this shortcut to get dosage matrix -#test2 <- vcfR2genlight(vcf) - - -#####Testing custom VCF reading function###### -# Open the gzipped VCF file -#con <- gzfile("/Users/ams866/Desktop/output.vcf", "rt") - -# Read in the entire file -#lines <- readLines(con) -#close(con) -# Read in the entire file -#lines <- readLines("/Users/ams866/Desktop/output.vcf") -# Filter out lines that start with ## -#filtered_lines <- lines[!grepl("^##", lines)] -# Create a temporary file to write the filtered lines -#temp_file <- tempfile() -#writeLines(filtered_lines, temp_file) -# Read in the filtered data using read.table or read.csv -#vcf_data <- read.table(temp_file, header = TRUE, sep = "\t", comment.char = "", check.names = FALSE) -# Clean up the temporary file -#unlink(temp_file) - -##Extract INFO column and Filter SNPs by those values -#Update the filtering options by the items present in the INFO column? - -# Load required library -#library(dplyr) - -# Split INFO column into key-value pairs -#vcf_data_parsed <- vcf_data %>% -# mutate(INFO_PARSED = strsplit(INFO, ";")) %>% -# unnest(INFO_PARSED) %>% -# separate(INFO_PARSED, into = c("KEY", "VALUE"), sep = "=") %>% -# spread(KEY, VALUE) - -#Filter by DP -#filtered_vcf_data <- vcf_data_parsed %>% -# filter(as.numeric(DP) > 10) - -# View the filtered dataframe -#print(filtered_vcf_data) - -##Extracting and filtering by FORMAT column -# Identify the columns that are not sample columns -#non_sample_cols <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT") -# Identify the sample columns -#sample_cols <- setdiff(names(vcf_data), non_sample_cols) -# Extract FORMAT keys -#format_keys <- strsplit(as.character(vcf_data$FORMAT[1]), ":")[[1]] -# Split SAMPLE columns based on FORMAT -#vcf_data_samples <- vcf_data %>% -# mutate(across(all_of(sample_cols), ~strsplit(as.character(.), ":"))) %>% -# mutate(across(all_of(sample_cols), ~map(., ~setNames(as.list(.), format_keys)))) %>% -# unnest_wider(all_of(sample_cols), names_sep = "_") - -# View the parsed dataframe -#print(head(vcf_data_samples)) - -# Create separate dataframes for each FORMAT variable -#format_dfs <- lapply(format_keys, function(format_key) { -# vcf_data_samples %>% -# select(ID, ends_with(paste0("_", format_key))) %>% -# column_to_rownames("ID") -#}) - -# Assign names to the list elements -#names(format_dfs) <- format_keys - -# Access the separate dataframes -#gt_df <- format_dfs$GT # Genotype dataframe -#ad_df <- format_dfs$AD # Allelic depths dataframe - -#*I think the above method is okay if you only need to filter at the INFO level, -#*But I think if you want to filter for FORMAT, that vcfR is probably best, -#*Will need to explore further if I can easily just filter for MPP by checking if it is above a -#*threshold, and then converting the GT and UD values to NA if so... -#*If that is efficient and works, then I will just use this custom VCF method... diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd index 0342fe1..676ef7f 100644 --- a/man/filterVCF.Rd +++ b/man/filterVCF.Rd @@ -6,7 +6,6 @@ \usage{ filterVCF( vcf.file, - quality.rates = FALSE, filter.OD = NULL, filter.BIAS.min = NULL, filter.BIAS.max = NULL, @@ -23,8 +22,6 @@ filterVCF( \arguments{ \item{vcf.file}{vcfR object or path to VCF file. Can be unzipped (.vcf) or gzipped (.vcf.gz).} -\item{quality.rates}{Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity).} - \item{filter.OD}{Updog filter} \item{filter.BIAS.min}{Updog filter (requires a value for both BIAS.min and BIAS.max)} @@ -61,11 +58,17 @@ The VCF format is v4.3 \examples{ ## Use file paths for each file on the local system +#Temp location (only for example) +output_file <- tempfile() + +filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"), + filter.OD = 0.5, + filter.MAF = 0.05, + ploidy = 2, + output.file = output_file) -#filterVCF(vcf.file = "example_dart_Dosage_Report.csv", - # filter.OD = 0.5, - # ploidy = 2, - # output.file = "name_for_vcf") +# Removing the output for the example +rm(output_file) ##The function will output the filtered VCF to the current working directory From 4ac0c5526fc7baec013902d717717b2ab16baa96 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 08:20:52 -0400 Subject: [PATCH 04/19] support LUT Marker_ID --- R/madc2vcf_targets.R | 12 +++++++----- man/madc2vcf_targets.Rd | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 888c445..cc72e3e 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -71,7 +71,7 @@ #' `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`. #' @param markers_info character or `NULL`. Optional path to a CSV providing target #' metadata. Accepted columns: -#' - `CloneID` or `BI_markerID` (required as marker identifier); +#' - `CloneID`, `Marker_ID`, or `BI_markerID` (required as marker identifier); #' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; #' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence #' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). @@ -237,7 +237,7 @@ madc2vcf_targets <- function(madc_file, if(!isTRUE(checks$checks["ChromPos"])) { if(is.null(markers_info)){ stop("CloneID column does not follow the 'Chr_Pos'. ", - "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ", + "Please provide a markers_info file with at least 'CloneID'/'Marker_ID'/'BI_markerID', ", "'Chr', and 'Pos' columns.") } else { @@ -309,7 +309,8 @@ madc2vcf_targets <- function(madc_file, if(is.null(mi_df)) mi_df <- read.csv(markers_info) id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else if ("CloneID" %in% colnames(mi_df)) "CloneID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", @@ -321,7 +322,7 @@ madc2vcf_targets <- function(madc_file, if(!all(rownames(ad_df) %in% mi_df[[id_col]])) { miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]] - if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") + if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.") vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:", paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>") warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") @@ -342,7 +343,8 @@ madc2vcf_targets <- function(madc_file, if(is.null(mi_df)) mi_df <- read.csv(markers_info) id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else if ("CloneID" %in% colnames(mi_df)) "CloneID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") + if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else + stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 30363a6..25f99d5 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -28,7 +28,7 @@ Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target metadata. Accepted columns: \itemize{ -\item \code{CloneID} or \code{BI_markerID} (required as marker identifier); +\item \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} (required as marker identifier); \item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; \item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence inference is not possible (IUPAC codes, indels, or unfixed allele IDs). From 5234572eb39849f68a20d92c68d9040356a82556 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:33:10 -0400 Subject: [PATCH 05/19] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/madc2vcf_multi.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index bcbae02..2ff7b0e 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -166,6 +166,9 @@ madc2vcf_multi <- function(madc_file, vmsg("Loading MADC into polyRAD", verbose = verbose, level = 0, type = ">>") + if (!requireNamespace("polyRAD", quietly = TRUE)) { + stop("Package 'polyRAD' is required for madc2vcf_multi(). Please install it with install.packages('polyRAD').", call. = FALSE) + } raddat <- polyRAD::readDArTag( file = input_file, botloci = botloci_input, From 8c9dcda701ef63b4c3620979ee7205fbd4308751 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 08:46:33 -0400 Subject: [PATCH 06/19] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/madc2vcf_targets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index cc72e3e..a16fdfb 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -165,7 +165,7 @@ madc2vcf_targets <- function(madc_file, "verbose= ", verbose,')">') # MADC checks - report <- read.csv(madc_file) + report <- read.csv(madc_file, check.names = FALSE) checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { From b4d5534d7fde36a053be8245e6905548e3b92301 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 08:28:04 -0400 Subject: [PATCH 07/19] covered error case --- R/get_countsMADC.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index 3a9bc2b..c1045ec 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -54,6 +54,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche # Add check inputs if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.") + if(!is.null(madc_file) && !is.null(madc_object)) stop("Please provide either madc_file or madc_object. Not both.") if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.") if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.") From 57bbc89c93cb677bc614c4389ffc50df581465a8 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:29:30 -0400 Subject: [PATCH 08/19] add example and suggest ggplot --- DESCRIPTION | 4 +-- NAMESPACE | 1 - R/imputation_concordance.R | 46 +++++++++++++++++++++++------------ man/imputation_concordance.Rd | 23 ++++++++++++++---- 4 files changed, 51 insertions(+), 23 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 48ede7c..672faac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,10 +62,10 @@ Imports: janitor, quadprog, tibble, - stringr, - ggplot2 + stringr Suggests: covr, + ggplot2, spelling, rmdformats, knitr (>= 1.10), diff --git a/NAMESPACE b/NAMESPACE index ae09080..e6cbc30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,7 +24,6 @@ export(thinSNP) export(updog2vcf) export(vmsg) import(dplyr) -import(ggplot2) import(janitor) import(parallel) import(quadprog) diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 1eb441a..918071a 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -47,19 +47,31 @@ #' is generated using \pkg{ggplot2}. #' #' @import dplyr -#' @import ggplot2 #' #' @examples -#' \dontrun{ +#' ref <- data.frame( +#' ID = c("S1", "S2", "S3"), +#' SNP1 = c(0, 1, 2), +#' SNP2 = c(1, 1, 0), +#' SNP3 = c(2, 5, 1) +#' ) +#' +#' test <- data.frame( +#' ID = c("S1", "S2", "S3"), +#' SNP1 = c(0, 0, 2), +#' SNP2 = c(1, 1, 1), +#' SNP3 = c(2, 5, 0) +#' ) +#' #' result <- imputation_concordance( #' reference_genos = ref, #' imputed_genos = test, -#' snps_2_exclude = snps, +#' snps_2_exclude = "SNP2", #' missing_code = 5, -#' verbose = TRUE, -#' plot = TRUE +#' print_result = FALSE #' ) -#' } +#' +#' result #' #' @importFrom stats reorder #' @export @@ -136,21 +148,25 @@ imputation_concordance <- function(reference_genos, # Optional plot if (plot) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + stop("Package 'ggplot2' is required when plot = TRUE.", call. = FALSE) + } plot_df <- data.frame( ID = imputed_genos$ID, Concordance = percentage_match * 100 ) - concordance_plot <- ggplot(plot_df, - aes(x = reorder(ID, Concordance), - y = Concordance)) + - geom_bar(stat = "identity") + - labs(title = "Imputation Concordance by Sample", - x = "Sample ID", - y = "Concordance (%)") + - theme_minimal() + - theme(axis.text.x = element_text(angle = 90, hjust = 1)) + concordance_plot <- ggplot2::ggplot( + plot_df, + ggplot2::aes(x = reorder(ID, Concordance), y = Concordance) + ) + + ggplot2::geom_bar(stat = "identity") + + ggplot2::labs(title = "Imputation Concordance by Sample", + x = "Sample ID", + y = "Concordance (%)") + + ggplot2::theme_minimal() + + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) print(concordance_plot) } diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 31f54a8..22e9462 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -64,15 +64,28 @@ When \code{plot = TRUE}, a bar plot showing concordance percentage per sample is generated using \pkg{ggplot2}. } \examples{ -\dontrun{ +ref <- data.frame( + ID = c("S1", "S2", "S3"), + SNP1 = c(0, 1, 2), + SNP2 = c(1, 1, 0), + SNP3 = c(2, 5, 1) +) + +test <- data.frame( + ID = c("S1", "S2", "S3"), + SNP1 = c(0, 0, 2), + SNP2 = c(1, 1, 1), + SNP3 = c(2, 5, 0) +) + result <- imputation_concordance( reference_genos = ref, imputed_genos = test, - snps_2_exclude = snps, + snps_2_exclude = "SNP2", missing_code = 5, - verbose = TRUE, - plot = TRUE + print_result = FALSE ) -} + +result } From 4bb67fd1a89cebd73b90a6d259cb3c625c01b817 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:29:45 -0400 Subject: [PATCH 09/19] make get_counts internal --- R/get_countsMADC.R | 3 ++- man/get_counts.Rd | 60 ------------------------------------------- man/get_countsMADC.Rd | 2 +- 3 files changed, 3 insertions(+), 62 deletions(-) delete mode 100644 man/get_counts.Rd diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R index c1045ec..57b83ce 100644 --- a/R/get_countsMADC.R +++ b/R/get_countsMADC.R @@ -46,7 +46,7 @@ #' #' rm(counts_matrices) #' -#' @seealso [get_counts()], [check_madc_sanity()] +#' @seealso [check_madc_sanity()] #' #' @import dplyr #' @export @@ -182,6 +182,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche #' @importFrom dplyr %>% filter case_when #' #' @keywords internal +#' @noRd get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) { # Add check inputs diff --git a/man/get_counts.Rd b/man/get_counts.Rd deleted file mode 100644 index 1879e07..0000000 --- a/man/get_counts.Rd +++ /dev/null @@ -1,60 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_countsMADC.R -\name{get_counts} -\alias{get_counts} -\title{Read and Pre-process a MADC File} -\usage{ -get_counts( - madc_file = NULL, - madc_object = NULL, - collapse_matches_counts = FALSE, - verbose = TRUE -) -} -\arguments{ -\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file. -At least one of \code{madc_file} or \code{madc_object} must be provided.} - -\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame -(e.g., from \code{check_botloci()}). When supplied, file reading is skipped. -At least one of \code{madc_file} or \code{madc_object} must be provided.} - -\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch} -and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt} -rows. If \code{FALSE} (default), those rows are discarded.} - -\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.} -} -\value{ -A data frame with one row per \code{Ref} or \code{Alt} allele entry, retaining -all original columns (\code{AlleleID}, \code{CloneID}, \code{AlleleSequence}, sample -count columns, etc.). -} -\description{ -Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the -file format, and returns a filtered data frame containing only \code{Ref} and \code{Alt} -haplotype rows ready for count-matrix construction. -} -\details{ -\strong{Input}: either \code{madc_file} (path to CSV) or \code{madc_object} (pre-read data -frame) must be supplied; at least one is required. - -\strong{Format detection} (applied to file or object alike): the first seven rows -of the first column are inspected: -\itemize{ -\item \strong{Standard format}: all entries are blank or \code{"*"} — the first 7 rows are -treated as DArT placeholder rows and skipped. -\item \strong{Fixed-allele-ID format}: no filler rows — data are used as-is. -} - -\strong{\verb{|AltMatch} / \verb{|RefMatch} handling} (controlled by \code{collapse_matches_counts}): -\itemize{ -\item \code{FALSE} (default): these rows are simply discarded. -\item \code{TRUE}: their counts are summed into the corresponding \verb{|Ref} or \verb{|Alt} -row for the same \code{CloneID}. -} - -In all cases, trailing suffixes on \code{AlleleID} (e.g., \verb{|Ref_001}, \verb{|Alt_002}) -are stripped to the canonical \verb{|Ref} / \verb{|Alt} form. -} -\keyword{internal} diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd index 28fca1e..207b899 100644 --- a/man/get_countsMADC.Rd +++ b/man/get_countsMADC.Rd @@ -67,5 +67,5 @@ rm(counts_matrices) } \seealso{ -\code{\link[=get_counts]{get_counts()}}, \code{\link[=check_madc_sanity]{check_madc_sanity()}} +\code{\link[=check_madc_sanity]{check_madc_sanity()}} } From ff1ef84d3a1ba905736c9d321527b70ca675e3a2 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 17 Apr 2026 09:29:51 -0400 Subject: [PATCH 10/19] update test --- tests/testthat/test-madc2vcf_targets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index a64da34..b5e8a60 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -218,7 +218,7 @@ test_that("simu alfalfa",{ botloci_file = alfalfa_botloci, markers_info = alfalfa_markers_info, verbose = FALSE), - "None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match." + "None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match." ) # Test that it works when the function can find a matching ID in markers_info to fix the botloci mismatch issue From 86b4fef4e24b827595036749cad51b2a9e645f27 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:12:20 -0400 Subject: [PATCH 11/19] added marker_id support --- R/check_madc_sanity.R | 58 ++++++++++----- tests/testthat/test-check_madc_sanity.R | 97 +++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 18 deletions(-) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R index fda01d5..88f564a 100644 --- a/R/check_madc_sanity.R +++ b/R/check_madc_sanity.R @@ -156,7 +156,10 @@ check_madc_sanity <- function(report) { pos <- strsplit(report$CloneID, "_") format <- all(sapply(pos, length) == 2) first <- all(grepl("^[A-Za-z]", sapply(pos, "[", 1))) - second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0)) + second <- all(vapply(pos, function(x) { + pos_num <- suppressWarnings(as.numeric(x[2])) + !is.na(pos_num) && pos_num > 0 + }, logical(1))) checks["ChromPos"] <- all(format, first, second) } else checks["ChromPos"] <- FALSE @@ -216,7 +219,9 @@ check_madc_sanity <- function(report) { #' @param botloci A data frame containing the botloci markers. #' @param report A data frame containing the MADC markers. #' @param ChromPos logical value indicating whether the CloneID in the MADC file contains chromosome and position information in the format "Chr_Pos". Default is TRUE -#' @param mi_df A data frame containing marker information with columns CloneID, Chr, and Pos. Required if `ChromPos` is FALSE. +#' @param mi_df A data frame containing marker information with one marker ID column +#' (`CloneID`, `Marker_ID`, or `BI_markerID`) plus `Chr` and `Pos`. Required if +#' `ChromPos` is FALSE. #' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. Required if `ChromPos` is FALSE. #' #' @return A list containing the adjusted botloci and MADC data frames. @@ -226,23 +231,42 @@ check_madc_sanity <- function(report) { #' #' @keywords internal #' @noRd +pick_markers_info_id_col <- function(mi_df, query_ids) { + query_ids <- unique(stats::na.omit(query_ids)) + id_cols <- intersect(c("CloneID", "BI_markerID", "Marker_ID"), colnames(mi_df)) + + if (!length(id_cols)) { + stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") + } + + match_n <- vapply(id_cols, function(col) { + sum(query_ids %in% unique(stats::na.omit(mi_df[[col]]))) + }, integer(1)) + + if (!any(match_n)) { + stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.") + } + + id_cols[which.max(match_n)] +} + check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=TRUE){ + original_clone_ids <- report$CloneID + use_col <- NULL # Check inputs if(!ChromPos) { - if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with CloneID, Chr and Pos columns must be provided.") - # if exists, it must contain CloneID or BI_markerID that matches the report$CloneID, and Chr and Pos columns - if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { - stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") - } else { - use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" - vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") - } + if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with 'CloneID'/'Marker_ID'/'BI_markerID', 'Chr', and 'Pos' columns must be provided.") + use_col <- pick_markers_info_id_col(mi_df, report$CloneID) + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>") if(is.null(mi_df$Chr) | is.null(mi_df$Pos)) stop("When MADC CloneID don't follow the format Chr_Pos, Chr and Pos columns must be provided in the markers_info file.") } if(!any(botloci$V1 %in% report$CloneID)) { # First check if any botloci markers are found in MADC file. If not, check for padding mismatch. vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>") + if(!is.null(mi_df) && is.null(use_col)) { + use_col <- pick_markers_info_id_col(mi_df, original_clone_ids) + } pad_madc <- unique(nchar(sub(".*_", "", report$CloneID))) pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1))) @@ -257,10 +281,11 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= ) report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) if(!is.null(mi_df)) { - mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df$CloneID), "_", - sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df$CloneID))) + mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df[[use_col]]), "_", + sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df[[use_col]]))) ) } + if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n") } else { botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1))) @@ -270,12 +295,8 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){ vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>") vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 2, type = ">>") - if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) { - stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.") - } else { - use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID" - vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") - } + use_col <- pick_markers_info_id_col(mi_df, report$CloneID) + vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>") mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos))) if(!any(botloci$V1 %in% mk_info_CloneID)){ @@ -286,6 +307,7 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose= vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 2, type = ">>") vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 2, type = ">>") report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])] + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) mi_df$CloneID <- mk_info_CloneID } } else { diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index d0c45cf..b1a3857 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -68,4 +68,101 @@ test_that("check madc",{ expect_equal(res$checks, exp) }) +test_that("check_botloci remaps using Marker_ID", { + botloci <- data.frame(V1 = c("1_0001", "2_0002")) + report <- data.frame( + CloneID = c("ProbeA_0001", "ProbeB_0002"), + AlleleID = c("ProbeA_0001|Ref_0001", "ProbeB_0002|Ref_0001"), + AlleleSequence = c("A", "T"), + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = c("ProbeA_0001", "ProbeB_0002"), + Chr = c("1", "2"), + Pos = c(1, 2) + ) + + res <- check_botloci(botloci, report, ChromPos = FALSE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, botloci$V1) + expect_equal(res[[3]]$CloneID, botloci$V1) +}) + +test_that("check_botloci resolves Marker_ID before padding report CloneIDs", { + botloci <- data.frame(V1 = "1_000000123") + report <- data.frame( + CloneID = "1_123", + AlleleID = "1_123|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = "1_123", + Chr = "1", + Pos = 123 + ) + + res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, botloci$V1) + expect_equal(res[[3]]$CloneID, botloci$V1) +}) + +test_that("pick_markers_info_id_col scores distinct markers not allele rows", { + mi_df <- data.frame( + CloneID = c("m1", "m2"), + Marker_ID = c("m1", "m3") + ) + query_ids <- c("m1", "m1", "m1", "m2") + + expect_equal(pick_markers_info_id_col(mi_df, query_ids), "CloneID") +}) + +test_that("check_madc_sanity returns FALSE for malformed CloneID positions", { + report <- data.frame( + CloneID = c("Chr_abc", "Chr_abc"), + AlleleID = c("Chr_abc|Ref_0001", "Chr_abc|Alt_0002"), + AlleleSequence = c("A", "T"), + check.names = FALSE + ) + res <- check_madc_sanity(report) + + expect_false(is.na(res$checks["ChromPos"])) + expect_false(res$checks["ChromPos"]) +}) + +test_that("check_botloci errors if widening MADC padding still does not match", { + botloci <- data.frame(V1 = "1_0002") + report <- data.frame( + CloneID = "1_1", + AlleleID = "1_1|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + + expect_error( + check_botloci(botloci, report, ChromPos = TRUE, verbose = FALSE), + "After matching padding, botloci markers still not found in MADC file. Check marker IDs." + ) +}) + +test_that("check_botloci keeps AlleleID synchronized after CloneID remap", { + botloci <- data.frame(V1 = "1_0001") + report <- data.frame( + CloneID = "ProbeA_0001", + AlleleID = "ProbeA_0001|Ref_0001", + AlleleSequence = "A", + check.names = FALSE + ) + mi_df <- data.frame( + Marker_ID = "ProbeA_0001", + Chr = "1", + Pos = 1 + ) + + res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE) + + expect_equal(res[[2]]$CloneID, "1_0001") + expect_equal(res[[2]]$AlleleID, "1_0001|Ref_0001") +}) From c10d134ba604cd7ff2ebbe5fbcb15a533f189811 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:13:21 -0400 Subject: [PATCH 12/19] fixed AD generation bug --- R/madc2vcf_all.R | 24 +++++++---- tests/testthat/test-madc2vcf_all.R | 67 +++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 63c031d..e223977 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -138,7 +138,7 @@ madc2vcf_all <- function(madc, checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { - if (check) message[1] else message[2] + if (isTRUE(check)) message[1] else message[2] }, checks$checks, checks$messages) for(i in seq_along(messages_results)) @@ -158,13 +158,23 @@ madc2vcf_all <- function(madc, # Check whether markers_info is present and contains Ref + Alt columns if(!is.null(markers_info)) { mi_df <- read.csv(markers_info) - # Standardize marker ID column to CloneID - if(!"CloneID" %in% colnames(mi_df) && "BI_markerID" %in% colnames(mi_df)) { - colnames(mi_df)[colnames(mi_df) == "BI_markerID"] <- "CloneID" - vmsg("markers_info: 'BI_markerID' column renamed to 'CloneID' for internal use", verbose = verbose, level = 1) - } else if(!"CloneID" %in% colnames(mi_df) && !"BI_markerID" %in% colnames(mi_df)) { + id_cols <- intersect(c("CloneID", "BI_markerID"), colnames(mi_df)) + if(!length(id_cols)) { stop("markers_info must contain a marker ID column named either 'CloneID' or 'BI_markerID'.") } + match_n <- vapply(id_cols, function(col) { + sum(unique(report$CloneID) %in% unique(stats::na.omit(mi_df[[col]]))) + }, integer(1)) + if(!any(match_n)) { + stop("None of the markers_info CloneID or BI_markerID values match the MADC CloneID column. Please make sure they use the same marker IDs.") + } + id_col <- id_cols[which.max(match_n)] + if(id_col != "CloneID" || !"CloneID" %in% colnames(mi_df)) { + mi_df$CloneID <- mi_df[[id_col]] + if(id_col == "BI_markerID") { + vmsg("markers_info: 'BI_markerID' column copied to 'CloneID' for internal use", verbose = verbose, level = 1) + } + } # Validate CloneID values if(any(is.na(mi_df$CloneID) | mi_df$CloneID == "")) stop("markers_info CloneID column contains empty or NA values. Please check your markers_info file.") @@ -910,7 +920,7 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic info_mk <- paste0("DP=", sum(c(RefTag, AltTag,total)),";", "ADS=",sum(RefTag),",",sum(AltTag), ads) } else { - tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, AltTag) + tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag) alts <- info$Alt info_mk <- paste0("DP=", sum(c(RefTag, AltTag)),";", "ADS=",sum(RefTag),",",sum(AltTag)) diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R index c8c860f..1d6050f 100644 --- a/tests/testthat/test-madc2vcf_all.R +++ b/tests/testthat/test-madc2vcf_all.R @@ -66,6 +66,70 @@ test_that("test madc offtargets",{ }) +test_that("madc2vcf_all preserves comma-separated AD for biallelic targets", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + db_file <- system.file("example_allele_db.fa", package="BIGr") + temp <- tempfile(fileext = ".vcf") + + madc2vcf_all(madc = madc_file, + botloci_file = bot_file, + hap_seq_file = db_file, + n.cores = 1, + out_vcf = temp, + verbose = FALSE) + + vcf <- read.vcfR(temp, verbose = FALSE) + ad <- extract.gt(vcf, "AD") + biallelic <- !grepl(",", vcf@fix[, "ALT"]) + + expect_true(all(grepl("^[0-9]+,[0-9]+$", ad[biallelic, 1]))) +}) + +test_that("madc2vcf_all accepts BI_markerID matches when CloneID does not match", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + db_file <- system.file("example_allele_db.fa", package="BIGr") + temp <- tempfile(fileext = ".vcf") + temp_markers <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + marker_ids <- unique(report$CloneID) + markers_info <- data.frame( + CloneID = paste0("bogus_", seq_along(marker_ids)), + BI_markerID = marker_ids + ) + write.csv(markers_info, temp_markers, row.names = FALSE) + + expect_no_error( + madc2vcf_all(madc = madc_file, + botloci_file = bot_file, + hap_seq_file = db_file, + markers_info = temp_markers, + n.cores = 1, + out_vcf = temp, + verbose = FALSE) + ) +}) + +test_that("madc2vcf_all surfaces missing-column validation error without crashing", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + report$AlleleSequence <- NULL + write.csv(report, temp_madc, row.names = FALSE) + + expect_error( + madc2vcf_all(madc = temp_madc, + botloci_file = bot_file, + out_vcf = tempfile(fileext = ".vcf"), + verbose = FALSE), + "One or more required columns missing" + ) +}) + # ======================================================================= # Using Breeding-Insight/BIGapp-PanelHub test files # ======================================================================= @@ -220,7 +284,7 @@ test_that("simu alfalfa",{ markers_info = alfalfa_markers_info, out_vcf = out, verbose = FALSE), - regexp = "None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs." + regexp = "None of the markers_info CloneID( or BI_markerID)? values match the MADC CloneID column. Please make sure they use the same marker IDs." ) # Test error when markers_info_ChromPos is provided but IDs still don't match botloci @@ -528,4 +592,3 @@ test_that("simu alfalfa",{ ) }) }) - From 1ae386f95ca906ef294efad57aa522d3a8cee64a Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:13:41 -0400 Subject: [PATCH 13/19] improved truth check --- R/madc2vcf_multi.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index 2ff7b0e..d22b8b4 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -81,7 +81,7 @@ madc2vcf_multi <- function(madc_file, checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { - if (check) message[1] else message[2] + if (isTRUE(check)) message[1] else message[2] }, checks$checks, checks$messages) for (i in seq_along(messages_results)) @@ -193,4 +193,3 @@ madc2vcf_multi <- function(madc_file, invisible(NULL) } - From 49cb0a4843cfb486418204fed54198299fa817ca Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:14:58 -0400 Subject: [PATCH 14/19] support Marker_ID --- R/madc2vcf_targets.R | 27 ++++++++++---------- man/madc2vcf_targets.Rd | 15 +++++++---- tests/testthat/test-madc2vcf_targets.R | 35 ++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 19 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index a16fdfb..d75a17f 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -67,14 +67,19 @@ #' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text #' file listing target IDs designed on the **bottom** strand (one ID per line). #' Used for strand-correcting probe sequences when `get_REF_ALT = TRUE` and -#' `markers_info` does not supply `Ref` and `Alt` columns. Also required when -#' `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`. +#' `markers_info` does not supply `Ref` and `Alt` columns. Not needed when +#' `markers_info` provides `Ref` and `Alt`, or when `get_REF_ALT = FALSE` and +#' `markers_info` provides `Chr` and `Pos`. Also required when `ChromPos` is +#' invalid and `markers_info` does not provide `Ref`/`Alt`. #' @param markers_info character or `NULL`. Optional path to a CSV providing target -#' metadata. Accepted columns: -#' - `CloneID`, `Marker_ID`, or `BI_markerID` (required as marker identifier); +#' metadata. Matching is done by column name, not column position. Accepted columns: +#' - one marker identifier column named `CloneID`, `Marker_ID`, or `BI_markerID` +#' (required; a generic `ID` column is not accepted); #' - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format; #' - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence -#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +#' inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When +#' `get_REF_ALT = TRUE`, `botloci_file` is still required unless `Ref` and `Alt` +#' are supplied here. #' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to recover #' REF/ALT bases. The source is chosen automatically: `markers_info` `Ref`/`Alt` #' columns take priority; otherwise probe sequences from the MADC are compared @@ -169,7 +174,7 @@ madc2vcf_targets <- function(madc_file, checks <- check_madc_sanity(report) messages_results <- mapply(function(check, message) { - if (check) message[1] else message[2] + if (isTRUE(check)) message[1] else message[2] }, checks$checks, checks$messages) for(i in seq_along(messages_results)) @@ -307,10 +312,7 @@ madc2vcf_targets <- function(madc_file, vmsg("Using markers_info for CHROM, POS, REF and ALT.", verbose = verbose, level = 0, type = ">>") if(is.null(mi_df)) mi_df <- read.csv(markers_info) - id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else - if ("CloneID" %in% colnames(mi_df)) "CloneID" else - if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") + id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df)) if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.", @@ -341,10 +343,7 @@ madc2vcf_targets <- function(madc_file, vmsg("markers_info file provided. Using CHROM and POS from the file.", verbose = verbose, level = 0, type = ">>") if(is.null(mi_df)) mi_df <- read.csv(markers_info) - id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else - if ("CloneID" %in% colnames(mi_df)) "CloneID" else - if ("Marker_ID" %in% colnames(mi_df)) "Marker_ID" else - stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.") + id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df)) if(checks$checks["Indels"]) vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.", diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index 25f99d5..8cab155 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -22,16 +22,21 @@ madc2vcf_targets( \item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text file listing target IDs designed on the \strong{bottom} strand (one ID per line). Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and -\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Also required when -\code{ChromPos} is invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} +\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Not needed when +\code{markers_info} provides \code{Ref} and \code{Alt}, or when \code{get_REF_ALT = FALSE} and +\code{markers_info} provides \code{Chr} and \code{Pos}. Also required when \code{ChromPos} is +invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.} \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target -metadata. Accepted columns: +metadata. Matching is done by column name, not column position. Accepted columns: \itemize{ -\item \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} (required as marker identifier); +\item one marker identifier column named \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} +(required; a generic \code{ID} column is not accepted); \item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format; \item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence -inference is not possible (IUPAC codes, indels, or unfixed allele IDs). +inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When +\code{get_REF_ALT = TRUE}, \code{botloci_file} is still required unless \code{Ref} and \code{Alt} +are supplied here. }} \item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to recover diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index b5e8a60..9ffc72e 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -87,6 +87,41 @@ test_that("bottom strand markers have correct REF/ALT", { rm(vcf_targets, temp_targets) }) +test_that("madc2vcf_targets preserves original sample names", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + temp_vcf <- tempfile(fileext = ".vcf") + + report <- read.csv(madc_file, check.names = FALSE) + colnames(report)[4:6] <- c("1A", "Sample-1", "sample 2") + write.csv(report, temp_madc, row.names = FALSE, quote = TRUE) + + suppressWarnings( + madc2vcf_targets(madc_file = temp_madc, output.file = temp_vcf, get_REF_ALT = FALSE) + ) + + vcf <- read.vcfR(temp_vcf, verbose = FALSE) + + expect_equal(colnames(vcf@gt)[2:4], c("1A", "Sample-1", "sample 2")) +}) + +test_that("madc2vcf_targets surfaces missing-column validation error without crashing", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + temp_madc <- tempfile(fileext = ".csv") + + report <- read.csv(madc_file, check.names = FALSE) + report$AlleleSequence <- NULL + write.csv(report, temp_madc, row.names = FALSE) + + expect_error( + madc2vcf_targets(madc_file = temp_madc, + output.file = tempfile(fileext = ".vcf"), + get_REF_ALT = FALSE, + verbose = FALSE), + "One or more required columns missing" + ) +}) + # ======================================================================= # Using Breeding-Insight/BIGapp-PanelHub test files From 22fc6e45d60e3cfb6c880037e40a0e74d995b120 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:58:31 -0400 Subject: [PATCH 15/19] Update documentation for verbose message utility --- R/utils.R | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/R/utils.R b/R/utils.R index 59e5563..076a430 100644 --- a/R/utils.R +++ b/R/utils.R @@ -27,20 +27,21 @@ convert_to_dosage <- function(gt) { }) } -##' Verbose Message Utility -##' -##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. -##' -##' @param text Character string, the message to print (supports sprintf formatting). -##' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. -##' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). -##' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. -##' @param ... Additional arguments passed to sprintf for formatting. -##' -##' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. -##' -##' @return No return value, called for side effects. -##' @export +#' Verbose Message Utility +#' +#' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. +#' +#' @param text Character string, the message to print (supports sprintf formatting). +#' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output. +#' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail). +#' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0. +#' @param ... Additional arguments passed to sprintf for formatting. +#' +#' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. +#' +#' @return No return value, called for side effects. +#' @internal +#' @noRd vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { if (!verbose) return(invisible()) # Format timestamp From 38dd6090d057bf10352f6dd250d10f9026002583 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 12:00:24 -0400 Subject: [PATCH 16/19] fix exports --- NAMESPACE | 1 - R/utils.R | 3 +-- man/vmsg.Rd | 28 ---------------------------- 3 files changed, 1 insertion(+), 31 deletions(-) delete mode 100644 man/vmsg.Rd diff --git a/NAMESPACE b/NAMESPACE index e6cbc30..e9f2613 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,7 +22,6 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) -export(vmsg) import(dplyr) import(janitor) import(parallel) diff --git a/R/utils.R b/R/utils.R index 076a430..a30c6f6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,7 +40,6 @@ convert_to_dosage <- function(gt) { #' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. #' #' @return No return value, called for side effects. -#' @internal #' @noRd vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { if (!verbose) return(invisible()) @@ -84,7 +83,7 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) { #' #' @keywords internal #' @noRd -#' +#' url_exists <- function(u) { tryCatch({ con <- url(u, open = "rb") diff --git a/man/vmsg.Rd b/man/vmsg.Rd deleted file mode 100644 index abcc768..0000000 --- a/man/vmsg.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{vmsg} -\alias{vmsg} -\title{Verbose Message Utility} -\usage{ -vmsg(text, verbose = FALSE, level = 1, type = ">>", ...) -} -\arguments{ -\item{text}{Character string, the message to print (supports sprintf formatting).} - -\item{verbose}{Logical. If TRUE, prints the message; if FALSE, suppresses output.} - -\item{level}{Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).} - -\item{type}{Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.} - -\item{...}{Additional arguments passed to sprintf for formatting.} -} -\value{ -No return value, called for side effects. -} -\description{ -Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE. -} -\details{ -Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg. -} From 56336da56c8c346f014e9c0879bf17c98ef4331f Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Mon, 20 Apr 2026 12:03:58 -0400 Subject: [PATCH 17/19] skipping if offline --- tests/testthat/test-check_madc_sanity.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R index b1a3857..4b60cfe 100644 --- a/tests/testthat/test-check_madc_sanity.R +++ b/tests/testthat/test-check_madc_sanity.R @@ -1,4 +1,5 @@ test_that("check madc",{ + skip_if_offline("raw.githubusercontent.com") github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/" names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs", "OtherAlleles") From 089e8fdb42bcaf5b6999e107a9b078e2725cf6b4 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 21 Apr 2026 09:26:34 -0400 Subject: [PATCH 18/19] madc2vcf_multi better function description --- R/madc2vcf_multi.R | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R index d22b8b4..e71cf0f 100644 --- a/R/madc2vcf_multi.R +++ b/R/madc2vcf_multi.R @@ -1,11 +1,9 @@ #' Convert MADC file to VCF using polyRAD for multiallelic genotyping #' -#' This function converts a DArTag MADC file to a VCF using the polyRAD package's -#' `readDArTag` and `RADdata2VCF` pipeline. It runs `check_madc_sanity` before -#' loading the data, applies corrections for lowercase sequences and all-NA -#' rows/columns, and sets `n.header.rows` automatically based on whether the -#' MADC file follows the raw DArT format (6 header rows) or the fixed allele ID -#' format (no header rows). +#' This function converts a DArTag fixed allele ID MADC file to a VCF +#' containing multiallelic markers based on the microhaplotypes using +#' the polyRAD package's `readDArTag`, `IterateHWE` population model +#' and `RADdata2VCF` pipeline. #' #' @param madc_file character. Path or URL to the input MADC CSV file. #' @param botloci_file character. Path or URL to the botloci file listing target From 8ee0b81e36339d9fc76b9ef1bd46647bdabc1d43 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 21 Apr 2026 09:27:18 -0400 Subject: [PATCH 19/19] roxygenise --- man/madc2vcf_multi.Rd | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd index 70bc59d..b6a3324 100644 --- a/man/madc2vcf_multi.Rd +++ b/man/madc2vcf_multi.Rd @@ -35,12 +35,10 @@ Default is 2.} Invisible NULL. Writes a VCF file to \code{outfile}. } \description{ -This function converts a DArTag MADC file to a VCF using the polyRAD package's -\code{readDArTag} and \code{RADdata2VCF} pipeline. It runs \code{check_madc_sanity} before -loading the data, applies corrections for lowercase sequences and all-NA -rows/columns, and sets \code{n.header.rows} automatically based on whether the -MADC file follows the raw DArT format (6 header rows) or the fixed allele ID -format (no header rows). +This function converts a DArTag fixed allele ID MADC file to a VCF +containing multiallelic markers based on the microhaplotypes using +the polyRAD package's \code{readDArTag}, \code{IterateHWE} population model +and \code{RADdata2VCF} pipeline. } \details{ The function performs the following steps: