From 4ec471b7087f5fc5f235584b386a6769b7a239d8 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Wed, 8 Apr 2026 08:40:59 -0400
Subject: [PATCH 01/19] Fix formatting of RefAltSeqs documentation

---
 R/check_madc_sanity.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R
index 2248779..fda01d5 100644
--- a/R/check_madc_sanity.R
+++ b/R/check_madc_sanity.R
@@ -13,7 +13,8 @@
 #'    (prefix matches `"chr"` case-insensitively, suffix is a positive integer);
 #' 7) **allNAcol** - at least one column contains only `NA` or empty values;
 #' 8) **allNArow** - at least one row contains only `NA` or empty values;
-#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row.
+#' 9) **RefAltSeqs** - every `CloneID` has at least one `Ref` and one `Alt` allele row;
+#' 10) **OtherAlleles** - presence of alleles where the target locus differs from both the Ref and Alt in `AlleleSequence`.
 #'
 #' @param report A `data.frame` with at least the columns
 #'   `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also

From cb768808a97c550411a774e0cd3ca311597910a9 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Wed, 8 Apr 2026 08:55:45 -0400
Subject: [PATCH 02/19] updated docs

---
 man/check_madc_sanity.Rd | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd
index 0398625..1d7eebb 100644
--- a/man/check_madc_sanity.Rd
+++ b/man/check_madc_sanity.Rd
@@ -50,7 +50,8 @@ or a \code{"-"} character is present in \code{AlleleSequence};
 (prefix matches \code{"chr"} case-insensitively, suffix is a positive integer);
 \item \strong{allNAcol} - at least one column contains only \code{NA} or empty values;
 \item \strong{allNArow} - at least one row contains only \code{NA} or empty values;
-\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row.
+\item \strong{RefAltSeqs} - every \code{CloneID} has at least one \code{Ref} and one \code{Alt} allele row;
+\item \strong{OtherAlleles} - presence of alleles where the target locus differs from both the Ref and Alt in \code{AlleleSequence}.
 }
 }
 \details{

From ab944e8e02a4eb5ca6491208996aff052b2cc315 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Fri, 17 Apr 2026 08:20:22 -0400
Subject: [PATCH 03/19] revert filterVCF

---
 R/filterVCF.R    | 321 +++++++----------------------------------------
 man/filterVCF.Rd |  17 +--
 2 files changed, 54 insertions(+), 284 deletions(-)

diff --git a/R/filterVCF.R b/R/filterVCF.R
index b9bca78..a54e32e 100644
--- a/R/filterVCF.R
+++ b/R/filterVCF.R
@@ -17,7 +17,6 @@
 #' @param filter.SAMPLE.miss Sample missing data filter
 #' @param filter.SNP.miss SNP missing data filter
 #' @param ploidy The ploidy of the species being analyzed
-#' @param quality.rates Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity).
 #' @param output.file output file name (optional). If no output.file name provided, then a vcfR object will be returned.
 #' @return A gzipped vcf file
 #' @importFrom vcfR read.vcfR
@@ -27,114 +26,42 @@
 #' @examples
 #' ## Use file paths for each file on the local system
 #'
+#' #Temp location (only for example)
+#' output_file <- tempfile()
 #'
-#' #filterVCF(vcf.file = "example_dart_Dosage_Report.csv",
-#'  #          filter.OD = 0.5,
-#'  #          ploidy = 2,
-#'  #          output.file = "name_for_vcf")
+#' filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"),
+#'            filter.OD = 0.5,
+#'            filter.MAF = 0.05,
+#'            ploidy = 2,
+#'            output.file = output_file)
+#'
+#' # Removing the output for the example
+#' rm(output_file)
 #'
 #' ##The function will output the filtered VCF to the current working directory
 #'
 #' @export
 filterVCF <- function(vcf.file,
-                      quality.rates = FALSE,
-                      filter.OD = NULL,
-                      filter.BIAS.min = NULL,
-                      filter.BIAS.max = NULL,
-                      filter.DP = NULL,
-                      filter.MPP = NULL,
-                      filter.PMC = NULL,
-                      filter.MAF = NULL,
-                      filter.SAMPLE.miss = NULL,
-                      filter.SNP.miss = NULL,
-                      ploidy,
-                      output.file = NULL) {
+                       filter.OD = NULL,
+                       filter.BIAS.min = NULL,
+                       filter.BIAS.max = NULL,
+                       filter.DP = NULL,
+                       filter.MPP = NULL,
+                       filter.PMC = NULL,
+                       filter.MAF = NULL,
+                       filter.SAMPLE.miss = NULL,
+                       filter.SNP.miss = NULL,
+                       ploidy,
+                       output.file = NULL) {
 
   #Should allow for any INFO field to be entered to be filtered
 
-
-
-  # Read VCF (can be .vcf or .vcf.gz)
-
+  # Import VCF (can be .vcf or .vcf.gz)
   if (!inherits(vcf.file, "vcfR")) {
-    vcf <- read.vcfR(vcf.file)
+    vcf <- read.vcfR(vcf.file, verbose = FALSE)
   } else {
     vcf <- vcf.file
-  }
-
-  # Keep original VCF for pre‑filter statistics
-  vcf_orig <- vcf
-
-
-  # pre‑filtering quality rates
-
-  if (quality.rates) {
-    ## Extract genotypes, depth and DP matrix
-    gt_orig <- extract.gt(vcf_orig, element = "GT", as.numeric = FALSE)
-
-    dfmt  <- strsplit(vcf_orig@gt[1, "FORMAT"], ":")[[1]]
-    if ("DP" %in% dfmt) {
-      dp_orig <- extract.gt(vcf_orig, element = "DP", as.numeric = TRUE)
-    } else {
-      dp_orig <- matrix(NA_real_,
-                        nrow = nrow(gt_orig), ncol = ncol(gt_orig),
-                        dimnames = dimnames(gt_orig))
-    }
-
-
-    # 1.  Observed heterozygosity (per‑marker & per‑sample)
-
-    # Helper: TRUE if a genotype is heterozygous (any two different
-    # alleles, excluding missing "./.")
-    is_het <- function(g) {
-      if (is.na(g) || g == "./.") return(FALSE)
-      alleles <- strsplit(g, split = "[/|]")[[1]]
-      return(length(unique(alleles)) > 1)
-    }
-    #matrix of heterozygous calls
-    het_mat  <- apply(gt_orig, c(1, 2), is_het)
-
-    #Observed heterozygosity per marker and per sample
-    obs_het_marker <- rowMeans(het_mat, na.rm = TRUE)
-    obs_het_sample <- colMeans(het_mat, na.rm = TRUE)
-
-
-    #Per‑marker stats
-
-    mean_depth_marker   <- rowMeans(dp_orig, na.rm = TRUE)
-    genotype_present    <- !is.na(gt_orig)
-    genotyping_rate_marker <- rowMeans(genotype_present)
-
-    markers_df <- data.frame(
-      marker = vcf_orig@fix[, "ID"],
-      mean_depth = round(mean_depth_marker, 2),
-      genotyping_rate = round(genotyping_rate_marker, 2),
-      obs_het = round(obs_het_marker, 2),
-      stringsAsFactors = FALSE
-    )
-
-
-    #Per‑sample stats
-
-    mean_depth_sample   <- colMeans(dp_orig, na.rm = TRUE)
-    genotyping_rate_sample <- colMeans(genotype_present)
-
-    samples_df <- data.frame(
-      sample = colnames(gt_orig),
-      mean_depth = round(mean_depth_sample, 2),
-      genotyping_rate = round(genotyping_rate_sample, 2),
-      obs_het = round(obs_het_sample, 2),
-      stringsAsFactors = FALSE
-    )
-
-
-    #Write CSV
-
-    base_name <- if (!is.null(output.file)) output.file else "pre_filter"
-    write.csv(markers_df, paste0(base_name, "_marker_stats.csv"),
-              row.names = FALSE, quote = FALSE)
-    write.csv(samples_df, paste0(base_name, "_sample_stats.csv"),
-              row.names = FALSE, quote = FALSE)
+    #rm(vcf.file)
   }
 
   #Update header based on user filtering parameters
@@ -175,7 +102,7 @@ filterVCF <- function(vcf.file,
 
   # Extract the DP values
   if ("DP" %in% format_fields && !is.null(filter.DP)) {
-    cat("Filtering by DP\n")
+    message("Filtering by DP\n")
     dp <- extract.gt(vcf, element = "DP", as.numeric = TRUE)
     # Identify cells to modify based on the DP threshold
     threshold <- as.numeric(filter.DP)
@@ -189,7 +116,7 @@ filterVCF <- function(vcf.file,
 
   #Filter if the MPP field is present
   if ("MPP" %in% format_fields && !is.null(filter.MPP)) {
-    cat("Filtering by MPP\n")
+    message("Filtering by MPP\n")
     # Extract the MPP values
     mpp <- extract.gt(vcf, element = "MPP", as.numeric = TRUE)
     # Identify cells to modify based on the DP threshold
@@ -229,13 +156,13 @@ filterVCF <- function(vcf.file,
   # Filtering by OD
   if ("OD" %in% info_ids && !is.null(filter.OD)) {
     info <- vcf@fix[, "INFO"] #Need to get after each filter..
-    cat("Filtering by OD\n")
+    message("Filtering by OD\n")
     od_values <- extract_info_value(info, "OD")
     # Ensure no NA values before filtering
     if (!all(is.na(od_values))) {
       vcf <- vcf[od_values < as.numeric(filter.OD), ]
     } else {
-      cat("No valid OD values found.\n")
+      warning("No valid OD values found.\n")
     }
   }
 
@@ -244,26 +171,26 @@ filterVCF <- function(vcf.file,
   # Filtering by BIAS
   if ("BIAS" %in% info_ids && !is.null(filter.BIAS.min) && !is.null(filter.BIAS.max)) {
     info <- vcf@fix[, "INFO"] #Need to get after each filter..
-    cat("Filtering by BIAS\n")
+    message("Filtering by BIAS\n")
     bias_values <- extract_info_value(info, "BIAS")
     # Ensure no NA values before filtering
     if (!all(is.na(bias_values))) {
       vcf <- vcf[bias_values > as.numeric(filter.BIAS.min) & bias_values < as.numeric(filter.BIAS.max), ]
     } else {
-      cat("No valid BIAS values found.\n")
+      warning("No valid BIAS values found.\n")
     }
   }
 
   # Filtering by PMC
   if ("PMC" %in% info_ids && !is.null(filter.PMC)) {
     info <- vcf@fix[, "INFO"] #Need to get after each filter..
-    cat("Filtering by PMC\n")
+    message("Filtering by PMC\n")
     pmc_values <- extract_info_value(info, "PMC")
     # Ensure no NA values before filtering
     if (!all(is.na(pmc_values))) {
       vcf <- vcf[pmc_values < as.numeric(filter.PMC), ]
     } else {
-      cat("No valid PMC values found.\n")
+      warning("No valid PMC values found.\n")
     }
   }
 
@@ -273,14 +200,14 @@ filterVCF <- function(vcf.file,
     gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf))
 
     if (!is.null(filter.SNP.miss)) {
-      cat("Filtering by SNP missing data\n")
+      message("Filtering by SNP missing data\n")
       snp_missing_data <- rowMeans(is.na(gt_matrix))
       vcf <- vcf[snp_missing_data < as.numeric(filter.SNP.miss), ]
       gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)
     }
 
     if (!is.null(filter.SAMPLE.miss)) {
-      cat("Filtering by Sample missing data\n")
+      message("Filtering by Sample missing data\n")
       # Calculate the proportion of missing data for each sample
       sample_missing_data <- colMeans(is.na(gt_matrix))
       # Identify samples to keep based on the missing data threshold
@@ -295,112 +222,30 @@ filterVCF <- function(vcf.file,
     rm(gt_matrix)
   }
 
-  ##Convert GT to dosage
-  #gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE)#as.matrix(vcfR2genlight(vcf))
-
-  # Function to determine the ploidy level from a genotype string
-  #determine_ploidy <- function(gt) {
-  #  if (is.na(gt)) {
-  #    return(NA)
-  #  }
-  #  return(length(strsplit(gt, "[|/]")[[1]]))
-  #}
-
-  # Function to find a non-NA example genotype to determine ploidy
-  #find_example_gt <- function(matrix) {
-  #  for (i in seq_len(nrow(matrix))) {
-  #    for (j in seq_len(ncol(matrix))) {
-  #      if (!is.na(matrix[i, j])) {
-  #        return(matrix[i, j])
-  #      }
-  #    }
-  #  }
-  #  return(NA)  # Return NA if no non-NA genotype is found
-  #}
-
-  # Find a non-NA example genotype
-  #example_gt <- find_example_gt(gt_matrix)
-
-  # Determine the ploidy level
-  #if (!is.na(example_gt)) {
-  #  ploidy <- determine_ploidy(example_gt)
-  #} else {
-  #  stop("No non-NA genotype found to determine ploidy.")
-  #}
-
-  # Generate lookup table for genotypes to dosage conversion
-  #generate_lookup_table <- function(ploidy) {
-  #  possible_alleles <- 0:ploidy
-  #  genotypes <- expand.grid(rep(list(possible_alleles), ploidy))
-  #  genotypes <- apply(genotypes, 1, function(x) paste(x, collapse = "/"))
-  #  dosage_values <- rowSums(expand.grid(rep(list(possible_alleles), ploidy)))
-  #  lookup_table <- setNames(dosage_values, genotypes)
-  #  return(lookup_table)
-  #}
-
-  # Generate the lookup table
-  #lookup_table <- generate_lookup_table(ploidy)
-
-  # Function to convert genotype to dosage using the lookup table
-  #genotype_to_dosage <- function(gt, lookup_table) {
-  #  if (is.na(gt)) {
-  #    return(NA)
-  #  }
-  #  return(lookup_table[[gt]])
-  #}
-
-  # Function to convert genotype matrix to dosage matrix using vectorized operations
-  #convert_genotypes_to_dosage <- function(gt_matrix, lookup_table) {
-  #  unique_gts <- unique(gt_matrix)
-  #  gt_to_dosage <- setNames(rep(NA, length(unique_gts)), unique_gts)
-  #  valid_gts <- unique_gts[unique_gts %in% names(lookup_table)]
-  #  gt_to_dosage[valid_gts] <- lookup_table[valid_gts]
-  #  dosage_matrix <- gt_to_dosage[gt_matrix]
-  #colnames(dosage_matrix) <- colnames(gt_matrix)
-  #row.names(dosage_matrix) <- row.names(gt_matrix)
-  #  return(matrix(as.numeric(dosage_matrix), nrow = nrow(gt_matrix), ncol = ncol(gt_matrix)))
-  #}
-
-  # Convert the genotype matrix to dosage matrix
-  #dosage_matrix <- convert_genotypes_to_dosage(gt_matrix, lookup_table)
-
   ##MAF filter
-  #Compare my lengthy process to estimate MAF with vcfR::maf() function
-  #The BIGr::calculate_MAF(dosage_matrix, ploidy) is the exact same as the vcfR::maf() calculations
-  #The step where I extract UD and calculate MAF is different...
-  #if ("UD" %in% format_fields) {
-  #  maf_df <- BIGr::calculate_MAF(extract.gt(vcf, element = "UD", as.numeric = TRUE), ploidy = ploidy)
-  #} else {
-  #convert genotypes to dosage and filter
-  #  maf_df <- BIGr::calculate_MAF(dosage_matrix, ploidy)
-  #}
-  #Need to confirm that vcfR::maf will work with any ploidy...if not, use my code
   if (!is.null(filter.MAF)) {
-    cat("Filtering by MAF\n")
+    message("Filtering by MAF\n")
     maf_df <- data.frame(vcfR::maf(vcf, element = 2))
     vcf <- vcf[maf_df$Frequency > as.numeric(filter.MAF), ]
   }
   ### Export the modified VCF file (this exports as a .vcf.gz, so make sure to have the name end in .vcf.gz)
-  cat("Exporting VCF\n")
-  if (!inherits(vcf.file, "vcfR")){
-    if (!is.null(output.file)){
-      output_name <- paste0(output.file,".vcf.gz")
+  message("Exporting VCF\n")
+  if (!inherits(vcf.file, "vcfR")) {
+    if (!is.null(output.file)) {
+      output_name <- paste0(output.file, ".vcf.gz")
       vcfR::write.vcf(vcf, file = output_name)
-    }else{
+    } else {
       return(vcf)
     }
-  }else{
-    if (!is.null(output.file)){
-      output_name <- paste0(output.file,"_filtered.vcf.gz")
+  } else {
+    if (!is.null(output.file)) {
+      output_name <- paste0(output.file, "_filtered.vcf.gz")
       vcfR::write.vcf(vcf, file = output_name)
-    }else{
+    } else {
       return(vcf)
     }
   }
 
-  #Message that includes the output vcf stats
-  print(vcf)
-
   #Message
   samples_removed <- starting_samples - (ncol(vcf@gt)-1)
   SNPs_removed <- starting_snps - nrow(vcf)
@@ -408,81 +253,3 @@ filterVCF <- function(vcf.file,
   message("SNPs removed due to filtering: ",SNPs_removed)
   message("Complete!")
 }
-#This is not reliable, so no longer use this shortcut to get dosage matrix
-#test2 <- vcfR2genlight(vcf)
-
-
-#####Testing custom VCF reading function######
-# Open the gzipped VCF file
-#con <- gzfile("/Users/ams866/Desktop/output.vcf", "rt")
-
-# Read in the entire file
-#lines <- readLines(con)
-#close(con)
-# Read in the entire file
-#lines <- readLines("/Users/ams866/Desktop/output.vcf")
-# Filter out lines that start with ##
-#filtered_lines <- lines[!grepl("^##", lines)]
-# Create a temporary file to write the filtered lines
-#temp_file <- tempfile()
-#writeLines(filtered_lines, temp_file)
-# Read in the filtered data using read.table or read.csv
-#vcf_data <- read.table(temp_file, header = TRUE, sep = "\t", comment.char = "", check.names = FALSE)
-# Clean up the temporary file
-#unlink(temp_file)
-
-##Extract INFO column and Filter SNPs by those values
-#Update the filtering options by the items present in the INFO column?
-
-# Load required library
-#library(dplyr)
-
-# Split INFO column into key-value pairs
-#vcf_data_parsed <- vcf_data %>%
-#  mutate(INFO_PARSED = strsplit(INFO, ";")) %>%
-#  unnest(INFO_PARSED) %>%
-#  separate(INFO_PARSED, into = c("KEY", "VALUE"), sep = "=") %>%
-#  spread(KEY, VALUE)
-
-#Filter by DP
-#filtered_vcf_data <- vcf_data_parsed %>%
-#  filter(as.numeric(DP) > 10)
-
-# View the filtered dataframe
-#print(filtered_vcf_data)
-
-##Extracting and filtering by FORMAT column
-# Identify the columns that are not sample columns
-#non_sample_cols <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT")
-# Identify the sample columns
-#sample_cols <- setdiff(names(vcf_data), non_sample_cols)
-# Extract FORMAT keys
-#format_keys <- strsplit(as.character(vcf_data$FORMAT[1]), ":")[[1]]
-# Split SAMPLE columns based on FORMAT
-#vcf_data_samples <- vcf_data %>%
-#  mutate(across(all_of(sample_cols), ~strsplit(as.character(.), ":"))) %>%
-#  mutate(across(all_of(sample_cols), ~map(., ~setNames(as.list(.), format_keys)))) %>%
-#  unnest_wider(all_of(sample_cols), names_sep = "_")
-
-# View the parsed dataframe
-#print(head(vcf_data_samples))
-
-# Create separate dataframes for each FORMAT variable
-#format_dfs <- lapply(format_keys, function(format_key) {
-#  vcf_data_samples %>%
-#    select(ID, ends_with(paste0("_", format_key))) %>%
-#    column_to_rownames("ID")
-#})
-
-# Assign names to the list elements
-#names(format_dfs) <- format_keys
-
-# Access the separate dataframes
-#gt_df <- format_dfs$GT  # Genotype dataframe
-#ad_df <- format_dfs$AD  # Allelic depths dataframe
-
-#*I think the above method is okay if you only need to filter at the INFO level,
-#*But I think if you want to filter for FORMAT, that vcfR is probably best,
-#*Will need to explore further if I can easily just filter for MPP by checking if it is above a
-#*threshold, and then converting the GT and UD values to NA if so...
-#*If that is efficient and works, then I will just use this custom VCF method...
diff --git a/man/filterVCF.Rd b/man/filterVCF.Rd
index 0342fe1..676ef7f 100644
--- a/man/filterVCF.Rd
+++ b/man/filterVCF.Rd
@@ -6,7 +6,6 @@
 \usage{
 filterVCF(
   vcf.file,
-  quality.rates = FALSE,
   filter.OD = NULL,
   filter.BIAS.min = NULL,
   filter.BIAS.max = NULL,
@@ -23,8 +22,6 @@ filterVCF(
 \arguments{
 \item{vcf.file}{vcfR object or path to VCF file. Can be unzipped (.vcf) or gzipped (.vcf.gz).}
 
-\item{quality.rates}{Logical. If TRUE, calculates and outputs CSV files with quality metrics for each marker and sample before filtering (mean depth, genotyping rate, observed heterozygosity).}
-
 \item{filter.OD}{Updog filter}
 
 \item{filter.BIAS.min}{Updog filter (requires a value for both BIAS.min and BIAS.max)}
@@ -61,11 +58,17 @@ The VCF format is v4.3
 \examples{
 ## Use file paths for each file on the local system
 
+#Temp location (only for example)
+output_file <- tempfile()
+
+filterVCF(vcf.file = system.file("iris_DArT_VCF.vcf.gz", package = "BIGr"),
+           filter.OD = 0.5,
+           filter.MAF = 0.05,
+           ploidy = 2,
+           output.file = output_file)
 
-#filterVCF(vcf.file = "example_dart_Dosage_Report.csv",
- #          filter.OD = 0.5,
- #          ploidy = 2,
- #          output.file = "name_for_vcf")
+# Removing the output for the example
+rm(output_file)
 
 ##The function will output the filtered VCF to the current working directory
 

From 4ac0c5526fc7baec013902d717717b2ab16baa96 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Fri, 17 Apr 2026 08:20:52 -0400
Subject: [PATCH 04/19] support LUT Marker_ID

---
 R/madc2vcf_targets.R    | 12 +++++++-----
 man/madc2vcf_targets.Rd |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R
index 888c445..cc72e3e 100644
--- a/R/madc2vcf_targets.R
+++ b/R/madc2vcf_targets.R
@@ -71,7 +71,7 @@
 #'   `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`.
 #' @param markers_info character or `NULL`. Optional path to a CSV providing target
 #'   metadata. Accepted columns:
-#'   - `CloneID` or `BI_markerID` (required as marker identifier);
+#'   - `CloneID`, `Marker_ID`, or `BI_markerID` (required as marker identifier);
 #'   - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format;
 #'   - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence
 #'     inference is not possible (IUPAC codes, indels, or unfixed allele IDs).
@@ -237,7 +237,7 @@ madc2vcf_targets <- function(madc_file,
   if(!isTRUE(checks$checks["ChromPos"])) {
     if(is.null(markers_info)){
       stop("CloneID column does not follow the 'Chr_Pos'. ",
-           "Please provide a markers_info file with at least 'CloneID'/'BI_markerID', ",
+           "Please provide a markers_info file with at least 'CloneID'/'Marker_ID'/'BI_markerID', ",
            "'Chr', and 'Pos' columns.")
     } else {
 
@@ -309,7 +309,8 @@ madc2vcf_targets <- function(madc_file,
     if(is.null(mi_df)) mi_df <- read.csv(markers_info)
     id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else
               if ("CloneID"     %in% colnames(mi_df)) "CloneID"     else
-      stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.")
+              if ("Marker_ID"     %in% colnames(mi_df)) "Marker_ID"     else
+      stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.")
 
     if(checks$checks["Indels"])
       vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.",
@@ -321,7 +322,7 @@ madc2vcf_targets <- function(madc_file,
 
     if(!all(rownames(ad_df) %in% mi_df[[id_col]])) {
       miss_CloneIDs <- rownames(ad_df)[!rownames(ad_df) %in% mi_df[[id_col]]]
-      if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.")
+      if(length(miss_CloneIDs) == nrow(ad_df)) stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.")
       vmsg(paste("Not all MADC CloneID was found in the markers_info file. These markers will be removed:",
                  paste(miss_CloneIDs, collapse = " ")), verbose = verbose, level = 2, type = ">>")
       warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.")
@@ -342,7 +343,8 @@ madc2vcf_targets <- function(madc_file,
     if(is.null(mi_df)) mi_df <- read.csv(markers_info)
     id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else
               if ("CloneID"     %in% colnames(mi_df)) "CloneID"     else
-      stop("The markers_info file must contain a marker ID column named either 'CloneID' or 'BI_markerID'.")
+              if ("Marker_ID"     %in% colnames(mi_df)) "Marker_ID"     else
+      stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.")
 
     if(checks$checks["Indels"])
       vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.",
diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd
index 30363a6..25f99d5 100644
--- a/man/madc2vcf_targets.Rd
+++ b/man/madc2vcf_targets.Rd
@@ -28,7 +28,7 @@ Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and
 \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target
 metadata. Accepted columns:
 \itemize{
-\item \code{CloneID} or \code{BI_markerID} (required as marker identifier);
+\item \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} (required as marker identifier);
 \item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format;
 \item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence
 inference is not possible (IUPAC codes, indels, or unfixed allele IDs).

From 5234572eb39849f68a20d92c68d9040356a82556 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Fri, 17 Apr 2026 09:33:10 -0400
Subject: [PATCH 05/19] Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 R/madc2vcf_multi.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R
index bcbae02..2ff7b0e 100644
--- a/R/madc2vcf_multi.R
+++ b/R/madc2vcf_multi.R
@@ -166,6 +166,9 @@ madc2vcf_multi <- function(madc_file,
 
   vmsg("Loading MADC into polyRAD", verbose = verbose, level = 0, type = ">>")
 
+  if (!requireNamespace("polyRAD", quietly = TRUE)) {
+    stop("Package 'polyRAD' is required for madc2vcf_multi(). Please install it with install.packages('polyRAD').", call. = FALSE)
+  }
   raddat <- polyRAD::readDArTag(
     file              = input_file,
     botloci           = botloci_input,

From 8c9dcda701ef63b4c3620979ee7205fbd4308751 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 08:46:33 -0400
Subject: [PATCH 06/19] Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 R/madc2vcf_targets.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R
index cc72e3e..a16fdfb 100644
--- a/R/madc2vcf_targets.R
+++ b/R/madc2vcf_targets.R
@@ -165,7 +165,7 @@ madc2vcf_targets <- function(madc_file,
                       "verbose= ", verbose,')">')
 
   # MADC checks
-  report <- read.csv(madc_file)
+  report <- read.csv(madc_file, check.names = FALSE)
   checks <- check_madc_sanity(report)
 
   messages_results <- mapply(function(check, message) {

From b4d5534d7fde36a053be8245e6905548e3b92301 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Fri, 17 Apr 2026 08:28:04 -0400
Subject: [PATCH 07/19] covered error case

---
 R/get_countsMADC.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R
index 3a9bc2b..c1045ec 100644
--- a/R/get_countsMADC.R
+++ b/R/get_countsMADC.R
@@ -54,6 +54,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche
 
   # Add check inputs
   if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.")
+  if(!is.null(madc_file) && !is.null(madc_object)) stop("Please provide either madc_file or madc_object. Not both.")
   if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.")
   if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.")
 

From 57bbc89c93cb677bc614c4389ffc50df581465a8 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Fri, 17 Apr 2026 09:29:30 -0400
Subject: [PATCH 08/19] add example and suggest ggplot

---
 DESCRIPTION                   |  4 +--
 NAMESPACE                     |  1 -
 R/imputation_concordance.R    | 46 +++++++++++++++++++++++------------
 man/imputation_concordance.Rd | 23 ++++++++++++++----
 4 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 48ede7c..672faac 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -62,10 +62,10 @@ Imports:
     janitor,
     quadprog,
     tibble,
-    stringr,
-    ggplot2
+    stringr
 Suggests: 
     covr,
+    ggplot2,
     spelling,
     rmdformats,
     knitr (>= 1.10),
diff --git a/NAMESPACE b/NAMESPACE
index ae09080..e6cbc30 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -24,7 +24,6 @@ export(thinSNP)
 export(updog2vcf)
 export(vmsg)
 import(dplyr)
-import(ggplot2)
 import(janitor)
 import(parallel)
 import(quadprog)
diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R
index 1eb441a..918071a 100644
--- a/R/imputation_concordance.R
+++ b/R/imputation_concordance.R
@@ -47,19 +47,31 @@
 #' is generated using \pkg{ggplot2}.
 #'
 #' @import dplyr
-#' @import ggplot2
 #'
 #' @examples
-#' \dontrun{
+#' ref <- data.frame(
+#'   ID = c("S1", "S2", "S3"),
+#'   SNP1 = c(0, 1, 2),
+#'   SNP2 = c(1, 1, 0),
+#'   SNP3 = c(2, 5, 1)
+#' )
+#'
+#' test <- data.frame(
+#'   ID = c("S1", "S2", "S3"),
+#'   SNP1 = c(0, 0, 2),
+#'   SNP2 = c(1, 1, 1),
+#'   SNP3 = c(2, 5, 0)
+#' )
+#'
 #' result <- imputation_concordance(
 #'   reference_genos = ref,
 #'   imputed_genos = test,
-#'   snps_2_exclude = snps,
+#'   snps_2_exclude = "SNP2",
 #'   missing_code = 5,
-#'   verbose = TRUE,
-#'   plot = TRUE
+#'   print_result = FALSE
 #' )
-#' }
+#'
+#' result
 #'
 #' @importFrom stats reorder
 #' @export
@@ -136,21 +148,25 @@ imputation_concordance <- function(reference_genos,
 
   # Optional plot
   if (plot) {
+    if (!requireNamespace("ggplot2", quietly = TRUE)) {
+      stop("Package 'ggplot2' is required when plot = TRUE.", call. = FALSE)
+    }
 
     plot_df <- data.frame(
       ID = imputed_genos$ID,
       Concordance = percentage_match * 100
     )
 
-    concordance_plot <- ggplot(plot_df,
-                               aes(x = reorder(ID, Concordance),
-                                   y = Concordance)) +
-      geom_bar(stat = "identity") +
-      labs(title = "Imputation Concordance by Sample",
-           x = "Sample ID",
-           y = "Concordance (%)") +
-      theme_minimal() +
-      theme(axis.text.x = element_text(angle = 90, hjust = 1))
+    concordance_plot <- ggplot2::ggplot(
+      plot_df,
+      ggplot2::aes(x = reorder(ID, Concordance), y = Concordance)
+    ) +
+      ggplot2::geom_bar(stat = "identity") +
+      ggplot2::labs(title = "Imputation Concordance by Sample",
+                    x = "Sample ID",
+                    y = "Concordance (%)") +
+      ggplot2::theme_minimal() +
+      ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1))
 
     print(concordance_plot)
   }
diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd
index 31f54a8..22e9462 100644
--- a/man/imputation_concordance.Rd
+++ b/man/imputation_concordance.Rd
@@ -64,15 +64,28 @@ When \code{plot = TRUE}, a bar plot showing concordance percentage per sample
 is generated using \pkg{ggplot2}.
 }
 \examples{
-\dontrun{
+ref <- data.frame(
+  ID = c("S1", "S2", "S3"),
+  SNP1 = c(0, 1, 2),
+  SNP2 = c(1, 1, 0),
+  SNP3 = c(2, 5, 1)
+)
+
+test <- data.frame(
+  ID = c("S1", "S2", "S3"),
+  SNP1 = c(0, 0, 2),
+  SNP2 = c(1, 1, 1),
+  SNP3 = c(2, 5, 0)
+)
+
 result <- imputation_concordance(
   reference_genos = ref,
   imputed_genos = test,
-  snps_2_exclude = snps,
+  snps_2_exclude = "SNP2",
   missing_code = 5,
-  verbose = TRUE,
-  plot = TRUE
+  print_result = FALSE
 )
-}
+
+result
 
 }

From 4bb67fd1a89cebd73b90a6d259cb3c625c01b817 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Fri, 17 Apr 2026 09:29:45 -0400
Subject: [PATCH 09/19] make get_counts internal

---
 R/get_countsMADC.R    |  3 ++-
 man/get_counts.Rd     | 60 -------------------------------------------
 man/get_countsMADC.Rd |  2 +-
 3 files changed, 3 insertions(+), 62 deletions(-)
 delete mode 100644 man/get_counts.Rd

diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R
index c1045ec..57b83ce 100644
--- a/R/get_countsMADC.R
+++ b/R/get_countsMADC.R
@@ -46,7 +46,7 @@
 #'
 #' rm(counts_matrices)
 #'
-#' @seealso [get_counts()], [check_madc_sanity()]
+#' @seealso [check_madc_sanity()]
 #'
 #' @import dplyr
 #' @export
@@ -182,6 +182,7 @@ get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matche
 #' @importFrom dplyr %>% filter case_when
 #'
 #' @keywords internal
+#' @noRd
 get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) {
 
   # Add check inputs
diff --git a/man/get_counts.Rd b/man/get_counts.Rd
deleted file mode 100644
index 1879e07..0000000
--- a/man/get_counts.Rd
+++ /dev/null
@@ -1,60 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/get_countsMADC.R
-\name{get_counts}
-\alias{get_counts}
-\title{Read and Pre-process a MADC File}
-\usage{
-get_counts(
-  madc_file = NULL,
-  madc_object = NULL,
-  collapse_matches_counts = FALSE,
-  verbose = TRUE
-)
-}
-\arguments{
-\item{madc_file}{character or \code{NULL}. Path to the input MADC CSV file.
-At least one of \code{madc_file} or \code{madc_object} must be provided.}
-
-\item{madc_object}{data frame or \code{NULL}. A pre-read MADC data frame
-(e.g., from \code{check_botloci()}). When supplied, file reading is skipped.
-At least one of \code{madc_file} or \code{madc_object} must be provided.}
-
-\item{collapse_matches_counts}{logical. If \code{TRUE}, counts for \verb{|AltMatch}
-and \verb{|RefMatch} rows are summed into their corresponding \verb{|Ref} and \verb{|Alt}
-rows. If \code{FALSE} (default), those rows are discarded.}
-
-\item{verbose}{logical. Whether to print progress messages. Default is \code{TRUE}.}
-}
-\value{
-A data frame with one row per \code{Ref} or \code{Alt} allele entry, retaining
-all original columns (\code{AlleleID}, \code{CloneID}, \code{AlleleSequence}, sample
-count columns, etc.).
-}
-\description{
-Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the
-file format, and returns a filtered data frame containing only \code{Ref} and \code{Alt}
-haplotype rows ready for count-matrix construction.
-}
-\details{
-\strong{Input}: either \code{madc_file} (path to CSV) or \code{madc_object} (pre-read data
-frame) must be supplied; at least one is required.
-
-\strong{Format detection} (applied to file or object alike): the first seven rows
-of the first column are inspected:
-\itemize{
-\item \strong{Standard format}: all entries are blank or \code{"*"} — the first 7 rows are
-treated as DArT placeholder rows and skipped.
-\item \strong{Fixed-allele-ID format}: no filler rows — data are used as-is.
-}
-
-\strong{\verb{|AltMatch} / \verb{|RefMatch} handling} (controlled by \code{collapse_matches_counts}):
-\itemize{
-\item \code{FALSE} (default): these rows are simply discarded.
-\item \code{TRUE}: their counts are summed into the corresponding \verb{|Ref} or \verb{|Alt}
-row for the same \code{CloneID}.
-}
-
-In all cases, trailing suffixes on \code{AlleleID} (e.g., \verb{|Ref_001}, \verb{|Alt_002})
-are stripped to the canonical \verb{|Ref} / \verb{|Alt} form.
-}
-\keyword{internal}
diff --git a/man/get_countsMADC.Rd b/man/get_countsMADC.Rd
index 28fca1e..207b899 100644
--- a/man/get_countsMADC.Rd
+++ b/man/get_countsMADC.Rd
@@ -67,5 +67,5 @@ rm(counts_matrices)
 
 }
 \seealso{
-\code{\link[=get_counts]{get_counts()}}, \code{\link[=check_madc_sanity]{check_madc_sanity()}}
+\code{\link[=check_madc_sanity]{check_madc_sanity()}}
 }

From ff1ef84d3a1ba905736c9d321527b70ca675e3a2 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Fri, 17 Apr 2026 09:29:51 -0400
Subject: [PATCH 10/19] update test

---
 tests/testthat/test-madc2vcf_targets.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R
index a64da34..b5e8a60 100644
--- a/tests/testthat/test-madc2vcf_targets.R
+++ b/tests/testthat/test-madc2vcf_targets.R
@@ -218,7 +218,7 @@ test_that("simu alfalfa",{
                        botloci_file = alfalfa_botloci,
                        markers_info = alfalfa_markers_info,
                        verbose = FALSE),
-      "None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match."
+      "None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match."
     )
 
     # Test that it works when the function can find a matching ID in markers_info to fix the botloci mismatch issue

From 86b4fef4e24b827595036749cad51b2a9e645f27 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 11:12:20 -0400
Subject: [PATCH 11/19] added marker_id support

---
 R/check_madc_sanity.R                   | 58 ++++++++++-----
 tests/testthat/test-check_madc_sanity.R | 97 +++++++++++++++++++++++++
 2 files changed, 137 insertions(+), 18 deletions(-)

diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R
index fda01d5..88f564a 100644
--- a/R/check_madc_sanity.R
+++ b/R/check_madc_sanity.R
@@ -156,7 +156,10 @@ check_madc_sanity <- function(report) {
       pos <- strsplit(report$CloneID, "_")
       format <- all(sapply(pos, length) == 2)
       first <- all(grepl("^[A-Za-z]", sapply(pos, "[", 1)))
-      second <- suppressWarnings(all(sapply(pos, function(x) as.numeric(x[2])) > 0))
+      second <- all(vapply(pos, function(x) {
+        pos_num <- suppressWarnings(as.numeric(x[2]))
+        !is.na(pos_num) && pos_num > 0
+      }, logical(1)))
       checks["ChromPos"] <- all(format, first, second)
     } else checks["ChromPos"] <- FALSE
 
@@ -216,7 +219,9 @@ check_madc_sanity <- function(report) {
 #' @param botloci A data frame containing the botloci markers.
 #' @param report A data frame containing the MADC markers.
 #' @param ChromPos logical value indicating whether the CloneID in the MADC file contains chromosome and position information in the format "Chr_Pos". Default is TRUE
-#' @param mi_df A data frame containing marker information with columns CloneID, Chr, and Pos. Required if `ChromPos` is FALSE.
+#' @param mi_df A data frame containing marker information with one marker ID column
+#'   (`CloneID`, `Marker_ID`, or `BI_markerID`) plus `Chr` and `Pos`. Required if
+#'   `ChromPos` is FALSE.
 #' @param verbose A logical value indicating whether to print detailed messages about the adjustments. Default is TRUE. Required if `ChromPos` is FALSE.
 #'
 #' @return A list containing the adjusted botloci and MADC data frames.
@@ -226,23 +231,42 @@ check_madc_sanity <- function(report) {
 #'
 #' @keywords internal
 #' @noRd
+pick_markers_info_id_col <- function(mi_df, query_ids) {
+  query_ids <- unique(stats::na.omit(query_ids))
+  id_cols <- intersect(c("CloneID", "BI_markerID", "Marker_ID"), colnames(mi_df))
+
+  if (!length(id_cols)) {
+    stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.")
+  }
+
+  match_n <- vapply(id_cols, function(col) {
+    sum(query_ids %in% unique(stats::na.omit(mi_df[[col]])))
+  }, integer(1))
+
+  if (!any(match_n)) {
+    stop("None of the MADC CloneID could be found in the markers_info CloneID, Marker_ID or BI_markerID. Please make sure they match.")
+  }
+
+  id_cols[which.max(match_n)]
+}
+
 check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=TRUE){
+  original_clone_ids <- report$CloneID
+  use_col <- NULL
 
   # Check inputs
   if(!ChromPos) {
-    if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with CloneID, Chr and Pos columns must be provided.")
-    # if exists, it must contain CloneID or BI_markerID that matches the report$CloneID, and Chr and Pos columns
-    if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) {
-      stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.")
-    } else {
-      use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID"
-      vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>")
-    }
+    if(is.null(mi_df)) stop("When MADC CloneID don't follow the format Chr_Pos, a marker_info file with 'CloneID'/'Marker_ID'/'BI_markerID', 'Chr', and 'Pos' columns must be provided.")
+    use_col <- pick_markers_info_id_col(mi_df, report$CloneID)
+    vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 1, type = ">>")
     if(is.null(mi_df$Chr) | is.null(mi_df$Pos)) stop("When MADC CloneID don't follow the format Chr_Pos, Chr and Pos columns must be provided in the markers_info file.")
   }
 
   if(!any(botloci$V1 %in% report$CloneID)) { # First check if any botloci markers are found in MADC file. If not, check for padding mismatch.
     vmsg("No botloci markers found in MADC file. Checking for padding mismatch...", verbose = verbose, level = 1, type = ">>")
+    if(!is.null(mi_df) && is.null(use_col)) {
+      use_col <- pick_markers_info_id_col(mi_df, original_clone_ids)
+    }
 
     pad_madc <- unique(nchar(sub(".*_", "", report$CloneID)))
     pad_botloci <- unique(nchar(sub(".*_", "", botloci$V1)))
@@ -257,10 +281,11 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=
         )
         report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2))
         if(!is.null(mi_df)) {
-          mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df$CloneID), "_",
-                                  sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df$CloneID)))
+          mi_df$CloneID <- paste0(sub("_(.*)", "", mi_df[[use_col]]), "_",
+                                  sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", mi_df[[use_col]])))
           )
         }
+        if(!any(botloci$V1 %in% report$CloneID)) stop("After matching padding, botloci markers still not found in MADC file. Check marker IDs.\n")
       } else {
         botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_",
                              sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1)))
@@ -270,12 +295,8 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=
     } else if (!(is.null(mi_df$Chr) | is.null(mi_df$Pos))){
       vmsg("It is not a padding mismatch issue.", verbose = verbose, level = 2, type = ">>")
       vmsg("Checking if jointing provided Chromosome and Position information in marker_file solve the issue", verbose = verbose, level = 2, type = ">>")
-      if(!any(mi_df$CloneID %in% report$CloneID) & !any(mi_df$BI_markerID %in% report$CloneID)) {
-        stop("None of the MADC CloneID could be found in the markers_info CloneID or BI_markerID. Please make sure they match.")
-      } else {
-        use_col <- if(any(mi_df$CloneID %in% report$CloneID)) "CloneID" else "BI_markerID"
-        vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>")
-      }
+      use_col <- pick_markers_info_id_col(mi_df, report$CloneID)
+      vmsg(paste("Using", use_col, "column in marker_info to match MADC CloneID"), verbose = verbose, level = 2, type = ">>")
       mk_info_CloneID <- paste0(mi_df$Chr, "_", sprintf(paste0("%0",pad_botloci, "d"), as.integer(mi_df$Pos)))
 
       if(!any(botloci$V1 %in% mk_info_CloneID)){
@@ -286,6 +307,7 @@ check_botloci <- function(botloci, report, ChromPos=TRUE, mi_df = NULL, verbose=
         vmsg("Chromosome and Position information in marker_file solve the issue.", verbose = verbose, level = 2, type = ">>")
         vmsg("Using this information to modify MADC CloneIDs to match botloci markers.", verbose = verbose, level = 2, type = ">>")
         report$CloneID <- mk_info_CloneID[match(report$CloneID, mi_df[[use_col]])]
+        report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2))
         mi_df$CloneID <- mk_info_CloneID
       }
     } else {
diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R
index d0c45cf..b1a3857 100644
--- a/tests/testthat/test-check_madc_sanity.R
+++ b/tests/testthat/test-check_madc_sanity.R
@@ -68,4 +68,101 @@ test_that("check madc",{
   expect_equal(res$checks, exp)
 })
 
+test_that("check_botloci remaps using Marker_ID", {
+  botloci <- data.frame(V1 = c("1_0001", "2_0002"))
+  report <- data.frame(
+    CloneID = c("ProbeA_0001", "ProbeB_0002"),
+    AlleleID = c("ProbeA_0001|Ref_0001", "ProbeB_0002|Ref_0001"),
+    AlleleSequence = c("A", "T"),
+    check.names = FALSE
+  )
+  mi_df <- data.frame(
+    Marker_ID = c("ProbeA_0001", "ProbeB_0002"),
+    Chr = c("1", "2"),
+    Pos = c(1, 2)
+  )
+
+  res <- check_botloci(botloci, report, ChromPos = FALSE, mi_df = mi_df, verbose = FALSE)
+
+  expect_equal(res[[2]]$CloneID, botloci$V1)
+  expect_equal(res[[3]]$CloneID, botloci$V1)
+})
+
+test_that("check_botloci resolves Marker_ID before padding report CloneIDs", {
+  botloci <- data.frame(V1 = "1_000000123")
+  report <- data.frame(
+    CloneID = "1_123",
+    AlleleID = "1_123|Ref_0001",
+    AlleleSequence = "A",
+    check.names = FALSE
+  )
+  mi_df <- data.frame(
+    Marker_ID = "1_123",
+    Chr = "1",
+    Pos = 123
+  )
+
+  res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE)
+
+  expect_equal(res[[2]]$CloneID, botloci$V1)
+  expect_equal(res[[3]]$CloneID, botloci$V1)
+})
+
+test_that("pick_markers_info_id_col scores distinct markers not allele rows", {
+  mi_df <- data.frame(
+    CloneID = c("m1", "m2"),
+    Marker_ID = c("m1", "m3")
+  )
+  query_ids <- c("m1", "m1", "m1", "m2")
+
+  expect_equal(pick_markers_info_id_col(mi_df, query_ids), "CloneID")
+})
+
+test_that("check_madc_sanity returns FALSE for malformed CloneID positions", {
+  report <- data.frame(
+    CloneID = c("Chr_abc", "Chr_abc"),
+    AlleleID = c("Chr_abc|Ref_0001", "Chr_abc|Alt_0002"),
+    AlleleSequence = c("A", "T"),
+    check.names = FALSE
+  )
 
+  res <- check_madc_sanity(report)
+
+  expect_false(is.na(res$checks["ChromPos"]))
+  expect_false(res$checks["ChromPos"])
+})
+
+test_that("check_botloci errors if widening MADC padding still does not match", {
+  botloci <- data.frame(V1 = "1_0002")
+  report <- data.frame(
+    CloneID = "1_1",
+    AlleleID = "1_1|Ref_0001",
+    AlleleSequence = "A",
+    check.names = FALSE
+  )
+
+  expect_error(
+    check_botloci(botloci, report, ChromPos = TRUE, verbose = FALSE),
+    "After matching padding, botloci markers still not found in MADC file. Check marker IDs."
+  )
+})
+
+test_that("check_botloci keeps AlleleID synchronized after CloneID remap", {
+  botloci <- data.frame(V1 = "1_0001")
+  report <- data.frame(
+    CloneID = "ProbeA_0001",
+    AlleleID = "ProbeA_0001|Ref_0001",
+    AlleleSequence = "A",
+    check.names = FALSE
+  )
+  mi_df <- data.frame(
+    Marker_ID = "ProbeA_0001",
+    Chr = "1",
+    Pos = 1
+  )
+
+  res <- check_botloci(botloci, report, ChromPos = TRUE, mi_df = mi_df, verbose = FALSE)
+
+  expect_equal(res[[2]]$CloneID, "1_0001")
+  expect_equal(res[[2]]$AlleleID, "1_0001|Ref_0001")
+})

From c10d134ba604cd7ff2ebbe5fbcb15a533f189811 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 11:13:21 -0400
Subject: [PATCH 12/19] fixed AD generation bug

---
 R/madc2vcf_all.R                   | 24 +++++++----
 tests/testthat/test-madc2vcf_all.R | 67 +++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R
index 63c031d..e223977 100644
--- a/R/madc2vcf_all.R
+++ b/R/madc2vcf_all.R
@@ -138,7 +138,7 @@ madc2vcf_all <- function(madc,
   checks <- check_madc_sanity(report)
 
   messages_results <- mapply(function(check, message) {
-    if (check)  message[1] else message[2]
+    if (isTRUE(check))  message[1] else message[2]
   }, checks$checks, checks$messages)
 
   for(i in seq_along(messages_results))
@@ -158,13 +158,23 @@ madc2vcf_all <- function(madc,
   # Check whether markers_info is present and contains Ref + Alt columns
   if(!is.null(markers_info)) {
     mi_df <- read.csv(markers_info)
-    # Standardize marker ID column to CloneID
-    if(!"CloneID" %in% colnames(mi_df) && "BI_markerID" %in% colnames(mi_df)) {
-      colnames(mi_df)[colnames(mi_df) == "BI_markerID"] <- "CloneID"
-      vmsg("markers_info: 'BI_markerID' column renamed to 'CloneID' for internal use", verbose = verbose, level = 1)
-    } else if(!"CloneID" %in% colnames(mi_df) && !"BI_markerID" %in% colnames(mi_df)) {
+    id_cols <- intersect(c("CloneID", "BI_markerID"), colnames(mi_df))
+    if(!length(id_cols)) {
       stop("markers_info must contain a marker ID column named either 'CloneID' or 'BI_markerID'.")
     }
+    match_n <- vapply(id_cols, function(col) {
+      sum(unique(report$CloneID) %in% unique(stats::na.omit(mi_df[[col]])))
+    }, integer(1))
+    if(!any(match_n)) {
+      stop("None of the markers_info CloneID or BI_markerID values match the MADC CloneID column. Please make sure they use the same marker IDs.")
+    }
+    id_col <- id_cols[which.max(match_n)]
+    if(id_col != "CloneID" || !"CloneID" %in% colnames(mi_df)) {
+      mi_df$CloneID <- mi_df[[id_col]]
+      if(id_col == "BI_markerID") {
+        vmsg("markers_info: 'BI_markerID' column copied to 'CloneID' for internal use", verbose = verbose, level = 1)
+      }
+    }
     # Validate CloneID values
     if(any(is.na(mi_df$CloneID) | mi_df$CloneID == ""))
       stop("markers_info CloneID column contains empty or NA values. Please check your markers_info file.")
@@ -910,7 +920,7 @@ merge_counts <- function(cloneID_unit, rm_multiallelic_SNP = FALSE, multiallelic
     info_mk <- paste0("DP=", sum(c(RefTag, AltTag,total)),";",
                         "ADS=",sum(RefTag),",",sum(AltTag), ads)
   } else {
-    tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, AltTag)
+    tab_counts <- paste0(RefTag + AltTag, ":", RefTag, ":", RefTag, ",", AltTag)
     alts <- info$Alt
     info_mk <- paste0("DP=", sum(c(RefTag, AltTag)),";",
                       "ADS=",sum(RefTag),",",sum(AltTag))
diff --git a/tests/testthat/test-madc2vcf_all.R b/tests/testthat/test-madc2vcf_all.R
index c8c860f..1d6050f 100644
--- a/tests/testthat/test-madc2vcf_all.R
+++ b/tests/testthat/test-madc2vcf_all.R
@@ -66,6 +66,70 @@ test_that("test madc offtargets",{
 
 })
 
+test_that("madc2vcf_all preserves comma-separated AD for biallelic targets", {
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+  bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr")
+  db_file <- system.file("example_allele_db.fa", package="BIGr")
+  temp <- tempfile(fileext = ".vcf")
+
+  madc2vcf_all(madc = madc_file,
+               botloci_file = bot_file,
+               hap_seq_file = db_file,
+               n.cores = 1,
+               out_vcf = temp,
+               verbose = FALSE)
+
+  vcf <- read.vcfR(temp, verbose = FALSE)
+  ad <- extract.gt(vcf, "AD")
+  biallelic <- !grepl(",", vcf@fix[, "ALT"])
+
+  expect_true(all(grepl("^[0-9]+,[0-9]+$", ad[biallelic, 1])))
+})
+
+test_that("madc2vcf_all accepts BI_markerID matches when CloneID does not match", {
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+  bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr")
+  db_file <- system.file("example_allele_db.fa", package="BIGr")
+  temp <- tempfile(fileext = ".vcf")
+  temp_markers <- tempfile(fileext = ".csv")
+
+  report <- read.csv(madc_file, check.names = FALSE)
+  marker_ids <- unique(report$CloneID)
+  markers_info <- data.frame(
+    CloneID = paste0("bogus_", seq_along(marker_ids)),
+    BI_markerID = marker_ids
+  )
+  write.csv(markers_info, temp_markers, row.names = FALSE)
+
+  expect_no_error(
+    madc2vcf_all(madc = madc_file,
+                 botloci_file = bot_file,
+                 hap_seq_file = db_file,
+                 markers_info = temp_markers,
+                 n.cores = 1,
+                 out_vcf = temp,
+                 verbose = FALSE)
+  )
+})
+
+test_that("madc2vcf_all surfaces missing-column validation error without crashing", {
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+  bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr")
+  temp_madc <- tempfile(fileext = ".csv")
+
+  report <- read.csv(madc_file, check.names = FALSE)
+  report$AlleleSequence <- NULL
+  write.csv(report, temp_madc, row.names = FALSE)
+
+  expect_error(
+    madc2vcf_all(madc = temp_madc,
+                 botloci_file = bot_file,
+                 out_vcf = tempfile(fileext = ".vcf"),
+                 verbose = FALSE),
+    "One or more required columns missing"
+  )
+})
+
 # =======================================================================
 # Using Breeding-Insight/BIGapp-PanelHub test files
 # =======================================================================
@@ -220,7 +284,7 @@ test_that("simu alfalfa",{
                    markers_info = alfalfa_markers_info,
                    out_vcf = out,
                    verbose = FALSE),
-      regexp = "None of the markers_info CloneID values match the MADC CloneID column. Please make sure they use the same marker IDs."
+      regexp = "None of the markers_info CloneID( or BI_markerID)? values match the MADC CloneID column. Please make sure they use the same marker IDs."
     )
 
     # Test error when markers_info_ChromPos is provided but IDs still don't match botloci
@@ -528,4 +592,3 @@ test_that("simu alfalfa",{
     )
   })
 })
-

From 1ae386f95ca906ef294efad57aa522d3a8cee64a Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 11:13:41 -0400
Subject: [PATCH 13/19] improved truth check

---
 R/madc2vcf_multi.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R
index 2ff7b0e..d22b8b4 100644
--- a/R/madc2vcf_multi.R
+++ b/R/madc2vcf_multi.R
@@ -81,7 +81,7 @@ madc2vcf_multi <- function(madc_file,
   checks <- check_madc_sanity(report)
 
   messages_results <- mapply(function(check, message) {
-    if (check) message[1] else message[2]
+    if (isTRUE(check)) message[1] else message[2]
   }, checks$checks, checks$messages)
 
   for (i in seq_along(messages_results))
@@ -193,4 +193,3 @@ madc2vcf_multi <- function(madc_file,
 
   invisible(NULL)
 }
-

From 49cb0a4843cfb486418204fed54198299fa817ca Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 11:14:58 -0400
Subject: [PATCH 14/19] support Marker_ID

---
 R/madc2vcf_targets.R                   | 27 ++++++++++----------
 man/madc2vcf_targets.Rd                | 15 +++++++----
 tests/testthat/test-madc2vcf_targets.R | 35 ++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R
index a16fdfb..d75a17f 100644
--- a/R/madc2vcf_targets.R
+++ b/R/madc2vcf_targets.R
@@ -67,14 +67,19 @@
 #' @param botloci_file character or `NULL` (default `NULL`). Path to a plain-text
 #'   file listing target IDs designed on the **bottom** strand (one ID per line).
 #'   Used for strand-correcting probe sequences when `get_REF_ALT = TRUE` and
-#'   `markers_info` does not supply `Ref` and `Alt` columns. Also required when
-#'   `ChromPos` is invalid and `markers_info` does not provide `Ref`/`Alt`.
+#'   `markers_info` does not supply `Ref` and `Alt` columns. Not needed when
+#'   `markers_info` provides `Ref` and `Alt`, or when `get_REF_ALT = FALSE` and
+#'   `markers_info` provides `Chr` and `Pos`. Also required when `ChromPos` is
+#'   invalid and `markers_info` does not provide `Ref`/`Alt`.
 #' @param markers_info character or `NULL`. Optional path to a CSV providing target
-#'   metadata. Accepted columns:
-#'   - `CloneID`, `Marker_ID`, or `BI_markerID` (required as marker identifier);
+#'   metadata. Matching is done by column name, not column position. Accepted columns:
+#'   - one marker identifier column named `CloneID`, `Marker_ID`, or `BI_markerID`
+#'     (required; a generic `ID` column is not accepted);
 #'   - `Chr`, `Pos` — required when `CloneID` does not follow the `Chr_Pos` format;
 #'   - `Ref`, `Alt` — required when `get_REF_ALT = TRUE` and probe-sequence
-#'     inference is not possible (IUPAC codes, indels, or unfixed allele IDs).
+#'     inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When
+#'     `get_REF_ALT = TRUE`, `botloci_file` is still required unless `Ref` and `Alt`
+#'     are supplied here.
 #' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to recover
 #'   REF/ALT bases. The source is chosen automatically: `markers_info` `Ref`/`Alt`
 #'   columns take priority; otherwise probe sequences from the MADC are compared
@@ -169,7 +174,7 @@ madc2vcf_targets <- function(madc_file,
   checks <- check_madc_sanity(report)
 
   messages_results <- mapply(function(check, message) {
-    if (check)  message[1] else message[2]
+    if (isTRUE(check))  message[1] else message[2]
   }, checks$checks, checks$messages)
 
   for(i in seq_along(messages_results))
@@ -307,10 +312,7 @@ madc2vcf_targets <- function(madc_file,
     vmsg("Using markers_info for CHROM, POS, REF and ALT.", verbose = verbose, level = 0, type = ">>")
 
     if(is.null(mi_df)) mi_df <- read.csv(markers_info)
-    id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else
-              if ("CloneID"     %in% colnames(mi_df)) "CloneID"     else
-              if ("Marker_ID"     %in% colnames(mi_df)) "Marker_ID"     else
-      stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.")
+    id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df))
 
     if(checks$checks["Indels"])
       vmsg("Indels detected in MADC file. But it is okay because Ref and Alt are provided in markers_info.",
@@ -341,10 +343,7 @@ madc2vcf_targets <- function(madc_file,
     vmsg("markers_info file provided. Using CHROM and POS from the file.", verbose = verbose, level = 0, type = ">>")
 
     if(is.null(mi_df)) mi_df <- read.csv(markers_info)
-    id_col <- if ("BI_markerID" %in% colnames(mi_df)) "BI_markerID" else
-              if ("CloneID"     %in% colnames(mi_df)) "CloneID"     else
-              if ("Marker_ID"     %in% colnames(mi_df)) "Marker_ID"     else
-      stop("The markers_info file must contain a marker ID column named either 'CloneID', 'Marker_ID' or 'BI_markerID'.")
+    id_col <- pick_markers_info_id_col(mi_df, rownames(ad_df))
 
     if(checks$checks["Indels"])
       vmsg("Indels detected in MADC file. Since get_REF_ALT = FALSE, Type and Indel_pos are not required in markers_info.",
diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd
index 25f99d5..8cab155 100644
--- a/man/madc2vcf_targets.Rd
+++ b/man/madc2vcf_targets.Rd
@@ -22,16 +22,21 @@ madc2vcf_targets(
 \item{botloci_file}{character or \code{NULL} (default \code{NULL}). Path to a plain-text
 file listing target IDs designed on the \strong{bottom} strand (one ID per line).
 Used for strand-correcting probe sequences when \code{get_REF_ALT = TRUE} and
-\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Also required when
-\code{ChromPos} is invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.}
+\code{markers_info} does not supply \code{Ref} and \code{Alt} columns. Not needed when
+\code{markers_info} provides \code{Ref} and \code{Alt}, or when \code{get_REF_ALT = FALSE} and
+\code{markers_info} provides \code{Chr} and \code{Pos}. Also required when \code{ChromPos} is
+invalid and \code{markers_info} does not provide \code{Ref}/\code{Alt}.}
 
 \item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target
-metadata. Accepted columns:
+metadata. Matching is done by column name, not column position. Accepted columns:
 \itemize{
-\item \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID} (required as marker identifier);
+\item one marker identifier column named \code{CloneID}, \code{Marker_ID}, or \code{BI_markerID}
+(required; a generic \code{ID} column is not accepted);
 \item \code{Chr}, \code{Pos} — required when \code{CloneID} does not follow the \code{Chr_Pos} format;
 \item \code{Ref}, \code{Alt} — required when \code{get_REF_ALT = TRUE} and probe-sequence
-inference is not possible (IUPAC codes, indels, or unfixed allele IDs).
+inference is not possible (IUPAC codes, indels, or unfixed allele IDs). When
+\code{get_REF_ALT = TRUE}, \code{botloci_file} is still required unless \code{Ref} and \code{Alt}
+are supplied here.
 }}
 
 \item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to recover
diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R
index b5e8a60..9ffc72e 100644
--- a/tests/testthat/test-madc2vcf_targets.R
+++ b/tests/testthat/test-madc2vcf_targets.R
@@ -87,6 +87,41 @@ test_that("bottom strand markers have correct REF/ALT", {
   rm(vcf_targets, temp_targets)
 })
 
+test_that("madc2vcf_targets preserves original sample names", {
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+  temp_madc <- tempfile(fileext = ".csv")
+  temp_vcf <- tempfile(fileext = ".vcf")
+
+  report <- read.csv(madc_file, check.names = FALSE)
+  colnames(report)[4:6] <- c("1A", "Sample-1", "sample 2")
+  write.csv(report, temp_madc, row.names = FALSE, quote = TRUE)
+
+  suppressWarnings(
+    madc2vcf_targets(madc_file = temp_madc, output.file = temp_vcf, get_REF_ALT = FALSE)
+  )
+
+  vcf <- read.vcfR(temp_vcf, verbose = FALSE)
+
+  expect_equal(colnames(vcf@gt)[2:4], c("1A", "Sample-1", "sample 2"))
+})
+
+test_that("madc2vcf_targets surfaces missing-column validation error without crashing", {
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+  temp_madc <- tempfile(fileext = ".csv")
+
+  report <- read.csv(madc_file, check.names = FALSE)
+  report$AlleleSequence <- NULL
+  write.csv(report, temp_madc, row.names = FALSE)
+
+  expect_error(
+    madc2vcf_targets(madc_file = temp_madc,
+                     output.file = tempfile(fileext = ".vcf"),
+                     get_REF_ALT = FALSE,
+                     verbose = FALSE),
+    "One or more required columns missing"
+  )
+})
+
 
 # =======================================================================
 # Using Breeding-Insight/BIGapp-PanelHub test files

From 22fc6e45d60e3cfb6c880037e40a0e74d995b120 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 11:58:31 -0400
Subject: [PATCH 15/19] Update documentation for verbose message utility

---
 R/utils.R | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 59e5563..076a430 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -27,20 +27,21 @@ convert_to_dosage <- function(gt) {
   })
 }
 
-##' Verbose Message Utility
-##'
-##' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE.
-##'
-##' @param text Character string, the message to print (supports sprintf formatting).
-##' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output.
-##' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).
-##' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.
-##' @param ... Additional arguments passed to sprintf for formatting.
-##'
-##' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg.
-##'
-##' @return No return value, called for side effects.
-##' @export
+#' Verbose Message Utility
+#'
+#' Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE.
+#'
+#' @param text Character string, the message to print (supports sprintf formatting).
+#' @param verbose Logical. If TRUE, prints the message; if FALSE, suppresses output.
+#' @param level Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).
+#' @param type Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.
+#' @param ... Additional arguments passed to sprintf for formatting.
+#'
+#' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg.
+#'
+#' @return No return value, called for side effects.
+#' @internal
+#' @noRd
 vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) {
   if (!verbose) return(invisible())
   # Format timestamp

From 38dd6090d057bf10352f6dd250d10f9026002583 Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 12:00:24 -0400
Subject: [PATCH 16/19] fix exports

---
 NAMESPACE   |  1 -
 R/utils.R   |  3 +--
 man/vmsg.Rd | 28 ----------------------------
 3 files changed, 1 insertion(+), 31 deletions(-)
 delete mode 100644 man/vmsg.Rd

diff --git a/NAMESPACE b/NAMESPACE
index e6cbc30..e9f2613 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -22,7 +22,6 @@ export(merge_MADCs)
 export(solve_composition_poly)
 export(thinSNP)
 export(updog2vcf)
-export(vmsg)
 import(dplyr)
 import(janitor)
 import(parallel)
diff --git a/R/utils.R b/R/utils.R
index 076a430..a30c6f6 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -40,7 +40,6 @@ convert_to_dosage <- function(gt) {
 #' @details Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg.
 #'
 #' @return No return value, called for side effects.
-#' @internal
 #' @noRd
 vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) {
   if (!verbose) return(invisible())
@@ -84,7 +83,7 @@ vmsg <- function(text, verbose = FALSE, level = 1, type = ">>", ...) {
 #'
 #' @keywords internal
 #' @noRd
-#' 
+#'
 url_exists <- function(u) {
   tryCatch({
     con <- url(u, open = "rb")
diff --git a/man/vmsg.Rd b/man/vmsg.Rd
deleted file mode 100644
index abcc768..0000000
--- a/man/vmsg.Rd
+++ /dev/null
@@ -1,28 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
-\name{vmsg}
-\alias{vmsg}
-\title{Verbose Message Utility}
-\usage{
-vmsg(text, verbose = FALSE, level = 1, type = ">>", ...)
-}
-\arguments{
-\item{text}{Character string, the message to print (supports sprintf formatting).}
-
-\item{verbose}{Logical. If TRUE, prints the message; if FALSE, suppresses output.}
-
-\item{level}{Integer, indentation level (0=header, 1=main step, 2=detail, 3=sub-detail).}
-
-\item{type}{Character string, message type (e.g., "INFO", "WARN", "ERROR"). Only shown for level 0.}
-
-\item{...}{Additional arguments passed to sprintf for formatting.}
-}
-\value{
-No return value, called for side effects.
-}
-\description{
-Prints a formatted verbose message with timestamp, indentation, and type label, if verbose is TRUE.
-}
-\details{
-Use the verbose argument to control message output. Typically, pass the function's verbose parameter to vmsg.
-}

From 56336da56c8c346f014e9c0879bf17c98ef4331f Mon Sep 17 00:00:00 2001
From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com>
Date: Mon, 20 Apr 2026 12:03:58 -0400
Subject: [PATCH 17/19] skipping if offline

---
 tests/testthat/test-check_madc_sanity.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R
index b1a3857..4b60cfe 100644
--- a/tests/testthat/test-check_madc_sanity.R
+++ b/tests/testthat/test-check_madc_sanity.R
@@ -1,4 +1,5 @@
 test_that("check madc",{
+  skip_if_offline("raw.githubusercontent.com")
 
   github_path <- "https://raw.githubusercontent.com/Breeding-Insight/BIGapp-PanelHub/refs/heads/long_seq/test_madcs/"
   names <- c("Columns", "FixAlleleIDs", "IUPACcodes", "LowerCase", "Indels", "ChromPos", "allNAcol", "allNArow", "RefAltSeqs", "OtherAlleles")

From 089e8fdb42bcaf5b6999e107a9b078e2725cf6b4 Mon Sep 17 00:00:00 2001
From: Cristianetaniguti <chaytaniguti@gmail.com>
Date: Tue, 21 Apr 2026 09:26:34 -0400
Subject: [PATCH 18/19] madc2vcf_multi better function description

---
 R/madc2vcf_multi.R | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/R/madc2vcf_multi.R b/R/madc2vcf_multi.R
index d22b8b4..e71cf0f 100644
--- a/R/madc2vcf_multi.R
+++ b/R/madc2vcf_multi.R
@@ -1,11 +1,9 @@
 #' Convert MADC file to VCF using polyRAD for multiallelic genotyping
 #'
-#' This function converts a DArTag MADC file to a VCF using the polyRAD package's
-#' `readDArTag` and `RADdata2VCF` pipeline. It runs `check_madc_sanity` before
-#' loading the data, applies corrections for lowercase sequences and all-NA
-#' rows/columns, and sets `n.header.rows` automatically based on whether the
-#' MADC file follows the raw DArT format (6 header rows) or the fixed allele ID
-#' format (no header rows).
+#' This function converts a DArTag fixed allele ID MADC file to a VCF
+#' containing multiallelic markers based on the microhaplotypes using
+#' the polyRAD package's `readDArTag`, `IterateHWE` population model
+#' and `RADdata2VCF` pipeline.
 #'
 #' @param madc_file character. Path or URL to the input MADC CSV file.
 #' @param botloci_file character. Path or URL to the botloci file listing target

From 8ee0b81e36339d9fc76b9ef1bd46647bdabc1d43 Mon Sep 17 00:00:00 2001
From: Cristianetaniguti <chaytaniguti@gmail.com>
Date: Tue, 21 Apr 2026 09:27:18 -0400
Subject: [PATCH 19/19] roxygenise

---
 man/madc2vcf_multi.Rd | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/man/madc2vcf_multi.Rd b/man/madc2vcf_multi.Rd
index 70bc59d..b6a3324 100644
--- a/man/madc2vcf_multi.Rd
+++ b/man/madc2vcf_multi.Rd
@@ -35,12 +35,10 @@ Default is 2.}
 Invisible NULL. Writes a VCF file to \code{outfile}.
 }
 \description{
-This function converts a DArTag MADC file to a VCF using the polyRAD package's
-\code{readDArTag} and \code{RADdata2VCF} pipeline. It runs \code{check_madc_sanity} before
-loading the data, applies corrections for lowercase sequences and all-NA
-rows/columns, and sets \code{n.header.rows} automatically based on whether the
-MADC file follows the raw DArT format (6 header rows) or the fixed allele ID
-format (no header rows).
+This function converts a DArTag fixed allele ID MADC file to a VCF
+containing multiallelic markers based on the microhaplotypes using
+the polyRAD package's \code{readDArTag}, \code{IterateHWE} population model
+and \code{RADdata2VCF} pipeline.
 }
 \details{
 The function performs the following steps: