From 71cf33364a36aa423c5fbb7bbe07f4cefbb40869 Mon Sep 17 00:00:00 2001
From: Benjmain Fair <bfair.kun@gmail.com>
Date: Thu, 25 Sep 2025 12:21:09 -0500
Subject: [PATCH] Added option to supply 100x2 matrix for custom bias

---
 R/check_extras.R           |  8 +-------
 R/generate_fragments.R     | 22 +++++++++++++++++-----
 R/simulate_experiment.R    |  7 +++++--
 man/polyester.Rd           |  2 ++
 man/simulate_experiment.Rd |  7 +++++--
 5 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/R/check_extras.R b/R/check_extras.R
index 6e57da2..43adcb2 100644
--- a/R/check_extras.R
+++ b/R/check_extras.R
@@ -41,7 +41,7 @@
 
     if(!('bias' %in% names(extras))){
         extras$bias = 'none'
-    }else{
+    } else if (is.character(extras$bias)) {
         extras$bias = match.arg(extras$bias, c('none', 'rnaf', 'cdnaf'))
     }
 
@@ -57,12 +57,6 @@
         extras$path = paste0(extras$model_path, '/', extras$model_prefix)
     }#this should work beause we already checked stuff.
 
-    if(!('bias' %in% names(extras))){
-        extras$bias = 'none'
-    }else{
-        extras$bias = match.arg(extras$bias, c('none', 'rnaf', 'cdnaf'))
-    }
-
     if(!('lib_sizes' %in% names(extras))){
         extras$lib_sizes = rep(1, total.n)
     }else{
diff --git a/R/generate_fragments.R b/R/generate_fragments.R
index 77a8695..1317220 100644
--- a/R/generate_fragments.R
+++ b/R/generate_fragments.R
@@ -74,7 +74,9 @@ generate_fragments = function(tObj, fraglen=250, fragsd=25,
   readlen=100, distr='normal', custdens=NULL, bias='none',
   frag_GC_bias='none') {
 
-    bias = match.arg(bias, c('none', 'rnaf', 'cdnaf'))
+    if (is.character(bias)) {
+        bias = match.arg(bias, c('none', 'rnaf', 'cdnaf'))
+    }
     distr = match.arg(distr, c('normal', 'empirical', 'custom'))
     L = width(tObj)
     if(distr == 'empirical'){
@@ -92,21 +94,31 @@ generate_fragments = function(tObj, fraglen=250, fragsd=25,
     }
     s = which(fraglens < L)
     n = length(s)
-    if(bias == 'none'){
+    if (is.character(bias) && bias == 'none') {
         start_pos = floor(runif(n, min=rep(1,n), max=L[s]-fraglens[s]+2))
-    }else if(bias == 'rnaf'){
+    } else if (is.character(bias) && bias == 'rnaf') {
         data(rnaf)
         starts_pct = sample(rnaf$pospct, size=n, prob=rnaf$prob, replace=TRUE)
         starts_pct[starts_pct==1] = 0.999
         start_pos = floor(starts_pct * (L[s]-fraglens[s]+2))
         start_pos[start_pos==0] = 1
-    }else{
-        # bias == 'cdnaf'
+    } else if (is.character(bias) && bias == 'cdnaf') {
         data(cdnaf)
         starts_pct = sample(cdnaf$pospct, size=n, prob=cdnaf$prob, replace=TRUE)
         starts_pct[starts_pct==1] = 0.999
         start_pos = floor(starts_pct * (L[s]-fraglens[s]+2))
         start_pos[start_pos==0] = 1
+    } else if (is.matrix(bias) || is.data.frame(bias)) {
+        pos = bias[,1]
+        prob = bias[,2]
+        prob = prob / sum(prob)
+        if (max(pos) > 1.01) pos = pos / 100
+        starts_pct = sample(pos, size=n, prob=prob, replace=TRUE)
+        starts_pct[starts_pct==1] = 0.999
+        start_pos = floor(starts_pct * (L[s]-fraglens[s]+2))
+        start_pos[start_pos==0] = 1
+    } else {
+        stop("Unknown bias type. Must be 'none', 'rnaf', 'cdnaf', or a 100x2 matrix/data.frame.")
     }
     tObj[s] = subseq(tObj[s], start=start_pos, width=fraglens[s])
     names(tObj)[s] = paste0(names(tObj[s]), ';mate1:', start_pos, '-',
diff --git a/R/simulate_experiment.R b/R/simulate_experiment.R
index 7a7efba..e3c9079 100644
--- a/R/simulate_experiment.R
+++ b/R/simulate_experiment.R
@@ -203,7 +203,7 @@
 #'     provided to \code{build_error_model.py} and is whatever comes before the
 #'     _mate1/_mate2 or _single files in \code{model_path}.
 #'     }
-#'   \item \code{bias} One of 'none', 'rnaf', or 'cdnaf'. 'none'
+#'   \item \code{bias} One of 'none', 'rnaf', or 'cdnaf', or a 100x2 matrix/data.frame. 'none'
 #'   represents uniform fragment selection (every possible fragment in a
 #'   transcript has equal probability of being in the experiment); 'rnaf'
 #'   represents positional bias that arises in protocols using RNA
@@ -212,7 +212,10 @@
 #'   coverage is higher in the middle of the transcript and lower at both ends,
 #'   and in the 'cdnaf' model, coverage increases toward the 3' end of the
 #'   transcript. The probability models used come from Supplementary Figure S3
-#'   of Li and Jiang (2012). Defaults to 'none' if you don't provide this.
+#'   of Li and Jiang (2012). 
+#' If a matrix/data.frame, column 1 is position (1-100 or 0-1), column 2 is probability (must sum to 1).
+#'   Allows user-specified positional bias, e.g., from a beta distribution.
+#' Defaults to 'none' if you don't provide this.
 #'   \item \code{gcbias} list indicating which samples to add GC bias to, and
 #'   from which models. Should be the same length as \code{sum(num_reps)};
 #'   entries can be either numeric or of class \code{loess}. A numeric entry of
diff --git a/man/polyester.Rd b/man/polyester.Rd
index 51858cc..bfe7e50 100644
--- a/man/polyester.Rd
+++ b/man/polyester.Rd
@@ -2,6 +2,8 @@
 % Please edit documentation in R/polyester-package.R
 \docType{package}
 \name{polyester}
+\alias{polyesterlocal}
+\alias{polyesterlocal-package}
 \alias{polyester}
 \title{Polyester: simulating RNA-seq reads including differential expression}
 \description{
diff --git a/man/simulate_experiment.Rd b/man/simulate_experiment.Rd
index c29e7a6..a37fe70 100644
--- a/man/simulate_experiment.Rd
+++ b/man/simulate_experiment.Rd
@@ -140,7 +140,7 @@ Reads can either be simulated from a FASTA file of transcripts
     provided to \code{build_error_model.py} and is whatever comes before the
     _mate1/_mate2 or _single files in \code{model_path}.
     }
-  \item \code{bias} One of 'none', 'rnaf', or 'cdnaf'. 'none'
+  \item \code{bias} One of 'none', 'rnaf', or 'cdnaf', or a 100x2 matrix/data.frame. 'none'
   represents uniform fragment selection (every possible fragment in a
   transcript has equal probability of being in the experiment); 'rnaf'
   represents positional bias that arises in protocols using RNA
@@ -149,7 +149,10 @@ Reads can either be simulated from a FASTA file of transcripts
   coverage is higher in the middle of the transcript and lower at both ends,
   and in the 'cdnaf' model, coverage increases toward the 3' end of the
   transcript. The probability models used come from Supplementary Figure S3
-  of Li and Jiang (2012). Defaults to 'none' if you don't provide this.
+  of Li and Jiang (2012). 
+If a matrix/data.frame, column 1 is position (1-100 or 0-1), column 2 is probability (must sum to 1).
+  Allows user-specified positional bias, e.g., from a beta distribution.
+Defaults to 'none' if you don't provide this.
   \item \code{gcbias} list indicating which samples to add GC bias to, and
   from which models. Should be the same length as \code{sum(num_reps)};
   entries can be either numeric or of class \code{loess}. A numeric entry of