From 71cf33364a36aa423c5fbb7bbe07f4cefbb40869 Mon Sep 17 00:00:00 2001 From: Benjmain Fair Date: Thu, 25 Sep 2025 12:21:09 -0500 Subject: [PATCH] Added option to supply 100x2 matrix for custom bias --- R/check_extras.R | 8 +------- R/generate_fragments.R | 22 +++++++++++++++++----- R/simulate_experiment.R | 7 +++++-- man/polyester.Rd | 2 ++ man/simulate_experiment.Rd | 7 +++++-- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/R/check_extras.R b/R/check_extras.R index 6e57da2..43adcb2 100644 --- a/R/check_extras.R +++ b/R/check_extras.R @@ -41,7 +41,7 @@ if(!('bias' %in% names(extras))){ extras$bias = 'none' - }else{ + } else if (is.character(extras$bias)) { extras$bias = match.arg(extras$bias, c('none', 'rnaf', 'cdnaf')) } @@ -57,12 +57,6 @@ extras$path = paste0(extras$model_path, '/', extras$model_prefix) }#this should work beause we already checked stuff. - if(!('bias' %in% names(extras))){ - extras$bias = 'none' - }else{ - extras$bias = match.arg(extras$bias, c('none', 'rnaf', 'cdnaf')) - } - if(!('lib_sizes' %in% names(extras))){ extras$lib_sizes = rep(1, total.n) }else{ diff --git a/R/generate_fragments.R b/R/generate_fragments.R index 77a8695..1317220 100644 --- a/R/generate_fragments.R +++ b/R/generate_fragments.R @@ -74,7 +74,9 @@ generate_fragments = function(tObj, fraglen=250, fragsd=25, readlen=100, distr='normal', custdens=NULL, bias='none', frag_GC_bias='none') { - bias = match.arg(bias, c('none', 'rnaf', 'cdnaf')) + if (is.character(bias)) { + bias = match.arg(bias, c('none', 'rnaf', 'cdnaf')) + } distr = match.arg(distr, c('normal', 'empirical', 'custom')) L = width(tObj) if(distr == 'empirical'){ @@ -92,21 +94,31 @@ generate_fragments = function(tObj, fraglen=250, fragsd=25, } s = which(fraglens < L) n = length(s) - if(bias == 'none'){ + if (is.character(bias) && bias == 'none') { start_pos = floor(runif(n, min=rep(1,n), max=L[s]-fraglens[s]+2)) - }else if(bias == 'rnaf'){ + } else if (is.character(bias) && bias == 'rnaf') { data(rnaf) starts_pct = sample(rnaf$pospct, size=n, prob=rnaf$prob, replace=TRUE) starts_pct[starts_pct==1] = 0.999 start_pos = floor(starts_pct * (L[s]-fraglens[s]+2)) start_pos[start_pos==0] = 1 - }else{ - # bias == 'cdnaf' + } else if (is.character(bias) && bias == 'cdnaf') { data(cdnaf) starts_pct = sample(cdnaf$pospct, size=n, prob=cdnaf$prob, replace=TRUE) starts_pct[starts_pct==1] = 0.999 start_pos = floor(starts_pct * (L[s]-fraglens[s]+2)) start_pos[start_pos==0] = 1 + } else if (is.matrix(bias) || is.data.frame(bias)) { + pos = bias[,1] + prob = bias[,2] + prob = prob / sum(prob) + if (max(pos) > 1.01) pos = pos / 100 + starts_pct = sample(pos, size=n, prob=prob, replace=TRUE) + starts_pct[starts_pct==1] = 0.999 + start_pos = floor(starts_pct * (L[s]-fraglens[s]+2)) + start_pos[start_pos==0] = 1 + } else { + stop("Unknown bias type. Must be 'none', 'rnaf', 'cdnaf', or a 100x2 matrix/data.frame.") } tObj[s] = subseq(tObj[s], start=start_pos, width=fraglens[s]) names(tObj)[s] = paste0(names(tObj[s]), ';mate1:', start_pos, '-', diff --git a/R/simulate_experiment.R b/R/simulate_experiment.R index 7a7efba..e3c9079 100644 --- a/R/simulate_experiment.R +++ b/R/simulate_experiment.R @@ -203,7 +203,7 @@ #' provided to \code{build_error_model.py} and is whatever comes before the #' _mate1/_mate2 or _single files in \code{model_path}. #' } -#' \item \code{bias} One of 'none', 'rnaf', or 'cdnaf'. 'none' +#' \item \code{bias} One of 'none', 'rnaf', or 'cdnaf', or a 100x2 matrix/data.frame. 'none' #' represents uniform fragment selection (every possible fragment in a #' transcript has equal probability of being in the experiment); 'rnaf' #' represents positional bias that arises in protocols using RNA @@ -212,7 +212,10 @@ #' coverage is higher in the middle of the transcript and lower at both ends, #' and in the 'cdnaf' model, coverage increases toward the 3' end of the #' transcript. The probability models used come from Supplementary Figure S3 -#' of Li and Jiang (2012). Defaults to 'none' if you don't provide this. +#' of Li and Jiang (2012). +#' If a matrix/data.frame, column 1 is position (1-100 or 0-1), column 2 is probability (must sum to 1). +#' Allows user-specified positional bias, e.g., from a beta distribution. +#' Defaults to 'none' if you don't provide this. #' \item \code{gcbias} list indicating which samples to add GC bias to, and #' from which models. Should be the same length as \code{sum(num_reps)}; #' entries can be either numeric or of class \code{loess}. A numeric entry of diff --git a/man/polyester.Rd b/man/polyester.Rd index 51858cc..bfe7e50 100644 --- a/man/polyester.Rd +++ b/man/polyester.Rd @@ -2,6 +2,8 @@ % Please edit documentation in R/polyester-package.R \docType{package} \name{polyester} +\alias{polyesterlocal} +\alias{polyesterlocal-package} \alias{polyester} \title{Polyester: simulating RNA-seq reads including differential expression} \description{ diff --git a/man/simulate_experiment.Rd b/man/simulate_experiment.Rd index c29e7a6..a37fe70 100644 --- a/man/simulate_experiment.Rd +++ b/man/simulate_experiment.Rd @@ -140,7 +140,7 @@ Reads can either be simulated from a FASTA file of transcripts provided to \code{build_error_model.py} and is whatever comes before the _mate1/_mate2 or _single files in \code{model_path}. } - \item \code{bias} One of 'none', 'rnaf', or 'cdnaf'. 'none' + \item \code{bias} One of 'none', 'rnaf', or 'cdnaf', or a 100x2 matrix/data.frame. 'none' represents uniform fragment selection (every possible fragment in a transcript has equal probability of being in the experiment); 'rnaf' represents positional bias that arises in protocols using RNA @@ -149,7 +149,10 @@ Reads can either be simulated from a FASTA file of transcripts coverage is higher in the middle of the transcript and lower at both ends, and in the 'cdnaf' model, coverage increases toward the 3' end of the transcript. The probability models used come from Supplementary Figure S3 - of Li and Jiang (2012). Defaults to 'none' if you don't provide this. + of Li and Jiang (2012). +If a matrix/data.frame, column 1 is position (1-100 or 0-1), column 2 is probability (must sum to 1). + Allows user-specified positional bias, e.g., from a beta distribution. +Defaults to 'none' if you don't provide this. \item \code{gcbias} list indicating which samples to add GC bias to, and from which models. Should be the same length as \code{sum(num_reps)}; entries can be either numeric or of class \code{loess}. A numeric entry of