diff --git a/src/AmyrisBio/primercore2.fs b/src/AmyrisBio/primercore2.fs index a361954..d3fcaa2 100644 --- a/src/AmyrisBio/primercore2.fs +++ b/src/AmyrisBio/primercore2.fs @@ -237,6 +237,22 @@ module primercore = g3Final > -9.0 + // find longest tail to tail pair that is self complementary + // 5' tgtCTTTAAAG 3' + // 3' GAAATTTCtgtg 5' + let longestTailTailOverlap (s1:char array) (s2:char array) = + let suffix (d:char array) l = d.[d.Length-l..] + + let maxOverlap = min s1.Length s2.Length + seq { + for i in 0..maxOverlap do + let suffix1 = suffix s1 i + let suffix2 = suffix s2 i + if suffix1 = Amyris.Bio.biolib.revComp suffix2 then + yield i + } |> Seq.fold (max) 0 + + /// Test if an oligo is self complementary let selfComp (oligo:char array) length = let rec checkOne l r = @@ -363,58 +379,144 @@ module primercore = let _,_,x = _temp p oligo N x + /// check longest self complementary tail for + /// oligo defined by f'->t' inclusive + let localLongestTailTailOverlap (s:char []) primerFrom primerTo = + let l = primerTo-primerFrom+1 + seq { + for i in l-1..-1..1 do + if seq { 0..i-1 } |> Seq.forall (fun j -> s.[primerTo-i+j] = biolib.rcBase (s.[primerTo-j])) then + yield (i+1) + } + |> Seq.tryFind(fun x -> x<> 0) + |> fun x -> defaultArg x 0 + + /// disrupt self primer dimers + /// Tweak the stopping point of a primer if it would help avoid a + /// situation where last N bases of the primer are self complementary and + /// can lead to formation of a primer dimer + /// 5'-------CTTTAAAG-3' + /// 3'GAAATTTC--------5' + /// primerLeft and primerRight delineate the range of bases in the underlying template s that are currently in the primer + let disruptPrimerDimers (debug:bool) (existingTemp : float) (p:PrimerParams) (s:char[]) primerLeft primerRight offset = + if debug then + printfn "disruptPrimerDimers: checking for problems" + + let initialTail = localLongestTailTailOverlap s primerLeft primerRight + if debug then printfn "disruptPrimerDimers: initialTail of length %d for %s" initialTail (arr2seq s.[primerLeft..primerRight]) + // no-op + if initialTail < 3 then + { tag = ""; oligo = s.[primerLeft..primerRight] ; temp = existingTemp ; offset = primerLeft+offset } + else + if debug then printfn "disruptPrimerDimers: found problematic initialTail of length %d initial f=%d t=%d" initialTail primerLeft primerRight + seq { for i in [1 ; -1 ; 2 ; -2 ; 3 ; -3 ; 4 ; -4; 5; -5 ; 6 ; -6] do + let primerRightNew = primerRight+i + let primerLenNew = primerRightNew-primerLeft+1 + if debug then printfn "disruptPrimerDimers: considering f=%d t=%d len=%d min=%d max=%d templateLen=%d" primerLeft primerRightNew primerLenNew p.minLength p.maxLength s.Length + if primerLenNew >= p.minLength && primerLenNew <= p.maxLength && primerRightNew < s.Length && primerRightNew > primerLeft then + let tail' = localLongestTailTailOverlap s primerLeft primerRightNew + if debug then printfn "disruptPrimerDimers: considering tail of length %d for %d" tail' primerRightNew + yield tail',primerRightNew + } + |> Seq.fold (min) (initialTail,primerRight) + |> fun (finalTail,finalT) -> + if debug then printfn "disruptPrimerDimers: finalTail of length %d deltaT=%d f=%d finalT=%d" finalTail (finalT-primerRight) primerLeft finalT + let finalTemp = temp p (s.[primerLeft..finalT]) (finalT-primerLeft+1) + { tag = ""; oligo = s.[primerLeft..finalT] ; temp = finalTemp; offset = primerLeft+offset } + + /// Cut out the region from fr -> to and extend - // Given oligo array from to gcInit offset , return oligo, temp, offset - let cutToGC (debug:bool) (existingTemp : float) (p:PrimerParams) (s:char[]) f t _ (*startingGC*) offset = - //let startingN = t-f + 1 - let rec findGCFwd t' (*gc' *) = - if t' < 0 || t' >= s.Length then - failwithf "ERROR: primercord findGC array bounds exception t'=%d\n" t' - match s.[t'] with - |'G' | 'g' | 'C' | 'c' when threePrimeStable false (s.[..t'])-> - (t', temp p (s.[f..t']) (t'-f+1)) // end on G/C + /// Given oligo array from to gcInit offset , return oligo, temp, offset + let cutToGC (debug:bool) (existingTemp : float) (p:PrimerParams) (s:char[]) primerFrom primerTo _startingGC offset = + if debug then + printfn "cutToGC: starting design f=%d t=%d oligo=%s" + primerFrom + primerTo + (arr2seq s.[primerFrom..primerTo]) + let rec findGCFwd newRight = + if newRight < 0 || newRight >= s.Length then + failwithf "ERROR: primercord findGC array bounds exception t'=%d\n" newRight + match s.[newRight] with + |'G' | 'g' | 'C' | 'c' when threePrimeStable false (s.[..newRight])-> + Some (newRight, temp p (s.[primerFrom..newRight]) (newRight-primerFrom+1)) // end on G/C - | _ when t' < (s.Length-1) && t'-f+1 < p.maxLength -> findGCFwd (t'+1) - | _ -> (t, (temp p s.[f..t] (t-f+1))) // fall back on original oligo if we run out of S without finding G - - let rec findGCRev t' = - if t' < 0 || t' >= s.Length then - failwithf "ERROR: primercord findGC array bounds exception t'=%d\n" t' - match s.[t'] with - |'G' | 'g' | 'C' | 'c' when threePrimeStable false (s.[..t']) -> - (t', temp p (s.[f..t']) (t'-f+1)) // end on G/C + | _ when newRight < (s.Length-1) && newRight-primerFrom+1 < p.maxLength -> findGCFwd (newRight+1) + | _ -> None + + let rec findGCRev newRight = + if newRight < 0 || newRight >= s.Length then + failwithf "ERROR: primercord findGC array bounds exception newRight=%d\n" newRight + match s.[newRight] with + |'G' | 'g' | 'C' | 'c' when threePrimeStable false (s.[..newRight]) -> + Some (newRight, temp p (s.[primerFrom..newRight]) (newRight-primerFrom+1)) - | _ when t' > 1 && t'-f+1 > p.minLength -> findGCRev (t'-1) - | _ -> (t, (temp p s.[f..t] (t-f+1))) // fall back on original oligo if we run out of S without finding G + | _ when newRight > 1 && newRight-primerFrom+1 > p.minLength -> findGCRev (newRight-1) + | _ -> None - if p.ATPenalty < 0.0001 then + // did we already end on a G or a C + let endsWithGC = s.[primerTo] |> base2GC = 1 + if endsWithGC || p.ATPenalty < 0.0001 then // No GC optimization - if debug then printfn "cutToGC: no atPenalty, done" - { tag = ""; oligo = s.[f..t] ; temp = temp p (s.[f..t]) (t-f+1); offset = f+offset } + if debug then printfn "cutToGC: no atPenalty or endsWithGC already, done len=%d" (primerTo-primerFrom+1) + { tag = ""; oligo = s.[primerFrom..primerTo] ; temp = temp p (s.[primerFrom..primerTo]) (primerTo-primerFrom+1); offset = primerFrom+offset } else // Is there an alternative starting point that would end on a G or C that's not too far away? - let altTFwd,altTempFwd = findGCFwd t - let altTRev,altTempRev = findGCRev t - - assert(altTempFwd > 10.0) - assert(altTempRev > 10.0) - if debug then - printfn "cutToGC: altTempFwd=%A altTFwd=%d altTempRev=%A altRFwd=%d" - altTempFwd altTFwd altTempRev altTRev - - // Choose the direction to a GC that was least perturbative - let altT,altTemp = - if abs (altTempFwd-existingTemp) < abs(altTempRev-existingTemp) then - altTFwd,altTempFwd - else altTRev,altTempRev - - if abs (altTemp-existingTemp) <= p.ATPenalty then - if debug then printfn "cutToGC: using altGC option temp=%A f=%d t=%d" altTemp f altT - - { tag = ""; oligo = s.[f..altT] ; temp = altTemp ; offset = f+offset} - else - if debug then printfn "cutToGC: ignore altGC option temp=%A f=%d t=%d" temp f t - { tag = ""; oligo = s.[f..t] ; temp = temp p (s.[f..t]) (t-f+1); offset = f+offset } + let altBest = + match findGCFwd primerTo,findGCRev primerTo with + | None,None -> None // no better option + | Some (altPos,altTemp),None -> + if debug then + printfn "cutToGC: altTempFwd=%A altTFwd=%d altRev=None" + altTemp altPos + Some (altPos,altTemp) // only one worked + | None, Some (altPos,altTemp) -> + if debug then + printfn "cutToGC: altTempRev=%A altTRev=%d altFwd=None" + altTemp + altPos + Some (altPos,altTemp) // only one worked + | Some (altPosFwd,altTempFwd),Some(altPosRev,altTempRev) -> + if debug then + printfn "cutToGC: altTempFwd=%A alTFwd=%d altTempRev=%A altTRev=%d" + altTempFwd altPosFwd altTempRev altPosRev + if abs (altTempFwd-existingTemp) < abs(altTempRev-existingTemp) then + altPosFwd,altTempFwd + else altPosRev,altTempRev + |> Some + + match altBest with + | Some (_,temperature) -> + if temperature < 10.0 then + failwithf "wikitemp1000: fail design temperature of %A < 10C" temperature // ensure selected temp isn't crazy + | None -> () + + match altBest with + | None -> + if debug then + printfn "cutToGC: no better altGC version, going with original f=%d t=%d oligo=%s" + primerFrom + primerTo + (arr2seq s.[primerFrom..primerTo]) + { tag = ""; oligo = s.[primerFrom..primerTo] ; temp = temp p (s.[primerFrom..primerTo]) (primerTo-primerFrom+1); offset = primerFrom+offset } + + | Some(_,altTemp) when (altTemp-existingTemp) > p.ATPenalty -> + // stick with original design, compromise too big + if debug then + printfn "cutToGC: ignore altGC option (compromise not worth it) temp=%A f=%d t=%d oligo=%s" + temp + primerFrom + primerTo + (arr2seq s.[primerFrom..primerTo]) + { tag = ""; oligo = s.[primerFrom..primerTo] ; temp = temp p (s.[primerFrom..primerTo]) (primerTo-primerFrom+1); offset = primerFrom+offset } + | Some(altPos,altTemp) -> + if debug then + printfn "cutToGC: using altGC option temp=%A f=%d t=%d oligo=%s" + altTemp + primerFrom + altPos + (arr2seq s.[primerFrom..altPos]) + + { tag = ""; oligo = s.[primerFrom..altPos] ; temp = altTemp ; offset = primerFrom+offset} let upper (s:char array) = [| for x in s -> @@ -461,7 +563,11 @@ module primercore = if debug then printfn "designLeft, stopping at nextBase-1=%d thisTemp=%f" (nextBase-1) (thisTemp/1.0) Some(cutToGC debug lastTemp p s 0 (nextBase-1) gcCount offset) - designLeftInternal sInitUpper nextBase gcCount (0.0) + designLeftInternal sInitUpper nextBase gcCount (0.0) + |> Option.map (fun prePDCheck -> + // possible final tweaks if we have made a primer-dimer with the tail + disruptPrimerDimers debug prePDCheck.temp p sInitUpper 0 (prePDCheck.oligo.Length-1) offset + ) /// Check for the presence of mono- or dinucleotide repeats longer than N let hasPolyrun n (s:char[]) = @@ -538,12 +644,16 @@ module primercore = let threeP = if r-l+1 > 5 then threePrimeStable false (s.[l..r]) else false // get average of all nucleotide specific penalties let seqP = Array.average seqPen.[l..r] + // check ability to self dimerize + let primerDimerPotential = longestTailTailOverlap (s.[l..r]) (s.[l..r]) let p = posCloseness + tempCloseness + lenCloseness + (if threeP then 0.0 else pen.threePrimeUnstablePenalty) + (if hasPoly then pen.polyPenalty else 0.0) + - seqP + seqP + + (if primerDimerPotential > 2 then (float primerDimerPotential)*3.0 else 0.0) + if (abs(t - tTemp) <= pen.tmMaxDifference) && (not polyC) then yield {l = l ; @@ -655,16 +765,13 @@ module primercore = printf "No oligo in required length, so make max allowable max=%d templateLen=%d\n" pen.maxLength s'.Length let oligo = s'.[..(min s'.Length pen.maxLength)-1] // longest allowed - Some( { tag=o.tag; oligo = oligo ; temp = temp pen oligo pen.maxLength; offset = o.offset } ) // calc temperature and offset is 0 plus their supplied offset + let oligoTemp = temp pen oligo oligo.Length + let oligo' = disruptPrimerDimers debug oligoTemp pen s' 0 (oligo.Length-1) o.offset + Some( { oligo' with tag=o.tag ; temp = temp pen oligo pen.maxLength; offset = o.offset } ) // calc temperature and offset is 0 plus their supplied offset | x -> x // | RIGHT -> failwith "should not get right here" | CENTERLEFT -> designCenter debug pen s' seqPen offFn ( (float o.targetTemp)*1.0) - - (*with - | Some(a,b,c,d) -> Some( { tag=o.tag; oligo = a ; temp = b ; offset = c} ) - | None -> None *) - | CENTERRIGHT -> failwith "should not get centerright here" (* diff --git a/tests/AmyrisBio.Tests/testPrimer.fs b/tests/AmyrisBio.Tests/testPrimer.fs index 67c20ad..aaff2ab 100644 --- a/tests/AmyrisBio.Tests/testPrimer.fs +++ b/tests/AmyrisBio.Tests/testPrimer.fs @@ -3,9 +3,89 @@ open Amyris.Bio.primercore open NUnit.Framework - [] type TestPrimer() = class + let common = [ + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "ATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "TCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "CTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "TTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "TAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "ATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "TAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "GATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "ATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "TTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "TAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "AATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "ATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "TTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "TTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "TAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "AAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATACATATATAAAG" ; + ] + + let templates1 = + [ "GAAATCTGTACCAACCGTATAGGTGAAAGAGACCCTTTATGG" ; + // "ATGCCATTCGTTGTTCCTAGAAGAAACCGTTCTTTGT" ; // interesting case, but dimer check overrides GC clamp so will fail + "ATGGTCATTGCTGAAGTTCCTAAATTAGCCTCTGCC" ; + "AAAAAAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATAATATATATATATTATATATATATA" ; + "AAAAAAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATAATATATATATATTATATATATATACATATATAAAG" ; + ]@common + + let templates2 = + [ + "AAAAAAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATAATATATATATATTATATATATATA" ; + "AAAAAAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATAATATATATATATTATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATTATATATATATATATATATAATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATTATATATATATATATAATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "AAAAAATCTTAATAGATTAATTTAAACAGTATATGTACAGTTTTATATATATATATATATATATATATATATATATATATATATATATATATATATATACATATATAAAG" ; + "CGGTTGGGCTTAACTTTAAAGAAAAAAGTTGAGATTAGATTTATTGTGTT" + ]@common + + let makeTask (template:string) = + {tag = "PR"; + temp = (template.ToCharArray()) + align = LEFT; + strand = TOP; + offset = 0; + targetTemp = 60.0; + sequencePenalties = None} + + let testPenalty = { + tmPenalty = 1.0; + tmMaxDifference = 5.0 ; + positionPenalty = 5.0 ; + lengthPenalty = 3.0 ; + polyLengthThreshold = 4; + polyPenalty = 10. ; + threePrimeUnstablePenalty = 5.0 ; + ATPenalty=3.0 ; + targetLength=20 ; + maxLength = 80 ; + minLength = 20 ; + monovalentConc = mM2M 50.0; + primerConc = uM2M 0.25 ; + divalentConc = mM2M 1.5 ; + templateConc = uM2M 0.01 ; + dNTPConc = uM2M 0.0 ;} + do () [] @@ -39,32 +119,44 @@ type TestPrimer() = class [] /// Test GC rich template can make minimum length primer member __.TestGCRich() = - let template="CTGCCGGCGACGTGGAGCGTCCGATTGTGACGCGCCTGAGCAACCCGGGCACGGTGCTGCGCGAGTCGTGCGACGCCTCACTGCTGGTGCAGGCCATCATCGACGCCATCGTCGACCTGGCCGTGCCCCTGACGGCCGCGTACAACGACGT".ToCharArray() - let pen={lengthPenalty = 3.0; - tmPenalty = 1.0; - tmMaxDifference = 5.0; - positionPenalty = 5.0; - polyLengthThreshold = 4; - polyPenalty = 10.0; - threePrimeUnstablePenalty = 5.0; - ATPenalty = 3.0; - maxLength = 60; - minLength = 20; - targetLength = 20; - monovalentConc = 0.05; - divalentConc = 0.0015; - primerConc = 2.5e-07; - templateConc = 1e-08; - dNTPConc = 0.0} + let template="CTGCCGGCGACGTGGAGCGTCCGATTGTGACGCGCCTGAGCAACCCGGGCACGGTGCTGCGCGAGTCGTGCGACGCCTCACTGCTGGTGCAGGCCATCATCGACGCCATCGTCGACCTGGCCGTGCCCCTGACGGCCGCGTACAACGACGT" + let p = oligoDesignWithCompromise false testPenalty (makeTask template) + Assert.IsTrue(p.IsSome) - let task = { tag = "PR"; - temp = template - align = CENTERLEFT; - strand = TOP; - offset = 0; - targetTemp = 60.0; - sequencePenalties = None} + [] + /// Test that GC clamp finding is behaving sensibly by tempting designer with long AT rich regions + member __.TestGCClamp() = + for template in templates1 do + // set false->true for diagnostics debugging test cases + match oligoDesign false testPenalty (makeTask template) with + | None -> + Assert.Fail (sprintf "TestGClamp failed to make a primer for %s" template) + | Some design -> + match design.oligo.[design.oligo.Length-1] with + | 'G' | 'C' -> () // good + | x -> Assert.Fail( // bad + sprintf "TestGCClamp yielded oligo ending in '%c' not G or C for template %s oligo=%s length=%d temp=%A" + x + template + (Amyris.Bio.utils.arr2seq design.oligo) + design.oligo.Length + design.temp + ) + [] + /// Test that we don't form a primer dimer under tempting circumstances + member __.TestPrimerDimer() = + let verbose = false + for template in templates2 do + if verbose then printfn "Testing %s for primer dimer avoidance" template + if verbose then printfn "designing oligo for %s " template + match oligoDesign false testPenalty (makeTask template) with + | None -> + if verbose then printfn "failed design for %s " template + Assert.Fail "TestPrimerDimer failed to make a primer" + | Some design -> + if verbose then printfn "succeeded design %A %s for %s " design.temp (Amyris.Bio.utils.arr2seq design.oligo) template + let tail = Amyris.Bio.primercore.longestTailTailOverlap design.oligo design.oligo + if verbose then printfn "tail is %d for %s" tail template + Assert.IsTrue(tail<4) - let p = oligoDesignWithCompromise false pen task - Assert.IsTrue(p.IsSome) end