From fa5685865b4061ae8b4f0f0e6ab5191a3a1bf2a1 Mon Sep 17 00:00:00 2001 From: Hernan Morales Date: Mon, 16 Mar 2026 14:50:36 -0300 Subject: [PATCH] Add full frequency table for every possible k-mer (named integer vector or matrix with all 4^k entries). Similar to Biostrings (oligonucleotideFrequency family). Add #gtoolkit baseline spec. --- .../BaselineOfBioSmalltalk.class.st | 78 ++++---- .../BioTools-Tests/BioAlphabetTest.class.st | 8 +- .../BioGenomicRangesTest.class.st | 38 +++- .../BioTools-Tests/BioSequenceTest.class.st | 185 +++++++++++++++++- repository/BioTools/BioSequence.class.st | 70 ++++++- 5 files changed, 317 insertions(+), 62 deletions(-) diff --git a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st index 58caffdb..8d34a0c2 100644 --- a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st +++ b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st @@ -20,33 +20,31 @@ Class { { #category : 'baselines' } BaselineOfBioSmalltalk >> baseline: spec [ - - spec for: #'common' do: [ - spec blessing: #'baseline'. - - spec preLoadDoIt: #preLoad. - spec postLoadDoIt: #'postLoad'. - - self - projectSpecForCommonUtils: spec; - projectSpecForDataDrameInspector: spec; - "projectSpecForProjectFramework: spec;" - projectSpecForStringExtensions: spec; - projectSpecForFileDialog: spec; - projectSpecForPolyMath: spec; - projectSpecForDataFrame: spec; - projectSpecForINIParser: spec; - projectSpecForPetitParser: spec; - projectSpecForNeedlemanWunsch: spec. + + spec for: #common do: [ + spec blessing: #baseline. + + spec preLoadDoIt: #preLoad. + spec postLoadDoIt: #postLoad. + + self + projectSpecForCommonUtils: spec; + projectSpecForDataDrameInspector: spec; + "projectSpecForProjectFramework: spec;" + projectSpecForStringExtensions: spec; + projectSpecForFileDialog: spec; + projectSpecForPolyMath: spec; + projectSpecForDataFrame: spec; + projectSpecForINIParser: spec; + projectSpecForPetitParser: spec; + projectSpecForNeedlemanWunsch: spec. "projectSpecForRoassal2: spec." - self baselineOSDeps: spec. - self baselineCommonPackages: spec. - self baselineGroups: spec ]. - - self baselinePharo: spec. + self baselineCommonPackages: spec. + self baselineGroups: spec ]. + self baselinePharo: spec ] { #category : 'groups' } @@ -170,26 +168,34 @@ BaselineOfBioSmalltalk >> baselineOSDeps: spec [ { #category : 'baselines' } BaselineOfBioSmalltalk >> baselinePharo: spec [ + spec for: #( #gtoolkit ) do: [ + self projectSpecForXMLPharo12: spec. + self commonPackagesForPharo7onWards: spec ]. + spec for: #( #'pharo4.x' #'pharo5.x' #'pharo6.x' ) do: [ - self projectSpecForXMLPharo7: spec. - spec - package: 'BioTools' with: [ spec includes: #( 'BioPharo4' ) ]; - package: 'BioPharo4' with: [ spec requires: #( 'BioTools' ) ]; - group: 'Basic' with: #( 'BioPharo4' ) ]. + self baselineOSDeps: spec. + self projectSpecForXMLPharo7: spec. + spec + package: 'BioTools' with: [ spec includes: #( 'BioPharo4' ) ]; + package: 'BioPharo4' with: [ spec requires: #( 'BioTools' ) ]; + group: 'Basic' with: #( 'BioPharo4' ) ]. spec for: #'pharo7.x' do: [ - self projectSpecForXMLPharo7: spec. - self commonPackagesForPharo7onWards: spec ]. + self baselineOSDeps: spec. + self projectSpecForXMLPharo7: spec. + self commonPackagesForPharo7onWards: spec ]. spec for: #( #'pharo8.x' #'pharo9.x' #'pharo10.x' #'pharo11.x' ) do: [ - self projectSpecForXMLPharo8: spec. - self commonPackagesForPharo7onWards: spec ]. - - spec for: #( #'pharo12.x' #'pharo13.x' #'pharo14.x') do: [ - self projectSpecForXMLPharo12: spec. - self commonPackagesForPharo7onWards: spec ] + self baselineOSDeps: spec. + self projectSpecForXMLPharo8: spec. + self commonPackagesForPharo7onWards: spec ]. + + spec for: #( #'pharo12.x' #'pharo13.x' #'pharo14.x' ) do: [ + self baselineOSDeps: spec. + self projectSpecForXMLPharo12: spec. + self commonPackagesForPharo7onWards: spec ] ] { #category : 'baselines' } diff --git a/repository/BioTools-Tests/BioAlphabetTest.class.st b/repository/BioTools-Tests/BioAlphabetTest.class.st index 7af38be1..8d6469a5 100644 --- a/repository/BioTools-Tests/BioAlphabetTest.class.st +++ b/repository/BioTools-Tests/BioAlphabetTest.class.st @@ -1,12 +1,12 @@ Class { #name : 'BioAlphabetTest', #superclass : 'BioAbstractTest', - #category : 'BioTools-Tests-Alphabets', - #package : 'BioTools-Tests', - #tag : 'Alphabets', #instVars : [ 'alphabet' - ] + ], + #category : 'BioTools-Tests-Alphabets', + #package : 'BioTools-Tests', + #tag : 'Alphabets' } { #category : 'testing' } diff --git a/repository/BioTools-Tests/BioGenomicRangesTest.class.st b/repository/BioTools-Tests/BioGenomicRangesTest.class.st index 75701151..cc6febdc 100644 --- a/repository/BioTools-Tests/BioGenomicRangesTest.class.st +++ b/repository/BioTools-Tests/BioGenomicRangesTest.class.st @@ -223,13 +223,15 @@ BioGenomicRangesTest >> testFromGFF [ { #category : 'tests' } BioGenomicRangesTest >> testIRangesRejectNegativeWidth [ - self should: [ BioIRanges starts: #(5) ends: #(4) ] raise: Error + + self should: [ BioIRanges starts: #( 5 ) ends: #( 4 ) ] raise: Error ] { #category : 'tests' } BioGenomicRangesTest >> testIRangesWidth [ + | ir | - ir := BioIRanges starts: #(2 10) ends: #(4 12). + ir := BioIRanges starts: #( 2 10 ) ends: #( 4 12 ). self assert: (ir width at: 1) equals: 3. self assert: (ir width at: 2) equals: 3 ] @@ -251,9 +253,20 @@ BioGenomicRangesTest >> testMetadataCanContainObjects [ { #category : 'tests' } BioGenomicRangesTest >> testOverlapWith [ + | a b c | - a := BioGenomicRanges seqnames: #('chr1' 'chr1') starts: #(1 50) ends: #(10 60) strands: #('+' '+') metadata: Dictionary new. - b := BioGenomicRanges seqnames: #('chr1') starts: #(5) ends: #(15) strands: #('+') metadata: Dictionary new. + a := BioGenomicRanges + seqnames: #( 'chr1' 'chr1' ) + starts: #( 1 50 ) + ends: #( 10 60 ) + strands: #( '+' '+' ) + metadata: Dictionary new. + b := BioGenomicRanges + seqnames: #( 'chr1' ) + starts: #( 5 ) + ends: #( 15 ) + strands: #( '+' ) + metadata: Dictionary new. c := a overlapWith: b. self assert: c size equals: 1. self assert: (c start at: 1) equals: 1 @@ -261,17 +274,28 @@ BioGenomicRangesTest >> testOverlapWith [ { #category : 'tests' } BioGenomicRangesTest >> testReduceKeepsStrandSeparate [ + | gr red | - gr := BioGenomicRanges seqnames: #('chr1' 'chr1') starts: #(1 2) ends: #(5 6) strands: #('+' '-') metadata: Dictionary new. + gr := BioGenomicRanges + seqnames: #( 'chr1' 'chr1' ) + starts: #( 1 2 ) + ends: #( 5 6 ) + strands: #( '+' '-' ) + metadata: Dictionary new. red := gr reduce. self assert: red size equals: 2 - ] { #category : 'tests' } BioGenomicRangesTest >> testReduceMergesAdjacent [ + | gr red | - gr := BioGenomicRanges seqnames: #('chr1' 'chr1') starts: #(1 6) ends: #(5 10) strands: #('+' '+') metadata: Dictionary new. + gr := BioGenomicRanges + seqnames: #( 'chr1' 'chr1' ) + starts: #( 1 6 ) + ends: #( 5 10 ) + strands: #( '+' '+' ) + metadata: Dictionary new. red := gr reduce. self assert: red size equals: 1. self assert: (red start at: 1) equals: 1. diff --git a/repository/BioTools-Tests/BioSequenceTest.class.st b/repository/BioTools-Tests/BioSequenceTest.class.st index c94f35ea..2974d6d8 100644 --- a/repository/BioTools-Tests/BioSequenceTest.class.st +++ b/repository/BioTools-Tests/BioSequenceTest.class.st @@ -12,14 +12,14 @@ BioSequenceTest >> alphabetClass [ ^ BioAlphabet ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testAccessionName [ self sequence accessionName: 'ABC234.1'. self assert: self sequence accessionName equals: 'ABC234.1'. ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testAddSeqFeature [ | seqFeature | @@ -103,13 +103,13 @@ BioSequenceTest >> testAmbiguityCodes [ self assert: (sequence ambiguityCodes includes: $N). ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testAminoacidNames [ self assert: (self sequence aminoacidNames bioHasEqualElements: #('Cysteine' 'Alanine' 'Glycine' 'Threonine' 'Cysteine' 'Alanine' 'Threonine' 'Glycine' 'Cysteine' 'Threonine' 'Alanine' 'Glycine')). ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testAsFastaRecord [ | fastaRecord | @@ -301,7 +301,7 @@ BioSequenceTest >> testCopyFromTo [ ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testDefaultAlphabetClass [ self assert: (self sequence defaultAlphabetClass isKindOf: BioAlphabet class) @@ -319,6 +319,63 @@ BioSequenceTest >> testDegenerateBasesAsMultibases [ equals: '' ] +{ #category : 'testing' } +BioSequenceTest >> testDinucleotideFrequencyCountsSlidingWindows [ + + | seq frequencies | + seq := BioSequence newDNA: 'ATATA'. + + frequencies := seq dinucleotideFrequency. + + self assert: (frequencies at: 'AT') equals: 2. + self assert: (frequencies at: 'TA') equals: 2 +] + +{ #category : 'testing' } +BioSequenceTest >> testDinucleotideFrequencyDoesNotIncludeAbsentDinucleotides [ + + | seq frequencies | + seq := BioSequence newDNA: 'AAAA'. + + frequencies := seq dinucleotideFrequency. + + self assert: frequencies size equals: 1. + self assert: (frequencies at: 'AA') equals: 3. + self deny: (frequencies includesKey: 'AC') +] + +{ #category : 'testing' } +BioSequenceTest >> testDinucleotideFrequencyEmptyWhenSequenceTooShort [ + + | seq | + seq := BioSequence newDNA: 'A'. + + self assertEmpty: seq dinucleotideFrequency +] + +{ #category : 'testing' } +BioSequenceTest >> testDinucleotideFrequencySingleWindow [ + + | seq frequencies | + seq := BioSequence newDNA: 'AC'. + + frequencies := seq dinucleotideFrequency. + + self assert: frequencies size equals: 1. + self assert: (frequencies at: 'AC') equals: 1 +] + +{ #category : 'testing' } +BioSequenceTest >> testDinucleotideFrequencySumsToNumberOfWindows [ + + | seq frequencies | + seq := BioSequence newDNA: 'ACGT'. + + frequencies := seq dinucleotideFrequency. + + self assert: frequencies values sum equals: seq size - 1 +] + { #category : 'testing' } BioSequenceTest >> testDisambiguate [ @@ -419,7 +476,7 @@ BioSequenceTest >> testHasAmbiguousBases [ self assert: seq2 hasAmbiguousBases. ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testHotspotRegionsLeftRight [ | seq1 hotspotRegions | @@ -451,7 +508,7 @@ BioSequenceTest >> testIsDNASequence [ ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testIsEmpty [ self deny: (BioSequence newAmbiguousDNA: self sampleDnaString03) isEmpty. @@ -675,6 +732,63 @@ BioSequenceTest >> testOccurrencesOfLetters [ ] +{ #category : 'testing' } +BioSequenceTest >> testOligonucleotideFrequencyCountsNucleotides [ + + | seq frequencies | + seq := BioSequence newDNA: 'AAGT'. + + frequencies := seq oligonucleotideFrequency. + + self assert: (frequencies at: 'A') equals: 2. + self assert: (frequencies at: 'G') equals: 1. + self assert: (frequencies at: 'T') equals: 1 +] + +{ #category : 'testing' } +BioSequenceTest >> testOligonucleotideFrequencyDoesNotIncludeAbsentSymbols [ + + | seq frequencies | + seq := BioSequence newDNA: 'ACAC'. + + frequencies := seq oligonucleotideFrequency. + + self deny: (frequencies includesKey: 'G'). + self deny: (frequencies includesKey: 'T') +] + +{ #category : 'testing' } +BioSequenceTest >> testOligonucleotideFrequencyOnEmptySequence [ + + | seq | + seq := BioSequence newDNA: ''. + + self assertEmpty: seq oligonucleotideFrequency +] + +{ #category : 'testing' } +BioSequenceTest >> testOligonucleotideFrequencySumsToSequenceSize [ + + | seq frequencies | + seq := BioSequence newDNA: 'ACGTAC'. + + frequencies := seq oligonucleotideFrequency. + + self assert: frequencies values sum equals: seq size +] + +{ #category : 'testing' } +BioSequenceTest >> testOligonucleotideFrequencyWithRepeatedSymbol [ + + | seq frequencies | + seq := BioSequence newDNA: 'AAAA'. + + frequencies := seq oligonucleotideFrequency. + + self assert: frequencies size equals: 1. + self assert: (frequencies at: 'A') equals: 4 +] + { #category : 'testing' } BioSequenceTest >> testPositionsOf [ @@ -759,7 +873,7 @@ BioSequenceTest >> testReversed [ self assert: seqTest reversed asString equals: String empty. ] -{ #category : 'test' } +{ #category : 'testing' } BioSequenceTest >> testSequenceFeatures [ self assert: (self sequence sequenceFeatures isKindOf: Collection). @@ -924,3 +1038,58 @@ BioSequenceTest >> testTranslated [ self assert: seq3 translate = seq4. ] + +{ #category : 'testing' } +BioSequenceTest >> testTrinucleotideFrequencyCountsSlidingWindows [ + + | seq frequencies | + seq := BioSequence newDNA: 'ATATAT'. + + frequencies := seq trinucleotideFrequency. + + self assert: (frequencies at: 'ATA') equals: 2. + self assert: (frequencies at: 'TAT') equals: 2 +] + +{ #category : 'testing' } +BioSequenceTest >> testTrinucleotideFrequencyEmptyWhenSequenceTooShort [ + + | seq | + seq := BioSequence newDNA: 'AT'. + self assertEmpty: seq trinucleotideFrequency +] + +{ #category : 'testing' } +BioSequenceTest >> testTrinucleotideFrequencySingleWindow [ + + | seq frequencies | + seq := BioSequence newDNA: 'ATG'. + + frequencies := seq trinucleotideFrequency. + + self assert: frequencies size equals: 1. + self assert: (frequencies at: 'ATG') equals: 1 +] + +{ #category : 'testing' } +BioSequenceTest >> testTrinucleotideFrequencySumsToNumberOfWindows [ + + | seq frequencies | + seq := BioSequence newDNA: 'ATGCA'. + + frequencies := seq trinucleotideFrequency. + + self assert: frequencies values sum equals: seq size - 2 +] + +{ #category : 'testing' } +BioSequenceTest >> testTrinucleotideFrequencyWithRepeatedTrinucleotide [ + + | seq frequencies | + seq := BioSequence newDNA: 'AAAAA'. + + frequencies := seq trinucleotideFrequency. + + self assert: frequencies size equals: 1. + self assert: (frequencies at: 'AAA') equals: 3 +] diff --git a/repository/BioTools/BioSequence.class.st b/repository/BioTools/BioSequence.class.st index 05fabccc..97640b6e 100644 --- a/repository/BioTools/BioSequence.class.st +++ b/repository/BioTools/BioSequence.class.st @@ -674,6 +674,14 @@ BioSequence >> dfsExpandSeq: currentDFSNode cumSequences: cumList [ ] +{ #category : 'accessing - frequencies' } +BioSequence >> dinucleotideFrequency [ + "Answer a Dictionary mapping each dinucleotide (2-mer) to its frequency in the receiver. + The counting uses a sliding window of step 1." + + ^ self kmerFrequencies: 2 +] + { #category : 'accessing - disambiguation' } BioSequence >> disambiguate [ " Answer a with receiver's sequence disambiguated using IUPAC ambiguity codes " @@ -991,7 +999,39 @@ BioSequence >> isSatelliteSequence [ ^ false ] -{ #category : 'accessing' } +{ #category : 'accessing - frequencies' } +BioSequence >> kmerFrequencies: k [ + "Answer a Dictionary with all observed k-mer frequencies (sliding window, step 1). + it provides the full k-mer frequency table + + It is functionally equivalent to: oligonucleotideFrequency(seq, width = k, step = 1, as.prob = FALSE) + (and the dinucleotide/trinucleotide convenience aliases)." + + | upperBound | + (k isInteger and: [ k > 0 ]) ifFalse: [ + self error: 'k must be a positive integer' ]. + + upperBound := self size - k + 1. + upperBound < 1 ifTrue: [ ^ Dictionary new ]. + + ^ (Bag withAll: ((1 to: upperBound) collect: [ :i | + (self copyFrom: i to: i + k - 1) asString ])) asDictionary +] + +{ #category : 'accessing - frequencies' } +BioSequence >> kmerFrequencies: k fullTable: aBoolean [ + "If aBoolean is true, return complete table with zeros for missing kmers" + + | freqs allKmers | + freqs := self kmerFrequencies: k. + aBoolean ifFalse: [ ^ freqs ]. + + allKmers := 'ACGT' enumerationsOfSize: k. + allKmers do: [ :kmer | freqs at: kmer ifAbsentPut: [ 0 ] ]. + ^ freqs +] + +{ #category : 'accessing - frequencies' } BioSequence >> kmersCount: patString [ " Answer the number of times that k-mers patString appears as substring of the receiver " @@ -999,7 +1039,7 @@ BioSequence >> kmersCount: patString [ (self copyFrom: i to: i + patString size - 1) sequence = patString ]. ] -{ #category : 'accessing' } +{ #category : 'accessing - frequencies' } BioSequence >> kmersCount: patString mismatches: d [ " Answer the number of times that k-mers patString appears as substring of the receiver allowing d mismatches " @@ -1255,7 +1295,7 @@ BioSequence >> molecularWeightNonDegen [ ((dict at: $C) * 289.18) + 17.01. ] -{ #category : 'accessing' } +{ #category : 'accessing - frequencies' } BioSequence >> mostFrequentKmer: k [ " Answer a of receiver with the most frequent k-mers of size k " @@ -1275,7 +1315,7 @@ BioSequence >> mostFrequentKmer: k [ ^ maxKmers. ] -{ #category : 'accessing' } +{ #category : 'accessing - frequencies' } BioSequence >> mostFrequentKmer: k mismatches: d [ " Answer a of receiver with the most frequent k-mers of size k allowing mismatches of size d " " Frequent Words with Mismatches Problem @@ -1293,7 +1333,7 @@ BioSequence >> mostFrequentKmer: k mismatches: d [ ^ counter maxElements ] -{ #category : 'accessing' } +{ #category : 'accessing - frequencies' } BioSequence >> mostFrequentKmerRevComp: k mismatches: d [ " Answer a of receiver with the most frequent k-mers of size k allowing mismatches of size d " " Frequent Words with Mismatches Problem @@ -1340,14 +1380,14 @@ BioSequence >> notEmpty [ ^ seq notEmpty ] -{ #category : 'accessing' } +{ #category : 'accessing - frequencies' } BioSequence >> occurrencesOf: aCharacter [ " Answer how many of the receiver's elements are equal to aLetter " ^ seq asUppercase occurrencesOf: aCharacter asAminoacidLetter asUppercase ] -{ #category : 'accessing' } +{ #category : 'accessing - frequencies' } BioSequence >> occurrencesOfLetters [ " Answer a of occurrence mappings for the receiver " @@ -1361,6 +1401,14 @@ BioSequence >> occurrencesOfLetters [ ^ occurrences ] +{ #category : 'accessing - frequencies' } +BioSequence >> oligonucleotideFrequency [ + "Answer a Dictionary mapping each nucleotide (1-mer) to its frequency in the receiver. + The counting uses a sliding window of step 1." + + ^ self kmerFrequencies: 1 +] + { #category : 'accessing' } BioSequence >> positionsOf: aCharacterOrString [ " Answer a Collection with the positions of aminoacidLetter in the receiver's sequence " @@ -1688,6 +1736,14 @@ BioSequence >> trimAmbiguityCodes [ ] +{ #category : 'accessing - frequencies' } +BioSequence >> trinucleotideFrequency [ + "Answer a Dictionary mapping each trinucleotide (3-mer) to its frequency in the receiver. + The counting uses a sliding window of step 1." + + ^ self kmerFrequencies: 3 +] + { #category : 'accessing' } BioSequence >> truncateFrom: startInteger [ " Modify the receiver slicing the sequence from startInteger position up to the last position.