Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ path = "src/lib.rs"
rand = "0.8"
image = "0.25"
image-compare = "0.5.0"
fnv = "1.0.7"
jetscii = { version = "0.5.3", features = [] }
memchr = "2.7.6"
rayon = "1.11.0"

[dev-dependencies]
divan = { version = "4.0.2", package = "codspeed-divan-compat" }
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ cargo codspeed run -m walltime
```

Note: You can also set the `CODSPEED_RUNNER_MODE` environment variable to `walltime` to avoid passing `-m walltime` every time.

Vladislav.Sukhmel
4 changes: 2 additions & 2 deletions benches/dna_matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ fn main() {

#[divan::bench(sample_count = 2, sample_size = 3)]
fn dna_matcher() {
let genome = std::fs::read_to_string("genome.fasta").expect(
let genome = std::fs::read("genome.fasta").expect(
"Failed to read genome.fasta\n\n Make sure to run 'cargo run --release --bin generate_fasta'",
);
let pattern = "AGTCCGTA";
let pattern = b"AGTCCGTA";

let matches = divan::black_box(naive_dna_matcher(
divan::black_box(&genome),
Expand Down
19 changes: 10 additions & 9 deletions src/bfs.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use std::collections::HashSet;
use std::collections::{VecDeque};

use fnv::FnvHashSet;

/// A simple graph represented as an adjacency list
#[derive(Debug, Clone)]
Expand All @@ -23,25 +25,24 @@ impl Graph {
}
}

/// Naive BFS implementation using Vec as a queue (intentionally slow)
/// Naive BFS implementation using VecDeque as a queue
/// Returns the order in which nodes were visited
pub fn bfs_naive(graph: &Graph, start: usize) -> Vec<usize> {
let mut visited = HashSet::new();
let mut queue = Vec::new(); // Using Vec instead of VecDeque - intentionally inefficient!
let mut result = Vec::new();
let mut visited = FnvHashSet::with_capacity_and_hasher(graph.num_nodes(), Default::default());
let mut queue = VecDeque::new();
let mut result = Vec::with_capacity(graph.num_nodes());

queue.push(start);
queue.push_back(start);
visited.insert(start);

while !queue.is_empty() {
// remove(0) is O(n) - this makes BFS slow!
let node = queue.remove(0);
let node = queue.pop_front().unwrap();
result.push(node);

if let Some(neighbors) = graph.adjacency.get(node) {
for &neighbor in neighbors {
if visited.insert(neighbor) {
queue.push(neighbor);
queue.push_back(neighbor);
}
}
}
Expand Down
75 changes: 63 additions & 12 deletions src/dna_matcher.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,59 @@
use memchr::Memchr;
use rayon::iter::{IntoParallelIterator, ParallelIterator};

struct ByteSplitImpl<'a> {
iter: Memchr<'a>,
slice: &'a [u8],
position: usize,
add_next: bool,
}

trait ByteSplit<'a> {
fn byte_split(self, separator: u8) -> ByteSplitImpl<'a>;
}

impl<'a> ByteSplit<'a> for &'a [u8] {
fn byte_split(self, separator: u8) -> ByteSplitImpl<'a> {
ByteSplitImpl {
iter: memchr::memchr_iter(separator, self),
slice: self,
position: 0,
add_next: true,
}
}
}

impl<'a> Iterator for ByteSplitImpl<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
if let Some(next_position) = self.iter.next() {
let slice = self.slice.get(self.position..next_position);
self.position = next_position + 1;
self.add_next = true;
return slice;
}

// If the iterator is consumed check if the last part of the string
// is missing to be added.
if !self.add_next {
None
} else {
// Use case for reading from last comma to end.
let slice = self.slice.get(self.position..);
self.position = self.slice.len();
self.add_next = false;
slice
}
}
}

/// Naive approach: Read the entire file as a string and filter lines
pub fn naive_dna_matcher(genome: &str, pattern: &str) -> Vec<String> {
genome
.lines()
.filter(|line| !line.starts_with('>')) // Skip headers
.filter(|line| line.contains(pattern))
.map(|s| s.to_string())
pub fn naive_dna_matcher<'a>(genome: &'a [u8], pattern: &[u8]) -> Vec<&'a [u8]> {
let matcher = jetscii::ByteSubstring::new(pattern);
let lines = genome.byte_split(b'\n').collect::<Vec<_>>();
lines
.into_par_iter()
.filter(|b| b.len() > 1 && b[0] != b'>' && matcher.find(b).is_some())
.collect()
}

Expand All @@ -14,21 +63,23 @@ mod tests {

#[test]
fn test_naive_matcher() {
let test_genome = ">seq1\nACGTACGT\n>seq2\nAGTCCGTAAA\n>seq3\nGGGGGG";
let pattern = "AGTCCGTA";
let test_genome = b">seq1\nACGTACGT\n>seq2\nAGTCCGTAAA\n>seq3\nGGGGGG";
let pattern = b"AGTCCGTA";
let matches = naive_dna_matcher(test_genome, pattern);
println!("{:?}", matches);
assert_eq!(matches.len(), 1);
assert_eq!(matches[0], "AGTCCGTAAA");
assert_eq!(matches[0], b"AGTCCGTAAA");
}

#[test]
fn test_naive_matcher_on_genome_file() {
// Read the actual genome.fasta file
let genome = std::fs::read_to_string("genome.fasta")
let genome = std::fs::read("genome.fasta")
.expect("Failed to read genome.fasta\n\n Make sure to run 'cargo run --release --bin generate_fasta'");
let pattern = "AGTCCGTA";
let pattern = b"AGTCCGTA";

let matches = naive_dna_matcher(&genome, pattern);
// println!("{:?}", matches);

// With fixed seed (42), we should always get exactly 4927 matches
assert_eq!(
Expand All @@ -39,7 +90,7 @@ mod tests {
);

println!(
"✓ Found {} sequences containing pattern '{}'",
"✓ Found {} sequences containing pattern '{:?}'",
matches.len(),
pattern
);
Expand Down