CodSpeedHQ · dmitryvk · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,10 +11,16 @@ path = "src/lib.rs"
 rand = "0.8"
 image = "0.25"
 image-compare = "0.5.0"
+jetscii = { version = "0.5.3" }
+itertools = "0.14.0"
+rayon = "1.11.0"
 
 [dev-dependencies]
 divan = { version = "4.0.2", package = "codspeed-divan-compat" }
 
+[profile.bench]
+debug = true
+
 [[bin]]
 name = "generate_fasta"
 path = "bin/generate_fasta.rs"

diff --git a/README.md b/README.md
@@ -34,3 +34,5 @@ cargo codspeed run -m walltime
 ```
 
 Note: You can also set the `CODSPEED_RUNNER_MODE` environment variable to `walltime` to avoid passing `-m walltime` every time.
+
+
diff --git a/src/bfs.rs b/src/bfs.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::VecDeque;
 
 /// A simple graph represented as an adjacency list
 #[derive(Debug, Clone)]
@@ -26,22 +26,23 @@ impl Graph {
 /// Naive BFS implementation using Vec as a queue (intentionally slow)
 /// Returns the order in which nodes were visited
 pub fn bfs_naive(graph: &Graph, start: usize) -> Vec<usize> {
-    let mut visited = HashSet::new();
-    let mut queue = Vec::new(); // Using Vec instead of VecDeque - intentionally inefficient!
-    let mut result = Vec::new();
+    let mut visited = vec![false; graph.num_nodes()];
+    let mut queue = VecDeque::with_capacity(graph.num_nodes());
+    let mut result = Vec::with_capacity(graph.num_nodes());
 
-    queue.push(start);
-    visited.insert(start);
+    queue.push_back(start);
+    assert!(start < graph.num_nodes());
+    visited[start] = true;
 
-    while !queue.is_empty() {
-        // remove(0) is O(n) - this makes BFS slow!
-        let node = queue.remove(0);
+    while let Some(node) = queue.pop_front() {
         result.push(node);
 
         if let Some(neighbors) = graph.adjacency.get(node) {
             for &neighbor in neighbors {
-                if visited.insert(neighbor) {
-                    queue.push(neighbor);
+                assert!(neighbor < graph.num_nodes());
+                if !visited[neighbor] {
+                    visited[neighbor] = true;
+                    queue.push_back(neighbor);
                 }
             }
         }

diff --git a/src/dna_matcher.rs b/src/dna_matcher.rs
@@ -1,19 +1,53 @@
+use itertools::Itertools;
+use jetscii::{ByteSubstring, bytes};
+use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
+
 /// Naive approach: Read the entire file as a string and filter lines
-pub fn naive_dna_matcher(genome: &str, pattern: &str) -> Vec<String> {
-    genome
-        .lines()
-        .filter(|line| !line.starts_with('>')) // Skip headers
-        .filter(|line| line.contains(pattern))
-        .map(|s| s.to_string())
+pub fn naive_dna_matcher<'a>(genome: &'a str, pattern: &'a str) -> Vec<&'a str> {
+    let genome = genome.as_bytes();
+    let pattern = pattern.as_bytes();
+    let searcher = ByteSubstring::new(pattern);
+    split_lines(genome)
+        .into_iter()
+        .collect_vec()
+        .par_iter()
+        .filter(|line| line.first() != Some(&b'>'))
+        .filter(|line| searcher.find(line).is_some())
+        .map(|s| unsafe { str::from_utf8_unchecked(s) })
         .collect()
 }
 
+fn split_lines(text: &[u8]) -> Vec<&[u8]> {
+    let newlines = bytes!('\n');
+    let mut offset = 0;
+    let mut result = Vec::with_capacity(128);
+    while offset < text.len() {
+        if let Some(next_offset_delta) = newlines.find(&text[offset..]) {
+            let next_offset = offset + next_offset_delta;
+            // println!("offset={offset} next_offset={next_offset}");
+            result.push(&text[offset..next_offset]);
+            offset = next_offset + 1;
+        } else {
+            // println!("offset={offset} next_offset is none");
+            result.push(&text[offset..]);
+            break;
+        }
+    }
+    result
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn test_naive_matcher() {
+    fn test_split_lines() {
+        let lines = split_lines(b"foo\nbar\nbaz");
+        assert_eq!(lines, vec![b"foo", b"bar", b"baz"]);
+    }
+
+    #[test]
+    fn test_naive_matcher_tiny() {
         let test_genome = ">seq1\nACGTACGT\n>seq2\nAGTCCGTAAA\n>seq3\nGGGGGG";
         let pattern = "AGTCCGTA";
         let matches = naive_dna_matcher(test_genome, pattern);

diff --git a/src/lut_filters.rs b/src/lut_filters.rs
@@ -31,37 +31,27 @@ pub fn apply_brightness_contrast_gamma(
     contrast: f32,
     gamma: f32,
 ) -> RgbImage {
-    let temp_img = apply_brightness_contrast(img, brightness, contrast);
-    naive::apply_gamma(&temp_img, gamma)
+    naive::apply_brightness_contrast_gamma(&img, brightness, contrast, gamma)
 }
 
 mod naive {
     use super::*;
 
     /// Apply brightness and contrast with floating-point math per pixel
     pub fn apply_brightness_contrast(img: &RgbImage, brightness: i16, contrast: f32) -> RgbImage {
+        let mut lut = [0u8; 256];
+        for i in 0..=255 {
+            lut[i] = (((i as f32 - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32) as u8;
+        }
         let (width, height) = img.dimensions();
         let mut output = ImageBuffer::new(width, height);
 
         for (x, y, pixel) in img.enumerate_pixels() {
-            let r = pixel[0] as f32;
-            let g = pixel[1] as f32;
-            let b = pixel[2] as f32;
-
-            // Apply contrast and brightness (5 FP ops per channel!)
-            let r = ((r - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32;
-            let g = ((g - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32;
-            let b = ((b - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32;
-
-            output.put_pixel(
-                x,
-                y,
-                Rgb([
-                    r.clamp(0.0, 255.0) as u8,
-                    g.clamp(0.0, 255.0) as u8,
-                    b.clamp(0.0, 255.0) as u8,
-                ]),
-            );
+            let r = lut[pixel[0] as usize];
+            let g = lut[pixel[1] as usize];
+            let b = lut[pixel[2] as usize];
+
+            output.put_pixel(x, y, Rgb([r, g, b]));
         }
 
         output
@@ -70,16 +60,55 @@ mod naive {
     /// Naive implementation: Apply gamma correction
     /// This is VERY slow because powf() is expensive!
     pub fn apply_gamma(img: &RgbImage, gamma: f32) -> RgbImage {
+        let mut lut = [0u8; 256];
+        for i in 0..=255 {
+            lut[i] = ((i as f32 / 255.0).powf(1.0 / gamma) * 255.0) as u8;
+        }
+        let (width, height) = img.dimensions();
+        let mut output = ImageBuffer::new(width, height);
+
+        for (x, y, pixel) in img.enumerate_pixels() {
+            // powf() is VERY expensive - this is why we need a LUT!
+            let r = lut[pixel[0] as usize];
+            let g = lut[pixel[1] as usize];
+            let b = lut[pixel[2] as usize];
+
+            output.put_pixel(x, y, Rgb([r, g, b]));
+        }
+
+        output
+    }
+
+    /// Naive implementation: Apply gamma correction
+    /// This is VERY slow because powf() is expensive!
+    pub fn apply_brightness_contrast_gamma(
+        img: &RgbImage,
+        brightness: i16,
+        contrast: f32,
+        gamma: f32,
+    ) -> RgbImage {
+        let mut lut1 = [0u8; 256];
+        for i in 0..=255 {
+            lut1[i] = (((i as f32 - 128.0) * (1.0 + contrast)) + 128.0 + brightness as f32) as u8;
+        }
+        let mut lut2: [u8; 256] = [0u8; 256];
+        for i in 0..=255 {
+            lut2[i] = ((i as f32 / 255.0).powf(1.0 / gamma) * 255.0) as u8;
+        }
+        let mut lut_combined: [u8; 256] = [0u8; 256];
+        for i in 0..=255 {
+            lut_combined[i] = lut2[lut1[i] as usize];
+        }
         let (width, height) = img.dimensions();
         let mut output = ImageBuffer::new(width, height);
 
         for (x, y, pixel) in img.enumerate_pixels() {
             // powf() is VERY expensive - this is why we need a LUT!
-            let r = (pixel[0] as f32 / 255.0).powf(1.0 / gamma) * 255.0;
-            let g = (pixel[1] as f32 / 255.0).powf(1.0 / gamma) * 255.0;
-            let b = (pixel[2] as f32 / 255.0).powf(1.0 / gamma) * 255.0;
+            let r = lut_combined[pixel[0] as usize];
+            let g = lut_combined[pixel[1] as usize];
+            let b = lut_combined[pixel[2] as usize];
 
-            output.put_pixel(x, y, Rgb([r as u8, g as u8, b as u8]));
+            output.put_pixel(x, y, Rgb([r, g, b]));
         }
 
         output
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,3 +34,5 @@ cargo codspeed run -m walltime
		```

		Note: You can also set the `CODSPEED_RUNNER_MODE` environment variable to `walltime` to avoid passing `-m walltime` every time.