From 32ece2e65a50322e8622bcedf6f0f777d22cee65 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 12:37:03 +0200
Subject: [PATCH 01/22] [feat] add deterministic duplicate code report CLI

---
 .github/workflows/ci.yml                |  32 +++
 .github/workflows/coderabbit-review.yml |  75 ++++++
 Cargo.lock                              |  16 ++
 Cargo.toml                              |  11 +
 README.md                               | 117 ++++++++-
 src/cli.rs                              | 154 ++++++++++++
 src/discovery.rs                        | 196 +++++++++++++++
 src/duplicate.rs                        | 316 ++++++++++++++++++++++++
 src/error.rs                            |  33 +++
 src/language.rs                         | 192 ++++++++++++++
 src/lib.rs                              | 167 +++++++++++++
 src/line.rs                             |  88 +++++++
 src/main.rs                             |  24 ++
 src/model.rs                            |  56 +++++
 src/paths.rs                            |  22 ++
 src/report.rs                           | 103 ++++++++
 16 files changed, 1601 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .github/workflows/coderabbit-review.yml
 create mode 100644 Cargo.lock
 create mode 100644 Cargo.toml
 create mode 100644 src/cli.rs
 create mode 100644 src/discovery.rs
 create mode 100644 src/duplicate.rs
 create mode 100644 src/error.rs
 create mode 100644 src/language.rs
 create mode 100644 src/lib.rs
 create mode 100644 src/line.rs
 create mode 100644 src/main.rs
 create mode 100644 src/model.rs
 create mode 100644 src/paths.rs
 create mode 100644 src/report.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..f8eae35
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,32 @@
+name: Rust CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+permissions:
+  contents: read
+
+jobs:
+  rust:
+    name: Build, test, and format
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust toolchain
+        run: rustup toolchain install stable --profile minimal --component rustfmt
+
+      - name: Check formatting
+        run: cargo fmt --all -- --check
+
+      - name: Build
+        run: cargo build --locked --all-targets
+
+      - name: Test
+        run: cargo test --locked --all-targets
+
diff --git a/.github/workflows/coderabbit-review.yml b/.github/workflows/coderabbit-review.yml
new file mode 100644
index 0000000..4a61832
--- /dev/null
+++ b/.github/workflows/coderabbit-review.yml
@@ -0,0 +1,75 @@
+name: CodeRabbit Review Gate
+
+on:
+  pull_request_review:
+    types:
+      - submitted
+      - edited
+      - dismissed
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  coderabbit-review:
+    name: Validate CodeRabbit review
+    if: github.event.pull_request.draft == false && github.event.review.user.login == 'coderabbitai[bot]'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check CodeRabbit review state
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+        run: |
+          node <<'NODE'
+          const token = process.env.GITHUB_TOKEN;
+          const [owner, repo] = process.env.GITHUB_REPOSITORY.split("/");
+          const prNumber = process.env.PR_NUMBER;
+          const headSha = process.env.PR_HEAD_SHA;
+
+          async function fetchReviews(page = 1, reviews = []) {
+            const url = `https://api.github.com/repos/${owner}/${repo}/pulls/${prNumber}/reviews?per_page=100&page=${page}`;
+            const response = await fetch(url, {
+              headers: {
+                Authorization: `Bearer ${token}`,
+                Accept: "application/vnd.github+json",
+                "X-GitHub-Api-Version": "2022-11-28",
+              },
+            });
+
+            if (!response.ok) {
+              const body = await response.text();
+              throw new Error(`GitHub review lookup failed: ${response.status} ${body}`);
+            }
+
+            const pageReviews = await response.json();
+            if (pageReviews.length === 0) {
+              return reviews;
+            }
+            return fetchReviews(page + 1, reviews.concat(pageReviews));
+          }
+
+          const reviews = await fetchReviews();
+          const codeRabbitReviews = reviews
+            .filter((review) => review.user?.login === "coderabbitai[bot]")
+            .filter((review) => review.commit_id === headSha)
+            .sort((left, right) => new Date(left.submitted_at) - new Date(right.submitted_at));
+
+          const latestReview = codeRabbitReviews.at(-1);
+          if (!latestReview) {
+            console.error(`CodeRabbit has not submitted a review for ${headSha}.`);
+            process.exit(1);
+          }
+
+          if (latestReview.state === "CHANGES_REQUESTED") {
+            console.error("CodeRabbit requested changes on this pull request.");
+            process.exit(1);
+          }
+
+          console.log(`CodeRabbit review state for ${headSha}: ${latestReview.state}`);
+          NODE
+
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000..2465298
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,16 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "codem8"
+version = "0.1.0"
+dependencies = [
+ "xxhash-rust",
+]
+
+[[package]]
+name = "xxhash-rust"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..1d7336b
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "codem8"
+version = "0.1.0"
+edition = "2021"
+license = "MIT"
+description = "A deterministic source code analysis CLI for duplicate code reports."
+repository = "https://github.com/b4prog/CodeM8"
+
+[dependencies]
+xxhash-rust = { version = "0.8", features = ["xxh3"] }
+
diff --git a/README.md b/README.md
index 93bf7bd..3642e93 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,116 @@
-# CodeM8
\ No newline at end of file
+# CodeM8
+
+CodeM8 is a Rust command-line application for deterministic source code reports.
+The initial report detects duplicated line-based code blocks in a repository:
+
+```bash
+codem8 --report-duplicate
+```
+
+The duplicate report is designed for both human developers and coding agents. It
+trims source lines, ignores empty lines, hashes normalized lines with XXH3
+128-bit, classifies syntax-only lines as block-only, groups repeated blocks, and
+prints a stable plain-text report sorted by duplicate weight.
+
+## Installation
+
+Build from source with Cargo:
+
+```bash
+cargo build --release
+```
+
+Run the local binary:
+
+```bash
+cargo run -- --report-duplicate
+```
+
+## Usage
+
+Analyze TypeScript files from the current directory:
+
+```bash
+codem8 --report-duplicate
+```
+
+Analyze multiple extensions:
+
+```bash
+codem8 --report-duplicate -file-extension=ts,tsx,js,jsx
+```
+
+Analyze an explicit list of files instead of recursively discovering files:
+
+```bash
+codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js
+```
+
+## Duplicate Report
+
+By default, CodeM8 analyzes `.ts` files. Recursive discovery skips common
+irrelevant directories such as `.git`, `node_modules`, `target`, `dist`,
+`build`, `coverage`, `.next`, `.nuxt`, `.svelte-kit`, `.idea`, and `.vscode`.
+Symbolic links are not followed.
+
+Every non-empty line is normalized with Rust string trimming, so leading and
+trailing Unicode whitespace are removed before hashing and comparison. Empty
+trimmed lines are ignored. CodeM8 currently expects UTF-8 source files; invalid
+UTF-8 produces a clear error rather than lossy output.
+
+Duplicate block weight is calculated as:
+
+```text
+(occurrences - 1) * duplicated_line_count * cumulative_normalized_character_count
+```
+
+Reports are sorted deterministically by descending weight, then by line count,
+character count, first location, and normalized block text.
+
+## Language Heuristics
+
+CodeM8 includes a hard-coded registry of block-only line patterns for common
+languages and markup formats:
+
+- TypeScript / JavaScript
+- Rust
+- C / C++ / Objective-C
+- C#
+- Java / Kotlin / Scala
+- Go
+- Python
+- Ruby
+- PHP
+- Swift
+- Shell
+- PowerShell
+- HTML / XML
+- CSS / SCSS / Sass / Less
+- SQL
+- YAML / JSON / TOML
+
+Block-only lines, such as braces or closing tags, cannot start a duplicate by
+themselves. They can still be included inside a larger duplicated block when
+surrounding comparison lines match.
+
+## Development
+
+Run the full local verification set:
+
+```bash
+cargo fmt --all -- --check
+cargo build --all-targets
+cargo test --all-targets
+```
+
+The repository includes GitHub Actions workflows for Rust CI and a CodeRabbit
+review gate. CI verifies formatting, build success, and tests on pushes and pull
+requests. The CodeRabbit gate runs when CodeRabbit submits or edits a pull
+request review and fails if CodeRabbit requests changes on the current PR head.
+
+## Dependency Policy
+
+CodeM8 avoids external packages for functionality that is simple to implement
+and maintain directly. The first implementation uses one runtime dependency,
+`xxhash-rust`, for the required XXH3 128-bit hash implementation. The crate is
+widely used and permissively licensed under MIT or Apache-2.0.
diff --git a/src/cli.rs b/src/cli.rs
new file mode 100644
index 0000000..934c274
--- /dev/null
+++ b/src/cli.rs
@@ -0,0 +1,154 @@
+use std::path::PathBuf;
+
+use crate::error::{CodeM8Error, Result};
+
+const DEFAULT_FILE_EXTENSIONS: &[&str] = &["ts"];
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CliConfig {
+    pub report_duplicate: bool,
+    pub file_extensions: Vec<String>,
+    pub files: Option<Vec<PathBuf>>,
+}
+
+pub fn parse_args<I, S>(args: I) -> Result<CliConfig>
+where
+    I: IntoIterator<Item = S>,
+    S: Into<String>,
+{
+    let mut report_duplicate = false;
+    let mut file_extensions = None;
+    let mut files = None;
+    for arg in args {
+        let arg = arg.into();
+        if arg == "--report-duplicate" {
+            report_duplicate = true;
+        } else if let Some(value) = arg
+            .strip_prefix("-file-extension=")
+            .or_else(|| arg.strip_prefix("--file-extension="))
+        {
+            if file_extensions.is_some() {
+                return Err(CodeM8Error::new(
+                    "file extensions were provided more than once",
+                ));
+            }
+            file_extensions = Some(parse_file_extensions(value)?);
+        } else if let Some(value) = arg
+            .strip_prefix("-files=")
+            .or_else(|| arg.strip_prefix("--files="))
+        {
+            if files.is_some() {
+                return Err(CodeM8Error::new(
+                    "explicit files were provided more than once",
+                ));
+            }
+            files = Some(parse_file_list(value)?);
+        } else {
+            return Err(CodeM8Error::new(format!("unknown argument: {arg}")));
+        }
+    }
+    if !report_duplicate {
+        return Err(CodeM8Error::new(
+            "no report switch provided; pass --report-duplicate",
+        ));
+    }
+    Ok(CliConfig {
+        report_duplicate,
+        file_extensions: file_extensions.unwrap_or_else(|| {
+            DEFAULT_FILE_EXTENSIONS
+                .iter()
+                .map(|extension| extension.to_string())
+                .collect()
+        }),
+        files,
+    })
+}
+
+pub fn parse_file_extensions(value: &str) -> Result<Vec<String>> {
+    let mut extensions = Vec::new();
+    for raw_extension in value.split(',') {
+        let extension = raw_extension.trim();
+        if extension.is_empty() {
+            return Err(CodeM8Error::new("file extension values must not be empty"));
+        }
+        if extension.starts_with('.') {
+            return Err(CodeM8Error::new(format!(
+                "file extensions must not start with a dot: {extension}"
+            )));
+        }
+        if extension.contains('/') || extension.contains('\\') {
+            return Err(CodeM8Error::new(format!(
+                "file extensions must not contain path separators: {extension}"
+            )));
+        }
+        let extension = extension.to_ascii_lowercase();
+        if !extensions.contains(&extension) {
+            extensions.push(extension);
+        }
+    }
+    if extensions.is_empty() {
+        return Err(CodeM8Error::new("at least one file extension is required"));
+    }
+    Ok(extensions)
+}
+
+pub fn parse_file_list(value: &str) -> Result<Vec<PathBuf>> {
+    let mut files = Vec::new();
+    for raw_file in value.split(',') {
+        let file = raw_file.trim();
+        if file.is_empty() {
+            return Err(CodeM8Error::new("file path values must not be empty"));
+        }
+        files.push(PathBuf::from(file));
+    }
+    if files.is_empty() {
+        return Err(CodeM8Error::new("at least one explicit file is required"));
+    }
+    Ok(files)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parses_default_duplicate_report_config() {
+        let config = parse_args(["--report-duplicate"]).expect("config parses");
+        assert!(config.report_duplicate);
+        assert_eq!(config.file_extensions, ["ts"]);
+        assert_eq!(config.files, None);
+    }
+
+    #[test]
+    fn parses_extensions_case_insensitively_and_trims_whitespace() {
+        let extensions = parse_file_extensions(" ts, JS ,tsx,ts ").expect("extensions parse");
+        assert_eq!(extensions, ["ts", "js", "tsx"]);
+    }
+
+    #[test]
+    fn rejects_empty_extensions() {
+        let error = parse_file_extensions("ts,,js").expect_err("empty extension fails");
+        assert!(error.to_string().contains("must not be empty"));
+    }
+
+    #[test]
+    fn rejects_extensions_with_leading_dot() {
+        let error = parse_file_extensions(".ts").expect_err("dot-prefixed extension fails");
+        assert!(error.to_string().contains("must not start with a dot"));
+    }
+
+    #[test]
+    fn rejects_missing_report_switch() {
+        let error = parse_args(["-file-extension=rs"]).expect_err("missing report fails");
+        assert!(error.to_string().contains("no report switch provided"));
+    }
+
+    #[test]
+    fn parses_explicit_file_list() {
+        let files = parse_file_list("src/a.ts, ./src/b.ts").expect("files parse");
+        assert_eq!(
+            files,
+            [PathBuf::from("src/a.ts"), PathBuf::from("./src/b.ts")]
+        );
+    }
+}
diff --git a/src/discovery.rs b/src/discovery.rs
new file mode 100644
index 0000000..2ff774e
--- /dev/null
+++ b/src/discovery.rs
@@ -0,0 +1,196 @@
+use std::fs;
+use std::path::{Path, PathBuf};
+
+use crate::error::{CodeM8Error, Result};
+use crate::model::SourceFile;
+use crate::paths::{format_path, normalize_display_path};
+
+const IGNORED_DIRECTORIES: &[&str] = &[
+    ".git",
+    "node_modules",
+    "target",
+    "dist",
+    "build",
+    "coverage",
+    ".next",
+    ".nuxt",
+    ".svelte-kit",
+    ".idea",
+    ".vscode",
+];
+
+pub fn discover_source_files(
+    current_dir: &Path,
+    extensions: &[String],
+    explicit_files: Option<&[PathBuf]>,
+) -> Result<Vec<SourceFile>> {
+    let mut source_files = match explicit_files {
+        Some(files) => discover_explicit_files(current_dir, extensions, files)?,
+        None => {
+            let mut source_files = Vec::new();
+            walk_directory(current_dir, current_dir, extensions, &mut source_files)?;
+            source_files
+        }
+    };
+    source_files.sort_by(|left, right| {
+        format_path(&left.display_path).cmp(&format_path(&right.display_path))
+    });
+    Ok(source_files)
+}
+
+fn discover_explicit_files(
+    current_dir: &Path,
+    extensions: &[String],
+    files: &[PathBuf],
+) -> Result<Vec<SourceFile>> {
+    let mut source_files = Vec::new();
+    for file in files {
+        let absolute_input = file.is_absolute();
+        let path = if absolute_input {
+            file.clone()
+        } else {
+            current_dir.join(file)
+        };
+        let metadata = fs::symlink_metadata(&path).map_err(|_| {
+            CodeM8Error::new(format!(
+                "explicit file does not exist: {}",
+                format_path(file)
+            ))
+        })?;
+        if metadata.file_type().is_symlink() {
+            return Err(CodeM8Error::new(format!(
+                "explicit file is a symbolic link and will not be followed: {}",
+                format_path(file)
+            )));
+        }
+        if metadata.is_dir() {
+            return Err(CodeM8Error::new(format!(
+                "explicit file is a directory: {}",
+                format_path(file)
+            )));
+        }
+        if !metadata.is_file() {
+            return Err(CodeM8Error::new(format!(
+                "explicit path is not a file: {}",
+                format_path(file)
+            )));
+        }
+        let Some(extension) = selected_extension(&path, extensions) else {
+            continue;
+        };
+        source_files.push(SourceFile {
+            path,
+            display_path: normalize_display_path(file),
+            extension,
+        });
+    }
+    Ok(source_files)
+}
+
+fn walk_directory(
+    root: &Path,
+    directory: &Path,
+    extensions: &[String],
+    source_files: &mut Vec<SourceFile>,
+) -> Result<()> {
+    let mut entries = fs::read_dir(directory)
+        .map_err(|error| CodeM8Error::io(directory, "read directory", error))?
+        .collect::<std::result::Result<Vec<_>, _>>()
+        .map_err(|error| CodeM8Error::io(directory, "read directory entry", error))?;
+    entries.sort_by(|left, right| {
+        left.file_name()
+            .to_string_lossy()
+            .cmp(&right.file_name().to_string_lossy())
+    });
+    for entry in entries {
+        let path = entry.path();
+        let file_type = entry
+            .file_type()
+            .map_err(|error| CodeM8Error::io(&path, "inspect path", error))?;
+        if file_type.is_symlink() {
+            continue;
+        }
+        if file_type.is_dir() {
+            let directory_name = entry.file_name().to_string_lossy().to_ascii_lowercase();
+            if IGNORED_DIRECTORIES.contains(&directory_name.as_str()) {
+                continue;
+            }
+            walk_directory(root, &path, extensions, source_files)?;
+        } else if file_type.is_file() {
+            let Some(extension) = selected_extension(&path, extensions) else {
+                continue;
+            };
+            let display_path = path
+                .strip_prefix(root)
+                .map(normalize_display_path)
+                .unwrap_or_else(|_| normalize_display_path(&path));
+            source_files.push(SourceFile {
+                path,
+                display_path,
+                extension,
+            });
+        }
+    }
+    Ok(())
+}
+
+fn selected_extension(path: &Path, extensions: &[String]) -> Option<String> {
+    let extension = path.extension()?.to_str()?.to_ascii_lowercase();
+    extensions
+        .iter()
+        .any(|selected| selected.eq_ignore_ascii_case(&extension))
+        .then_some(extension)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use super::*;
+
+    static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0);
+
+    fn temp_dir(name: &str) -> PathBuf {
+        let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
+        let path = std::env::temp_dir().join(format!(
+            "codem8-discovery-{name}-{}-{id}",
+            std::process::id()
+        ));
+        if path.exists() {
+            fs::remove_dir_all(&path).expect("remove stale test directory");
+        }
+        fs::create_dir_all(&path).expect("create test directory");
+        path
+    }
+
+    #[test]
+    fn recursively_discovers_matching_extensions_and_ignores_common_directories() {
+        let root = temp_dir("recursive");
+        fs::create_dir_all(root.join("src")).expect("create src");
+        fs::create_dir_all(root.join("target")).expect("create target");
+        fs::write(root.join("src").join("a.TS"), "").expect("write ts");
+        fs::write(root.join("src").join("b.js"), "").expect("write js");
+        fs::write(root.join("target").join("ignored.ts"), "").expect("write ignored");
+        let files = discover_source_files(&root, &["ts".to_string()], None).expect("discover");
+        assert_eq!(files.len(), 1);
+        assert_eq!(format_path(&files[0].display_path), "src/a.TS");
+        fs::remove_dir_all(root).expect("cleanup");
+    }
+
+    #[test]
+    fn explicit_files_skip_unselected_extensions() {
+        let root = temp_dir("explicit-skip");
+        fs::write(root.join("a.ts"), "").expect("write ts");
+        fs::write(root.join("b.js"), "").expect("write js");
+        let files = discover_source_files(
+            &root,
+            &["ts".to_string()],
+            Some(&[PathBuf::from("a.ts"), PathBuf::from("b.js")]),
+        )
+        .expect("discover");
+        assert_eq!(files.len(), 1);
+        assert_eq!(format_path(&files[0].display_path), "a.ts");
+        fs::remove_dir_all(root).expect("cleanup");
+    }
+}
diff --git a/src/duplicate.rs b/src/duplicate.rs
new file mode 100644
index 0000000..ef11e35
--- /dev/null
+++ b/src/duplicate.rs
@@ -0,0 +1,316 @@
+use std::cmp::Ordering;
+use std::collections::{BTreeSet, HashMap};
+use std::path::PathBuf;
+
+use crate::model::{DuplicateBlock, DuplicateOccurrence, LineEntry, LineStatus, ProcessedFile};
+use crate::paths::format_path;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+struct LineRef {
+    file_index: usize,
+    line_index: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+struct OccurrenceKey {
+    file_path: PathBuf,
+    file_path_key: String,
+    start_line: usize,
+    end_line: usize,
+}
+
+impl Ord for OccurrenceKey {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.file_path_key
+            .cmp(&other.file_path_key)
+            .then_with(|| self.start_line.cmp(&other.start_line))
+            .then_with(|| self.end_line.cmp(&other.end_line))
+    }
+}
+
+impl PartialOrd for OccurrenceKey {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+pub fn detect_duplicate_blocks(files: &[ProcessedFile]) -> Vec<DuplicateBlock> {
+    let mut line_index: HashMap<u128, Vec<LineRef>> = HashMap::new();
+    for (file_index, file) in files.iter().enumerate() {
+        for (line_index_in_file, line) in file.lines.iter().enumerate() {
+            line_index.entry(line.hash).or_default().push(LineRef {
+                file_index,
+                line_index: line_index_in_file,
+            });
+        }
+    }
+    let mut blocks_by_lines: HashMap<Vec<String>, BTreeSet<OccurrenceKey>> = HashMap::new();
+    for refs in line_index.values() {
+        if refs.len() < 2 {
+            continue;
+        }
+        let mut comparison_refs_by_text: HashMap<String, Vec<LineRef>> = HashMap::new();
+        for line_ref in refs {
+            let line = line_at(files, *line_ref);
+            if line.status != LineStatus::Comparison {
+                continue;
+            }
+            comparison_refs_by_text
+                .entry(line.normalized_text.clone())
+                .or_default()
+                .push(*line_ref);
+        }
+        for comparison_refs in comparison_refs_by_text.values() {
+            if comparison_refs.len() < 2 {
+                continue;
+            }
+            for left_index in 0..comparison_refs.len() {
+                for right_index in (left_index + 1)..comparison_refs.len() {
+                    let left = comparison_refs[left_index];
+                    let right = comparison_refs[right_index];
+                    let Some(candidate) = expand_pair(files, left, right) else {
+                        continue;
+                    };
+                    let occurrences = blocks_by_lines
+                        .entry(candidate.normalized_lines)
+                        .or_default();
+                    occurrences.insert(candidate.left_occurrence);
+                    occurrences.insert(candidate.right_occurrence);
+                }
+            }
+        }
+    }
+    let mut duplicate_blocks = blocks_by_lines
+        .into_iter()
+        .filter_map(|(normalized_lines, occurrences)| {
+            if normalized_lines.is_empty() || occurrences.len() < 2 {
+                return None;
+            }
+            let occurrences = occurrences
+                .into_iter()
+                .map(|occurrence| DuplicateOccurrence {
+                    file_path: occurrence.file_path,
+                    start_line: occurrence.start_line,
+                    end_line: occurrence.end_line,
+                })
+                .collect::<Vec<_>>();
+            let character_count = normalized_lines
+                .iter()
+                .map(|line| line.chars().count() as u64)
+                .sum::<u64>();
+            let weight =
+                (occurrences.len() as u64 - 1) * normalized_lines.len() as u64 * character_count;
+            Some(DuplicateBlock {
+                normalized_lines,
+                occurrences,
+                weight,
+            })
+        })
+        .collect::<Vec<_>>();
+    duplicate_blocks.sort_by(compare_duplicate_blocks);
+    duplicate_blocks
+}
+
+#[derive(Debug)]
+struct CandidateBlock {
+    normalized_lines: Vec<String>,
+    left_occurrence: OccurrenceKey,
+    right_occurrence: OccurrenceKey,
+}
+
+fn expand_pair(files: &[ProcessedFile], left: LineRef, right: LineRef) -> Option<CandidateBlock> {
+    if left == right {
+        return None;
+    }
+    let mut left_start = left.line_index;
+    let mut right_start = right.line_index;
+    while left_start > 0
+        && right_start > 0
+        && line_text(files, left.file_index, left_start - 1)
+            == line_text(files, right.file_index, right_start - 1)
+    {
+        left_start -= 1;
+        right_start -= 1;
+    }
+    let mut left_end = left.line_index;
+    let mut right_end = right.line_index;
+    while left_end + 1 < files[left.file_index].lines.len()
+        && right_end + 1 < files[right.file_index].lines.len()
+        && line_text(files, left.file_index, left_end + 1)
+            == line_text(files, right.file_index, right_end + 1)
+    {
+        left_end += 1;
+        right_end += 1;
+    }
+    let normalized_lines = files[left.file_index].lines[left_start..=left_end]
+        .iter()
+        .map(|line| line.normalized_text.clone())
+        .collect::<Vec<_>>();
+    Some(CandidateBlock {
+        normalized_lines,
+        left_occurrence: occurrence_for(files, left.file_index, left_start, left_end),
+        right_occurrence: occurrence_for(files, right.file_index, right_start, right_end),
+    })
+}
+
+fn occurrence_for(
+    files: &[ProcessedFile],
+    file_index: usize,
+    start_index: usize,
+    end_index: usize,
+) -> OccurrenceKey {
+    let lines = &files[file_index].lines;
+    let file_path = files[file_index].source.display_path.clone();
+    OccurrenceKey {
+        file_path_key: format_path(&file_path),
+        file_path,
+        start_line: lines[start_index].line_number,
+        end_line: lines[end_index].line_number,
+    }
+}
+
+fn line_at(files: &[ProcessedFile], line_ref: LineRef) -> &LineEntry {
+    &files[line_ref.file_index].lines[line_ref.line_index]
+}
+
+fn line_text(files: &[ProcessedFile], file_index: usize, line_index: usize) -> &str {
+    &files[file_index].lines[line_index].normalized_text
+}
+
+fn compare_duplicate_blocks(left: &DuplicateBlock, right: &DuplicateBlock) -> Ordering {
+    right
+        .weight
+        .cmp(&left.weight)
+        .then_with(|| right.line_count().cmp(&left.line_count()))
+        .then_with(|| right.character_count().cmp(&left.character_count()))
+        .then_with(|| first_occurrence_key(left).cmp(&first_occurrence_key(right)))
+        .then_with(|| first_occurrence_start_line(left).cmp(&first_occurrence_start_line(right)))
+        .then_with(|| normalized_block_text(left).cmp(&normalized_block_text(right)))
+}
+
+fn first_occurrence_key(block: &DuplicateBlock) -> String {
+    block
+        .occurrences
+        .first()
+        .map(|occurrence| format_path(&occurrence.file_path))
+        .unwrap_or_default()
+}
+
+fn first_occurrence_start_line(block: &DuplicateBlock) -> usize {
+    block
+        .occurrences
+        .first()
+        .map(|occurrence| occurrence.start_line)
+        .unwrap_or_default()
+}
+
+fn normalized_block_text(block: &DuplicateBlock) -> String {
+    block.normalized_lines.join("\n")
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::language::hash_normalized_line;
+    use crate::model::{LineEntry, ProcessedFile, SourceFile};
+
+    use super::*;
+
+    fn processed_file(path: &str, extension: &str, lines: &[(&str, LineStatus)]) -> ProcessedFile {
+        let line_entries = lines
+            .iter()
+            .enumerate()
+            .map(|(index, (text, status))| LineEntry {
+                file_path: PathBuf::from(path),
+                line_number: index + 1,
+                normalized_text: (*text).to_string(),
+                hash: hash_normalized_line(text),
+                status: *status,
+            })
+            .collect();
+        ProcessedFile {
+            source: SourceFile {
+                path: PathBuf::from(path),
+                display_path: PathBuf::from(path),
+                extension: extension.to_string(),
+            },
+            lines: line_entries,
+        }
+    }
+
+    #[test]
+    fn groups_three_occurrences_of_the_same_block() {
+        let files = vec![
+            processed_file(
+                "a.ts",
+                "ts",
+                &[
+                    ("const value = one;", LineStatus::Comparison),
+                    ("return value;", LineStatus::Comparison),
+                ],
+            ),
+            processed_file(
+                "b.ts",
+                "ts",
+                &[
+                    ("const value = one;", LineStatus::Comparison),
+                    ("return value;", LineStatus::Comparison),
+                ],
+            ),
+            processed_file(
+                "c.ts",
+                "ts",
+                &[
+                    ("const value = one;", LineStatus::Comparison),
+                    ("return value;", LineStatus::Comparison),
+                ],
+            ),
+        ];
+        let blocks = detect_duplicate_blocks(&files);
+        assert_eq!(blocks.len(), 1);
+        assert_eq!(blocks[0].occurrences.len(), 3);
+        assert_eq!(
+            blocks[0].normalized_lines,
+            ["const value = one;", "return value;"]
+        );
+    }
+
+    #[test]
+    fn ignores_single_line_duplicates_that_are_only_block_only_lines() {
+        let files = vec![
+            processed_file("a.ts", "ts", &[("}", LineStatus::BlockOnly)]),
+            processed_file("b.ts", "ts", &[("}", LineStatus::BlockOnly)]),
+        ];
+        let blocks = detect_duplicate_blocks(&files);
+        assert!(blocks.is_empty());
+    }
+
+    #[test]
+    fn includes_block_only_lines_inside_larger_duplicate_blocks() {
+        let files = vec![
+            processed_file(
+                "a.ts",
+                "ts",
+                &[
+                    ("if (ready) {", LineStatus::Comparison),
+                    ("}", LineStatus::BlockOnly),
+                    ("return value;", LineStatus::Comparison),
+                ],
+            ),
+            processed_file(
+                "b.ts",
+                "ts",
+                &[
+                    ("if (ready) {", LineStatus::Comparison),
+                    ("}", LineStatus::BlockOnly),
+                    ("return value;", LineStatus::Comparison),
+                ],
+            ),
+        ];
+        let blocks = detect_duplicate_blocks(&files);
+        assert_eq!(blocks.len(), 1);
+        assert_eq!(
+            blocks[0].normalized_lines,
+            ["if (ready) {", "}", "return value;"]
+        );
+    }
+}
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..fcb7545
--- /dev/null
+++ b/src/error.rs
@@ -0,0 +1,33 @@
+use std::error::Error;
+use std::fmt;
+use std::io;
+use std::path::Path;
+
+use crate::paths::format_path;
+
+pub type Result<T> = std::result::Result<T, CodeM8Error>;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CodeM8Error {
+    message: String,
+}
+
+impl CodeM8Error {
+    pub fn new(message: impl Into<String>) -> Self {
+        Self {
+            message: message.into(),
+        }
+    }
+
+    pub fn io(path: &Path, action: &str, error: io::Error) -> Self {
+        Self::new(format!("could not {action} {}: {error}", format_path(path)))
+    }
+}
+
+impl fmt::Display for CodeM8Error {
+    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+        formatter.write_str(&self.message)
+    }
+}
+
+impl Error for CodeM8Error {}
diff --git a/src/language.rs b/src/language.rs
new file mode 100644
index 0000000..2dc3dc3
--- /dev/null
+++ b/src/language.rs
@@ -0,0 +1,192 @@
+use std::collections::HashMap;
+use std::sync::OnceLock;
+
+use crate::model::LineStatus;
+
+#[derive(Debug, Clone, Copy)]
+pub struct LanguageLinePattern {
+    pub language_name: &'static str,
+    pub extensions: &'static [&'static str],
+    pub block_only_lines: &'static [&'static str],
+}
+
+pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
+    LanguageLinePattern {
+        language_name: "TypeScript / JavaScript",
+        extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"],
+        block_only_lines: &[
+            "(", ")", "{", "}", "[", "]", ");", "];", "};", ")};", "}),", "});",
+        ],
+    },
+    LanguageLinePattern {
+        language_name: "Rust",
+        extensions: &["rs"],
+        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"],
+    },
+    LanguageLinePattern {
+        language_name: "C / C++ / Objective-C",
+        extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"],
+        block_only_lines: &[
+            "{", "}", "(", ")", "[", "]", ");", "];", "};", "#endif", "#else",
+        ],
+    },
+    LanguageLinePattern {
+        language_name: "C#",
+        extensions: &["cs"],
+        block_only_lines: &[
+            "{",
+            "}",
+            "(",
+            ")",
+            "[",
+            "]",
+            ");",
+            "];",
+            "};",
+            "#endregion",
+            "#else",
+            "#endif",
+        ],
+    },
+    LanguageLinePattern {
+        language_name: "Java / Kotlin / Scala",
+        extensions: &["java", "kt", "kts", "scala", "sc"],
+        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"],
+    },
+    LanguageLinePattern {
+        language_name: "Go",
+        extensions: &["go"],
+        block_only_lines: &["{", "}", "(", ")", "[", "]"],
+    },
+    LanguageLinePattern {
+        language_name: "Python",
+        extensions: &["py", "pyw"],
+        block_only_lines: &["(", ")", "[", "]", "{", "}"],
+    },
+    LanguageLinePattern {
+        language_name: "Ruby",
+        extensions: &["rb"],
+        block_only_lines: &["(", ")", "[", "]", "{", "}", "end"],
+    },
+    LanguageLinePattern {
+        language_name: "PHP",
+        extensions: &["php", "phtml"],
+        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"],
+    },
+    LanguageLinePattern {
+        language_name: "Swift",
+        extensions: &["swift"],
+        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"],
+    },
+    LanguageLinePattern {
+        language_name: "Shell",
+        extensions: &["sh", "bash", "zsh", "fish"],
+        block_only_lines: &["then", "do", "done", "fi", "else", "{", "}"],
+    },
+    LanguageLinePattern {
+        language_name: "PowerShell",
+        extensions: &["ps1", "psm1", "psd1"],
+        block_only_lines: &["{", "}", "(", ")", "[", "]", ");"],
+    },
+    LanguageLinePattern {
+        language_name: "HTML / XML",
+        extensions: &["html", "htm", "xml", "xhtml", "svg"],
+        block_only_lines: &[
+            ">",
+            "/>",
+            "</div>",
+            "</span>",
+            "</section>",
+            "</article>",
+            "</body>",
+            "</html>",
+        ],
+    },
+    LanguageLinePattern {
+        language_name: "CSS / SCSS / Sass / Less",
+        extensions: &["css", "scss", "sass", "less"],
+        block_only_lines: &["{", "}", ");"],
+    },
+    LanguageLinePattern {
+        language_name: "SQL",
+        extensions: &["sql"],
+        block_only_lines: &["(", ")", ");", ";", "BEGIN", "END"],
+    },
+    LanguageLinePattern {
+        language_name: "YAML / JSON / TOML",
+        extensions: &["yaml", "yml", "json", "toml"],
+        block_only_lines: &["{", "}", "[", "]", "},", "],"],
+    },
+];
+
+#[derive(Debug)]
+struct BlockOnlyRegistry {
+    by_extension: HashMap<&'static str, HashMap<u128, Vec<&'static str>>>,
+}
+
+static BLOCK_ONLY_REGISTRY: OnceLock<BlockOnlyRegistry> = OnceLock::new();
+
+pub fn hash_normalized_line(line: &str) -> u128 {
+    xxhash_rust::xxh3::xxh3_128(line.as_bytes())
+}
+
+pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus {
+    let extension = extension.to_ascii_lowercase();
+    let Some(patterns_by_hash) = registry().by_extension.get(extension.as_str()) else {
+        return LineStatus::Comparison;
+    };
+    let Some(patterns) = patterns_by_hash.get(&hash) else {
+        return LineStatus::Comparison;
+    };
+    if patterns.contains(&normalized_line) {
+        LineStatus::BlockOnly
+    } else {
+        LineStatus::Comparison
+    }
+}
+
+fn registry() -> &'static BlockOnlyRegistry {
+    BLOCK_ONLY_REGISTRY.get_or_init(|| {
+        let mut by_extension: HashMap<&'static str, HashMap<u128, Vec<&'static str>>> =
+            HashMap::new();
+        for language in LANGUAGE_PATTERNS {
+            for extension in language.extensions {
+                let patterns_by_hash = by_extension.entry(extension).or_default();
+                for line in language.block_only_lines {
+                    patterns_by_hash
+                        .entry(hash_normalized_line(line))
+                        .or_default()
+                        .push(line);
+                }
+            }
+        }
+        BlockOnlyRegistry { by_extension }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn assigns_block_only_status_from_extension_specific_registry() {
+        let hash = hash_normalized_line("}");
+        assert_eq!(classify_line("ts", "}", hash), LineStatus::BlockOnly);
+    }
+
+    #[test]
+    fn assigns_comparison_status_for_meaningful_lines() {
+        let line = "const value = computeValue(input);";
+        let hash = hash_normalized_line(line);
+        assert_eq!(classify_line("ts", line, hash), LineStatus::Comparison);
+    }
+
+    #[test]
+    fn verifies_text_after_hash_lookup() {
+        let hash = hash_normalized_line("}");
+        assert_eq!(
+            classify_line("ts", "not-a-brace", hash),
+            LineStatus::Comparison
+        );
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..805bc69
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,167 @@
+pub mod cli;
+pub mod discovery;
+pub mod duplicate;
+pub mod error;
+pub mod language;
+pub mod line;
+pub mod model;
+pub mod paths;
+pub mod report;
+
+use std::io::Write;
+use std::path::Path;
+
+use crate::error::{CodeM8Error, Result};
+
+pub fn run<I, S, W>(args: I, current_dir: &Path, writer: &mut W) -> Result<()>
+where
+    I: IntoIterator<Item = S>,
+    S: Into<String>,
+    W: Write,
+{
+    let config = cli::parse_args(args)?;
+    if config.report_duplicate {
+        let source_files = discovery::discover_source_files(
+            current_dir,
+            &config.file_extensions,
+            config.files.as_deref(),
+        )?;
+        let processed_files = line::process_source_files(&source_files)?;
+        let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files);
+        let report = report::DuplicateReport {
+            analyzed_files: source_files.len(),
+            analyzed_extensions: config.file_extensions,
+            duplicate_blocks,
+        };
+        writer
+            .write_all(report::render_duplicate_report(&report).as_bytes())
+            .map_err(|error| CodeM8Error::new(format!("could not write report output: {error}")))?;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs;
+    use std::path::{Path, PathBuf};
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use super::*;
+
+    static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0);
+
+    struct TempProject {
+        path: PathBuf,
+    }
+
+    impl TempProject {
+        fn new(name: &str) -> Self {
+            let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
+            let path =
+                std::env::temp_dir().join(format!("codem8-{name}-{}-{id}", std::process::id()));
+            if path.exists() {
+                fs::remove_dir_all(&path).expect("remove stale test directory");
+            }
+            fs::create_dir_all(&path).expect("create test directory");
+            Self { path }
+        }
+
+        fn write(&self, relative_path: &str, contents: &str) {
+            let path = self.path.join(relative_path);
+            if let Some(parent) = path.parent() {
+                fs::create_dir_all(parent).expect("create test parent directory");
+            }
+            fs::write(path, contents).expect("write test file");
+        }
+
+        fn path(&self) -> &Path {
+            &self.path
+        }
+    }
+
+    impl Drop for TempProject {
+        fn drop(&mut self) {
+            let _ = fs::remove_dir_all(&self.path);
+        }
+    }
+
+    fn run_in(project: &TempProject, args: &[&str]) -> std::result::Result<String, CodeM8Error> {
+        let mut output = Vec::new();
+        run(args.iter().copied(), project.path(), &mut output)?;
+        Ok(String::from_utf8(output).expect("report is UTF-8"))
+    }
+
+    #[test]
+    fn duplicate_report_snapshot_is_stable() {
+        let project = TempProject::new("snapshot");
+        project.write(
+            "src/a.ts",
+            "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n",
+        );
+        project.write(
+            "src/b.ts",
+            "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n",
+        );
+        let output = run_in(&project, &["--report-duplicate"]).expect("report succeeds");
+        assert_eq!(
+            output,
+            concat!(
+                "Duplicate Code Report\n",
+                "=====================\n",
+                "\n",
+                "Analyzed files: 2\n",
+                "Analyzed extensions: ts\n",
+                "Duplicate blocks found: 1\n",
+                "\n",
+                "#1 Weight: 324\n",
+                "Lines: 4\n",
+                "Characters: 81\n",
+                "Occurrences: 2\n",
+                "\n",
+                "Locations:\n",
+                "- src/a.ts:1-4\n",
+                "- src/b.ts:1-4\n",
+                "\n",
+                "Code:\n",
+                "  const value = computeValue(input);\n",
+                "  if (value === undefined) {\n",
+                "  return defaultValue;\n",
+                "  }\n",
+            )
+        );
+    }
+
+    #[test]
+    fn explicit_files_disable_recursive_discovery() {
+        let project = TempProject::new("explicit-files");
+        project.write("src/a.ts", "const value = one;\n");
+        project.write("src/b.ts", "const value = one;\n");
+        let output =
+            run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds");
+        assert!(output.contains("Analyzed files: 1"));
+        assert!(output.contains("Duplicate blocks found: 0"));
+    }
+
+    #[test]
+    fn custom_extensions_change_analyzed_files() {
+        let project = TempProject::new("custom-extensions");
+        project.write("src/a.js", "const value = one;\n");
+        project.write("src/b.js", "const value = one;\n");
+        let default_output = run_in(&project, &["--report-duplicate"]).expect("report succeeds");
+        assert!(default_output.contains("Analyzed files: 0"));
+        let js_output = run_in(&project, &["--report-duplicate", "-file-extension=js"])
+            .expect("report succeeds");
+        assert!(js_output.contains("Analyzed files: 2"));
+        assert!(js_output.contains("Duplicate blocks found: 1"));
+    }
+
+    #[test]
+    fn invalid_explicit_file_returns_a_clear_error() {
+        let project = TempProject::new("invalid-file");
+        let error = run_in(&project, &["--report-duplicate", "-files=missing.ts"])
+            .expect_err("missing explicit file fails");
+        assert!(error
+            .to_string()
+            .contains("explicit file does not exist: missing.ts"));
+    }
+}
diff --git a/src/line.rs b/src/line.rs
new file mode 100644
index 0000000..894faaf
--- /dev/null
+++ b/src/line.rs
@@ -0,0 +1,88 @@
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+
+use crate::error::{CodeM8Error, Result};
+use crate::language::{classify_line, hash_normalized_line};
+use crate::model::{LineEntry, ProcessedFile, SourceFile};
+
+pub fn process_source_files(source_files: &[SourceFile]) -> Result<Vec<ProcessedFile>> {
+    source_files.iter().map(process_source_file).collect()
+}
+
+pub fn process_source_file(source_file: &SourceFile) -> Result<ProcessedFile> {
+    let file = File::open(&source_file.path)
+        .map_err(|error| CodeM8Error::io(&source_file.display_path, "open file", error))?;
+    let reader = BufReader::new(file);
+    let mut lines = Vec::new();
+    for (index, line) in reader.lines().enumerate() {
+        let line = line.map_err(|error| {
+            CodeM8Error::new(format!(
+                "could not read {} as UTF-8 text: {error}",
+                crate::paths::format_path(&source_file.display_path)
+            ))
+        })?;
+        let Some(normalized_text) = normalize_line(&line) else {
+            continue;
+        };
+        let hash = hash_normalized_line(&normalized_text);
+        let status = classify_line(&source_file.extension, &normalized_text, hash);
+        lines.push(LineEntry {
+            file_path: source_file.display_path.clone(),
+            line_number: index + 1,
+            normalized_text,
+            hash,
+            status,
+        });
+    }
+    Ok(ProcessedFile {
+        source: source_file.clone(),
+        lines,
+    })
+}
+
+pub fn normalize_line(line: &str) -> Option<String> {
+    let normalized = line.trim();
+    if normalized.is_empty() {
+        None
+    } else {
+        Some(normalized.to_string())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs;
+
+    use crate::model::LineStatus;
+
+    use super::*;
+
+    #[test]
+    fn trims_unicode_whitespace_and_skips_empty_lines() {
+        assert_eq!(
+            normalize_line("\t value \u{2003}"),
+            Some("value".to_string())
+        );
+        assert_eq!(normalize_line(" \t "), None);
+    }
+
+    #[test]
+    fn processes_non_empty_lines_with_original_line_numbers() {
+        let path = std::env::temp_dir().join(format!("codem8-line-test-{}.ts", std::process::id()));
+        fs::write(&path, "  const value = 1;  \n\n   }\n").expect("write source file");
+        let source = SourceFile {
+            path: path.clone(),
+            display_path: "sample.ts".into(),
+            extension: "ts".to_string(),
+        };
+        let processed = process_source_file(&source).expect("process source file");
+        assert_eq!(processed.lines.len(), 2);
+        assert_eq!(processed.lines[0].line_number, 1);
+        assert_eq!(processed.lines[0].normalized_text, "const value = 1;");
+        assert_eq!(processed.lines[0].status, LineStatus::Comparison);
+        assert_eq!(processed.lines[1].line_number, 3);
+        assert_eq!(processed.lines[1].normalized_text, "}");
+        assert_eq!(processed.lines[1].status, LineStatus::BlockOnly);
+        fs::remove_file(path).expect("cleanup");
+    }
+}
diff --git a/src/main.rs b/src/main.rs
new file mode 100644
index 0000000..77dbbac
--- /dev/null
+++ b/src/main.rs
@@ -0,0 +1,24 @@
+use std::io::Write;
+use std::process::ExitCode;
+
+fn main() -> ExitCode {
+    let current_dir = match std::env::current_dir() {
+        Ok(current_dir) => current_dir,
+        Err(error) => {
+            eprintln!("error: could not determine current directory: {error}");
+            return ExitCode::FAILURE;
+        }
+    };
+    let stdout = std::io::stdout();
+    let mut stdout = stdout.lock();
+    match codem8::run(std::env::args().skip(1), &current_dir, &mut stdout) {
+        Ok(()) => {
+            let _ = stdout.flush();
+            ExitCode::SUCCESS
+        }
+        Err(error) => {
+            eprintln!("error: {error}");
+            ExitCode::FAILURE
+        }
+    }
+}
diff --git a/src/model.rs b/src/model.rs
new file mode 100644
index 0000000..e7a1248
--- /dev/null
+++ b/src/model.rs
@@ -0,0 +1,56 @@
+use std::path::PathBuf;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LineStatus {
+    Comparison,
+    BlockOnly,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct SourceFile {
+    pub path: PathBuf,
+    pub display_path: PathBuf,
+    pub extension: String,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct LineEntry {
+    pub file_path: PathBuf,
+    pub line_number: usize,
+    pub normalized_text: String,
+    pub hash: u128,
+    pub status: LineStatus,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct ProcessedFile {
+    pub source: SourceFile,
+    pub lines: Vec<LineEntry>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DuplicateOccurrence {
+    pub file_path: PathBuf,
+    pub start_line: usize,
+    pub end_line: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DuplicateBlock {
+    pub normalized_lines: Vec<String>,
+    pub occurrences: Vec<DuplicateOccurrence>,
+    pub weight: u64,
+}
+
+impl DuplicateBlock {
+    pub fn line_count(&self) -> usize {
+        self.normalized_lines.len()
+    }
+
+    pub fn character_count(&self) -> u64 {
+        self.normalized_lines
+            .iter()
+            .map(|line| line.chars().count() as u64)
+            .sum()
+    }
+}
diff --git a/src/paths.rs b/src/paths.rs
new file mode 100644
index 0000000..56e1812
--- /dev/null
+++ b/src/paths.rs
@@ -0,0 +1,22 @@
+use std::path::{Component, Path, PathBuf};
+
+pub fn format_path(path: &Path) -> String {
+    path.to_string_lossy().replace('\\', "/")
+}
+
+pub fn normalize_display_path(path: &Path) -> PathBuf {
+    let mut normalized = PathBuf::new();
+    for component in path.components() {
+        match component {
+            Component::CurDir => {}
+            Component::Normal(part) => normalized.push(part),
+            Component::ParentDir => normalized.push(".."),
+            Component::RootDir | Component::Prefix(_) => normalized.push(component.as_os_str()),
+        }
+    }
+    if normalized.as_os_str().is_empty() {
+        PathBuf::from(".")
+    } else {
+        normalized
+    }
+}
diff --git a/src/report.rs b/src/report.rs
new file mode 100644
index 0000000..f6207c3
--- /dev/null
+++ b/src/report.rs
@@ -0,0 +1,103 @@
+use crate::model::DuplicateBlock;
+use crate::paths::format_path;
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DuplicateReport {
+    pub analyzed_files: usize,
+    pub analyzed_extensions: Vec<String>,
+    pub duplicate_blocks: Vec<DuplicateBlock>,
+}
+
+pub fn render_duplicate_report(report: &DuplicateReport) -> String {
+    let mut output = String::new();
+    output.push_str("Duplicate Code Report\n");
+    output.push_str("=====================\n\n");
+    output.push_str(&format!("Analyzed files: {}\n", report.analyzed_files));
+    output.push_str(&format!(
+        "Analyzed extensions: {}\n",
+        report.analyzed_extensions.join(", ")
+    ));
+    output.push_str(&format!(
+        "Duplicate blocks found: {}\n",
+        report.duplicate_blocks.len()
+    ));
+    for (index, block) in report.duplicate_blocks.iter().enumerate() {
+        output.push('\n');
+        output.push_str(&format!("#{} Weight: {}\n", index + 1, block.weight));
+        output.push_str(&format!("Lines: {}\n", block.line_count()));
+        output.push_str(&format!("Characters: {}\n", block.character_count()));
+        output.push_str(&format!("Occurrences: {}\n\n", block.occurrences.len()));
+        output.push_str("Locations:\n");
+        for occurrence in &block.occurrences {
+            output.push_str(&format!(
+                "- {}:{}-{}\n",
+                format_path(&occurrence.file_path),
+                occurrence.start_line,
+                occurrence.end_line
+            ));
+        }
+        output.push_str("\nCode:\n");
+        for line in &block.normalized_lines {
+            output.push_str("  ");
+            output.push_str(line);
+            output.push('\n');
+        }
+    }
+    output
+}
+
+#[cfg(test)]
+mod tests {
+    use std::path::PathBuf;
+
+    use crate::model::{DuplicateBlock, DuplicateOccurrence};
+
+    use super::*;
+
+    #[test]
+    fn renders_empty_report() {
+        let report = DuplicateReport {
+            analyzed_files: 0,
+            analyzed_extensions: vec!["ts".to_string()],
+            duplicate_blocks: Vec::new(),
+        };
+        assert_eq!(
+            render_duplicate_report(&report),
+            "Duplicate Code Report\n\
+             =====================\n\
+             \n\
+             Analyzed files: 0\n\
+             Analyzed extensions: ts\n\
+             Duplicate blocks found: 0\n"
+        );
+    }
+
+    #[test]
+    fn renders_duplicate_block_details() {
+        let report = DuplicateReport {
+            analyzed_files: 2,
+            analyzed_extensions: vec!["ts".to_string(), "js".to_string()],
+            duplicate_blocks: vec![DuplicateBlock {
+                normalized_lines: vec!["return value;".to_string()],
+                occurrences: vec![
+                    DuplicateOccurrence {
+                        file_path: PathBuf::from("src/a.ts"),
+                        start_line: 1,
+                        end_line: 1,
+                    },
+                    DuplicateOccurrence {
+                        file_path: PathBuf::from("src/b.js"),
+                        start_line: 5,
+                        end_line: 5,
+                    },
+                ],
+                weight: 13,
+            }],
+        };
+        let output = render_duplicate_report(&report);
+        assert!(output.contains("#1 Weight: 13"));
+        assert!(output.contains("Lines: 1"));
+        assert!(output.contains("- src/a.ts:1-1"));
+        assert!(output.contains("  return value;"));
+    }
+}

From a4e1b147cd749a3f408428879a5e2f22a4850b2d Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 12:54:07 +0200
Subject: [PATCH 02/22] [ci] wrap CodeRabbit workflow script in async IIFE

---
 .github/workflows/coderabbit-review.yml | 36 ++++++++++++++-----------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/coderabbit-review.yml b/.github/workflows/coderabbit-review.yml
index 4a61832..5cfc373 100644
--- a/.github/workflows/coderabbit-review.yml
+++ b/.github/workflows/coderabbit-review.yml
@@ -53,23 +53,27 @@ jobs:
             return fetchReviews(page + 1, reviews.concat(pageReviews));
           }
 
-          const reviews = await fetchReviews();
-          const codeRabbitReviews = reviews
-            .filter((review) => review.user?.login === "coderabbitai[bot]")
-            .filter((review) => review.commit_id === headSha)
-            .sort((left, right) => new Date(left.submitted_at) - new Date(right.submitted_at));
+          (async () => {
+            const reviews = await fetchReviews();
+            const codeRabbitReviews = reviews
+              .filter((review) => review.user?.login === "coderabbitai[bot]")
+              .filter((review) => review.commit_id === headSha)
+              .sort((left, right) => new Date(left.submitted_at) - new Date(right.submitted_at));
 
-          const latestReview = codeRabbitReviews.at(-1);
-          if (!latestReview) {
-            console.error(`CodeRabbit has not submitted a review for ${headSha}.`);
-            process.exit(1);
-          }
+            const latestReview = codeRabbitReviews.at(-1);
+            if (!latestReview) {
+              console.error(`CodeRabbit has not submitted a review for ${headSha}.`);
+              process.exit(1);
+            }
 
-          if (latestReview.state === "CHANGES_REQUESTED") {
-            console.error("CodeRabbit requested changes on this pull request.");
-            process.exit(1);
-          }
+            if (latestReview.state === "CHANGES_REQUESTED") {
+              console.error("CodeRabbit requested changes on this pull request.");
+              process.exit(1);
+            }
 
-          console.log(`CodeRabbit review state for ${headSha}: ${latestReview.state}`);
+            console.log(`CodeRabbit review state for ${headSha}: ${latestReview.state}`);
+          })().catch((error) => {
+            console.error(error);
+            process.exit(1);
+          });
           NODE
-

From cb9c7e8497cbb21e604371d5c31949b017f2f8a9 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 12:59:38 +0200
Subject: [PATCH 03/22] [fix] deduplicate resolved explicit source files

---
 src/discovery.rs | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/discovery.rs b/src/discovery.rs
index 2ff774e..1244180 100644
--- a/src/discovery.rs
+++ b/src/discovery.rs
@@ -1,3 +1,4 @@
+use std::collections::HashSet;
 use std::fs;
 use std::path::{Path, PathBuf};
 
@@ -44,6 +45,7 @@ fn discover_explicit_files(
     files: &[PathBuf],
 ) -> Result<Vec<SourceFile>> {
     let mut source_files = Vec::new();
+    let mut seen_paths = HashSet::new();
     for file in files {
         let absolute_input = file.is_absolute();
         let path = if absolute_input {
@@ -78,8 +80,13 @@ fn discover_explicit_files(
         let Some(extension) = selected_extension(&path, extensions) else {
             continue;
         };
+        let canonical_path = fs::canonicalize(&path)
+            .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", error))?;
+        if !seen_paths.insert(canonical_path.clone()) {
+            continue;
+        }
         source_files.push(SourceFile {
-            path,
+            path: canonical_path,
             display_path: normalize_display_path(file),
             extension,
         });
@@ -193,4 +200,25 @@ mod tests {
         assert_eq!(format_path(&files[0].display_path), "a.ts");
         fs::remove_dir_all(root).expect("cleanup");
     }
+
+    #[test]
+    fn explicit_files_deduplicate_resolved_paths() {
+        let root = temp_dir("explicit-dedup");
+        fs::write(root.join("a.ts"), "").expect("write ts");
+        let absolute = fs::canonicalize(root.join("a.ts")).expect("canonicalize ts");
+        let files = discover_source_files(
+            &root,
+            &["ts".to_string()],
+            Some(&[
+                PathBuf::from("a.ts"),
+                PathBuf::from(".").join("a.ts"),
+                absolute.clone(),
+            ]),
+        )
+        .expect("discover");
+        assert_eq!(files.len(), 1);
+        assert_eq!(files[0].path, absolute);
+        assert_eq!(format_path(&files[0].display_path), "a.ts");
+        fs::remove_dir_all(root).expect("cleanup");
+    }
 }

From d9982116553426cadb8a5635b55e6a1cda240ad3 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 13:16:12 +0200
Subject: [PATCH 04/22] [fix] reject overlapping duplicate ranges in the same
 file

---
 src/duplicate.rs | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/duplicate.rs b/src/duplicate.rs
index ef11e35..aadeaaa 100644
--- a/src/duplicate.rs
+++ b/src/duplicate.rs
@@ -142,6 +142,9 @@ fn expand_pair(files: &[ProcessedFile], left: LineRef, right: LineRef) -> Option
         left_end += 1;
         right_end += 1;
     }
+    if left.file_index == right.file_index && left_start <= right_end && right_start <= left_end {
+        return None;
+    }
     let normalized_lines = files[left.file_index].lines[left_start..=left_end]
         .iter()
         .map(|line| line.normalized_text.clone())
@@ -313,4 +316,29 @@ mod tests {
             ["if (ready) {", "}", "return value;"]
         );
     }
+
+    #[test]
+    fn rejects_overlapping_duplicate_ranges_in_the_same_file() {
+        let files = vec![processed_file(
+            "a.ts",
+            "ts",
+            &[
+                ("const value = one;", LineStatus::Comparison),
+                ("const value = one;", LineStatus::Comparison),
+                ("const value = one;", LineStatus::Comparison),
+            ],
+        )];
+        let blocks = detect_duplicate_blocks(&files);
+        assert!(!blocks.iter().any(|block| {
+            block.normalized_lines == ["const value = one;", "const value = one;"]
+                && block
+                    .occurrences
+                    .iter()
+                    .any(|occurrence| occurrence.start_line == 1)
+                && block
+                    .occurrences
+                    .iter()
+                    .any(|occurrence| occurrence.start_line == 2)
+        }));
+    }
 }

From d886d25fe52b5356a7685a24a6baa4e53cddbc40 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 13:53:04 +0200
Subject: [PATCH 05/22] [docs] document cargo installation from GitHub and
 local source

---
 README.md | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3642e93..0dd54e7 100644
--- a/README.md
+++ b/README.md
@@ -14,13 +14,25 @@ prints a stable plain-text report sorted by duplicate weight.
 
 ## Installation
 
-Build from source with Cargo:
+Install `codem8` from the GitHub source with Cargo:
+
+```bash
+cargo install --git https://github.com/b4prog/CodeM8 codem8
+```
+
+Build from a local checkout with Cargo:
 
 ```bash
 cargo build --release
 ```
 
-Run the local binary:
+Install from a local checkout:
+
+```bash
+cargo install --path .
+```
+
+Run from the local checkout without installing:
 
 ```bash
 cargo run -- --report-duplicate

From a61844bdeb643513d4a31aa50bc19c3fceb78872 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 14:13:57 +0200
Subject: [PATCH 06/22] [chore] add clippy lint threshold configuration

---
 clippy.toml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 clippy.toml

diff --git a/clippy.toml b/clippy.toml
new file mode 100644
index 0000000..770b51a
--- /dev/null
+++ b/clippy.toml
@@ -0,0 +1,5 @@
+too-many-lines-threshold = 80
+too-many-arguments-threshold = 5
+type-complexity-threshold = 200
+excessive-nesting-threshold = 4
+cognitive-complexity-threshold = 20

From cbe018a3102f5ddfdba4351a04010fbee9899471 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 14:19:03 +0200
Subject: [PATCH 07/22] [docs] document agent verification requirements and
 local checks

---
 AGENTS.md | 19 +++++++++++++++++++
 README.md |  3 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..c8ae01f
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,19 @@
+# Agent Instructions
+
+These instructions apply to code agents working in this repository, including Codex.
+
+## Before finishing a change
+
+Run the repository verification commands from the workspace root and fix any issues before handing work back:
+
+```bash
+cargo fmt --all -- --check
+cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity
+rtk cargo build --locked --all-targets
+```
+
+## Notes
+
+- Treat Clippy warnings as errors for generated or edited code.
+- Prefer changes that satisfy the repository `clippy.toml` configuration without adding `#[allow(...)]` attributes unless a maintainer explicitly asks for them.
+- If a command cannot be run in the current environment, call that out clearly in the handoff.
diff --git a/README.md b/README.md
index 0dd54e7..ce3d08a 100644
--- a/README.md
+++ b/README.md
@@ -111,7 +111,8 @@ Run the full local verification set:
 
 ```bash
 cargo fmt --all -- --check
-cargo build --all-targets
+cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity
+rtk cargo build --locked --all-targets
 cargo test --all-targets
 ```
 

From 945d07c481fdf250b8d0f67ef0edb3f1f84e8897 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 14:25:04 +0200
Subject: [PATCH 08/22] [refactor] reduce nesting in duplicate and language
 helpers

---
 src/duplicate.rs | 42 ++++++++++++++++++++++++++++--------------
 src/language.rs  | 23 ++++++++++++++++-------
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/src/duplicate.rs b/src/duplicate.rs
index aadeaaa..7b80a04 100644
--- a/src/duplicate.rs
+++ b/src/duplicate.rs
@@ -64,20 +64,7 @@ pub fn detect_duplicate_blocks(files: &[ProcessedFile]) -> Vec<DuplicateBlock> {
             if comparison_refs.len() < 2 {
                 continue;
             }
-            for left_index in 0..comparison_refs.len() {
-                for right_index in (left_index + 1)..comparison_refs.len() {
-                    let left = comparison_refs[left_index];
-                    let right = comparison_refs[right_index];
-                    let Some(candidate) = expand_pair(files, left, right) else {
-                        continue;
-                    };
-                    let occurrences = blocks_by_lines
-                        .entry(candidate.normalized_lines)
-                        .or_default();
-                    occurrences.insert(candidate.left_occurrence);
-                    occurrences.insert(candidate.right_occurrence);
-                }
-            }
+            collect_candidates(files, comparison_refs, &mut blocks_by_lines);
         }
     }
     let mut duplicate_blocks = blocks_by_lines
@@ -118,6 +105,33 @@ struct CandidateBlock {
     right_occurrence: OccurrenceKey,
 }
 
+fn collect_candidates(
+    files: &[ProcessedFile],
+    comparison_refs: &[LineRef],
+    blocks_by_lines: &mut HashMap<Vec<String>, BTreeSet<OccurrenceKey>>,
+) {
+    for left_index in 0..comparison_refs.len() {
+        let left = comparison_refs[left_index];
+        for &right in &comparison_refs[(left_index + 1)..] {
+            let Some(candidate) = expand_pair(files, left, right) else {
+                continue;
+            };
+            store_candidate(candidate, blocks_by_lines);
+        }
+    }
+}
+
+fn store_candidate(
+    candidate: CandidateBlock,
+    blocks_by_lines: &mut HashMap<Vec<String>, BTreeSet<OccurrenceKey>>,
+) {
+    let occurrences = blocks_by_lines
+        .entry(candidate.normalized_lines)
+        .or_default();
+    occurrences.insert(candidate.left_occurrence);
+    occurrences.insert(candidate.right_occurrence);
+}
+
 fn expand_pair(files: &[ProcessedFile], left: LineRef, right: LineRef) -> Option<CandidateBlock> {
     if left == right {
         return None;
diff --git a/src/language.rs b/src/language.rs
index 2dc3dc3..d4c36fa 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -151,19 +151,28 @@ fn registry() -> &'static BlockOnlyRegistry {
             HashMap::new();
         for language in LANGUAGE_PATTERNS {
             for extension in language.extensions {
-                let patterns_by_hash = by_extension.entry(extension).or_default();
-                for line in language.block_only_lines {
-                    patterns_by_hash
-                        .entry(hash_normalized_line(line))
-                        .or_default()
-                        .push(line);
-                }
+                register_block_only_lines(
+                    by_extension.entry(extension).or_default(),
+                    language.block_only_lines,
+                );
             }
         }
         BlockOnlyRegistry { by_extension }
     })
 }
 
+fn register_block_only_lines(
+    patterns_by_hash: &mut HashMap<u128, Vec<&'static str>>,
+    lines: &'static [&'static str],
+) {
+    for &line in lines {
+        patterns_by_hash
+            .entry(hash_normalized_line(line))
+            .or_default()
+            .push(line);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From 9417bae460688c08511401eb216873795d0adfcb Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 14:39:09 +0200
Subject: [PATCH 09/22] [chore] satisfy stricter clippy lint requirements

---
 AGENTS.md        |  2 +-
 Cargo.toml       |  3 ++-
 src/cli.rs       | 19 ++++++++++++++++++-
 src/discovery.rs | 30 +++++++++++++++++-------------
 src/error.rs     |  4 +++-
 src/language.rs  |  2 ++
 src/lib.rs       |  6 ++++++
 src/line.rs      | 13 ++++++++++++-
 src/model.rs     |  4 +++-
 src/paths.rs     |  2 ++
 src/report.rs    | 34 ++++++++++++++++++++--------------
 11 files changed, 86 insertions(+), 33 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index c8ae01f..9b718e5 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -8,7 +8,7 @@ Run the repository verification commands from the workspace root and fix any iss
 
 ```bash
 cargo fmt --all -- --check
-cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity
+cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo
 rtk cargo build --locked --all-targets
 ```
 
diff --git a/Cargo.toml b/Cargo.toml
index 1d7336b..a1ad11d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,8 @@ edition = "2021"
 license = "MIT"
 description = "A deterministic source code analysis CLI for duplicate code reports."
 repository = "https://github.com/b4prog/CodeM8"
+keywords = ["cli", "duplicate-detection", "source-code", "analysis"]
+categories = ["command-line-utilities", "development-tools"]
 
 [dependencies]
 xxhash-rust = { version = "0.8", features = ["xxh3"] }
-
diff --git a/src/cli.rs b/src/cli.rs
index 934c274..28dd729 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -11,6 +11,12 @@ pub struct CliConfig {
     pub files: Option<Vec<PathBuf>>,
 }
 
+/// Parses command-line arguments into a validated CLI configuration.
+///
+/// # Errors
+///
+/// Returns an error when the arguments are invalid, repeated, or missing the
+/// required report switch.
 pub fn parse_args<I, S>(args: I) -> Result<CliConfig>
 where
     I: IntoIterator<Item = S>,
@@ -57,13 +63,19 @@ where
         file_extensions: file_extensions.unwrap_or_else(|| {
             DEFAULT_FILE_EXTENSIONS
                 .iter()
-                .map(|extension| extension.to_string())
+                .map(std::string::ToString::to_string)
                 .collect()
         }),
         files,
     })
 }
 
+/// Parses a comma-separated list of file extensions.
+///
+/// # Errors
+///
+/// Returns an error when an extension is empty, starts with `.`, or contains a
+/// path separator.
 pub fn parse_file_extensions(value: &str) -> Result<Vec<String>> {
     let mut extensions = Vec::new();
     for raw_extension in value.split(',') {
@@ -92,6 +104,11 @@ pub fn parse_file_extensions(value: &str) -> Result<Vec<String>> {
     Ok(extensions)
 }
 
+/// Parses a comma-separated list of explicit file paths.
+///
+/// # Errors
+///
+/// Returns an error when any provided file path is empty.
 pub fn parse_file_list(value: &str) -> Result<Vec<PathBuf>> {
     let mut files = Vec::new();
     for raw_file in value.split(',') {
diff --git a/src/discovery.rs b/src/discovery.rs
index 1244180..a976c56 100644
--- a/src/discovery.rs
+++ b/src/discovery.rs
@@ -20,18 +20,23 @@ const IGNORED_DIRECTORIES: &[&str] = &[
     ".vscode",
 ];
 
+/// Discovers source files that match the selected extensions.
+///
+/// # Errors
+///
+/// Returns an error when explicit files are invalid or when walking the file
+/// tree fails.
 pub fn discover_source_files(
     current_dir: &Path,
     extensions: &[String],
     explicit_files: Option<&[PathBuf]>,
 ) -> Result<Vec<SourceFile>> {
-    let mut source_files = match explicit_files {
-        Some(files) => discover_explicit_files(current_dir, extensions, files)?,
-        None => {
-            let mut source_files = Vec::new();
-            walk_directory(current_dir, current_dir, extensions, &mut source_files)?;
-            source_files
-        }
+    let mut source_files = if let Some(files) = explicit_files {
+        discover_explicit_files(current_dir, extensions, files)?
+    } else {
+        let mut source_files = Vec::new();
+        walk_directory(current_dir, current_dir, extensions, &mut source_files)?;
+        source_files
     };
     source_files.sort_by(|left, right| {
         format_path(&left.display_path).cmp(&format_path(&right.display_path))
@@ -81,7 +86,7 @@ fn discover_explicit_files(
             continue;
         };
         let canonical_path = fs::canonicalize(&path)
-            .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", error))?;
+            .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", &error))?;
         if !seen_paths.insert(canonical_path.clone()) {
             continue;
         }
@@ -101,9 +106,9 @@ fn walk_directory(
     source_files: &mut Vec<SourceFile>,
 ) -> Result<()> {
     let mut entries = fs::read_dir(directory)
-        .map_err(|error| CodeM8Error::io(directory, "read directory", error))?
+        .map_err(|error| CodeM8Error::io(directory, "read directory", &error))?
         .collect::<std::result::Result<Vec<_>, _>>()
-        .map_err(|error| CodeM8Error::io(directory, "read directory entry", error))?;
+        .map_err(|error| CodeM8Error::io(directory, "read directory entry", &error))?;
     entries.sort_by(|left, right| {
         left.file_name()
             .to_string_lossy()
@@ -113,7 +118,7 @@ fn walk_directory(
         let path = entry.path();
         let file_type = entry
             .file_type()
-            .map_err(|error| CodeM8Error::io(&path, "inspect path", error))?;
+            .map_err(|error| CodeM8Error::io(&path, "inspect path", &error))?;
         if file_type.is_symlink() {
             continue;
         }
@@ -129,8 +134,7 @@ fn walk_directory(
             };
             let display_path = path
                 .strip_prefix(root)
-                .map(normalize_display_path)
-                .unwrap_or_else(|_| normalize_display_path(&path));
+                .map_or_else(|_| normalize_display_path(&path), normalize_display_path);
             source_files.push(SourceFile {
                 path,
                 display_path,
diff --git a/src/error.rs b/src/error.rs
index fcb7545..dc57d67 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -13,13 +13,15 @@ pub struct CodeM8Error {
 }
 
 impl CodeM8Error {
+    #[must_use]
     pub fn new(message: impl Into<String>) -> Self {
         Self {
             message: message.into(),
         }
     }
 
-    pub fn io(path: &Path, action: &str, error: io::Error) -> Self {
+    #[must_use]
+    pub fn io(path: &Path, action: &str, error: &io::Error) -> Self {
         Self::new(format!("could not {action} {}: {error}", format_path(path)))
     }
 }
diff --git a/src/language.rs b/src/language.rs
index d4c36fa..b74ba57 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -126,10 +126,12 @@ struct BlockOnlyRegistry {
 
 static BLOCK_ONLY_REGISTRY: OnceLock<BlockOnlyRegistry> = OnceLock::new();
 
+#[must_use]
 pub fn hash_normalized_line(line: &str) -> u128 {
     xxhash_rust::xxh3::xxh3_128(line.as_bytes())
 }
 
+#[must_use]
 pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus {
     let extension = extension.to_ascii_lowercase();
     let Some(patterns_by_hash) = registry().by_extension.get(extension.as_str()) else {
diff --git a/src/lib.rs b/src/lib.rs
index 805bc69..e13d185 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,6 +13,12 @@ use std::path::Path;
 
 use crate::error::{CodeM8Error, Result};
 
+/// Runs the CLI workflow and writes the selected report to the provided writer.
+///
+/// # Errors
+///
+/// Returns an error when argument parsing, file discovery, file processing, or
+/// report writing fails.
 pub fn run<I, S, W>(args: I, current_dir: &Path, writer: &mut W) -> Result<()>
 where
     I: IntoIterator<Item = S>,
diff --git a/src/line.rs b/src/line.rs
index 894faaf..b33a4a6 100644
--- a/src/line.rs
+++ b/src/line.rs
@@ -5,13 +5,23 @@ use crate::error::{CodeM8Error, Result};
 use crate::language::{classify_line, hash_normalized_line};
 use crate::model::{LineEntry, ProcessedFile, SourceFile};
 
+/// Processes a set of source files into normalized line entries.
+///
+/// # Errors
+///
+/// Returns an error when any input file cannot be opened or read as UTF-8 text.
 pub fn process_source_files(source_files: &[SourceFile]) -> Result<Vec<ProcessedFile>> {
     source_files.iter().map(process_source_file).collect()
 }
 
+/// Processes one source file into its normalized, classified lines.
+///
+/// # Errors
+///
+/// Returns an error when the file cannot be opened or read as UTF-8 text.
 pub fn process_source_file(source_file: &SourceFile) -> Result<ProcessedFile> {
     let file = File::open(&source_file.path)
-        .map_err(|error| CodeM8Error::io(&source_file.display_path, "open file", error))?;
+        .map_err(|error| CodeM8Error::io(&source_file.display_path, "open file", &error))?;
     let reader = BufReader::new(file);
     let mut lines = Vec::new();
     for (index, line) in reader.lines().enumerate() {
@@ -40,6 +50,7 @@ pub fn process_source_file(source_file: &SourceFile) -> Result<ProcessedFile> {
     })
 }
 
+#[must_use]
 pub fn normalize_line(line: &str) -> Option<String> {
     let normalized = line.trim();
     if normalized.is_empty() {
diff --git a/src/model.rs b/src/model.rs
index e7a1248..2a1b195 100644
--- a/src/model.rs
+++ b/src/model.rs
@@ -43,10 +43,12 @@ pub struct DuplicateBlock {
 }
 
 impl DuplicateBlock {
-    pub fn line_count(&self) -> usize {
+    #[must_use]
+    pub const fn line_count(&self) -> usize {
         self.normalized_lines.len()
     }
 
+    #[must_use]
     pub fn character_count(&self) -> u64 {
         self.normalized_lines
             .iter()
diff --git a/src/paths.rs b/src/paths.rs
index 56e1812..777473f 100644
--- a/src/paths.rs
+++ b/src/paths.rs
@@ -1,9 +1,11 @@
 use std::path::{Component, Path, PathBuf};
 
+#[must_use]
 pub fn format_path(path: &Path) -> String {
     path.to_string_lossy().replace('\\', "/")
 }
 
+#[must_use]
 pub fn normalize_display_path(path: &Path) -> PathBuf {
     let mut normalized = PathBuf::new();
     for component in path.components() {
diff --git a/src/report.rs b/src/report.rs
index f6207c3..126a1cc 100644
--- a/src/report.rs
+++ b/src/report.rs
@@ -1,3 +1,5 @@
+use std::fmt::Write as _;
+
 use crate::model::DuplicateBlock;
 use crate::paths::format_path;
 
@@ -8,33 +10,37 @@ pub struct DuplicateReport {
     pub duplicate_blocks: Vec<DuplicateBlock>,
 }
 
+#[must_use]
 pub fn render_duplicate_report(report: &DuplicateReport) -> String {
     let mut output = String::new();
     output.push_str("Duplicate Code Report\n");
     output.push_str("=====================\n\n");
-    output.push_str(&format!("Analyzed files: {}\n", report.analyzed_files));
-    output.push_str(&format!(
-        "Analyzed extensions: {}\n",
+    let _ = writeln!(output, "Analyzed files: {}", report.analyzed_files);
+    let _ = writeln!(
+        output,
+        "Analyzed extensions: {}",
         report.analyzed_extensions.join(", ")
-    ));
-    output.push_str(&format!(
-        "Duplicate blocks found: {}\n",
+    );
+    let _ = writeln!(
+        output,
+        "Duplicate blocks found: {}",
         report.duplicate_blocks.len()
-    ));
+    );
     for (index, block) in report.duplicate_blocks.iter().enumerate() {
         output.push('\n');
-        output.push_str(&format!("#{} Weight: {}\n", index + 1, block.weight));
-        output.push_str(&format!("Lines: {}\n", block.line_count()));
-        output.push_str(&format!("Characters: {}\n", block.character_count()));
-        output.push_str(&format!("Occurrences: {}\n\n", block.occurrences.len()));
+        let _ = writeln!(output, "#{} Weight: {}", index + 1, block.weight);
+        let _ = writeln!(output, "Lines: {}", block.line_count());
+        let _ = writeln!(output, "Characters: {}", block.character_count());
+        let _ = writeln!(output, "Occurrences: {}\n", block.occurrences.len());
         output.push_str("Locations:\n");
         for occurrence in &block.occurrences {
-            output.push_str(&format!(
-                "- {}:{}-{}\n",
+            let _ = writeln!(
+                output,
+                "- {}:{}-{}",
                 format_path(&occurrence.file_path),
                 occurrence.start_line,
                 occurrence.end_line
-            ));
+            );
         }
         output.push_str("\nCode:\n");
         for line in &block.normalized_lines {

From 5e775700ea98e390e802500783a5cc61bc733de1 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 14:45:55 +0200
Subject: [PATCH 10/22] [ci] add Clippy validation to the Rust CI workflow

---
 .github/workflows/ci.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f8eae35..9dac4b1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -19,14 +19,16 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install Rust toolchain
-        run: rustup toolchain install stable --profile minimal --component rustfmt
+        run: rustup toolchain install stable --profile minimal --component rustfmt --component clippy
 
       - name: Check formatting
         run: cargo fmt --all -- --check
 
+      - name: Run Clippy
+        run: cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo
+
       - name: Build
         run: cargo build --locked --all-targets
 
       - name: Test
         run: cargo test --locked --all-targets
-

From 436936f7a8783a3bc780b45e54699da51e3dd02e Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 15:57:52 +0200
Subject: [PATCH 11/22] [ci] enable CodeRabbit request changes approval
 workflow

---
 .coderabbit.yaml | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .coderabbit.yaml

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 0000000..11bd52f
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,2 @@
+reviews:
+  request_changes_workflow: true

From f9054a0735e7297c1ad2769166a49e56a76565a3 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 16:10:03 +0200
Subject: [PATCH 12/22] [ci] enable detailed CodeRabbit reviews and disable
 poems

---
 .coderabbit.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index 11bd52f..f7b3a5d 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -1,2 +1,4 @@
 reviews:
   request_changes_workflow: true
+  review_details: true
+  poem: false

From 1ea12699f24b31fa14ebb2a3f5848d551fb008bc Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 16:51:27 +0200
Subject: [PATCH 13/22] [test] add coverage for parser, discovery, duplicate,
 and path edge cases

---
 src/cli.rs       | 44 ++++++++++++++++++++++++++++++++++++++++
 src/discovery.rs | 13 ++++++++++++
 src/duplicate.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/line.rs      | 19 ++++++++++++++++++
 src/paths.rs     | 26 ++++++++++++++++++++++++
 5 files changed, 154 insertions(+)

diff --git a/src/cli.rs b/src/cli.rs
index 28dd729..3fa99a1 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -154,12 +154,48 @@ mod tests {
         assert!(error.to_string().contains("must not start with a dot"));
     }
 
+    #[test]
+    fn rejects_extensions_with_path_separators() {
+        let error = parse_file_extensions("src/ts").expect_err("path-like extension fails");
+        assert!(error
+            .to_string()
+            .contains("must not contain path separators"));
+    }
+
     #[test]
     fn rejects_missing_report_switch() {
         let error = parse_args(["-file-extension=rs"]).expect_err("missing report fails");
         assert!(error.to_string().contains("no report switch provided"));
     }
 
+    #[test]
+    fn rejects_unknown_arguments() {
+        let error = parse_args(["--report-duplicate", "--verbose"]).expect_err("unknown arg fails");
+        assert!(error.to_string().contains("unknown argument: --verbose"));
+    }
+
+    #[test]
+    fn rejects_repeated_file_extension_arguments() {
+        let error = parse_args([
+            "--report-duplicate",
+            "-file-extension=ts",
+            "--file-extension=js",
+        ])
+        .expect_err("repeated extensions fail");
+        assert!(error
+            .to_string()
+            .contains("file extensions were provided more than once"));
+    }
+
+    #[test]
+    fn rejects_repeated_explicit_file_arguments() {
+        let error = parse_args(["--report-duplicate", "-files=a.ts", "--files=b.ts"])
+            .expect_err("repeated explicit files fail");
+        assert!(error
+            .to_string()
+            .contains("explicit files were provided more than once"));
+    }
+
     #[test]
     fn parses_explicit_file_list() {
         let files = parse_file_list("src/a.ts, ./src/b.ts").expect("files parse");
@@ -168,4 +204,12 @@ mod tests {
             [PathBuf::from("src/a.ts"), PathBuf::from("./src/b.ts")]
         );
     }
+
+    #[test]
+    fn rejects_empty_explicit_file_paths() {
+        let error = parse_file_list("src/a.ts, ").expect_err("empty explicit file fails");
+        assert!(error
+            .to_string()
+            .contains("file path values must not be empty"));
+    }
 }
diff --git a/src/discovery.rs b/src/discovery.rs
index a976c56..0424979 100644
--- a/src/discovery.rs
+++ b/src/discovery.rs
@@ -225,4 +225,17 @@ mod tests {
         assert_eq!(format_path(&files[0].display_path), "a.ts");
         fs::remove_dir_all(root).expect("cleanup");
     }
+
+    #[test]
+    fn explicit_files_reject_directories() {
+        let root = temp_dir("explicit-directory");
+        fs::create_dir_all(root.join("src")).expect("create explicit directory");
+        let error =
+            discover_source_files(&root, &["ts".to_string()], Some(&[PathBuf::from("src")]))
+                .expect_err("directory explicit file fails");
+        assert!(error
+            .to_string()
+            .contains("explicit file is a directory: src"));
+        fs::remove_dir_all(root).expect("cleanup");
+    }
 }
diff --git a/src/duplicate.rs b/src/duplicate.rs
index 7b80a04..f01bca7 100644
--- a/src/duplicate.rs
+++ b/src/duplicate.rs
@@ -291,6 +291,58 @@ mod tests {
         );
     }
 
+    #[test]
+    fn ignores_matching_hashes_with_different_text() {
+        let mut files = vec![
+            processed_file(
+                "a.ts",
+                "ts",
+                &[("const value = one;", LineStatus::Comparison)],
+            ),
+            processed_file(
+                "b.ts",
+                "ts",
+                &[("const value = two;", LineStatus::Comparison)],
+            ),
+        ];
+        files[1].lines[0].hash = files[0].lines[0].hash;
+        let blocks = detect_duplicate_blocks(&files);
+        assert!(blocks.is_empty());
+    }
+
+    #[test]
+    fn sorts_duplicate_blocks_by_weight() {
+        let files = vec![
+            processed_file(
+                "a.ts",
+                "ts",
+                &[
+                    ("const longerValue = computeOne();", LineStatus::Comparison),
+                    ("return longerValue;", LineStatus::Comparison),
+                    ("const uniqueA = true;", LineStatus::Comparison),
+                    ("const x = 1;", LineStatus::Comparison),
+                ],
+            ),
+            processed_file(
+                "b.ts",
+                "ts",
+                &[
+                    ("const longerValue = computeOne();", LineStatus::Comparison),
+                    ("return longerValue;", LineStatus::Comparison),
+                    ("const uniqueB = true;", LineStatus::Comparison),
+                    ("const x = 1;", LineStatus::Comparison),
+                ],
+            ),
+        ];
+        let blocks = detect_duplicate_blocks(&files);
+        assert!(blocks.len() >= 2);
+        assert_eq!(
+            blocks[0].normalized_lines,
+            ["const longerValue = computeOne();", "return longerValue;"]
+        );
+        assert!(blocks[0].weight >= blocks[1].weight);
+    }
+
     #[test]
     fn ignores_single_line_duplicates_that_are_only_block_only_lines() {
         let files = vec![
diff --git a/src/line.rs b/src/line.rs
index b33a4a6..92dc0f5 100644
--- a/src/line.rs
+++ b/src/line.rs
@@ -96,4 +96,23 @@ mod tests {
         assert_eq!(processed.lines[1].status, LineStatus::BlockOnly);
         fs::remove_file(path).expect("cleanup");
     }
+
+    #[test]
+    fn returns_clear_error_for_invalid_utf8() {
+        let path = std::env::temp_dir().join(format!(
+            "codem8-line-invalid-utf8-{}.ts",
+            std::process::id()
+        ));
+        fs::write(&path, [0xff, b'\n']).expect("write invalid source file");
+        let source = SourceFile {
+            path: path.clone(),
+            display_path: "invalid.ts".into(),
+            extension: "ts".to_string(),
+        };
+        let error = process_source_file(&source).expect_err("invalid UTF-8 fails");
+        assert!(error
+            .to_string()
+            .contains("could not read invalid.ts as UTF-8 text"));
+        fs::remove_file(path).expect("cleanup");
+    }
 }
diff --git a/src/paths.rs b/src/paths.rs
index 777473f..f55926f 100644
--- a/src/paths.rs
+++ b/src/paths.rs
@@ -22,3 +22,29 @@ pub fn normalize_display_path(path: &Path) -> PathBuf {
         normalized
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn formats_paths_with_forward_slashes() {
+        assert_eq!(
+            format_path(Path::new("src\\nested\\a.ts")),
+            "src/nested/a.ts"
+        );
+    }
+
+    #[test]
+    fn normalizes_display_paths_without_losing_parent_segments() {
+        assert_eq!(
+            normalize_display_path(Path::new("./src/../a.ts")),
+            PathBuf::from("src").join("..").join("a.ts")
+        );
+    }
+
+    #[test]
+    fn normalizes_empty_display_path_to_current_directory() {
+        assert_eq!(normalize_display_path(Path::new(".")), PathBuf::from("."));
+    }
+}

From b9ecef27bf5ae10c76efa67ead311cb4ff7f024a Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 17:54:01 +0200
Subject: [PATCH 14/22] [test] make duplicate sort fixture fail without sorting

---
 src/duplicate.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/duplicate.rs b/src/duplicate.rs
index f01bca7..6414768 100644
--- a/src/duplicate.rs
+++ b/src/duplicate.rs
@@ -317,20 +317,20 @@ mod tests {
                 "a.ts",
                 "ts",
                 &[
+                    ("const x = 1;", LineStatus::Comparison),
+                    ("const uniqueA = true;", LineStatus::Comparison),
                     ("const longerValue = computeOne();", LineStatus::Comparison),
                     ("return longerValue;", LineStatus::Comparison),
-                    ("const uniqueA = true;", LineStatus::Comparison),
-                    ("const x = 1;", LineStatus::Comparison),
                 ],
             ),
             processed_file(
                 "b.ts",
                 "ts",
                 &[
+                    ("const x = 1;", LineStatus::Comparison),
+                    ("const uniqueB = true;", LineStatus::Comparison),
                     ("const longerValue = computeOne();", LineStatus::Comparison),
                     ("return longerValue;", LineStatus::Comparison),
-                    ("const uniqueB = true;", LineStatus::Comparison),
-                    ("const x = 1;", LineStatus::Comparison),
                 ],
             ),
         ];

From 38662c82dbdb292feba6745b6e4b2a36b3e89a3e Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 18:23:54 +0200
Subject: [PATCH 15/22] [feat] add CLI help output for duplicate reports

---
 src/cli.rs      | 92 ++++++++++++++++++++++++++++++++++++++++++++++++-
 src/error.rs    | 15 ++++++++
 src/language.rs | 17 ++++++++-
 src/lib.rs      | 48 +++++++++++++++++---------
 src/main.rs     |  4 +++
 5 files changed, 157 insertions(+), 19 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index 3fa99a1..1cdb894 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -3,6 +3,51 @@ use std::path::PathBuf;
 use crate::error::{CodeM8Error, Result};
 
 const DEFAULT_FILE_EXTENSIONS: &[&str] = &["ts"];
+const HELP_TEXT: &str = "\
+CodeM8 - deterministic source code analysis reports.
+
+USAGE:
+  codem8 help
+  codem8 --report-duplicate [OPTIONS]
+
+COMMANDS:
+  help
+      Display this detailed documentation.
+
+REQUIRED REPORT SWITCHES:
+  --report-duplicate
+      Analyze source files and print a duplicate code report.
+
+OPTIONS:
+  -file-extension=<extensions>
+  --file-extension=<extensions>
+      Comma-separated source file extensions to analyze.
+      Defaults to: ts
+      Examples: -file-extension=ts,tsx,js,jsx
+
+  -files=<paths>
+  --files=<paths>
+      Comma-separated explicit files to analyze instead of recursively
+      discovering files from the current directory.
+      Example: -files=src/a.ts,src/b.js
+
+DUPLICATE REPORT PURPOSE:
+  The duplicate report helps you find repeated code that may be worth
+  refactoring, reviewing, or consolidating. It lists each duplicated block with
+  the files and line ranges where it appears, making it easier to compare the
+  repeated code and decide whether it should stay duplicated.
+
+EXAMPLES:
+  codem8 --report-duplicate
+  codem8 --report-duplicate -file-extension=ts,tsx,js,jsx
+  codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js
+";
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum CliCommand {
+    Help,
+    ReportDuplicate(CliConfig),
+}
 
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CliConfig {
@@ -11,6 +56,29 @@ pub struct CliConfig {
     pub files: Option<Vec<PathBuf>>,
 }
 
+#[must_use]
+pub const fn help_text() -> &'static str {
+    HELP_TEXT
+}
+
+/// Parses command-line arguments into a CLI command.
+///
+/// # Errors
+///
+/// Returns an error when the arguments are invalid, repeated, or missing the
+/// required report switch.
+pub fn parse_command<I, S>(args: I) -> Result<CliCommand>
+where
+    I: IntoIterator<Item = S>,
+    S: Into<String>,
+{
+    let args = args.into_iter().map(Into::into).collect::<Vec<_>>();
+    if args.len() == 1 && is_help_argument(&args[0]) {
+        return Ok(CliCommand::Help);
+    }
+    parse_args(args).map(CliCommand::ReportDuplicate)
+}
+
 /// Parses command-line arguments into a validated CLI configuration.
 ///
 /// # Errors
@@ -54,7 +122,7 @@ where
         }
     }
     if !report_duplicate {
-        return Err(CodeM8Error::new(
+        return Err(CodeM8Error::with_help(
             "no report switch provided; pass --report-duplicate",
         ));
     }
@@ -124,10 +192,30 @@ pub fn parse_file_list(value: &str) -> Result<Vec<PathBuf>> {
     Ok(files)
 }
 
+fn is_help_argument(arg: &str) -> bool {
+    matches!(arg, "help" | "--help" | "-h")
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
+    #[test]
+    fn parses_help_command() {
+        let command = parse_command(["help"]).expect("help parses");
+        assert_eq!(command, CliCommand::Help);
+    }
+
+    #[test]
+    fn exposes_detailed_help_text() {
+        assert!(help_text().contains("USAGE:"));
+        assert!(help_text().contains("--report-duplicate"));
+        assert!(help_text().contains("-file-extension=<extensions>"));
+        assert!(help_text().contains("-files=<paths>"));
+        assert!(help_text().contains("helps you find repeated code"));
+        assert!(!help_text().contains("Duplicate weight"));
+    }
+
     #[test]
     fn parses_default_duplicate_report_config() {
         let config = parse_args(["--report-duplicate"]).expect("config parses");
@@ -166,12 +254,14 @@ mod tests {
     fn rejects_missing_report_switch() {
         let error = parse_args(["-file-extension=rs"]).expect_err("missing report fails");
         assert!(error.to_string().contains("no report switch provided"));
+        assert!(error.should_show_help());
     }
 
     #[test]
     fn rejects_unknown_arguments() {
         let error = parse_args(["--report-duplicate", "--verbose"]).expect_err("unknown arg fails");
         assert!(error.to_string().contains("unknown argument: --verbose"));
+        assert!(!error.should_show_help());
     }
 
     #[test]
diff --git a/src/error.rs b/src/error.rs
index dc57d67..bcfe20b 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -10,6 +10,7 @@ pub type Result<T> = std::result::Result<T, CodeM8Error>;
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CodeM8Error {
     message: String,
+    show_help: bool,
 }
 
 impl CodeM8Error {
@@ -17,6 +18,15 @@ impl CodeM8Error {
     pub fn new(message: impl Into<String>) -> Self {
         Self {
             message: message.into(),
+            show_help: false,
+        }
+    }
+
+    #[must_use]
+    pub fn with_help(message: impl Into<String>) -> Self {
+        Self {
+            message: message.into(),
+            show_help: true,
         }
     }
 
@@ -24,6 +34,11 @@ impl CodeM8Error {
     pub fn io(path: &Path, action: &str, error: &io::Error) -> Self {
         Self::new(format!("could not {action} {}: {error}", format_path(path)))
     }
+
+    #[must_use]
+    pub const fn should_show_help(&self) -> bool {
+        self.show_help
+    }
 }
 
 impl fmt::Display for CodeM8Error {
diff --git a/src/language.rs b/src/language.rs
index b74ba57..60d7985 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -21,7 +21,22 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "Rust",
         extensions: &["rs"],
-        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"],
+        block_only_lines: &[
+            "{",
+            "}",
+            "(",
+            ")",
+            "))",
+            "[",
+            "]",
+            ");",
+            "];",
+            "));",
+            "})?;",
+            "})",
+            "})?;",
+            ".into_iter()",
+        ],
     },
     LanguageLinePattern {
         language_name: "C / C++ / Objective-C",
diff --git a/src/lib.rs b/src/lib.rs
index e13d185..659a0a0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -25,23 +25,29 @@ where
     S: Into<String>,
     W: Write,
 {
-    let config = cli::parse_args(args)?;
-    if config.report_duplicate {
-        let source_files = discovery::discover_source_files(
-            current_dir,
-            &config.file_extensions,
-            config.files.as_deref(),
-        )?;
-        let processed_files = line::process_source_files(&source_files)?;
-        let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files);
-        let report = report::DuplicateReport {
-            analyzed_files: source_files.len(),
-            analyzed_extensions: config.file_extensions,
-            duplicate_blocks,
-        };
-        writer
-            .write_all(report::render_duplicate_report(&report).as_bytes())
-            .map_err(|error| CodeM8Error::new(format!("could not write report output: {error}")))?;
+    match cli::parse_command(args)? {
+        cli::CliCommand::Help => writer
+            .write_all(cli::help_text().as_bytes())
+            .map_err(|error| CodeM8Error::new(format!("could not write help output: {error}")))?,
+        cli::CliCommand::ReportDuplicate(config) => {
+            let source_files = discovery::discover_source_files(
+                current_dir,
+                &config.file_extensions,
+                config.files.as_deref(),
+            )?;
+            let processed_files = line::process_source_files(&source_files)?;
+            let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files);
+            let report = report::DuplicateReport {
+                analyzed_files: source_files.len(),
+                analyzed_extensions: config.file_extensions,
+                duplicate_blocks,
+            };
+            writer
+                .write_all(report::render_duplicate_report(&report).as_bytes())
+                .map_err(|error| {
+                    CodeM8Error::new(format!("could not write report output: {error}"))
+                })?;
+        }
     }
     Ok(())
 }
@@ -170,4 +176,12 @@ mod tests {
             .to_string()
             .contains("explicit file does not exist: missing.ts"));
     }
+
+    #[test]
+    fn help_command_prints_documentation() {
+        let project = TempProject::new("help");
+        let output = run_in(&project, &["help"]).expect("help succeeds");
+        assert!(output.contains("USAGE:"));
+        assert!(output.contains("--report-duplicate"));
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index 77dbbac..a6e1e1d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -18,6 +18,10 @@ fn main() -> ExitCode {
         }
         Err(error) => {
             eprintln!("error: {error}");
+            if error.should_show_help() {
+                eprintln!();
+                eprint!("{}", codem8::cli::help_text());
+            }
             ExitCode::FAILURE
         }
     }

From 17e24d4c95643999ad3a76bc0856eb63abd85798 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 18:34:14 +0200
Subject: [PATCH 16/22] [refactor] rename duplicate mitigation line patterns

---
 src/language.rs | 51 +++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/src/language.rs b/src/language.rs
index 60d7985..b69b6fb 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -7,21 +7,21 @@ use crate::model::LineStatus;
 pub struct LanguageLinePattern {
     pub language_name: &'static str,
     pub extensions: &'static [&'static str],
-    pub block_only_lines: &'static [&'static str],
+    pub duplicate_mitigation_lines: &'static [&'static str],
 }
 
 pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "TypeScript / JavaScript",
         extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"],
-        block_only_lines: &[
+        duplicate_mitigation_lines: &[
             "(", ")", "{", "}", "[", "]", ");", "];", "};", ")};", "}),", "});",
         ],
     },
     LanguageLinePattern {
         language_name: "Rust",
         extensions: &["rs"],
-        block_only_lines: &[
+        duplicate_mitigation_lines: &[
             "{",
             "}",
             "(",
@@ -41,14 +41,14 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "C / C++ / Objective-C",
         extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"],
-        block_only_lines: &[
+        duplicate_mitigation_lines: &[
             "{", "}", "(", ")", "[", "]", ");", "];", "};", "#endif", "#else",
         ],
     },
     LanguageLinePattern {
         language_name: "C#",
         extensions: &["cs"],
-        block_only_lines: &[
+        duplicate_mitigation_lines: &[
             "{",
             "}",
             "(",
@@ -66,47 +66,47 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "Java / Kotlin / Scala",
         extensions: &["java", "kt", "kts", "scala", "sc"],
-        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"],
+        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"],
     },
     LanguageLinePattern {
         language_name: "Go",
         extensions: &["go"],
-        block_only_lines: &["{", "}", "(", ")", "[", "]"],
+        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]"],
     },
     LanguageLinePattern {
         language_name: "Python",
         extensions: &["py", "pyw"],
-        block_only_lines: &["(", ")", "[", "]", "{", "}"],
+        duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}"],
     },
     LanguageLinePattern {
         language_name: "Ruby",
         extensions: &["rb"],
-        block_only_lines: &["(", ")", "[", "]", "{", "}", "end"],
+        duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}", "end"],
     },
     LanguageLinePattern {
         language_name: "PHP",
         extensions: &["php", "phtml"],
-        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"],
+        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"],
     },
     LanguageLinePattern {
         language_name: "Swift",
         extensions: &["swift"],
-        block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"],
+        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];"],
     },
     LanguageLinePattern {
         language_name: "Shell",
         extensions: &["sh", "bash", "zsh", "fish"],
-        block_only_lines: &["then", "do", "done", "fi", "else", "{", "}"],
+        duplicate_mitigation_lines: &["then", "do", "done", "fi", "else", "{", "}"],
     },
     LanguageLinePattern {
         language_name: "PowerShell",
         extensions: &["ps1", "psm1", "psd1"],
-        block_only_lines: &["{", "}", "(", ")", "[", "]", ");"],
+        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");"],
     },
     LanguageLinePattern {
         language_name: "HTML / XML",
         extensions: &["html", "htm", "xml", "xhtml", "svg"],
-        block_only_lines: &[
+        duplicate_mitigation_lines: &[
             ">",
             "/>",
             "</div>",
@@ -120,26 +120,27 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "CSS / SCSS / Sass / Less",
         extensions: &["css", "scss", "sass", "less"],
-        block_only_lines: &["{", "}", ");"],
+        duplicate_mitigation_lines: &["{", "}", ");"],
     },
     LanguageLinePattern {
         language_name: "SQL",
         extensions: &["sql"],
-        block_only_lines: &["(", ")", ");", ";", "BEGIN", "END"],
+        duplicate_mitigation_lines: &["(", ")", ");", ";", "BEGIN", "END"],
     },
     LanguageLinePattern {
         language_name: "YAML / JSON / TOML",
         extensions: &["yaml", "yml", "json", "toml"],
-        block_only_lines: &["{", "}", "[", "]", "},", "],"],
+        duplicate_mitigation_lines: &["{", "}", "[", "]", "},", "],"],
     },
 ];
 
 #[derive(Debug)]
-struct BlockOnlyRegistry {
+struct DuplicateMitigationLineRegistry {
     by_extension: HashMap<&'static str, HashMap<u128, Vec<&'static str>>>,
 }
 
-static BLOCK_ONLY_REGISTRY: OnceLock<BlockOnlyRegistry> = OnceLock::new();
+static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock<DuplicateMitigationLineRegistry> =
+    OnceLock::new();
 
 #[must_use]
 pub fn hash_normalized_line(line: &str) -> u128 {
@@ -162,23 +163,23 @@ pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> Line
     }
 }
 
-fn registry() -> &'static BlockOnlyRegistry {
-    BLOCK_ONLY_REGISTRY.get_or_init(|| {
+fn registry() -> &'static DuplicateMitigationLineRegistry {
+    DUPLICATE_MITIGATION_LINE_REGISTRY.get_or_init(|| {
         let mut by_extension: HashMap<&'static str, HashMap<u128, Vec<&'static str>>> =
             HashMap::new();
         for language in LANGUAGE_PATTERNS {
             for extension in language.extensions {
-                register_block_only_lines(
+                register_duplicate_mitigation_lines(
                     by_extension.entry(extension).or_default(),
-                    language.block_only_lines,
+                    language.duplicate_mitigation_lines,
                 );
             }
         }
-        BlockOnlyRegistry { by_extension }
+        DuplicateMitigationLineRegistry { by_extension }
     })
 }
 
-fn register_block_only_lines(
+fn register_duplicate_mitigation_lines(
     patterns_by_hash: &mut HashMap<u128, Vec<&'static str>>,
     lines: &'static [&'static str],
 ) {

From d6178ea80c00e869ff967f06761e4dc1f6cd0f59 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 18:51:53 +0200
Subject: [PATCH 17/22] [feat] add punctuation duplicate mitigation patterns

---
 src/language.rs | 165 ++++++++++++++++++++++++++++++------------------
 1 file changed, 104 insertions(+), 61 deletions(-)

diff --git a/src/language.rs b/src/language.rs
index b69b6fb..efaaf86 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -7,6 +7,7 @@ use crate::model::LineStatus;
 pub struct LanguageLinePattern {
     pub language_name: &'static str,
     pub extensions: &'static [&'static str],
+    pub duplicate_mitigation_pattern: &'static [char],
     pub duplicate_mitigation_lines: &'static [&'static str],
 }
 
@@ -14,101 +15,82 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "TypeScript / JavaScript",
         extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"],
-        duplicate_mitigation_lines: &[
-            "(", ")", "{", "}", "[", "]", ");", "];", "};", ")};", "}),", "});",
-        ],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "Rust",
         extensions: &["rs"],
-        duplicate_mitigation_lines: &[
-            "{",
-            "}",
-            "(",
-            ")",
-            "))",
-            "[",
-            "]",
-            ");",
-            "];",
-            "));",
-            "})?;",
-            "})",
-            "})?;",
-            ".into_iter()",
-        ],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_lines: &[".into_iter()"],
     },
     LanguageLinePattern {
         language_name: "C / C++ / Objective-C",
         extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"],
-        duplicate_mitigation_lines: &[
-            "{", "}", "(", ")", "[", "]", ");", "];", "};", "#endif", "#else",
-        ],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_lines: &["#endif", "#else"],
     },
     LanguageLinePattern {
         language_name: "C#",
         extensions: &["cs"],
-        duplicate_mitigation_lines: &[
-            "{",
-            "}",
-            "(",
-            ")",
-            "[",
-            "]",
-            ");",
-            "];",
-            "};",
-            "#endregion",
-            "#else",
-            "#endif",
-        ],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_lines: &["#endregion", "#else", "#endif"],
     },
     LanguageLinePattern {
         language_name: "Java / Kotlin / Scala",
         extensions: &["java", "kt", "kts", "scala", "sc"],
-        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "Go",
         extensions: &["go"],
-        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "Python",
         extensions: &["py", "pyw"],
-        duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "Ruby",
         extensions: &["rb"],
-        duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}", "end"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':'],
+        duplicate_mitigation_lines: &["end"],
     },
     LanguageLinePattern {
         language_name: "PHP",
         extensions: &["php", "phtml"],
-        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"],
+        duplicate_mitigation_pattern: &[
+            '(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>', '/',
+        ],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "Swift",
         extensions: &["swift"],
-        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "Shell",
         extensions: &["sh", "bash", "zsh", "fish"],
-        duplicate_mitigation_lines: &["then", "do", "done", "fi", "else", "{", "}"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', '&', '|'],
+        duplicate_mitigation_lines: &["then", "do", "done", "fi", "else"],
     },
     LanguageLinePattern {
         language_name: "PowerShell",
         extensions: &["ps1", "psm1", "psd1"],
-        duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '|'],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "HTML / XML",
         extensions: &["html", "htm", "xml", "xhtml", "svg"],
+        duplicate_mitigation_pattern: &['<', '>', '/'],
         duplicate_mitigation_lines: &[
-            ">",
-            "/>",
             "</div>",
             "</span>",
             "</section>",
@@ -120,23 +102,32 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "CSS / SCSS / Sass / Less",
         extensions: &["css", "scss", "sass", "less"],
-        duplicate_mitigation_lines: &["{", "}", ");"],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'],
+        duplicate_mitigation_lines: &[],
     },
     LanguageLinePattern {
         language_name: "SQL",
         extensions: &["sql"],
-        duplicate_mitigation_lines: &["(", ")", ");", ";", "BEGIN", "END"],
+        duplicate_mitigation_pattern: &['(', ')', ';', ',', ':'],
+        duplicate_mitigation_lines: &["BEGIN", "END"],
     },
     LanguageLinePattern {
         language_name: "YAML / JSON / TOML",
         extensions: &["yaml", "yml", "json", "toml"],
-        duplicate_mitigation_lines: &["{", "}", "[", "]", "},", "],"],
+        duplicate_mitigation_pattern: &['{', '}', '[', ']', ','],
+        duplicate_mitigation_lines: &[],
     },
 ];
 
 #[derive(Debug)]
 struct DuplicateMitigationLineRegistry {
-    by_extension: HashMap<&'static str, HashMap<u128, Vec<&'static str>>>,
+    by_extension: HashMap<&'static str, DuplicateMitigationPatterns>,
+}
+
+#[derive(Debug, Default)]
+struct DuplicateMitigationPatterns {
+    lines_by_hash: HashMap<u128, Vec<&'static str>>,
+    character_pattern: Vec<char>,
 }
 
 static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock<DuplicateMitigationLineRegistry> =
@@ -150,13 +141,10 @@ pub fn hash_normalized_line(line: &str) -> u128 {
 #[must_use]
 pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus {
     let extension = extension.to_ascii_lowercase();
-    let Some(patterns_by_hash) = registry().by_extension.get(extension.as_str()) else {
-        return LineStatus::Comparison;
-    };
-    let Some(patterns) = patterns_by_hash.get(&hash) else {
+    let Some(patterns) = registry().by_extension.get(extension.as_str()) else {
         return LineStatus::Comparison;
     };
-    if patterns.contains(&normalized_line) {
+    if patterns.matches_line(normalized_line, hash) {
         LineStatus::BlockOnly
     } else {
         LineStatus::Comparison
@@ -165,20 +153,37 @@ pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> Line
 
 fn registry() -> &'static DuplicateMitigationLineRegistry {
     DUPLICATE_MITIGATION_LINE_REGISTRY.get_or_init(|| {
-        let mut by_extension: HashMap<&'static str, HashMap<u128, Vec<&'static str>>> =
-            HashMap::new();
+        let mut by_extension: HashMap<&'static str, DuplicateMitigationPatterns> = HashMap::new();
         for language in LANGUAGE_PATTERNS {
             for extension in language.extensions {
+                let patterns = by_extension.entry(extension).or_default();
                 register_duplicate_mitigation_lines(
-                    by_extension.entry(extension).or_default(),
+                    &mut patterns.lines_by_hash,
                     language.duplicate_mitigation_lines,
                 );
+                register_duplicate_mitigation_pattern(
+                    &mut patterns.character_pattern,
+                    language.duplicate_mitigation_pattern,
+                );
             }
         }
         DuplicateMitigationLineRegistry { by_extension }
     })
 }
 
+impl DuplicateMitigationPatterns {
+    fn matches_line(&self, normalized_line: &str, hash: u128) -> bool {
+        self.matches_registered_line(normalized_line, hash)
+            || matches_duplicate_mitigation_pattern(normalized_line, &self.character_pattern)
+    }
+
+    fn matches_registered_line(&self, normalized_line: &str, hash: u128) -> bool {
+        self.lines_by_hash
+            .get(&hash)
+            .is_some_and(|patterns| patterns.contains(&normalized_line))
+    }
+}
+
 fn register_duplicate_mitigation_lines(
     patterns_by_hash: &mut HashMap<u128, Vec<&'static str>>,
     lines: &'static [&'static str],
@@ -191,14 +196,33 @@ fn register_duplicate_mitigation_lines(
     }
 }
 
+fn register_duplicate_mitigation_pattern(
+    character_pattern: &mut Vec<char>,
+    characters: &'static [char],
+) {
+    for &character in characters {
+        if !character_pattern.contains(&character) {
+            character_pattern.push(character);
+        }
+    }
+}
+
+fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) -> bool {
+    !character_pattern.is_empty()
+        && line
+            .chars()
+            .all(|character| character.is_whitespace() || character_pattern.contains(&character))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn assigns_block_only_status_from_extension_specific_registry() {
-        let hash = hash_normalized_line("}");
-        assert_eq!(classify_line("ts", "}", hash), LineStatus::BlockOnly);
+    fn assigns_block_only_status_from_extension_specific_line_registry() {
+        let line = ".into_iter()";
+        let hash = hash_normalized_line(line);
+        assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly);
     }
 
     #[test]
@@ -216,4 +240,23 @@ mod tests {
             LineStatus::Comparison
         );
     }
+
+    #[test]
+    fn assigns_block_only_status_from_character_pattern() {
+        let line = "} \t);";
+        let hash = hash_normalized_line(line);
+        assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly);
+    }
+
+    #[test]
+    fn ignores_character_pattern_for_unknown_extensions() {
+        let line = "});";
+        let hash = hash_normalized_line(line);
+        assert_eq!(classify_line("unknown", line, hash), LineStatus::Comparison);
+    }
+
+    #[test]
+    fn empty_character_pattern_does_not_match() {
+        assert!(!matches_duplicate_mitigation_pattern("}", &[]));
+    }
 }

From 03660654034780457c693b605f3dc18a1b91defe Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 19:03:43 +0200
Subject: [PATCH 18/22] [fix] use language registry for default duplicate
 report extensions

---
 src/cli.rs      | 13 ++++---------
 src/language.rs | 23 +++++++++++++++++++++++
 src/lib.rs      | 13 +++++++++----
 3 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/src/cli.rs b/src/cli.rs
index 1cdb894..57a4868 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -1,8 +1,8 @@
 use std::path::PathBuf;
 
 use crate::error::{CodeM8Error, Result};
+use crate::language::supported_file_extensions;
 
-const DEFAULT_FILE_EXTENSIONS: &[&str] = &["ts"];
 const HELP_TEXT: &str = "\
 CodeM8 - deterministic source code analysis reports.
 
@@ -22,7 +22,7 @@ OPTIONS:
   -file-extension=<extensions>
   --file-extension=<extensions>
       Comma-separated source file extensions to analyze.
-      Defaults to: ts
+      Defaults to all extensions registered in LANGUAGE_PATTERNS.
       Examples: -file-extension=ts,tsx,js,jsx
 
   -files=<paths>
@@ -128,12 +128,7 @@ where
     }
     Ok(CliConfig {
         report_duplicate,
-        file_extensions: file_extensions.unwrap_or_else(|| {
-            DEFAULT_FILE_EXTENSIONS
-                .iter()
-                .map(std::string::ToString::to_string)
-                .collect()
-        }),
+        file_extensions: file_extensions.unwrap_or_else(supported_file_extensions),
         files,
     })
 }
@@ -220,7 +215,7 @@ mod tests {
     fn parses_default_duplicate_report_config() {
         let config = parse_args(["--report-duplicate"]).expect("config parses");
         assert!(config.report_duplicate);
-        assert_eq!(config.file_extensions, ["ts"]);
+        assert_eq!(config.file_extensions, supported_file_extensions());
         assert_eq!(config.files, None);
     }
 
diff --git a/src/language.rs b/src/language.rs
index efaaf86..a3be521 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -119,6 +119,19 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     },
 ];
 
+#[must_use]
+pub fn supported_file_extensions() -> Vec<String> {
+    let mut extensions = Vec::new();
+    for language in LANGUAGE_PATTERNS {
+        for &extension in language.extensions {
+            if !extensions.iter().any(|selected| selected == extension) {
+                extensions.push(extension.to_string());
+            }
+        }
+    }
+    extensions
+}
+
 #[derive(Debug)]
 struct DuplicateMitigationLineRegistry {
     by_extension: HashMap<&'static str, DuplicateMitigationPatterns>,
@@ -259,4 +272,14 @@ mod tests {
     fn empty_character_pattern_does_not_match() {
         assert!(!matches_duplicate_mitigation_pattern("}", &[]));
     }
+
+    #[test]
+    fn collects_supported_file_extensions_from_language_patterns() {
+        let extensions = supported_file_extensions();
+        for language in LANGUAGE_PATTERNS {
+            for extension in language.extensions {
+                assert!(extensions.iter().any(|selected| selected == extension));
+            }
+        }
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 659a0a0..72704ea 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -115,14 +115,17 @@ mod tests {
             "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n",
         );
         let output = run_in(&project, &["--report-duplicate"]).expect("report succeeds");
+        let expected_extensions = language::supported_file_extensions().join(", ");
         assert_eq!(
             output,
-            concat!(
+            [
                 "Duplicate Code Report\n",
                 "=====================\n",
                 "\n",
                 "Analyzed files: 2\n",
-                "Analyzed extensions: ts\n",
+                "Analyzed extensions: ",
+                &expected_extensions,
+                "\n",
                 "Duplicate blocks found: 1\n",
                 "\n",
                 "#1 Weight: 324\n",
@@ -139,7 +142,8 @@ mod tests {
                 "  if (value === undefined) {\n",
                 "  return defaultValue;\n",
                 "  }\n",
-            )
+            ]
+            .concat()
         );
     }
 
@@ -160,7 +164,8 @@ mod tests {
         project.write("src/a.js", "const value = one;\n");
         project.write("src/b.js", "const value = one;\n");
         let default_output = run_in(&project, &["--report-duplicate"]).expect("report succeeds");
-        assert!(default_output.contains("Analyzed files: 0"));
+        assert!(default_output.contains("Analyzed files: 2"));
+        assert!(default_output.contains("Duplicate blocks found: 1"));
         let js_output = run_in(&project, &["--report-duplicate", "-file-extension=js"])
             .expect("report succeeds");
         assert!(js_output.contains("Analyzed files: 2"));

From 606268d0d9e1b12935feef12abac28a50089643e Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 19:25:10 +0200
Subject: [PATCH 19/22] [feat] add verbose mode

---
 README.md       | 11 ++++++++
 src/cli.rs      | 21 +++++++++++++--
 src/language.rs |  6 ++---
 src/lib.rs      | 39 ++++++++++++++++++++-------
 src/report.rs   | 71 ++++++++++++++++++++++++++++++++++++++-----------
 5 files changed, 117 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index ce3d08a..1ee7d5b 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,12 @@ Analyze an explicit list of files instead of recursively discovering files:
 codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js
 ```
 
+Include duplicate block metrics:
+
+```bash
+codem8 --report-duplicate --verbose
+```
+
 ## Duplicate Report
 
 By default, CodeM8 analyzes `.ts` files. Recursive discovery skips common
@@ -79,6 +85,11 @@ Duplicate block weight is calculated as:
 Reports are sorted deterministically by descending weight, then by line count,
 character count, first location, and normalized block text.
 
+By default, each duplicate block prints the duplicated code before its
+locations. Use `--verbose` to also show weight, line count, and occurrence
+count. Character counts are used internally for scoring and sorting, but are
+not printed.
+
 ## Language Heuristics
 
 CodeM8 includes a hard-coded registry of block-only line patterns for common
diff --git a/src/cli.rs b/src/cli.rs
index 57a4868..381520b 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -31,6 +31,9 @@ OPTIONS:
       discovering files from the current directory.
       Example: -files=src/a.ts,src/b.js
 
+  --verbose
+      Include duplicate block metrics in report output.
+
 DUPLICATE REPORT PURPOSE:
   The duplicate report helps you find repeated code that may be worth
   refactoring, reviewing, or consolidating. It lists each duplicated block with
@@ -52,6 +55,7 @@ pub enum CliCommand {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CliConfig {
     pub report_duplicate: bool,
+    pub verbose: bool,
     pub file_extensions: Vec<String>,
     pub files: Option<Vec<PathBuf>>,
 }
@@ -91,12 +95,15 @@ where
     S: Into<String>,
 {
     let mut report_duplicate = false;
+    let mut verbose = false;
     let mut file_extensions = None;
     let mut files = None;
     for arg in args {
         let arg = arg.into();
         if arg == "--report-duplicate" {
             report_duplicate = true;
+        } else if arg == "--verbose" {
+            verbose = true;
         } else if let Some(value) = arg
             .strip_prefix("-file-extension=")
             .or_else(|| arg.strip_prefix("--file-extension="))
@@ -128,6 +135,7 @@ where
     }
     Ok(CliConfig {
         report_duplicate,
+        verbose,
         file_extensions: file_extensions.unwrap_or_else(supported_file_extensions),
         files,
     })
@@ -205,6 +213,7 @@ mod tests {
     fn exposes_detailed_help_text() {
         assert!(help_text().contains("USAGE:"));
         assert!(help_text().contains("--report-duplicate"));
+        assert!(help_text().contains("--verbose"));
         assert!(help_text().contains("-file-extension=<extensions>"));
         assert!(help_text().contains("-files=<paths>"));
         assert!(help_text().contains("helps you find repeated code"));
@@ -215,10 +224,18 @@ mod tests {
     fn parses_default_duplicate_report_config() {
         let config = parse_args(["--report-duplicate"]).expect("config parses");
         assert!(config.report_duplicate);
+        assert!(!config.verbose);
         assert_eq!(config.file_extensions, supported_file_extensions());
         assert_eq!(config.files, None);
     }
 
+    #[test]
+    fn parses_verbose_duplicate_report_config() {
+        let config = parse_args(["--report-duplicate", "--verbose"]).expect("config parses");
+        assert!(config.report_duplicate);
+        assert!(config.verbose);
+    }
+
     #[test]
     fn parses_extensions_case_insensitively_and_trims_whitespace() {
         let extensions = parse_file_extensions(" ts, JS ,tsx,ts ").expect("extensions parse");
@@ -254,8 +271,8 @@ mod tests {
 
     #[test]
     fn rejects_unknown_arguments() {
-        let error = parse_args(["--report-duplicate", "--verbose"]).expect_err("unknown arg fails");
-        assert!(error.to_string().contains("unknown argument: --verbose"));
+        let error = parse_args(["--report-duplicate", "--unknown"]).expect_err("unknown arg fails");
+        assert!(error.to_string().contains("unknown argument: --unknown"));
         assert!(!error.should_show_help());
     }
 
diff --git a/src/language.rs b/src/language.rs
index a3be521..2e75db0 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -22,7 +22,7 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
         language_name: "Rust",
         extensions: &["rs"],
         duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
-        duplicate_mitigation_lines: &[".into_iter()"],
+        duplicate_mitigation_lines: &[".into_iter()", "///"],
     },
     LanguageLinePattern {
         language_name: "C / C++ / Objective-C",
@@ -114,8 +114,8 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "YAML / JSON / TOML",
         extensions: &["yaml", "yml", "json", "toml"],
-        duplicate_mitigation_pattern: &['{', '}', '[', ']', ','],
-        duplicate_mitigation_lines: &[],
+        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_lines: &["jobs:", "on:"],
     },
 ];
 
diff --git a/src/lib.rs b/src/lib.rs
index 72704ea..c1c2ce9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -43,7 +43,7 @@ where
                 duplicate_blocks,
             };
             writer
-                .write_all(report::render_duplicate_report(&report).as_bytes())
+                .write_all(report::render_duplicate_report(&report, config.verbose).as_bytes())
                 .map_err(|error| {
                     CodeM8Error::new(format!("could not write report output: {error}"))
                 })?;
@@ -128,25 +128,44 @@ mod tests {
                 "\n",
                 "Duplicate blocks found: 1\n",
                 "\n",
-                "#1 Weight: 324\n",
-                "Lines: 4\n",
-                "Characters: 81\n",
-                "Occurrences: 2\n",
-                "\n",
-                "Locations:\n",
-                "- src/a.ts:1-4\n",
-                "- src/b.ts:1-4\n",
-                "\n",
+                "#1\n",
                 "Code:\n",
                 "  const value = computeValue(input);\n",
                 "  if (value === undefined) {\n",
                 "  return defaultValue;\n",
                 "  }\n",
+                "\n",
+                "Locations:\n",
+                "- src/a.ts:1-4\n",
+                "- src/b.ts:1-4\n",
             ]
             .concat()
         );
     }
 
+    #[test]
+    fn verbose_duplicate_report_includes_metrics_without_characters() {
+        let project = TempProject::new("verbose");
+        project.write(
+            "src/a.ts",
+            "const value = computeValue(input);\nreturn value;\n",
+        );
+        project.write(
+            "src/b.ts",
+            "const value = computeValue(input);\nreturn value;\n",
+        );
+        let output =
+            run_in(&project, &["--report-duplicate", "--verbose"]).expect("report succeeds");
+        assert!(output.contains("Weight:"));
+        assert!(output.contains("Lines: 2"));
+        assert!(output.contains("Occurrences: 2"));
+        assert!(!output.contains("Characters:"));
+        assert!(
+            output.find("Code:").expect("code section exists")
+                < output.find("Locations:").expect("locations section exists")
+        );
+    }
+
     #[test]
     fn explicit_files_disable_recursive_discovery() {
         let project = TempProject::new("explicit-files");
diff --git a/src/report.rs b/src/report.rs
index 126a1cc..1d039e5 100644
--- a/src/report.rs
+++ b/src/report.rs
@@ -11,7 +11,7 @@ pub struct DuplicateReport {
 }
 
 #[must_use]
-pub fn render_duplicate_report(report: &DuplicateReport) -> String {
+pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> String {
     let mut output = String::new();
     output.push_str("Duplicate Code Report\n");
     output.push_str("=====================\n\n");
@@ -28,11 +28,20 @@ pub fn render_duplicate_report(report: &DuplicateReport) -> String {
     );
     for (index, block) in report.duplicate_blocks.iter().enumerate() {
         output.push('\n');
-        let _ = writeln!(output, "#{} Weight: {}", index + 1, block.weight);
-        let _ = writeln!(output, "Lines: {}", block.line_count());
-        let _ = writeln!(output, "Characters: {}", block.character_count());
-        let _ = writeln!(output, "Occurrences: {}\n", block.occurrences.len());
-        output.push_str("Locations:\n");
+        let _ = writeln!(output, "#{}", index + 1);
+        if verbose {
+            let _ = writeln!(output, "Weight: {}", block.weight);
+            let _ = writeln!(output, "Lines: {}", block.line_count());
+            let _ = writeln!(output, "Occurrences: {}", block.occurrences.len());
+            output.push('\n');
+        }
+        output.push_str("Code:\n");
+        for line in &block.normalized_lines {
+            output.push_str("  ");
+            output.push_str(line);
+            output.push('\n');
+        }
+        output.push_str("\nLocations:\n");
         for occurrence in &block.occurrences {
             let _ = writeln!(
                 output,
@@ -42,12 +51,6 @@ pub fn render_duplicate_report(report: &DuplicateReport) -> String {
                 occurrence.end_line
             );
         }
-        output.push_str("\nCode:\n");
-        for line in &block.normalized_lines {
-            output.push_str("  ");
-            output.push_str(line);
-            output.push('\n');
-        }
     }
     output
 }
@@ -68,7 +71,7 @@ mod tests {
             duplicate_blocks: Vec::new(),
         };
         assert_eq!(
-            render_duplicate_report(&report),
+            render_duplicate_report(&report, false),
             "Duplicate Code Report\n\
              =====================\n\
              \n\
@@ -100,10 +103,46 @@ mod tests {
                 weight: 13,
             }],
         };
-        let output = render_duplicate_report(&report);
-        assert!(output.contains("#1 Weight: 13"));
-        assert!(output.contains("Lines: 1"));
+        let output = render_duplicate_report(&report, false);
+        assert!(output.contains("#1\n"));
+        assert!(!output.contains("Weight: 13"));
+        assert!(!output.contains("Lines: 1"));
+        assert!(!output.contains("Occurrences: 2"));
+        assert!(!output.contains("Characters:"));
         assert!(output.contains("- src/a.ts:1-1"));
         assert!(output.contains("  return value;"));
+        assert!(
+            output.find("Code:").expect("code section exists")
+                < output.find("Locations:").expect("locations section exists")
+        );
+    }
+
+    #[test]
+    fn renders_duplicate_block_metrics_in_verbose_mode() {
+        let report = DuplicateReport {
+            analyzed_files: 2,
+            analyzed_extensions: vec!["ts".to_string()],
+            duplicate_blocks: vec![DuplicateBlock {
+                normalized_lines: vec!["return value;".to_string()],
+                occurrences: vec![
+                    DuplicateOccurrence {
+                        file_path: PathBuf::from("src/a.ts"),
+                        start_line: 1,
+                        end_line: 1,
+                    },
+                    DuplicateOccurrence {
+                        file_path: PathBuf::from("src/b.ts"),
+                        start_line: 2,
+                        end_line: 2,
+                    },
+                ],
+                weight: 13,
+            }],
+        };
+        let output = render_duplicate_report(&report, true);
+        assert!(output.contains("Weight: 13"));
+        assert!(output.contains("Lines: 1"));
+        assert!(output.contains("Occurrences: 2"));
+        assert!(!output.contains("Characters:"));
     }
 }

From 49ff176853a9ca3353873f08c297f033963a3869 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 20:01:47 +0200
Subject: [PATCH 20/22] [feat] add regex duplicate mitigation patterns

---
 Cargo.lock      |  45 ++++++++++++++++++++
 Cargo.toml      |   1 +
 src/language.rs | 110 ++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 133 insertions(+), 23 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2465298..7b39a5f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,13 +2,58 @@
 # It is not intended for manual editing.
 version = 4
 
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "codem8"
 version = "0.1.0"
 dependencies = [
+ "regex",
  "xxhash-rust",
 ]
 
+[[package]]
+name = "memchr"
+version = "2.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
+
+[[package]]
+name = "regex"
+version = "1.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4"
+
 [[package]]
 name = "xxhash-rust"
 version = "0.8.15"
diff --git a/Cargo.toml b/Cargo.toml
index a1ad11d..8cfb01c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,4 +9,5 @@ keywords = ["cli", "duplicate-detection", "source-code", "analysis"]
 categories = ["command-line-utilities", "development-tools"]
 
 [dependencies]
+regex = "1"
 xxhash-rust = { version = "0.8", features = ["xxh3"] }
diff --git a/src/language.rs b/src/language.rs
index 2e75db0..efabeb9 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 use std::sync::OnceLock;
 
 use crate::model::LineStatus;
+use regex::Regex;
 
 #[derive(Debug, Clone, Copy)]
 pub struct LanguageLinePattern {
@@ -9,113 +10,137 @@ pub struct LanguageLinePattern {
     pub extensions: &'static [&'static str],
     pub duplicate_mitigation_pattern: &'static [char],
     pub duplicate_mitigation_lines: &'static [&'static str],
+    pub duplicate_mitigation_regexps: &'static [&'static str],
 }
 
 pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
     LanguageLinePattern {
         language_name: "TypeScript / JavaScript",
         extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "Rust",
         extensions: &["rs"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
-        duplicate_mitigation_lines: &[".into_iter()", "///"],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
+        duplicate_mitigation_lines: &["///"],
+        duplicate_mitigation_regexps: &[
+            r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$",
+            r"^[A-Za-z0-9_]+\s*[.,]?$",
+            r"^\.?\s*[A-Za-z0-9_]+(?:\(\s*\)?)?$",
+            r"^let\s+(?:mut\s+)?[A-Za-z0-9_]+\s*=$",
+            r"^pub\s+[A-Za-z0-9_]*\s*:\s*[A-Za-z0-9_]*[,]?$",
+            r"^use\s+[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*;$",
+        ],
     },
     LanguageLinePattern {
         language_name: "C / C++ / Objective-C",
         extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
-        duplicate_mitigation_lines: &["#endif", "#else"],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
+        duplicate_mitigation_lines: &["#else", "#endif"],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "C#",
         extensions: &["cs"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
-        duplicate_mitigation_lines: &["#endregion", "#else", "#endif"],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
+        duplicate_mitigation_lines: &["#else", "#endif", "#endregion"],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "Java / Kotlin / Scala",
         extensions: &["java", "kt", "kts", "scala", "sc"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "Go",
         extensions: &["go"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "Python",
         extensions: &["py", "pyw"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "Ruby",
         extensions: &["rb"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &["end"],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "PHP",
         extensions: &["php", "phtml"],
         duplicate_mitigation_pattern: &[
-            '(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>', '/',
+            '(', ')', ',', '/', ':', ';', '<', '>', '?', '[', ']', '{', '}',
         ],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "Swift",
         extensions: &["swift"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "Shell",
         extensions: &["sh", "bash", "zsh", "fish"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', '&', '|'],
-        duplicate_mitigation_lines: &["then", "do", "done", "fi", "else"],
+        duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'],
+        duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "PowerShell",
         extensions: &["ps1", "psm1", "psd1"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '|'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '|', '}'],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "HTML / XML",
         extensions: &["html", "htm", "xml", "xhtml", "svg"],
-        duplicate_mitigation_pattern: &['<', '>', '/'],
+        duplicate_mitigation_pattern: &['/', '<', '>'],
         duplicate_mitigation_lines: &[
-            "</div>",
-            "</span>",
-            "</section>",
             "</article>",
             "</body>",
+            "</div>",
             "</html>",
+            "</section>",
+            "</span>",
         ],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "CSS / SCSS / Sass / Less",
         extensions: &["css", "scss", "sass", "less"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &[],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "SQL",
         extensions: &["sql"],
-        duplicate_mitigation_pattern: &['(', ')', ';', ',', ':'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';'],
         duplicate_mitigation_lines: &["BEGIN", "END"],
+        duplicate_mitigation_regexps: &[],
     },
     LanguageLinePattern {
         language_name: "YAML / JSON / TOML",
         extensions: &["yaml", "yml", "json", "toml"],
-        duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'],
+        duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &["jobs:", "on:"],
+        duplicate_mitigation_regexps: &[],
     },
 ];
 
@@ -141,6 +166,7 @@ struct DuplicateMitigationLineRegistry {
 struct DuplicateMitigationPatterns {
     lines_by_hash: HashMap<u128, Vec<&'static str>>,
     character_pattern: Vec<char>,
+    regexps: Vec<Regex>,
 }
 
 static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock<DuplicateMitigationLineRegistry> =
@@ -178,6 +204,10 @@ fn registry() -> &'static DuplicateMitigationLineRegistry {
                     &mut patterns.character_pattern,
                     language.duplicate_mitigation_pattern,
                 );
+                register_duplicate_mitigation_regexps(
+                    &mut patterns.regexps,
+                    language.duplicate_mitigation_regexps,
+                );
             }
         }
         DuplicateMitigationLineRegistry { by_extension }
@@ -188,6 +218,7 @@ impl DuplicateMitigationPatterns {
     fn matches_line(&self, normalized_line: &str, hash: u128) -> bool {
         self.matches_registered_line(normalized_line, hash)
             || matches_duplicate_mitigation_pattern(normalized_line, &self.character_pattern)
+            || matches_duplicate_mitigation_regexps(normalized_line, &self.regexps)
     }
 
     fn matches_registered_line(&self, normalized_line: &str, hash: u128) -> bool {
@@ -220,6 +251,17 @@ fn register_duplicate_mitigation_pattern(
     }
 }
 
+fn register_duplicate_mitigation_regexps(
+    regexps: &mut Vec<Regex>,
+    patterns: &'static [&'static str],
+) {
+    for &pattern in patterns {
+        if !regexps.iter().any(|regexp| regexp.as_str() == pattern) {
+            regexps.push(Regex::new(pattern).expect("duplicate mitigation regexp must compile"));
+        }
+    }
+}
+
 fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) -> bool {
     !character_pattern.is_empty()
         && line
@@ -227,6 +269,14 @@ fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char])
             .all(|character| character.is_whitespace() || character_pattern.contains(&character))
 }
 
+fn matches_duplicate_mitigation_regexps(line: &str, regexps: &[Regex]) -> bool {
+    regexps.iter().any(|regexp| {
+        regexp
+            .find(line)
+            .is_some_and(|matched| matched.start() == 0 && matched.end() == line.len())
+    })
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -261,6 +311,20 @@ mod tests {
         assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly);
     }
 
+    #[test]
+    fn assigns_block_only_status_from_regexps() {
+        let line = ".update()";
+        let hash = hash_normalized_line(line);
+        assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly);
+    }
+
+    #[test]
+    fn regexps_must_match_the_full_line() {
+        let line = ".update()?.await";
+        let hash = hash_normalized_line(line);
+        assert_eq!(classify_line("rs", line, hash), LineStatus::Comparison);
+    }
+
     #[test]
     fn ignores_character_pattern_for_unknown_extensions() {
         let line = "});";

From e4461c61198bc2cd62623548225d7315d7d0d186 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 20:10:15 +0200
Subject: [PATCH 21/22] [fix] reject double-dash CLI options

---
 README.md  |  4 ++--
 src/cli.rs | 51 ++++++++++++++++++++++++++++++++++-----------------
 src/lib.rs |  2 +-
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 1ee7d5b..880105a 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js
 Include duplicate block metrics:
 
 ```bash
-codem8 --report-duplicate --verbose
+codem8 --report-duplicate -verbose
 ```
 
 ## Duplicate Report
@@ -86,7 +86,7 @@ Reports are sorted deterministically by descending weight, then by line count,
 character count, first location, and normalized block text.
 
 By default, each duplicate block prints the duplicated code before its
-locations. Use `--verbose` to also show weight, line count, and occurrence
+locations. Use `-verbose` to also show weight, line count, and occurrence
 count. Character counts are used internally for scoring and sorting, but are
 not printed.
 
diff --git a/src/cli.rs b/src/cli.rs
index 381520b..597a98b 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -20,18 +20,16 @@ REQUIRED REPORT SWITCHES:
 
 OPTIONS:
   -file-extension=<extensions>
-  --file-extension=<extensions>
       Comma-separated source file extensions to analyze.
       Defaults to all extensions registered in LANGUAGE_PATTERNS.
       Examples: -file-extension=ts,tsx,js,jsx
 
   -files=<paths>
-  --files=<paths>
       Comma-separated explicit files to analyze instead of recursively
       discovering files from the current directory.
       Example: -files=src/a.ts,src/b.js
 
-  --verbose
+  -verbose
       Include duplicate block metrics in report output.
 
 DUPLICATE REPORT PURPOSE:
@@ -102,22 +100,16 @@ where
         let arg = arg.into();
         if arg == "--report-duplicate" {
             report_duplicate = true;
-        } else if arg == "--verbose" {
+        } else if arg == "-verbose" {
             verbose = true;
-        } else if let Some(value) = arg
-            .strip_prefix("-file-extension=")
-            .or_else(|| arg.strip_prefix("--file-extension="))
-        {
+        } else if let Some(value) = arg.strip_prefix("-file-extension=") {
             if file_extensions.is_some() {
                 return Err(CodeM8Error::new(
                     "file extensions were provided more than once",
                 ));
             }
             file_extensions = Some(parse_file_extensions(value)?);
-        } else if let Some(value) = arg
-            .strip_prefix("-files=")
-            .or_else(|| arg.strip_prefix("--files="))
-        {
+        } else if let Some(value) = arg.strip_prefix("-files=") {
             if files.is_some() {
                 return Err(CodeM8Error::new(
                     "explicit files were provided more than once",
@@ -196,7 +188,7 @@ pub fn parse_file_list(value: &str) -> Result<Vec<PathBuf>> {
 }
 
 fn is_help_argument(arg: &str) -> bool {
-    matches!(arg, "help" | "--help" | "-h")
+    matches!(arg, "help" | "-h")
 }
 
 #[cfg(test)]
@@ -209,13 +201,22 @@ mod tests {
         assert_eq!(command, CliCommand::Help);
     }
 
+    #[test]
+    fn parses_short_help_option() {
+        let command = parse_command(["-h"]).expect("short help parses");
+        assert_eq!(command, CliCommand::Help);
+    }
+
     #[test]
     fn exposes_detailed_help_text() {
         assert!(help_text().contains("USAGE:"));
         assert!(help_text().contains("--report-duplicate"));
-        assert!(help_text().contains("--verbose"));
+        assert!(help_text().contains("-verbose"));
         assert!(help_text().contains("-file-extension=<extensions>"));
         assert!(help_text().contains("-files=<paths>"));
+        assert!(!help_text().contains("--verbose"));
+        assert!(!help_text().contains("--file-extension=<extensions>"));
+        assert!(!help_text().contains("--files=<paths>"));
         assert!(help_text().contains("helps you find repeated code"));
         assert!(!help_text().contains("Duplicate weight"));
     }
@@ -231,7 +232,7 @@ mod tests {
 
     #[test]
     fn parses_verbose_duplicate_report_config() {
-        let config = parse_args(["--report-duplicate", "--verbose"]).expect("config parses");
+        let config = parse_args(["--report-duplicate", "-verbose"]).expect("config parses");
         assert!(config.report_duplicate);
         assert!(config.verbose);
     }
@@ -276,12 +277,28 @@ mod tests {
         assert!(!error.should_show_help());
     }
 
+    #[test]
+    fn rejects_double_dash_option_arguments() {
+        for option in [
+            "--help",
+            "--verbose",
+            "--file-extension=js",
+            "--files=src/a.ts",
+        ] {
+            let error =
+                parse_args(["--report-duplicate", option]).expect_err("double-dash option fails");
+            assert!(error
+                .to_string()
+                .contains(&format!("unknown argument: {option}")));
+        }
+    }
+
     #[test]
     fn rejects_repeated_file_extension_arguments() {
         let error = parse_args([
             "--report-duplicate",
             "-file-extension=ts",
-            "--file-extension=js",
+            "-file-extension=js",
         ])
         .expect_err("repeated extensions fail");
         assert!(error
@@ -291,7 +308,7 @@ mod tests {
 
     #[test]
     fn rejects_repeated_explicit_file_arguments() {
-        let error = parse_args(["--report-duplicate", "-files=a.ts", "--files=b.ts"])
+        let error = parse_args(["--report-duplicate", "-files=a.ts", "-files=b.ts"])
             .expect_err("repeated explicit files fail");
         assert!(error
             .to_string()
diff --git a/src/lib.rs b/src/lib.rs
index c1c2ce9..b0a3005 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -155,7 +155,7 @@ mod tests {
             "const value = computeValue(input);\nreturn value;\n",
         );
         let output =
-            run_in(&project, &["--report-duplicate", "--verbose"]).expect("report succeeds");
+            run_in(&project, &["--report-duplicate", "-verbose"]).expect("report succeeds");
         assert!(output.contains("Weight:"));
         assert!(output.contains("Lines: 2"));
         assert!(output.contains("Occurrences: 2"));

From 71d01dfee02201ab02c7999d79b6bed837ddd9f9 Mon Sep 17 00:00:00 2001
From: b4prog <b4prog@gmail.com>
Date: Thu, 25 Jun 2026 20:51:20 +0200
Subject: [PATCH 22/22] [feat] add git branch duplicate report scanning

---
 README.md        |  13 +++
 src/cli.rs       | 142 +++++++++++++++++++++---
 src/discovery.rs |   9 +-
 src/git.rs       | 284 +++++++++++++++++++++++++++++++++++++++++++++++
 src/language.rs  |   6 +
 src/lib.rs       |  44 +++++++-
 src/report.rs    |  43 ++++++-
 7 files changed, 518 insertions(+), 23 deletions(-)
 create mode 100644 src/git.rs

diff --git a/README.md b/README.md
index 880105a..74ed36e 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,13 @@ Analyze an explicit list of files instead of recursively discovering files:
 codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js
 ```
 
+Analyze files changed on the current local Git branch compared to the origin
+base branch:
+
+```bash
+codem8 --report-duplicate -git-branch
+```
+
 Include duplicate block metrics:
 
 ```bash
@@ -76,6 +83,12 @@ trailing Unicode whitespace are removed before hashing and comparison. Empty
 trimmed lines are ignored. CodeM8 currently expects UTF-8 source files; invalid
 UTF-8 produces a clear error rather than lossy output.
 
+Use `-git-branch` to analyze only files changed on the current local branch
+compared to the origin base branch. CodeM8 resolves that base from `origin/HEAD`
+with `origin/main` and `origin/master` fallbacks. This includes committed,
+staged, unstaged, and untracked files that still exist in the worktree. The
+option requires a Git repository and cannot be combined with `-files`.
+
 Duplicate block weight is calculated as:
 
 ```text
diff --git a/src/cli.rs b/src/cli.rs
index 597a98b..9b3dc17 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -1,11 +1,11 @@
+use std::fmt::Write as _;
 use std::path::PathBuf;
 
 use crate::error::{CodeM8Error, Result};
 use crate::language::supported_file_extensions;
 
-const HELP_TEXT: &str = "\
-CodeM8 - deterministic source code analysis reports.
-
+const CARGO_LOCK: &str = include_str!("../Cargo.lock");
+const HELP_TEXT_BODY: &str = "\
 USAGE:
   codem8 help
   codem8 --report-duplicate [OPTIONS]
@@ -29,6 +29,11 @@ OPTIONS:
       discovering files from the current directory.
       Example: -files=src/a.ts,src/b.js
 
+  -git-branch
+      Analyze files changed on the current local Git branch compared to the
+      origin base branch, including committed, staged, unstaged, and untracked
+      files. Cannot be combined with -files.
+
   -verbose
       Include duplicate block metrics in report output.
 
@@ -42,8 +47,15 @@ EXAMPLES:
   codem8 --report-duplicate
   codem8 --report-duplicate -file-extension=ts,tsx,js,jsx
   codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js
+  codem8 --report-duplicate -git-branch
 ";
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+struct CargoLockPackage<'a> {
+    name: &'a str,
+    version: &'a str,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum CliCommand {
     Help,
@@ -56,11 +68,20 @@ pub struct CliConfig {
     pub verbose: bool,
     pub file_extensions: Vec<String>,
     pub files: Option<Vec<PathBuf>>,
+    pub git_branch: bool,
 }
 
 #[must_use]
-pub const fn help_text() -> &'static str {
-    HELP_TEXT
+pub fn help_text() -> String {
+    let version = codem8_version_from_cargo_lock().unwrap_or("unknown");
+    let mut output = String::new();
+    let _ = writeln!(
+        output,
+        "CodeM8 {version} - deterministic source code analysis reports."
+    );
+    output.push('\n');
+    output.push_str(HELP_TEXT_BODY);
+    output
 }
 
 /// Parses command-line arguments into a CLI command.
@@ -96,12 +117,20 @@ where
     let mut verbose = false;
     let mut file_extensions = None;
     let mut files = None;
+    let mut git_branch = false;
     for arg in args {
         let arg = arg.into();
         if arg == "--report-duplicate" {
             report_duplicate = true;
         } else if arg == "-verbose" {
             verbose = true;
+        } else if arg == "-git-branch" {
+            if git_branch {
+                return Err(CodeM8Error::new(
+                    "git branch mode was provided more than once",
+                ));
+            }
+            git_branch = true;
         } else if let Some(value) = arg.strip_prefix("-file-extension=") {
             if file_extensions.is_some() {
                 return Err(CodeM8Error::new(
@@ -125,11 +154,17 @@ where
             "no report switch provided; pass --report-duplicate",
         ));
     }
+    if git_branch && files.is_some() {
+        return Err(CodeM8Error::new(
+            "git branch mode cannot be combined with explicit files",
+        ));
+    }
     Ok(CliConfig {
         report_duplicate,
         verbose,
         file_extensions: file_extensions.unwrap_or_else(supported_file_extensions),
         files,
+        git_branch,
     })
 }
 
@@ -191,6 +226,30 @@ fn is_help_argument(arg: &str) -> bool {
     matches!(arg, "help" | "-h")
 }
 
+fn codem8_version_from_cargo_lock() -> Option<&'static str> {
+    cargo_lock_packages(CARGO_LOCK)
+        .find(|package| package.name == "codem8")
+        .map(|package| package.version)
+}
+
+fn cargo_lock_packages(lockfile: &str) -> impl Iterator<Item = CargoLockPackage<'_>> {
+    lockfile.split("[[package]]").filter_map(cargo_lock_package)
+}
+
+fn cargo_lock_package(section: &str) -> Option<CargoLockPackage<'_>> {
+    let name = cargo_lock_value(section, "name")?;
+    let version = cargo_lock_value(section, "version")?;
+    Some(CargoLockPackage { name, version })
+}
+
+fn cargo_lock_value<'a>(section: &'a str, key: &str) -> Option<&'a str> {
+    let prefix = format!("{key} = \"");
+    section
+        .lines()
+        .map(str::trim)
+        .find_map(|line| line.strip_prefix(&prefix)?.strip_suffix('"'))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -209,16 +268,42 @@ mod tests {
 
     #[test]
     fn exposes_detailed_help_text() {
-        assert!(help_text().contains("USAGE:"));
-        assert!(help_text().contains("--report-duplicate"));
-        assert!(help_text().contains("-verbose"));
-        assert!(help_text().contains("-file-extension=<extensions>"));
-        assert!(help_text().contains("-files=<paths>"));
-        assert!(!help_text().contains("--verbose"));
-        assert!(!help_text().contains("--file-extension=<extensions>"));
-        assert!(!help_text().contains("--files=<paths>"));
-        assert!(help_text().contains("helps you find repeated code"));
-        assert!(!help_text().contains("Duplicate weight"));
+        let help = help_text();
+        assert!(help.contains("USAGE:"));
+        assert!(help.contains("--report-duplicate"));
+        assert!(help.contains("-verbose"));
+        assert!(help.contains("-file-extension=<extensions>"));
+        assert!(help.contains("-files=<paths>"));
+        assert!(help.contains("-git-branch"));
+        assert!(!help.contains("--verbose"));
+        assert!(!help.contains("--file-extension=<extensions>"));
+        assert!(!help.contains("--files=<paths>"));
+        assert!(!help.contains("--git-branch"));
+        assert!(help.contains("helps you find repeated code"));
+        assert!(!help.contains("Duplicate weight"));
+    }
+
+    #[test]
+    fn help_text_includes_version_from_cargo_lock() {
+        let version = codem8_version_from_cargo_lock().expect("codem8 version exists");
+        assert!(help_text().starts_with(&format!("CodeM8 {version} - ")));
+    }
+
+    #[test]
+    fn extracts_package_versions_from_cargo_lock_sections() {
+        let lockfile = r#"
+[[package]]
+name = "dependency"
+version = "1.2.3"
+
+[[package]]
+name = "codem8"
+version = "0.4.2"
+"#;
+        let package = cargo_lock_packages(lockfile)
+            .find(|package| package.name == "codem8")
+            .expect("package exists");
+        assert_eq!(package.version, "0.4.2");
     }
 
     #[test]
@@ -228,6 +313,7 @@ mod tests {
         assert!(!config.verbose);
         assert_eq!(config.file_extensions, supported_file_extensions());
         assert_eq!(config.files, None);
+        assert!(!config.git_branch);
     }
 
     #[test]
@@ -237,6 +323,13 @@ mod tests {
         assert!(config.verbose);
     }
 
+    #[test]
+    fn parses_git_branch_duplicate_report_config() {
+        let config = parse_args(["--report-duplicate", "-git-branch"]).expect("config parses");
+        assert!(config.git_branch);
+        assert_eq!(config.files, None);
+    }
+
     #[test]
     fn parses_extensions_case_insensitively_and_trims_whitespace() {
         let extensions = parse_file_extensions(" ts, JS ,tsx,ts ").expect("extensions parse");
@@ -284,6 +377,7 @@ mod tests {
             "--verbose",
             "--file-extension=js",
             "--files=src/a.ts",
+            "--git-branch",
         ] {
             let error =
                 parse_args(["--report-duplicate", option]).expect_err("double-dash option fails");
@@ -315,6 +409,24 @@ mod tests {
             .contains("explicit files were provided more than once"));
     }
 
+    #[test]
+    fn rejects_repeated_git_branch_arguments() {
+        let error = parse_args(["--report-duplicate", "-git-branch", "-git-branch"])
+            .expect_err("repeated git branch mode fails");
+        assert!(error
+            .to_string()
+            .contains("git branch mode was provided more than once"));
+    }
+
+    #[test]
+    fn rejects_git_branch_with_explicit_files() {
+        let error = parse_args(["--report-duplicate", "-git-branch", "-files=a.ts"])
+            .expect_err("exclusive file modes fail");
+        assert!(error
+            .to_string()
+            .contains("git branch mode cannot be combined with explicit files"));
+    }
+
     #[test]
     fn parses_explicit_file_list() {
         let files = parse_file_list("src/a.ts, ./src/b.ts").expect("files parse");
diff --git a/src/discovery.rs b/src/discovery.rs
index 0424979..c48293a 100644
--- a/src/discovery.rs
+++ b/src/discovery.rs
@@ -90,9 +90,16 @@ fn discover_explicit_files(
         if !seen_paths.insert(canonical_path.clone()) {
             continue;
         }
+        let display_path = if absolute_input {
+            canonical_path
+                .strip_prefix(current_dir)
+                .map_or_else(|_| normalize_display_path(file), normalize_display_path)
+        } else {
+            normalize_display_path(file)
+        };
         source_files.push(SourceFile {
             path: canonical_path,
-            display_path: normalize_display_path(file),
+            display_path,
             extension,
         });
     }
diff --git a/src/git.rs b/src/git.rs
new file mode 100644
index 0000000..965f0c9
--- /dev/null
+++ b/src/git.rs
@@ -0,0 +1,284 @@
+use std::collections::BTreeSet;
+use std::fs;
+use std::path::{Path, PathBuf};
+use std::process::{Command, Output};
+
+use crate::error::{CodeM8Error, Result};
+
+/// Lists files changed on the current branch compared to the origin base branch.
+///
+/// # Errors
+///
+/// Returns an error when `current_dir` is not inside a Git repository, the
+/// current branch cannot be resolved, or the origin base branch is missing.
+pub fn changed_files_against_origin(current_dir: &Path) -> Result<Vec<PathBuf>> {
+    let repo_root = repo_root(current_dir)?;
+    ensure_named_branch(&repo_root)?;
+    let origin_ref = origin_base_ref(&repo_root)?;
+    let merge_base = run_git_text(
+        &repo_root,
+        &["merge-base", &origin_ref, "HEAD"],
+        "find merge base with origin base branch",
+    )?;
+    let mut paths = BTreeSet::new();
+    collect_nul_paths(
+        &repo_root,
+        &[
+            "diff",
+            "--name-only",
+            "-z",
+            "--diff-filter=ACMRTUXB",
+            merge_base.trim(),
+            "HEAD",
+        ],
+        &mut paths,
+    )?;
+    collect_nul_paths(
+        &repo_root,
+        &[
+            "diff",
+            "--name-only",
+            "-z",
+            "--cached",
+            "--diff-filter=ACMRTUXB",
+        ],
+        &mut paths,
+    )?;
+    collect_nul_paths(
+        &repo_root,
+        &["diff", "--name-only", "-z", "--diff-filter=ACMRTUXB"],
+        &mut paths,
+    )?;
+    collect_nul_paths(
+        &repo_root,
+        &["ls-files", "--others", "--exclude-standard", "-z"],
+        &mut paths,
+    )?;
+    Ok(paths
+        .into_iter()
+        .filter_map(|path| existing_file_path(&repo_root, current_dir, &path))
+        .collect())
+}
+
+fn repo_root(current_dir: &Path) -> Result<PathBuf> {
+    let output = run_git_output(
+        current_dir,
+        &["rev-parse", "--show-toplevel"],
+        "find git repository",
+    )?;
+    if !output.status.success() {
+        return Err(CodeM8Error::new(
+            "git branch mode requires the current directory to be inside a git repository",
+        ));
+    }
+    let root = output_text(output.stdout, "parse git repository root")?;
+    Ok(PathBuf::from(root.trim()))
+}
+
+fn ensure_named_branch(repo_root: &Path) -> Result<()> {
+    let branch = run_git_text(
+        repo_root,
+        &["rev-parse", "--abbrev-ref", "HEAD"],
+        "determine current git branch",
+    )?;
+    let branch = branch.trim();
+    if branch == "HEAD" {
+        return Err(CodeM8Error::new(
+            "git branch mode requires a named local branch, but HEAD is detached",
+        ));
+    }
+    Ok(())
+}
+
+fn origin_base_ref(repo_root: &Path) -> Result<String> {
+    for candidate in ["origin/HEAD", "origin/main", "origin/master"] {
+        if verify_origin_ref(repo_root, candidate) {
+            return Ok(candidate.to_string());
+        }
+    }
+    Err(CodeM8Error::new(
+        "git branch mode could not resolve origin base branch",
+    ))
+}
+
+fn verify_origin_ref(repo_root: &Path, origin_ref: &str) -> bool {
+    let commit_ref = format!("{origin_ref}^{{commit}}");
+    run_git_output(
+        repo_root,
+        &["rev-parse", "--verify", &commit_ref],
+        "resolve origin base branch",
+    )
+    .is_ok_and(|output| output.status.success())
+}
+
+fn collect_nul_paths(repo_root: &Path, args: &[&str], paths: &mut BTreeSet<PathBuf>) -> Result<()> {
+    let output = run_git_output(repo_root, args, "list changed git files")?;
+    let stdout = ensure_git_success(output, "list changed git files")?;
+    for path in nul_paths(&stdout) {
+        paths.insert(path);
+    }
+    Ok(())
+}
+
+fn existing_file_path(repo_root: &Path, current_dir: &Path, path: &Path) -> Option<PathBuf> {
+    let absolute = repo_root.join(path);
+    let metadata = fs::symlink_metadata(&absolute).ok()?;
+    if !metadata.is_file() || metadata.file_type().is_symlink() {
+        return None;
+    }
+    let relative = absolute.strip_prefix(current_dir).map(Path::to_path_buf);
+    Some(relative.unwrap_or(absolute))
+}
+
+fn run_git_text(current_dir: &Path, args: &[&str], action: &str) -> Result<String> {
+    let output = run_git_output(current_dir, args, action)?;
+    let stdout = ensure_git_success(output, action)?;
+    output_text(stdout, action)
+}
+
+fn run_git_output(current_dir: &Path, args: &[&str], action: &str) -> Result<Output> {
+    Command::new("git")
+        .arg("-C")
+        .arg(current_dir)
+        .args(args)
+        .output()
+        .map_err(|error| CodeM8Error::new(format!("could not {action}: {error}")))
+}
+
+fn ensure_git_success(output: Output, action: &str) -> Result<Vec<u8>> {
+    if output.status.success() {
+        return Ok(output.stdout);
+    }
+    let stderr = output_text(output.stderr, action)?;
+    Err(CodeM8Error::new(format!(
+        "could not {action}: {}",
+        stderr.trim()
+    )))
+}
+
+fn output_text(bytes: Vec<u8>, action: &str) -> Result<String> {
+    String::from_utf8(bytes)
+        .map_err(|error| CodeM8Error::new(format!("could not {action}: {error}")))
+}
+
+fn nul_paths(bytes: &[u8]) -> Vec<PathBuf> {
+    String::from_utf8_lossy(bytes)
+        .split('\0')
+        .filter(|path| !path.is_empty())
+        .map(PathBuf::from)
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use std::process::Command;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use super::*;
+
+    static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0);
+
+    struct TempGitRepo {
+        path: PathBuf,
+    }
+
+    impl TempGitRepo {
+        fn new(name: &str) -> Self {
+            let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
+            let path =
+                std::env::temp_dir().join(format!("codem8-git-{name}-{}-{id}", std::process::id()));
+            if path.exists() {
+                fs::remove_dir_all(&path).expect("remove stale test directory");
+            }
+            fs::create_dir_all(&path).expect("create test directory");
+            Self { path }
+        }
+
+        fn path(&self) -> &Path {
+            &self.path
+        }
+
+        fn write(&self, relative_path: &str, contents: &str) {
+            let path = self.path.join(relative_path);
+            if let Some(parent) = path.parent() {
+                fs::create_dir_all(parent).expect("create parent directory");
+            }
+            fs::write(path, contents).expect("write test file");
+        }
+
+        fn git(&self, args: &[&str]) {
+            let status = Command::new("git")
+                .arg("-C")
+                .arg(&self.path)
+                .args(args)
+                .status()
+                .expect("run git");
+            assert!(status.success(), "git command failed: {args:?}");
+        }
+
+        fn commit(&self, message: &str) {
+            self.git(&["add", "."]);
+            self.git(&[
+                "-c",
+                "user.name=CodeM8 Test",
+                "-c",
+                "user.email=codem8@example.invalid",
+                "commit",
+                "-m",
+                message,
+            ]);
+        }
+    }
+
+    impl Drop for TempGitRepo {
+        fn drop(&mut self) {
+            let _ = fs::remove_dir_all(&self.path);
+        }
+    }
+
+    fn git_is_available() -> bool {
+        Command::new("git")
+            .arg("--version")
+            .status()
+            .is_ok_and(|status| status.success())
+    }
+
+    #[test]
+    fn rejects_non_git_directory() {
+        let repo = TempGitRepo::new("non-repo");
+        let error = changed_files_against_origin(repo.path()).expect_err("non-repo fails");
+        assert!(error.to_string().contains("requires the current directory"));
+    }
+
+    #[test]
+    fn lists_committed_staged_unstaged_and_untracked_files() {
+        if !git_is_available() {
+            return;
+        }
+        let repo = TempGitRepo::new("changes");
+        repo.git(&["init"]);
+        repo.write("src/base.ts", "const value = one;\n");
+        repo.write("src/deleted.ts", "const value = deleted;\n");
+        repo.commit("initial");
+        repo.git(&["update-ref", "refs/remotes/origin/main", "HEAD"]);
+        repo.git(&["branch", "-M", "feature"]);
+        repo.write("src/committed.ts", "const value = committed;\n");
+        repo.commit("branch change");
+        repo.git(&["update-ref", "refs/remotes/origin/feature", "HEAD"]);
+        repo.write("src/staged.ts", "const value = staged;\n");
+        repo.git(&["add", "src/staged.ts"]);
+        repo.write("src/base.ts", "const value = modified;\n");
+        repo.write("src/untracked.ts", "const value = untracked;\n");
+        fs::remove_file(repo.path().join("src/deleted.ts")).expect("delete tracked file");
+        let files = changed_files_against_origin(repo.path()).expect("list branch files");
+        assert_eq!(
+            files,
+            [
+                PathBuf::from("src/base.ts"),
+                PathBuf::from("src/committed.ts"),
+                PathBuf::from("src/staged.ts"),
+                PathBuf::from("src/untracked.ts"),
+            ]
+        );
+    }
+}
diff --git a/src/language.rs b/src/language.rs
index efabeb9..636c8b1 100644
--- a/src/language.rs
+++ b/src/language.rs
@@ -27,11 +27,17 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[
         duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'],
         duplicate_mitigation_lines: &["///"],
         duplicate_mitigation_regexps: &[
+            // Excludes short path or enum variant fragments. Example: Self::Ready,
             r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$",
+            // Excludes bare identifiers with optional punctuation. Example: value,
             r"^[A-Za-z0-9_]+\s*[.,]?$",
+            // Excludes simple method or field access lines. Example: .clone()
             r"^\.?\s*[A-Za-z0-9_]+(?:\(\s*\)?)?$",
+            // Excludes incomplete let bindings split across lines. Example: let value =
             r"^let\s+(?:mut\s+)?[A-Za-z0-9_]+\s*=$",
+            // Excludes simple public struct field declarations. Example: pub name: String,
             r"^pub\s+[A-Za-z0-9_]*\s*:\s*[A-Za-z0-9_]*[,]?$",
+            // Excludes single-path use imports. Example: use crate::module;
             r"^use\s+[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*;$",
         ],
     },
diff --git a/src/lib.rs b/src/lib.rs
index b0a3005..6656221 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,6 +2,7 @@ pub mod cli;
 pub mod discovery;
 pub mod duplicate;
 pub mod error;
+pub mod git;
 pub mod language;
 pub mod line;
 pub mod model;
@@ -30,16 +31,28 @@ where
             .write_all(cli::help_text().as_bytes())
             .map_err(|error| CodeM8Error::new(format!("could not write help output: {error}")))?,
         cli::CliCommand::ReportDuplicate(config) => {
+            let should_report_scanned_files = config.git_branch || config.files.is_some();
+            let git_branch_files = if config.git_branch {
+                Some(git::changed_files_against_origin(current_dir)?)
+            } else {
+                None
+            };
             let source_files = discovery::discover_source_files(
                 current_dir,
                 &config.file_extensions,
-                config.files.as_deref(),
+                git_branch_files.as_deref().or(config.files.as_deref()),
             )?;
             let processed_files = line::process_source_files(&source_files)?;
             let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files);
             let report = report::DuplicateReport {
                 analyzed_files: source_files.len(),
                 analyzed_extensions: config.file_extensions,
+                scanned_files: should_report_scanned_files.then(|| {
+                    source_files
+                        .iter()
+                        .map(|source_file| source_file.display_path.clone())
+                        .collect()
+                }),
                 duplicate_blocks,
             };
             writer
@@ -122,7 +135,7 @@ mod tests {
                 "Duplicate Code Report\n",
                 "=====================\n",
                 "\n",
-                "Analyzed files: 2\n",
+                "Number of files scanned: 2\n",
                 "Analyzed extensions: ",
                 &expected_extensions,
                 "\n",
@@ -173,21 +186,42 @@ mod tests {
         project.write("src/b.ts", "const value = one;\n");
         let output =
             run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds");
-        assert!(output.contains("Analyzed files: 1"));
+        assert!(output.contains("Number of files scanned: 1"));
         assert!(output.contains("Duplicate blocks found: 0"));
     }
 
+    #[test]
+    fn verbose_explicit_files_report_lists_scanned_files() {
+        let project = TempProject::new("verbose-explicit-files");
+        project.write("src/a.ts", "const value = one;\n");
+        project.write("src/b.ts", "const value = one;\n");
+        let quiet_output =
+            run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds");
+        assert!(!quiet_output.contains("Files scanned:"));
+        let verbose_output = run_in(
+            &project,
+            &["--report-duplicate", "-verbose", "-files=src/a.ts"],
+        )
+        .expect("report succeeds");
+        assert!(verbose_output.contains(
+            "Number of files scanned: 1\n\
+             Files scanned:\n\
+             - src/a.ts\n\
+             Analyzed extensions:"
+        ));
+    }
+
     #[test]
     fn custom_extensions_change_analyzed_files() {
         let project = TempProject::new("custom-extensions");
         project.write("src/a.js", "const value = one;\n");
         project.write("src/b.js", "const value = one;\n");
         let default_output = run_in(&project, &["--report-duplicate"]).expect("report succeeds");
-        assert!(default_output.contains("Analyzed files: 2"));
+        assert!(default_output.contains("Number of files scanned: 2"));
         assert!(default_output.contains("Duplicate blocks found: 1"));
         let js_output = run_in(&project, &["--report-duplicate", "-file-extension=js"])
             .expect("report succeeds");
-        assert!(js_output.contains("Analyzed files: 2"));
+        assert!(js_output.contains("Number of files scanned: 2"));
         assert!(js_output.contains("Duplicate blocks found: 1"));
     }
 
diff --git a/src/report.rs b/src/report.rs
index 1d039e5..eb7c42d 100644
--- a/src/report.rs
+++ b/src/report.rs
@@ -1,4 +1,5 @@
 use std::fmt::Write as _;
+use std::path::PathBuf;
 
 use crate::model::DuplicateBlock;
 use crate::paths::format_path;
@@ -7,6 +8,7 @@ use crate::paths::format_path;
 pub struct DuplicateReport {
     pub analyzed_files: usize,
     pub analyzed_extensions: Vec<String>,
+    pub scanned_files: Option<Vec<PathBuf>>,
     pub duplicate_blocks: Vec<DuplicateBlock>,
 }
 
@@ -15,7 +17,18 @@ pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> Strin
     let mut output = String::new();
     output.push_str("Duplicate Code Report\n");
     output.push_str("=====================\n\n");
-    let _ = writeln!(output, "Analyzed files: {}", report.analyzed_files);
+    let _ = writeln!(output, "Number of files scanned: {}", report.analyzed_files);
+    let scanned_files = if verbose {
+        report.scanned_files.as_ref()
+    } else {
+        None
+    };
+    if let Some(scanned_files) = scanned_files {
+        output.push_str("Files scanned:\n");
+        for file in scanned_files {
+            let _ = writeln!(output, "- {}", format_path(file));
+        }
+    }
     let _ = writeln!(
         output,
         "Analyzed extensions: {}",
@@ -68,6 +81,7 @@ mod tests {
         let report = DuplicateReport {
             analyzed_files: 0,
             analyzed_extensions: vec!["ts".to_string()],
+            scanned_files: None,
             duplicate_blocks: Vec::new(),
         };
         assert_eq!(
@@ -75,7 +89,7 @@ mod tests {
             "Duplicate Code Report\n\
              =====================\n\
              \n\
-             Analyzed files: 0\n\
+             Number of files scanned: 0\n\
              Analyzed extensions: ts\n\
              Duplicate blocks found: 0\n"
         );
@@ -86,6 +100,7 @@ mod tests {
         let report = DuplicateReport {
             analyzed_files: 2,
             analyzed_extensions: vec!["ts".to_string(), "js".to_string()],
+            scanned_files: None,
             duplicate_blocks: vec![DuplicateBlock {
                 normalized_lines: vec!["return value;".to_string()],
                 occurrences: vec![
@@ -122,6 +137,7 @@ mod tests {
         let report = DuplicateReport {
             analyzed_files: 2,
             analyzed_extensions: vec!["ts".to_string()],
+            scanned_files: None,
             duplicate_blocks: vec![DuplicateBlock {
                 normalized_lines: vec!["return value;".to_string()],
                 occurrences: vec![
@@ -145,4 +161,27 @@ mod tests {
         assert!(output.contains("Occurrences: 2"));
         assert!(!output.contains("Characters:"));
     }
+
+    #[test]
+    fn renders_scanned_file_list_only_in_verbose_mode() {
+        let report = DuplicateReport {
+            analyzed_files: 2,
+            analyzed_extensions: vec!["ts".to_string()],
+            scanned_files: Some(vec![
+                PathBuf::from("src/a.ts"),
+                PathBuf::from("src/nested/b.ts"),
+            ]),
+            duplicate_blocks: Vec::new(),
+        };
+        let quiet_output = render_duplicate_report(&report, false);
+        assert!(!quiet_output.contains("Files scanned:"));
+        let verbose_output = render_duplicate_report(&report, true);
+        assert!(verbose_output.contains(
+            "Number of files scanned: 2\n\
+             Files scanned:\n\
+             - src/a.ts\n\
+             - src/nested/b.ts\n\
+             Analyzed extensions: ts"
+        ));
+    }
 }