From 32ece2e65a50322e8622bcedf6f0f777d22cee65 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 12:37:03 +0200 Subject: [PATCH 01/22] [feat] add deterministic duplicate code report CLI --- .github/workflows/ci.yml | 32 +++ .github/workflows/coderabbit-review.yml | 75 ++++++ Cargo.lock | 16 ++ Cargo.toml | 11 + README.md | 117 ++++++++- src/cli.rs | 154 ++++++++++++ src/discovery.rs | 196 +++++++++++++++ src/duplicate.rs | 316 ++++++++++++++++++++++++ src/error.rs | 33 +++ src/language.rs | 192 ++++++++++++++ src/lib.rs | 167 +++++++++++++ src/line.rs | 88 +++++++ src/main.rs | 24 ++ src/model.rs | 56 +++++ src/paths.rs | 22 ++ src/report.rs | 103 ++++++++ 16 files changed, 1601 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/coderabbit-review.yml create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/cli.rs create mode 100644 src/discovery.rs create mode 100644 src/duplicate.rs create mode 100644 src/error.rs create mode 100644 src/language.rs create mode 100644 src/lib.rs create mode 100644 src/line.rs create mode 100644 src/main.rs create mode 100644 src/model.rs create mode 100644 src/paths.rs create mode 100644 src/report.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f8eae35 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,32 @@ +name: Rust CI + +on: + push: + branches: + - main + pull_request: + +permissions: + contents: read + +jobs: + rust: + name: Build, test, and format + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + run: rustup toolchain install stable --profile minimal --component rustfmt + + - name: Check formatting + run: cargo fmt --all -- --check + + - name: Build + run: cargo build --locked --all-targets + + - name: Test + run: cargo test --locked --all-targets + diff --git a/.github/workflows/coderabbit-review.yml b/.github/workflows/coderabbit-review.yml new file mode 100644 index 0000000..4a61832 --- /dev/null +++ b/.github/workflows/coderabbit-review.yml @@ -0,0 +1,75 @@ +name: CodeRabbit Review Gate + +on: + pull_request_review: + types: + - submitted + - edited + - dismissed + +permissions: + contents: read + pull-requests: read + +jobs: + coderabbit-review: + name: Validate CodeRabbit review + if: github.event.pull_request.draft == false && github.event.review.user.login == 'coderabbitai[bot]' + runs-on: ubuntu-latest + + steps: + - name: Check CodeRabbit review state + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + node <<'NODE' + const token = process.env.GITHUB_TOKEN; + const [owner, repo] = process.env.GITHUB_REPOSITORY.split("/"); + const prNumber = process.env.PR_NUMBER; + const headSha = process.env.PR_HEAD_SHA; + + async function fetchReviews(page = 1, reviews = []) { + const url = `https://api.github.com/repos/${owner}/${repo}/pulls/${prNumber}/reviews?per_page=100&page=${page}`; + const response = await fetch(url, { + headers: { + Authorization: `Bearer ${token}`, + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + }, + }); + + if (!response.ok) { + const body = await response.text(); + throw new Error(`GitHub review lookup failed: ${response.status} ${body}`); + } + + const pageReviews = await response.json(); + if (pageReviews.length === 0) { + return reviews; + } + return fetchReviews(page + 1, reviews.concat(pageReviews)); + } + + const reviews = await fetchReviews(); + const codeRabbitReviews = reviews + .filter((review) => review.user?.login === "coderabbitai[bot]") + .filter((review) => review.commit_id === headSha) + .sort((left, right) => new Date(left.submitted_at) - new Date(right.submitted_at)); + + const latestReview = codeRabbitReviews.at(-1); + if (!latestReview) { + console.error(`CodeRabbit has not submitted a review for ${headSha}.`); + process.exit(1); + } + + if (latestReview.state === "CHANGES_REQUESTED") { + console.error("CodeRabbit requested changes on this pull request."); + process.exit(1); + } + + console.log(`CodeRabbit review state for ${headSha}: ${latestReview.state}`); + NODE + diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..2465298 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "codem8" +version = "0.1.0" +dependencies = [ + "xxhash-rust", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1d7336b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "codem8" +version = "0.1.0" +edition = "2021" +license = "MIT" +description = "A deterministic source code analysis CLI for duplicate code reports." +repository = "https://github.com/b4prog/CodeM8" + +[dependencies] +xxhash-rust = { version = "0.8", features = ["xxh3"] } + diff --git a/README.md b/README.md index 93bf7bd..3642e93 100644 --- a/README.md +++ b/README.md @@ -1 +1,116 @@ -# CodeM8 \ No newline at end of file +# CodeM8 + +CodeM8 is a Rust command-line application for deterministic source code reports. +The initial report detects duplicated line-based code blocks in a repository: + +```bash +codem8 --report-duplicate +``` + +The duplicate report is designed for both human developers and coding agents. It +trims source lines, ignores empty lines, hashes normalized lines with XXH3 +128-bit, classifies syntax-only lines as block-only, groups repeated blocks, and +prints a stable plain-text report sorted by duplicate weight. + +## Installation + +Build from source with Cargo: + +```bash +cargo build --release +``` + +Run the local binary: + +```bash +cargo run -- --report-duplicate +``` + +## Usage + +Analyze TypeScript files from the current directory: + +```bash +codem8 --report-duplicate +``` + +Analyze multiple extensions: + +```bash +codem8 --report-duplicate -file-extension=ts,tsx,js,jsx +``` + +Analyze an explicit list of files instead of recursively discovering files: + +```bash +codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js +``` + +## Duplicate Report + +By default, CodeM8 analyzes `.ts` files. Recursive discovery skips common +irrelevant directories such as `.git`, `node_modules`, `target`, `dist`, +`build`, `coverage`, `.next`, `.nuxt`, `.svelte-kit`, `.idea`, and `.vscode`. +Symbolic links are not followed. + +Every non-empty line is normalized with Rust string trimming, so leading and +trailing Unicode whitespace are removed before hashing and comparison. Empty +trimmed lines are ignored. CodeM8 currently expects UTF-8 source files; invalid +UTF-8 produces a clear error rather than lossy output. + +Duplicate block weight is calculated as: + +```text +(occurrences - 1) * duplicated_line_count * cumulative_normalized_character_count +``` + +Reports are sorted deterministically by descending weight, then by line count, +character count, first location, and normalized block text. + +## Language Heuristics + +CodeM8 includes a hard-coded registry of block-only line patterns for common +languages and markup formats: + +- TypeScript / JavaScript +- Rust +- C / C++ / Objective-C +- C# +- Java / Kotlin / Scala +- Go +- Python +- Ruby +- PHP +- Swift +- Shell +- PowerShell +- HTML / XML +- CSS / SCSS / Sass / Less +- SQL +- YAML / JSON / TOML + +Block-only lines, such as braces or closing tags, cannot start a duplicate by +themselves. They can still be included inside a larger duplicated block when +surrounding comparison lines match. + +## Development + +Run the full local verification set: + +```bash +cargo fmt --all -- --check +cargo build --all-targets +cargo test --all-targets +``` + +The repository includes GitHub Actions workflows for Rust CI and a CodeRabbit +review gate. CI verifies formatting, build success, and tests on pushes and pull +requests. The CodeRabbit gate runs when CodeRabbit submits or edits a pull +request review and fails if CodeRabbit requests changes on the current PR head. + +## Dependency Policy + +CodeM8 avoids external packages for functionality that is simple to implement +and maintain directly. The first implementation uses one runtime dependency, +`xxhash-rust`, for the required XXH3 128-bit hash implementation. The crate is +widely used and permissively licensed under MIT or Apache-2.0. diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..934c274 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,154 @@ +use std::path::PathBuf; + +use crate::error::{CodeM8Error, Result}; + +const DEFAULT_FILE_EXTENSIONS: &[&str] = &["ts"]; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CliConfig { + pub report_duplicate: bool, + pub file_extensions: Vec, + pub files: Option>, +} + +pub fn parse_args(args: I) -> Result +where + I: IntoIterator, + S: Into, +{ + let mut report_duplicate = false; + let mut file_extensions = None; + let mut files = None; + for arg in args { + let arg = arg.into(); + if arg == "--report-duplicate" { + report_duplicate = true; + } else if let Some(value) = arg + .strip_prefix("-file-extension=") + .or_else(|| arg.strip_prefix("--file-extension=")) + { + if file_extensions.is_some() { + return Err(CodeM8Error::new( + "file extensions were provided more than once", + )); + } + file_extensions = Some(parse_file_extensions(value)?); + } else if let Some(value) = arg + .strip_prefix("-files=") + .or_else(|| arg.strip_prefix("--files=")) + { + if files.is_some() { + return Err(CodeM8Error::new( + "explicit files were provided more than once", + )); + } + files = Some(parse_file_list(value)?); + } else { + return Err(CodeM8Error::new(format!("unknown argument: {arg}"))); + } + } + if !report_duplicate { + return Err(CodeM8Error::new( + "no report switch provided; pass --report-duplicate", + )); + } + Ok(CliConfig { + report_duplicate, + file_extensions: file_extensions.unwrap_or_else(|| { + DEFAULT_FILE_EXTENSIONS + .iter() + .map(|extension| extension.to_string()) + .collect() + }), + files, + }) +} + +pub fn parse_file_extensions(value: &str) -> Result> { + let mut extensions = Vec::new(); + for raw_extension in value.split(',') { + let extension = raw_extension.trim(); + if extension.is_empty() { + return Err(CodeM8Error::new("file extension values must not be empty")); + } + if extension.starts_with('.') { + return Err(CodeM8Error::new(format!( + "file extensions must not start with a dot: {extension}" + ))); + } + if extension.contains('/') || extension.contains('\\') { + return Err(CodeM8Error::new(format!( + "file extensions must not contain path separators: {extension}" + ))); + } + let extension = extension.to_ascii_lowercase(); + if !extensions.contains(&extension) { + extensions.push(extension); + } + } + if extensions.is_empty() { + return Err(CodeM8Error::new("at least one file extension is required")); + } + Ok(extensions) +} + +pub fn parse_file_list(value: &str) -> Result> { + let mut files = Vec::new(); + for raw_file in value.split(',') { + let file = raw_file.trim(); + if file.is_empty() { + return Err(CodeM8Error::new("file path values must not be empty")); + } + files.push(PathBuf::from(file)); + } + if files.is_empty() { + return Err(CodeM8Error::new("at least one explicit file is required")); + } + Ok(files) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_default_duplicate_report_config() { + let config = parse_args(["--report-duplicate"]).expect("config parses"); + assert!(config.report_duplicate); + assert_eq!(config.file_extensions, ["ts"]); + assert_eq!(config.files, None); + } + + #[test] + fn parses_extensions_case_insensitively_and_trims_whitespace() { + let extensions = parse_file_extensions(" ts, JS ,tsx,ts ").expect("extensions parse"); + assert_eq!(extensions, ["ts", "js", "tsx"]); + } + + #[test] + fn rejects_empty_extensions() { + let error = parse_file_extensions("ts,,js").expect_err("empty extension fails"); + assert!(error.to_string().contains("must not be empty")); + } + + #[test] + fn rejects_extensions_with_leading_dot() { + let error = parse_file_extensions(".ts").expect_err("dot-prefixed extension fails"); + assert!(error.to_string().contains("must not start with a dot")); + } + + #[test] + fn rejects_missing_report_switch() { + let error = parse_args(["-file-extension=rs"]).expect_err("missing report fails"); + assert!(error.to_string().contains("no report switch provided")); + } + + #[test] + fn parses_explicit_file_list() { + let files = parse_file_list("src/a.ts, ./src/b.ts").expect("files parse"); + assert_eq!( + files, + [PathBuf::from("src/a.ts"), PathBuf::from("./src/b.ts")] + ); + } +} diff --git a/src/discovery.rs b/src/discovery.rs new file mode 100644 index 0000000..2ff774e --- /dev/null +++ b/src/discovery.rs @@ -0,0 +1,196 @@ +use std::fs; +use std::path::{Path, PathBuf}; + +use crate::error::{CodeM8Error, Result}; +use crate::model::SourceFile; +use crate::paths::{format_path, normalize_display_path}; + +const IGNORED_DIRECTORIES: &[&str] = &[ + ".git", + "node_modules", + "target", + "dist", + "build", + "coverage", + ".next", + ".nuxt", + ".svelte-kit", + ".idea", + ".vscode", +]; + +pub fn discover_source_files( + current_dir: &Path, + extensions: &[String], + explicit_files: Option<&[PathBuf]>, +) -> Result> { + let mut source_files = match explicit_files { + Some(files) => discover_explicit_files(current_dir, extensions, files)?, + None => { + let mut source_files = Vec::new(); + walk_directory(current_dir, current_dir, extensions, &mut source_files)?; + source_files + } + }; + source_files.sort_by(|left, right| { + format_path(&left.display_path).cmp(&format_path(&right.display_path)) + }); + Ok(source_files) +} + +fn discover_explicit_files( + current_dir: &Path, + extensions: &[String], + files: &[PathBuf], +) -> Result> { + let mut source_files = Vec::new(); + for file in files { + let absolute_input = file.is_absolute(); + let path = if absolute_input { + file.clone() + } else { + current_dir.join(file) + }; + let metadata = fs::symlink_metadata(&path).map_err(|_| { + CodeM8Error::new(format!( + "explicit file does not exist: {}", + format_path(file) + )) + })?; + if metadata.file_type().is_symlink() { + return Err(CodeM8Error::new(format!( + "explicit file is a symbolic link and will not be followed: {}", + format_path(file) + ))); + } + if metadata.is_dir() { + return Err(CodeM8Error::new(format!( + "explicit file is a directory: {}", + format_path(file) + ))); + } + if !metadata.is_file() { + return Err(CodeM8Error::new(format!( + "explicit path is not a file: {}", + format_path(file) + ))); + } + let Some(extension) = selected_extension(&path, extensions) else { + continue; + }; + source_files.push(SourceFile { + path, + display_path: normalize_display_path(file), + extension, + }); + } + Ok(source_files) +} + +fn walk_directory( + root: &Path, + directory: &Path, + extensions: &[String], + source_files: &mut Vec, +) -> Result<()> { + let mut entries = fs::read_dir(directory) + .map_err(|error| CodeM8Error::io(directory, "read directory", error))? + .collect::, _>>() + .map_err(|error| CodeM8Error::io(directory, "read directory entry", error))?; + entries.sort_by(|left, right| { + left.file_name() + .to_string_lossy() + .cmp(&right.file_name().to_string_lossy()) + }); + for entry in entries { + let path = entry.path(); + let file_type = entry + .file_type() + .map_err(|error| CodeM8Error::io(&path, "inspect path", error))?; + if file_type.is_symlink() { + continue; + } + if file_type.is_dir() { + let directory_name = entry.file_name().to_string_lossy().to_ascii_lowercase(); + if IGNORED_DIRECTORIES.contains(&directory_name.as_str()) { + continue; + } + walk_directory(root, &path, extensions, source_files)?; + } else if file_type.is_file() { + let Some(extension) = selected_extension(&path, extensions) else { + continue; + }; + let display_path = path + .strip_prefix(root) + .map(normalize_display_path) + .unwrap_or_else(|_| normalize_display_path(&path)); + source_files.push(SourceFile { + path, + display_path, + extension, + }); + } + } + Ok(()) +} + +fn selected_extension(path: &Path, extensions: &[String]) -> Option { + let extension = path.extension()?.to_str()?.to_ascii_lowercase(); + extensions + .iter() + .any(|selected| selected.eq_ignore_ascii_case(&extension)) + .then_some(extension) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + fn temp_dir(name: &str) -> PathBuf { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = std::env::temp_dir().join(format!( + "codem8-discovery-{name}-{}-{id}", + std::process::id() + )); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + path + } + + #[test] + fn recursively_discovers_matching_extensions_and_ignores_common_directories() { + let root = temp_dir("recursive"); + fs::create_dir_all(root.join("src")).expect("create src"); + fs::create_dir_all(root.join("target")).expect("create target"); + fs::write(root.join("src").join("a.TS"), "").expect("write ts"); + fs::write(root.join("src").join("b.js"), "").expect("write js"); + fs::write(root.join("target").join("ignored.ts"), "").expect("write ignored"); + let files = discover_source_files(&root, &["ts".to_string()], None).expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "src/a.TS"); + fs::remove_dir_all(root).expect("cleanup"); + } + + #[test] + fn explicit_files_skip_unselected_extensions() { + let root = temp_dir("explicit-skip"); + fs::write(root.join("a.ts"), "").expect("write ts"); + fs::write(root.join("b.js"), "").expect("write js"); + let files = discover_source_files( + &root, + &["ts".to_string()], + Some(&[PathBuf::from("a.ts"), PathBuf::from("b.js")]), + ) + .expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } +} diff --git a/src/duplicate.rs b/src/duplicate.rs new file mode 100644 index 0000000..ef11e35 --- /dev/null +++ b/src/duplicate.rs @@ -0,0 +1,316 @@ +use std::cmp::Ordering; +use std::collections::{BTreeSet, HashMap}; +use std::path::PathBuf; + +use crate::model::{DuplicateBlock, DuplicateOccurrence, LineEntry, LineStatus, ProcessedFile}; +use crate::paths::format_path; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +struct LineRef { + file_index: usize, + line_index: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct OccurrenceKey { + file_path: PathBuf, + file_path_key: String, + start_line: usize, + end_line: usize, +} + +impl Ord for OccurrenceKey { + fn cmp(&self, other: &Self) -> Ordering { + self.file_path_key + .cmp(&other.file_path_key) + .then_with(|| self.start_line.cmp(&other.start_line)) + .then_with(|| self.end_line.cmp(&other.end_line)) + } +} + +impl PartialOrd for OccurrenceKey { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +pub fn detect_duplicate_blocks(files: &[ProcessedFile]) -> Vec { + let mut line_index: HashMap> = HashMap::new(); + for (file_index, file) in files.iter().enumerate() { + for (line_index_in_file, line) in file.lines.iter().enumerate() { + line_index.entry(line.hash).or_default().push(LineRef { + file_index, + line_index: line_index_in_file, + }); + } + } + let mut blocks_by_lines: HashMap, BTreeSet> = HashMap::new(); + for refs in line_index.values() { + if refs.len() < 2 { + continue; + } + let mut comparison_refs_by_text: HashMap> = HashMap::new(); + for line_ref in refs { + let line = line_at(files, *line_ref); + if line.status != LineStatus::Comparison { + continue; + } + comparison_refs_by_text + .entry(line.normalized_text.clone()) + .or_default() + .push(*line_ref); + } + for comparison_refs in comparison_refs_by_text.values() { + if comparison_refs.len() < 2 { + continue; + } + for left_index in 0..comparison_refs.len() { + for right_index in (left_index + 1)..comparison_refs.len() { + let left = comparison_refs[left_index]; + let right = comparison_refs[right_index]; + let Some(candidate) = expand_pair(files, left, right) else { + continue; + }; + let occurrences = blocks_by_lines + .entry(candidate.normalized_lines) + .or_default(); + occurrences.insert(candidate.left_occurrence); + occurrences.insert(candidate.right_occurrence); + } + } + } + } + let mut duplicate_blocks = blocks_by_lines + .into_iter() + .filter_map(|(normalized_lines, occurrences)| { + if normalized_lines.is_empty() || occurrences.len() < 2 { + return None; + } + let occurrences = occurrences + .into_iter() + .map(|occurrence| DuplicateOccurrence { + file_path: occurrence.file_path, + start_line: occurrence.start_line, + end_line: occurrence.end_line, + }) + .collect::>(); + let character_count = normalized_lines + .iter() + .map(|line| line.chars().count() as u64) + .sum::(); + let weight = + (occurrences.len() as u64 - 1) * normalized_lines.len() as u64 * character_count; + Some(DuplicateBlock { + normalized_lines, + occurrences, + weight, + }) + }) + .collect::>(); + duplicate_blocks.sort_by(compare_duplicate_blocks); + duplicate_blocks +} + +#[derive(Debug)] +struct CandidateBlock { + normalized_lines: Vec, + left_occurrence: OccurrenceKey, + right_occurrence: OccurrenceKey, +} + +fn expand_pair(files: &[ProcessedFile], left: LineRef, right: LineRef) -> Option { + if left == right { + return None; + } + let mut left_start = left.line_index; + let mut right_start = right.line_index; + while left_start > 0 + && right_start > 0 + && line_text(files, left.file_index, left_start - 1) + == line_text(files, right.file_index, right_start - 1) + { + left_start -= 1; + right_start -= 1; + } + let mut left_end = left.line_index; + let mut right_end = right.line_index; + while left_end + 1 < files[left.file_index].lines.len() + && right_end + 1 < files[right.file_index].lines.len() + && line_text(files, left.file_index, left_end + 1) + == line_text(files, right.file_index, right_end + 1) + { + left_end += 1; + right_end += 1; + } + let normalized_lines = files[left.file_index].lines[left_start..=left_end] + .iter() + .map(|line| line.normalized_text.clone()) + .collect::>(); + Some(CandidateBlock { + normalized_lines, + left_occurrence: occurrence_for(files, left.file_index, left_start, left_end), + right_occurrence: occurrence_for(files, right.file_index, right_start, right_end), + }) +} + +fn occurrence_for( + files: &[ProcessedFile], + file_index: usize, + start_index: usize, + end_index: usize, +) -> OccurrenceKey { + let lines = &files[file_index].lines; + let file_path = files[file_index].source.display_path.clone(); + OccurrenceKey { + file_path_key: format_path(&file_path), + file_path, + start_line: lines[start_index].line_number, + end_line: lines[end_index].line_number, + } +} + +fn line_at(files: &[ProcessedFile], line_ref: LineRef) -> &LineEntry { + &files[line_ref.file_index].lines[line_ref.line_index] +} + +fn line_text(files: &[ProcessedFile], file_index: usize, line_index: usize) -> &str { + &files[file_index].lines[line_index].normalized_text +} + +fn compare_duplicate_blocks(left: &DuplicateBlock, right: &DuplicateBlock) -> Ordering { + right + .weight + .cmp(&left.weight) + .then_with(|| right.line_count().cmp(&left.line_count())) + .then_with(|| right.character_count().cmp(&left.character_count())) + .then_with(|| first_occurrence_key(left).cmp(&first_occurrence_key(right))) + .then_with(|| first_occurrence_start_line(left).cmp(&first_occurrence_start_line(right))) + .then_with(|| normalized_block_text(left).cmp(&normalized_block_text(right))) +} + +fn first_occurrence_key(block: &DuplicateBlock) -> String { + block + .occurrences + .first() + .map(|occurrence| format_path(&occurrence.file_path)) + .unwrap_or_default() +} + +fn first_occurrence_start_line(block: &DuplicateBlock) -> usize { + block + .occurrences + .first() + .map(|occurrence| occurrence.start_line) + .unwrap_or_default() +} + +fn normalized_block_text(block: &DuplicateBlock) -> String { + block.normalized_lines.join("\n") +} + +#[cfg(test)] +mod tests { + use crate::language::hash_normalized_line; + use crate::model::{LineEntry, ProcessedFile, SourceFile}; + + use super::*; + + fn processed_file(path: &str, extension: &str, lines: &[(&str, LineStatus)]) -> ProcessedFile { + let line_entries = lines + .iter() + .enumerate() + .map(|(index, (text, status))| LineEntry { + file_path: PathBuf::from(path), + line_number: index + 1, + normalized_text: (*text).to_string(), + hash: hash_normalized_line(text), + status: *status, + }) + .collect(); + ProcessedFile { + source: SourceFile { + path: PathBuf::from(path), + display_path: PathBuf::from(path), + extension: extension.to_string(), + }, + lines: line_entries, + } + } + + #[test] + fn groups_three_occurrences_of_the_same_block() { + let files = vec![ + processed_file( + "a.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("return value;", LineStatus::Comparison), + ], + ), + processed_file( + "b.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("return value;", LineStatus::Comparison), + ], + ), + processed_file( + "c.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("return value;", LineStatus::Comparison), + ], + ), + ]; + let blocks = detect_duplicate_blocks(&files); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].occurrences.len(), 3); + assert_eq!( + blocks[0].normalized_lines, + ["const value = one;", "return value;"] + ); + } + + #[test] + fn ignores_single_line_duplicates_that_are_only_block_only_lines() { + let files = vec![ + processed_file("a.ts", "ts", &[("}", LineStatus::BlockOnly)]), + processed_file("b.ts", "ts", &[("}", LineStatus::BlockOnly)]), + ]; + let blocks = detect_duplicate_blocks(&files); + assert!(blocks.is_empty()); + } + + #[test] + fn includes_block_only_lines_inside_larger_duplicate_blocks() { + let files = vec![ + processed_file( + "a.ts", + "ts", + &[ + ("if (ready) {", LineStatus::Comparison), + ("}", LineStatus::BlockOnly), + ("return value;", LineStatus::Comparison), + ], + ), + processed_file( + "b.ts", + "ts", + &[ + ("if (ready) {", LineStatus::Comparison), + ("}", LineStatus::BlockOnly), + ("return value;", LineStatus::Comparison), + ], + ), + ]; + let blocks = detect_duplicate_blocks(&files); + assert_eq!(blocks.len(), 1); + assert_eq!( + blocks[0].normalized_lines, + ["if (ready) {", "}", "return value;"] + ); + } +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..fcb7545 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,33 @@ +use std::error::Error; +use std::fmt; +use std::io; +use std::path::Path; + +use crate::paths::format_path; + +pub type Result = std::result::Result; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CodeM8Error { + message: String, +} + +impl CodeM8Error { + pub fn new(message: impl Into) -> Self { + Self { + message: message.into(), + } + } + + pub fn io(path: &Path, action: &str, error: io::Error) -> Self { + Self::new(format!("could not {action} {}: {error}", format_path(path))) + } +} + +impl fmt::Display for CodeM8Error { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str(&self.message) + } +} + +impl Error for CodeM8Error {} diff --git a/src/language.rs b/src/language.rs new file mode 100644 index 0000000..2dc3dc3 --- /dev/null +++ b/src/language.rs @@ -0,0 +1,192 @@ +use std::collections::HashMap; +use std::sync::OnceLock; + +use crate::model::LineStatus; + +#[derive(Debug, Clone, Copy)] +pub struct LanguageLinePattern { + pub language_name: &'static str, + pub extensions: &'static [&'static str], + pub block_only_lines: &'static [&'static str], +} + +pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ + LanguageLinePattern { + language_name: "TypeScript / JavaScript", + extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"], + block_only_lines: &[ + "(", ")", "{", "}", "[", "]", ");", "];", "};", ")};", "}),", "});", + ], + }, + LanguageLinePattern { + language_name: "Rust", + extensions: &["rs"], + block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"], + }, + LanguageLinePattern { + language_name: "C / C++ / Objective-C", + extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"], + block_only_lines: &[ + "{", "}", "(", ")", "[", "]", ");", "];", "};", "#endif", "#else", + ], + }, + LanguageLinePattern { + language_name: "C#", + extensions: &["cs"], + block_only_lines: &[ + "{", + "}", + "(", + ")", + "[", + "]", + ");", + "];", + "};", + "#endregion", + "#else", + "#endif", + ], + }, + LanguageLinePattern { + language_name: "Java / Kotlin / Scala", + extensions: &["java", "kt", "kts", "scala", "sc"], + block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"], + }, + LanguageLinePattern { + language_name: "Go", + extensions: &["go"], + block_only_lines: &["{", "}", "(", ")", "[", "]"], + }, + LanguageLinePattern { + language_name: "Python", + extensions: &["py", "pyw"], + block_only_lines: &["(", ")", "[", "]", "{", "}"], + }, + LanguageLinePattern { + language_name: "Ruby", + extensions: &["rb"], + block_only_lines: &["(", ")", "[", "]", "{", "}", "end"], + }, + LanguageLinePattern { + language_name: "PHP", + extensions: &["php", "phtml"], + block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"], + }, + LanguageLinePattern { + language_name: "Swift", + extensions: &["swift"], + block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"], + }, + LanguageLinePattern { + language_name: "Shell", + extensions: &["sh", "bash", "zsh", "fish"], + block_only_lines: &["then", "do", "done", "fi", "else", "{", "}"], + }, + LanguageLinePattern { + language_name: "PowerShell", + extensions: &["ps1", "psm1", "psd1"], + block_only_lines: &["{", "}", "(", ")", "[", "]", ");"], + }, + LanguageLinePattern { + language_name: "HTML / XML", + extensions: &["html", "htm", "xml", "xhtml", "svg"], + block_only_lines: &[ + ">", + "/>", + "", + "", + "", + "", + "", + "", + ], + }, + LanguageLinePattern { + language_name: "CSS / SCSS / Sass / Less", + extensions: &["css", "scss", "sass", "less"], + block_only_lines: &["{", "}", ");"], + }, + LanguageLinePattern { + language_name: "SQL", + extensions: &["sql"], + block_only_lines: &["(", ")", ");", ";", "BEGIN", "END"], + }, + LanguageLinePattern { + language_name: "YAML / JSON / TOML", + extensions: &["yaml", "yml", "json", "toml"], + block_only_lines: &["{", "}", "[", "]", "},", "],"], + }, +]; + +#[derive(Debug)] +struct BlockOnlyRegistry { + by_extension: HashMap<&'static str, HashMap>>, +} + +static BLOCK_ONLY_REGISTRY: OnceLock = OnceLock::new(); + +pub fn hash_normalized_line(line: &str) -> u128 { + xxhash_rust::xxh3::xxh3_128(line.as_bytes()) +} + +pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus { + let extension = extension.to_ascii_lowercase(); + let Some(patterns_by_hash) = registry().by_extension.get(extension.as_str()) else { + return LineStatus::Comparison; + }; + let Some(patterns) = patterns_by_hash.get(&hash) else { + return LineStatus::Comparison; + }; + if patterns.contains(&normalized_line) { + LineStatus::BlockOnly + } else { + LineStatus::Comparison + } +} + +fn registry() -> &'static BlockOnlyRegistry { + BLOCK_ONLY_REGISTRY.get_or_init(|| { + let mut by_extension: HashMap<&'static str, HashMap>> = + HashMap::new(); + for language in LANGUAGE_PATTERNS { + for extension in language.extensions { + let patterns_by_hash = by_extension.entry(extension).or_default(); + for line in language.block_only_lines { + patterns_by_hash + .entry(hash_normalized_line(line)) + .or_default() + .push(line); + } + } + } + BlockOnlyRegistry { by_extension } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn assigns_block_only_status_from_extension_specific_registry() { + let hash = hash_normalized_line("}"); + assert_eq!(classify_line("ts", "}", hash), LineStatus::BlockOnly); + } + + #[test] + fn assigns_comparison_status_for_meaningful_lines() { + let line = "const value = computeValue(input);"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::Comparison); + } + + #[test] + fn verifies_text_after_hash_lookup() { + let hash = hash_normalized_line("}"); + assert_eq!( + classify_line("ts", "not-a-brace", hash), + LineStatus::Comparison + ); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..805bc69 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,167 @@ +pub mod cli; +pub mod discovery; +pub mod duplicate; +pub mod error; +pub mod language; +pub mod line; +pub mod model; +pub mod paths; +pub mod report; + +use std::io::Write; +use std::path::Path; + +use crate::error::{CodeM8Error, Result}; + +pub fn run(args: I, current_dir: &Path, writer: &mut W) -> Result<()> +where + I: IntoIterator, + S: Into, + W: Write, +{ + let config = cli::parse_args(args)?; + if config.report_duplicate { + let source_files = discovery::discover_source_files( + current_dir, + &config.file_extensions, + config.files.as_deref(), + )?; + let processed_files = line::process_source_files(&source_files)?; + let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files); + let report = report::DuplicateReport { + analyzed_files: source_files.len(), + analyzed_extensions: config.file_extensions, + duplicate_blocks, + }; + writer + .write_all(report::render_duplicate_report(&report).as_bytes()) + .map_err(|error| CodeM8Error::new(format!("could not write report output: {error}")))?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::{Path, PathBuf}; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + struct TempProject { + path: PathBuf, + } + + impl TempProject { + fn new(name: &str) -> Self { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = + std::env::temp_dir().join(format!("codem8-{name}-{}-{id}", std::process::id())); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + Self { path } + } + + fn write(&self, relative_path: &str, contents: &str) { + let path = self.path.join(relative_path); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).expect("create test parent directory"); + } + fs::write(path, contents).expect("write test file"); + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TempProject { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.path); + } + } + + fn run_in(project: &TempProject, args: &[&str]) -> std::result::Result { + let mut output = Vec::new(); + run(args.iter().copied(), project.path(), &mut output)?; + Ok(String::from_utf8(output).expect("report is UTF-8")) + } + + #[test] + fn duplicate_report_snapshot_is_stable() { + let project = TempProject::new("snapshot"); + project.write( + "src/a.ts", + "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n", + ); + project.write( + "src/b.ts", + "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n", + ); + let output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); + assert_eq!( + output, + concat!( + "Duplicate Code Report\n", + "=====================\n", + "\n", + "Analyzed files: 2\n", + "Analyzed extensions: ts\n", + "Duplicate blocks found: 1\n", + "\n", + "#1 Weight: 324\n", + "Lines: 4\n", + "Characters: 81\n", + "Occurrences: 2\n", + "\n", + "Locations:\n", + "- src/a.ts:1-4\n", + "- src/b.ts:1-4\n", + "\n", + "Code:\n", + " const value = computeValue(input);\n", + " if (value === undefined) {\n", + " return defaultValue;\n", + " }\n", + ) + ); + } + + #[test] + fn explicit_files_disable_recursive_discovery() { + let project = TempProject::new("explicit-files"); + project.write("src/a.ts", "const value = one;\n"); + project.write("src/b.ts", "const value = one;\n"); + let output = + run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds"); + assert!(output.contains("Analyzed files: 1")); + assert!(output.contains("Duplicate blocks found: 0")); + } + + #[test] + fn custom_extensions_change_analyzed_files() { + let project = TempProject::new("custom-extensions"); + project.write("src/a.js", "const value = one;\n"); + project.write("src/b.js", "const value = one;\n"); + let default_output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); + assert!(default_output.contains("Analyzed files: 0")); + let js_output = run_in(&project, &["--report-duplicate", "-file-extension=js"]) + .expect("report succeeds"); + assert!(js_output.contains("Analyzed files: 2")); + assert!(js_output.contains("Duplicate blocks found: 1")); + } + + #[test] + fn invalid_explicit_file_returns_a_clear_error() { + let project = TempProject::new("invalid-file"); + let error = run_in(&project, &["--report-duplicate", "-files=missing.ts"]) + .expect_err("missing explicit file fails"); + assert!(error + .to_string() + .contains("explicit file does not exist: missing.ts")); + } +} diff --git a/src/line.rs b/src/line.rs new file mode 100644 index 0000000..894faaf --- /dev/null +++ b/src/line.rs @@ -0,0 +1,88 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; + +use crate::error::{CodeM8Error, Result}; +use crate::language::{classify_line, hash_normalized_line}; +use crate::model::{LineEntry, ProcessedFile, SourceFile}; + +pub fn process_source_files(source_files: &[SourceFile]) -> Result> { + source_files.iter().map(process_source_file).collect() +} + +pub fn process_source_file(source_file: &SourceFile) -> Result { + let file = File::open(&source_file.path) + .map_err(|error| CodeM8Error::io(&source_file.display_path, "open file", error))?; + let reader = BufReader::new(file); + let mut lines = Vec::new(); + for (index, line) in reader.lines().enumerate() { + let line = line.map_err(|error| { + CodeM8Error::new(format!( + "could not read {} as UTF-8 text: {error}", + crate::paths::format_path(&source_file.display_path) + )) + })?; + let Some(normalized_text) = normalize_line(&line) else { + continue; + }; + let hash = hash_normalized_line(&normalized_text); + let status = classify_line(&source_file.extension, &normalized_text, hash); + lines.push(LineEntry { + file_path: source_file.display_path.clone(), + line_number: index + 1, + normalized_text, + hash, + status, + }); + } + Ok(ProcessedFile { + source: source_file.clone(), + lines, + }) +} + +pub fn normalize_line(line: &str) -> Option { + let normalized = line.trim(); + if normalized.is_empty() { + None + } else { + Some(normalized.to_string()) + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use crate::model::LineStatus; + + use super::*; + + #[test] + fn trims_unicode_whitespace_and_skips_empty_lines() { + assert_eq!( + normalize_line("\t value \u{2003}"), + Some("value".to_string()) + ); + assert_eq!(normalize_line(" \t "), None); + } + + #[test] + fn processes_non_empty_lines_with_original_line_numbers() { + let path = std::env::temp_dir().join(format!("codem8-line-test-{}.ts", std::process::id())); + fs::write(&path, " const value = 1; \n\n }\n").expect("write source file"); + let source = SourceFile { + path: path.clone(), + display_path: "sample.ts".into(), + extension: "ts".to_string(), + }; + let processed = process_source_file(&source).expect("process source file"); + assert_eq!(processed.lines.len(), 2); + assert_eq!(processed.lines[0].line_number, 1); + assert_eq!(processed.lines[0].normalized_text, "const value = 1;"); + assert_eq!(processed.lines[0].status, LineStatus::Comparison); + assert_eq!(processed.lines[1].line_number, 3); + assert_eq!(processed.lines[1].normalized_text, "}"); + assert_eq!(processed.lines[1].status, LineStatus::BlockOnly); + fs::remove_file(path).expect("cleanup"); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..77dbbac --- /dev/null +++ b/src/main.rs @@ -0,0 +1,24 @@ +use std::io::Write; +use std::process::ExitCode; + +fn main() -> ExitCode { + let current_dir = match std::env::current_dir() { + Ok(current_dir) => current_dir, + Err(error) => { + eprintln!("error: could not determine current directory: {error}"); + return ExitCode::FAILURE; + } + }; + let stdout = std::io::stdout(); + let mut stdout = stdout.lock(); + match codem8::run(std::env::args().skip(1), ¤t_dir, &mut stdout) { + Ok(()) => { + let _ = stdout.flush(); + ExitCode::SUCCESS + } + Err(error) => { + eprintln!("error: {error}"); + ExitCode::FAILURE + } + } +} diff --git a/src/model.rs b/src/model.rs new file mode 100644 index 0000000..e7a1248 --- /dev/null +++ b/src/model.rs @@ -0,0 +1,56 @@ +use std::path::PathBuf; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LineStatus { + Comparison, + BlockOnly, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SourceFile { + pub path: PathBuf, + pub display_path: PathBuf, + pub extension: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct LineEntry { + pub file_path: PathBuf, + pub line_number: usize, + pub normalized_text: String, + pub hash: u128, + pub status: LineStatus, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ProcessedFile { + pub source: SourceFile, + pub lines: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DuplicateOccurrence { + pub file_path: PathBuf, + pub start_line: usize, + pub end_line: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DuplicateBlock { + pub normalized_lines: Vec, + pub occurrences: Vec, + pub weight: u64, +} + +impl DuplicateBlock { + pub fn line_count(&self) -> usize { + self.normalized_lines.len() + } + + pub fn character_count(&self) -> u64 { + self.normalized_lines + .iter() + .map(|line| line.chars().count() as u64) + .sum() + } +} diff --git a/src/paths.rs b/src/paths.rs new file mode 100644 index 0000000..56e1812 --- /dev/null +++ b/src/paths.rs @@ -0,0 +1,22 @@ +use std::path::{Component, Path, PathBuf}; + +pub fn format_path(path: &Path) -> String { + path.to_string_lossy().replace('\\', "/") +} + +pub fn normalize_display_path(path: &Path) -> PathBuf { + let mut normalized = PathBuf::new(); + for component in path.components() { + match component { + Component::CurDir => {} + Component::Normal(part) => normalized.push(part), + Component::ParentDir => normalized.push(".."), + Component::RootDir | Component::Prefix(_) => normalized.push(component.as_os_str()), + } + } + if normalized.as_os_str().is_empty() { + PathBuf::from(".") + } else { + normalized + } +} diff --git a/src/report.rs b/src/report.rs new file mode 100644 index 0000000..f6207c3 --- /dev/null +++ b/src/report.rs @@ -0,0 +1,103 @@ +use crate::model::DuplicateBlock; +use crate::paths::format_path; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DuplicateReport { + pub analyzed_files: usize, + pub analyzed_extensions: Vec, + pub duplicate_blocks: Vec, +} + +pub fn render_duplicate_report(report: &DuplicateReport) -> String { + let mut output = String::new(); + output.push_str("Duplicate Code Report\n"); + output.push_str("=====================\n\n"); + output.push_str(&format!("Analyzed files: {}\n", report.analyzed_files)); + output.push_str(&format!( + "Analyzed extensions: {}\n", + report.analyzed_extensions.join(", ") + )); + output.push_str(&format!( + "Duplicate blocks found: {}\n", + report.duplicate_blocks.len() + )); + for (index, block) in report.duplicate_blocks.iter().enumerate() { + output.push('\n'); + output.push_str(&format!("#{} Weight: {}\n", index + 1, block.weight)); + output.push_str(&format!("Lines: {}\n", block.line_count())); + output.push_str(&format!("Characters: {}\n", block.character_count())); + output.push_str(&format!("Occurrences: {}\n\n", block.occurrences.len())); + output.push_str("Locations:\n"); + for occurrence in &block.occurrences { + output.push_str(&format!( + "- {}:{}-{}\n", + format_path(&occurrence.file_path), + occurrence.start_line, + occurrence.end_line + )); + } + output.push_str("\nCode:\n"); + for line in &block.normalized_lines { + output.push_str(" "); + output.push_str(line); + output.push('\n'); + } + } + output +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use crate::model::{DuplicateBlock, DuplicateOccurrence}; + + use super::*; + + #[test] + fn renders_empty_report() { + let report = DuplicateReport { + analyzed_files: 0, + analyzed_extensions: vec!["ts".to_string()], + duplicate_blocks: Vec::new(), + }; + assert_eq!( + render_duplicate_report(&report), + "Duplicate Code Report\n\ + =====================\n\ + \n\ + Analyzed files: 0\n\ + Analyzed extensions: ts\n\ + Duplicate blocks found: 0\n" + ); + } + + #[test] + fn renders_duplicate_block_details() { + let report = DuplicateReport { + analyzed_files: 2, + analyzed_extensions: vec!["ts".to_string(), "js".to_string()], + duplicate_blocks: vec![DuplicateBlock { + normalized_lines: vec!["return value;".to_string()], + occurrences: vec![ + DuplicateOccurrence { + file_path: PathBuf::from("src/a.ts"), + start_line: 1, + end_line: 1, + }, + DuplicateOccurrence { + file_path: PathBuf::from("src/b.js"), + start_line: 5, + end_line: 5, + }, + ], + weight: 13, + }], + }; + let output = render_duplicate_report(&report); + assert!(output.contains("#1 Weight: 13")); + assert!(output.contains("Lines: 1")); + assert!(output.contains("- src/a.ts:1-1")); + assert!(output.contains(" return value;")); + } +} From a4e1b147cd749a3f408428879a5e2f22a4850b2d Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 12:54:07 +0200 Subject: [PATCH 02/22] [ci] wrap CodeRabbit workflow script in async IIFE --- .github/workflows/coderabbit-review.yml | 36 ++++++++++++++----------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/.github/workflows/coderabbit-review.yml b/.github/workflows/coderabbit-review.yml index 4a61832..5cfc373 100644 --- a/.github/workflows/coderabbit-review.yml +++ b/.github/workflows/coderabbit-review.yml @@ -53,23 +53,27 @@ jobs: return fetchReviews(page + 1, reviews.concat(pageReviews)); } - const reviews = await fetchReviews(); - const codeRabbitReviews = reviews - .filter((review) => review.user?.login === "coderabbitai[bot]") - .filter((review) => review.commit_id === headSha) - .sort((left, right) => new Date(left.submitted_at) - new Date(right.submitted_at)); + (async () => { + const reviews = await fetchReviews(); + const codeRabbitReviews = reviews + .filter((review) => review.user?.login === "coderabbitai[bot]") + .filter((review) => review.commit_id === headSha) + .sort((left, right) => new Date(left.submitted_at) - new Date(right.submitted_at)); - const latestReview = codeRabbitReviews.at(-1); - if (!latestReview) { - console.error(`CodeRabbit has not submitted a review for ${headSha}.`); - process.exit(1); - } + const latestReview = codeRabbitReviews.at(-1); + if (!latestReview) { + console.error(`CodeRabbit has not submitted a review for ${headSha}.`); + process.exit(1); + } - if (latestReview.state === "CHANGES_REQUESTED") { - console.error("CodeRabbit requested changes on this pull request."); - process.exit(1); - } + if (latestReview.state === "CHANGES_REQUESTED") { + console.error("CodeRabbit requested changes on this pull request."); + process.exit(1); + } - console.log(`CodeRabbit review state for ${headSha}: ${latestReview.state}`); + console.log(`CodeRabbit review state for ${headSha}: ${latestReview.state}`); + })().catch((error) => { + console.error(error); + process.exit(1); + }); NODE - From cb9c7e8497cbb21e604371d5c31949b017f2f8a9 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 12:59:38 +0200 Subject: [PATCH 03/22] [fix] deduplicate resolved explicit source files --- src/discovery.rs | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/discovery.rs b/src/discovery.rs index 2ff774e..1244180 100644 --- a/src/discovery.rs +++ b/src/discovery.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::fs; use std::path::{Path, PathBuf}; @@ -44,6 +45,7 @@ fn discover_explicit_files( files: &[PathBuf], ) -> Result> { let mut source_files = Vec::new(); + let mut seen_paths = HashSet::new(); for file in files { let absolute_input = file.is_absolute(); let path = if absolute_input { @@ -78,8 +80,13 @@ fn discover_explicit_files( let Some(extension) = selected_extension(&path, extensions) else { continue; }; + let canonical_path = fs::canonicalize(&path) + .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", error))?; + if !seen_paths.insert(canonical_path.clone()) { + continue; + } source_files.push(SourceFile { - path, + path: canonical_path, display_path: normalize_display_path(file), extension, }); @@ -193,4 +200,25 @@ mod tests { assert_eq!(format_path(&files[0].display_path), "a.ts"); fs::remove_dir_all(root).expect("cleanup"); } + + #[test] + fn explicit_files_deduplicate_resolved_paths() { + let root = temp_dir("explicit-dedup"); + fs::write(root.join("a.ts"), "").expect("write ts"); + let absolute = fs::canonicalize(root.join("a.ts")).expect("canonicalize ts"); + let files = discover_source_files( + &root, + &["ts".to_string()], + Some(&[ + PathBuf::from("a.ts"), + PathBuf::from(".").join("a.ts"), + absolute.clone(), + ]), + ) + .expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(files[0].path, absolute); + assert_eq!(format_path(&files[0].display_path), "a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } } From d9982116553426cadb8a5635b55e6a1cda240ad3 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 13:16:12 +0200 Subject: [PATCH 04/22] [fix] reject overlapping duplicate ranges in the same file --- src/duplicate.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/duplicate.rs b/src/duplicate.rs index ef11e35..aadeaaa 100644 --- a/src/duplicate.rs +++ b/src/duplicate.rs @@ -142,6 +142,9 @@ fn expand_pair(files: &[ProcessedFile], left: LineRef, right: LineRef) -> Option left_end += 1; right_end += 1; } + if left.file_index == right.file_index && left_start <= right_end && right_start <= left_end { + return None; + } let normalized_lines = files[left.file_index].lines[left_start..=left_end] .iter() .map(|line| line.normalized_text.clone()) @@ -313,4 +316,29 @@ mod tests { ["if (ready) {", "}", "return value;"] ); } + + #[test] + fn rejects_overlapping_duplicate_ranges_in_the_same_file() { + let files = vec![processed_file( + "a.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("const value = one;", LineStatus::Comparison), + ("const value = one;", LineStatus::Comparison), + ], + )]; + let blocks = detect_duplicate_blocks(&files); + assert!(!blocks.iter().any(|block| { + block.normalized_lines == ["const value = one;", "const value = one;"] + && block + .occurrences + .iter() + .any(|occurrence| occurrence.start_line == 1) + && block + .occurrences + .iter() + .any(|occurrence| occurrence.start_line == 2) + })); + } } From d886d25fe52b5356a7685a24a6baa4e53cddbc40 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 13:53:04 +0200 Subject: [PATCH 05/22] [docs] document cargo installation from GitHub and local source --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3642e93..0dd54e7 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,25 @@ prints a stable plain-text report sorted by duplicate weight. ## Installation -Build from source with Cargo: +Install `codem8` from the GitHub source with Cargo: + +```bash +cargo install --git https://github.com/b4prog/CodeM8 codem8 +``` + +Build from a local checkout with Cargo: ```bash cargo build --release ``` -Run the local binary: +Install from a local checkout: + +```bash +cargo install --path . +``` + +Run from the local checkout without installing: ```bash cargo run -- --report-duplicate From a61844bdeb643513d4a31aa50bc19c3fceb78872 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 14:13:57 +0200 Subject: [PATCH 06/22] [chore] add clippy lint threshold configuration --- clippy.toml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 clippy.toml diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..770b51a --- /dev/null +++ b/clippy.toml @@ -0,0 +1,5 @@ +too-many-lines-threshold = 80 +too-many-arguments-threshold = 5 +type-complexity-threshold = 200 +excessive-nesting-threshold = 4 +cognitive-complexity-threshold = 20 From cbe018a3102f5ddfdba4351a04010fbee9899471 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 14:19:03 +0200 Subject: [PATCH 07/22] [docs] document agent verification requirements and local checks --- AGENTS.md | 19 +++++++++++++++++++ README.md | 3 ++- 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..c8ae01f --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,19 @@ +# Agent Instructions + +These instructions apply to code agents working in this repository, including Codex. + +## Before finishing a change + +Run the repository verification commands from the workspace root and fix any issues before handing work back: + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity +rtk cargo build --locked --all-targets +``` + +## Notes + +- Treat Clippy warnings as errors for generated or edited code. +- Prefer changes that satisfy the repository `clippy.toml` configuration without adding `#[allow(...)]` attributes unless a maintainer explicitly asks for them. +- If a command cannot be run in the current environment, call that out clearly in the handoff. diff --git a/README.md b/README.md index 0dd54e7..ce3d08a 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,8 @@ Run the full local verification set: ```bash cargo fmt --all -- --check -cargo build --all-targets +cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity +rtk cargo build --locked --all-targets cargo test --all-targets ``` From 945d07c481fdf250b8d0f67ef0edb3f1f84e8897 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 14:25:04 +0200 Subject: [PATCH 08/22] [refactor] reduce nesting in duplicate and language helpers --- src/duplicate.rs | 42 ++++++++++++++++++++++++++++-------------- src/language.rs | 23 ++++++++++++++++------- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/src/duplicate.rs b/src/duplicate.rs index aadeaaa..7b80a04 100644 --- a/src/duplicate.rs +++ b/src/duplicate.rs @@ -64,20 +64,7 @@ pub fn detect_duplicate_blocks(files: &[ProcessedFile]) -> Vec { if comparison_refs.len() < 2 { continue; } - for left_index in 0..comparison_refs.len() { - for right_index in (left_index + 1)..comparison_refs.len() { - let left = comparison_refs[left_index]; - let right = comparison_refs[right_index]; - let Some(candidate) = expand_pair(files, left, right) else { - continue; - }; - let occurrences = blocks_by_lines - .entry(candidate.normalized_lines) - .or_default(); - occurrences.insert(candidate.left_occurrence); - occurrences.insert(candidate.right_occurrence); - } - } + collect_candidates(files, comparison_refs, &mut blocks_by_lines); } } let mut duplicate_blocks = blocks_by_lines @@ -118,6 +105,33 @@ struct CandidateBlock { right_occurrence: OccurrenceKey, } +fn collect_candidates( + files: &[ProcessedFile], + comparison_refs: &[LineRef], + blocks_by_lines: &mut HashMap, BTreeSet>, +) { + for left_index in 0..comparison_refs.len() { + let left = comparison_refs[left_index]; + for &right in &comparison_refs[(left_index + 1)..] { + let Some(candidate) = expand_pair(files, left, right) else { + continue; + }; + store_candidate(candidate, blocks_by_lines); + } + } +} + +fn store_candidate( + candidate: CandidateBlock, + blocks_by_lines: &mut HashMap, BTreeSet>, +) { + let occurrences = blocks_by_lines + .entry(candidate.normalized_lines) + .or_default(); + occurrences.insert(candidate.left_occurrence); + occurrences.insert(candidate.right_occurrence); +} + fn expand_pair(files: &[ProcessedFile], left: LineRef, right: LineRef) -> Option { if left == right { return None; diff --git a/src/language.rs b/src/language.rs index 2dc3dc3..d4c36fa 100644 --- a/src/language.rs +++ b/src/language.rs @@ -151,19 +151,28 @@ fn registry() -> &'static BlockOnlyRegistry { HashMap::new(); for language in LANGUAGE_PATTERNS { for extension in language.extensions { - let patterns_by_hash = by_extension.entry(extension).or_default(); - for line in language.block_only_lines { - patterns_by_hash - .entry(hash_normalized_line(line)) - .or_default() - .push(line); - } + register_block_only_lines( + by_extension.entry(extension).or_default(), + language.block_only_lines, + ); } } BlockOnlyRegistry { by_extension } }) } +fn register_block_only_lines( + patterns_by_hash: &mut HashMap>, + lines: &'static [&'static str], +) { + for &line in lines { + patterns_by_hash + .entry(hash_normalized_line(line)) + .or_default() + .push(line); + } +} + #[cfg(test)] mod tests { use super::*; From 9417bae460688c08511401eb216873795d0adfcb Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 14:39:09 +0200 Subject: [PATCH 09/22] [chore] satisfy stricter clippy lint requirements --- AGENTS.md | 2 +- Cargo.toml | 3 ++- src/cli.rs | 19 ++++++++++++++++++- src/discovery.rs | 30 +++++++++++++++++------------- src/error.rs | 4 +++- src/language.rs | 2 ++ src/lib.rs | 6 ++++++ src/line.rs | 13 ++++++++++++- src/model.rs | 4 +++- src/paths.rs | 2 ++ src/report.rs | 34 ++++++++++++++++++++-------------- 11 files changed, 86 insertions(+), 33 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index c8ae01f..9b718e5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -8,7 +8,7 @@ Run the repository verification commands from the workspace root and fix any iss ```bash cargo fmt --all -- --check -cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity +cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo rtk cargo build --locked --all-targets ``` diff --git a/Cargo.toml b/Cargo.toml index 1d7336b..a1ad11d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,8 @@ edition = "2021" license = "MIT" description = "A deterministic source code analysis CLI for duplicate code reports." repository = "https://github.com/b4prog/CodeM8" +keywords = ["cli", "duplicate-detection", "source-code", "analysis"] +categories = ["command-line-utilities", "development-tools"] [dependencies] xxhash-rust = { version = "0.8", features = ["xxh3"] } - diff --git a/src/cli.rs b/src/cli.rs index 934c274..28dd729 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -11,6 +11,12 @@ pub struct CliConfig { pub files: Option>, } +/// Parses command-line arguments into a validated CLI configuration. +/// +/// # Errors +/// +/// Returns an error when the arguments are invalid, repeated, or missing the +/// required report switch. pub fn parse_args(args: I) -> Result where I: IntoIterator, @@ -57,13 +63,19 @@ where file_extensions: file_extensions.unwrap_or_else(|| { DEFAULT_FILE_EXTENSIONS .iter() - .map(|extension| extension.to_string()) + .map(std::string::ToString::to_string) .collect() }), files, }) } +/// Parses a comma-separated list of file extensions. +/// +/// # Errors +/// +/// Returns an error when an extension is empty, starts with `.`, or contains a +/// path separator. pub fn parse_file_extensions(value: &str) -> Result> { let mut extensions = Vec::new(); for raw_extension in value.split(',') { @@ -92,6 +104,11 @@ pub fn parse_file_extensions(value: &str) -> Result> { Ok(extensions) } +/// Parses a comma-separated list of explicit file paths. +/// +/// # Errors +/// +/// Returns an error when any provided file path is empty. pub fn parse_file_list(value: &str) -> Result> { let mut files = Vec::new(); for raw_file in value.split(',') { diff --git a/src/discovery.rs b/src/discovery.rs index 1244180..a976c56 100644 --- a/src/discovery.rs +++ b/src/discovery.rs @@ -20,18 +20,23 @@ const IGNORED_DIRECTORIES: &[&str] = &[ ".vscode", ]; +/// Discovers source files that match the selected extensions. +/// +/// # Errors +/// +/// Returns an error when explicit files are invalid or when walking the file +/// tree fails. pub fn discover_source_files( current_dir: &Path, extensions: &[String], explicit_files: Option<&[PathBuf]>, ) -> Result> { - let mut source_files = match explicit_files { - Some(files) => discover_explicit_files(current_dir, extensions, files)?, - None => { - let mut source_files = Vec::new(); - walk_directory(current_dir, current_dir, extensions, &mut source_files)?; - source_files - } + let mut source_files = if let Some(files) = explicit_files { + discover_explicit_files(current_dir, extensions, files)? + } else { + let mut source_files = Vec::new(); + walk_directory(current_dir, current_dir, extensions, &mut source_files)?; + source_files }; source_files.sort_by(|left, right| { format_path(&left.display_path).cmp(&format_path(&right.display_path)) @@ -81,7 +86,7 @@ fn discover_explicit_files( continue; }; let canonical_path = fs::canonicalize(&path) - .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", error))?; + .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", &error))?; if !seen_paths.insert(canonical_path.clone()) { continue; } @@ -101,9 +106,9 @@ fn walk_directory( source_files: &mut Vec, ) -> Result<()> { let mut entries = fs::read_dir(directory) - .map_err(|error| CodeM8Error::io(directory, "read directory", error))? + .map_err(|error| CodeM8Error::io(directory, "read directory", &error))? .collect::, _>>() - .map_err(|error| CodeM8Error::io(directory, "read directory entry", error))?; + .map_err(|error| CodeM8Error::io(directory, "read directory entry", &error))?; entries.sort_by(|left, right| { left.file_name() .to_string_lossy() @@ -113,7 +118,7 @@ fn walk_directory( let path = entry.path(); let file_type = entry .file_type() - .map_err(|error| CodeM8Error::io(&path, "inspect path", error))?; + .map_err(|error| CodeM8Error::io(&path, "inspect path", &error))?; if file_type.is_symlink() { continue; } @@ -129,8 +134,7 @@ fn walk_directory( }; let display_path = path .strip_prefix(root) - .map(normalize_display_path) - .unwrap_or_else(|_| normalize_display_path(&path)); + .map_or_else(|_| normalize_display_path(&path), normalize_display_path); source_files.push(SourceFile { path, display_path, diff --git a/src/error.rs b/src/error.rs index fcb7545..dc57d67 100644 --- a/src/error.rs +++ b/src/error.rs @@ -13,13 +13,15 @@ pub struct CodeM8Error { } impl CodeM8Error { + #[must_use] pub fn new(message: impl Into) -> Self { Self { message: message.into(), } } - pub fn io(path: &Path, action: &str, error: io::Error) -> Self { + #[must_use] + pub fn io(path: &Path, action: &str, error: &io::Error) -> Self { Self::new(format!("could not {action} {}: {error}", format_path(path))) } } diff --git a/src/language.rs b/src/language.rs index d4c36fa..b74ba57 100644 --- a/src/language.rs +++ b/src/language.rs @@ -126,10 +126,12 @@ struct BlockOnlyRegistry { static BLOCK_ONLY_REGISTRY: OnceLock = OnceLock::new(); +#[must_use] pub fn hash_normalized_line(line: &str) -> u128 { xxhash_rust::xxh3::xxh3_128(line.as_bytes()) } +#[must_use] pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus { let extension = extension.to_ascii_lowercase(); let Some(patterns_by_hash) = registry().by_extension.get(extension.as_str()) else { diff --git a/src/lib.rs b/src/lib.rs index 805bc69..e13d185 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,12 @@ use std::path::Path; use crate::error::{CodeM8Error, Result}; +/// Runs the CLI workflow and writes the selected report to the provided writer. +/// +/// # Errors +/// +/// Returns an error when argument parsing, file discovery, file processing, or +/// report writing fails. pub fn run(args: I, current_dir: &Path, writer: &mut W) -> Result<()> where I: IntoIterator, diff --git a/src/line.rs b/src/line.rs index 894faaf..b33a4a6 100644 --- a/src/line.rs +++ b/src/line.rs @@ -5,13 +5,23 @@ use crate::error::{CodeM8Error, Result}; use crate::language::{classify_line, hash_normalized_line}; use crate::model::{LineEntry, ProcessedFile, SourceFile}; +/// Processes a set of source files into normalized line entries. +/// +/// # Errors +/// +/// Returns an error when any input file cannot be opened or read as UTF-8 text. pub fn process_source_files(source_files: &[SourceFile]) -> Result> { source_files.iter().map(process_source_file).collect() } +/// Processes one source file into its normalized, classified lines. +/// +/// # Errors +/// +/// Returns an error when the file cannot be opened or read as UTF-8 text. pub fn process_source_file(source_file: &SourceFile) -> Result { let file = File::open(&source_file.path) - .map_err(|error| CodeM8Error::io(&source_file.display_path, "open file", error))?; + .map_err(|error| CodeM8Error::io(&source_file.display_path, "open file", &error))?; let reader = BufReader::new(file); let mut lines = Vec::new(); for (index, line) in reader.lines().enumerate() { @@ -40,6 +50,7 @@ pub fn process_source_file(source_file: &SourceFile) -> Result { }) } +#[must_use] pub fn normalize_line(line: &str) -> Option { let normalized = line.trim(); if normalized.is_empty() { diff --git a/src/model.rs b/src/model.rs index e7a1248..2a1b195 100644 --- a/src/model.rs +++ b/src/model.rs @@ -43,10 +43,12 @@ pub struct DuplicateBlock { } impl DuplicateBlock { - pub fn line_count(&self) -> usize { + #[must_use] + pub const fn line_count(&self) -> usize { self.normalized_lines.len() } + #[must_use] pub fn character_count(&self) -> u64 { self.normalized_lines .iter() diff --git a/src/paths.rs b/src/paths.rs index 56e1812..777473f 100644 --- a/src/paths.rs +++ b/src/paths.rs @@ -1,9 +1,11 @@ use std::path::{Component, Path, PathBuf}; +#[must_use] pub fn format_path(path: &Path) -> String { path.to_string_lossy().replace('\\', "/") } +#[must_use] pub fn normalize_display_path(path: &Path) -> PathBuf { let mut normalized = PathBuf::new(); for component in path.components() { diff --git a/src/report.rs b/src/report.rs index f6207c3..126a1cc 100644 --- a/src/report.rs +++ b/src/report.rs @@ -1,3 +1,5 @@ +use std::fmt::Write as _; + use crate::model::DuplicateBlock; use crate::paths::format_path; @@ -8,33 +10,37 @@ pub struct DuplicateReport { pub duplicate_blocks: Vec, } +#[must_use] pub fn render_duplicate_report(report: &DuplicateReport) -> String { let mut output = String::new(); output.push_str("Duplicate Code Report\n"); output.push_str("=====================\n\n"); - output.push_str(&format!("Analyzed files: {}\n", report.analyzed_files)); - output.push_str(&format!( - "Analyzed extensions: {}\n", + let _ = writeln!(output, "Analyzed files: {}", report.analyzed_files); + let _ = writeln!( + output, + "Analyzed extensions: {}", report.analyzed_extensions.join(", ") - )); - output.push_str(&format!( - "Duplicate blocks found: {}\n", + ); + let _ = writeln!( + output, + "Duplicate blocks found: {}", report.duplicate_blocks.len() - )); + ); for (index, block) in report.duplicate_blocks.iter().enumerate() { output.push('\n'); - output.push_str(&format!("#{} Weight: {}\n", index + 1, block.weight)); - output.push_str(&format!("Lines: {}\n", block.line_count())); - output.push_str(&format!("Characters: {}\n", block.character_count())); - output.push_str(&format!("Occurrences: {}\n\n", block.occurrences.len())); + let _ = writeln!(output, "#{} Weight: {}", index + 1, block.weight); + let _ = writeln!(output, "Lines: {}", block.line_count()); + let _ = writeln!(output, "Characters: {}", block.character_count()); + let _ = writeln!(output, "Occurrences: {}\n", block.occurrences.len()); output.push_str("Locations:\n"); for occurrence in &block.occurrences { - output.push_str(&format!( - "- {}:{}-{}\n", + let _ = writeln!( + output, + "- {}:{}-{}", format_path(&occurrence.file_path), occurrence.start_line, occurrence.end_line - )); + ); } output.push_str("\nCode:\n"); for line in &block.normalized_lines { From 5e775700ea98e390e802500783a5cc61bc733de1 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 14:45:55 +0200 Subject: [PATCH 10/22] [ci] add Clippy validation to the Rust CI workflow --- .github/workflows/ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f8eae35..9dac4b1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,14 +19,16 @@ jobs: uses: actions/checkout@v4 - name: Install Rust toolchain - run: rustup toolchain install stable --profile minimal --component rustfmt + run: rustup toolchain install stable --profile minimal --component rustfmt --component clippy - name: Check formatting run: cargo fmt --all -- --check + - name: Run Clippy + run: cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo + - name: Build run: cargo build --locked --all-targets - name: Test run: cargo test --locked --all-targets - From 436936f7a8783a3bc780b45e54699da51e3dd02e Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 15:57:52 +0200 Subject: [PATCH 11/22] [ci] enable CodeRabbit request changes approval workflow --- .coderabbit.yaml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .coderabbit.yaml diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000..11bd52f --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,2 @@ +reviews: + request_changes_workflow: true From f9054a0735e7297c1ad2769166a49e56a76565a3 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 16:10:03 +0200 Subject: [PATCH 12/22] [ci] enable detailed CodeRabbit reviews and disable poems --- .coderabbit.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 11bd52f..f7b3a5d 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -1,2 +1,4 @@ reviews: request_changes_workflow: true + review_details: true + poem: false From 1ea12699f24b31fa14ebb2a3f5848d551fb008bc Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 16:51:27 +0200 Subject: [PATCH 13/22] [test] add coverage for parser, discovery, duplicate, and path edge cases --- src/cli.rs | 44 ++++++++++++++++++++++++++++++++++++++++ src/discovery.rs | 13 ++++++++++++ src/duplicate.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ src/line.rs | 19 ++++++++++++++++++ src/paths.rs | 26 ++++++++++++++++++++++++ 5 files changed, 154 insertions(+) diff --git a/src/cli.rs b/src/cli.rs index 28dd729..3fa99a1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -154,12 +154,48 @@ mod tests { assert!(error.to_string().contains("must not start with a dot")); } + #[test] + fn rejects_extensions_with_path_separators() { + let error = parse_file_extensions("src/ts").expect_err("path-like extension fails"); + assert!(error + .to_string() + .contains("must not contain path separators")); + } + #[test] fn rejects_missing_report_switch() { let error = parse_args(["-file-extension=rs"]).expect_err("missing report fails"); assert!(error.to_string().contains("no report switch provided")); } + #[test] + fn rejects_unknown_arguments() { + let error = parse_args(["--report-duplicate", "--verbose"]).expect_err("unknown arg fails"); + assert!(error.to_string().contains("unknown argument: --verbose")); + } + + #[test] + fn rejects_repeated_file_extension_arguments() { + let error = parse_args([ + "--report-duplicate", + "-file-extension=ts", + "--file-extension=js", + ]) + .expect_err("repeated extensions fail"); + assert!(error + .to_string() + .contains("file extensions were provided more than once")); + } + + #[test] + fn rejects_repeated_explicit_file_arguments() { + let error = parse_args(["--report-duplicate", "-files=a.ts", "--files=b.ts"]) + .expect_err("repeated explicit files fail"); + assert!(error + .to_string() + .contains("explicit files were provided more than once")); + } + #[test] fn parses_explicit_file_list() { let files = parse_file_list("src/a.ts, ./src/b.ts").expect("files parse"); @@ -168,4 +204,12 @@ mod tests { [PathBuf::from("src/a.ts"), PathBuf::from("./src/b.ts")] ); } + + #[test] + fn rejects_empty_explicit_file_paths() { + let error = parse_file_list("src/a.ts, ").expect_err("empty explicit file fails"); + assert!(error + .to_string() + .contains("file path values must not be empty")); + } } diff --git a/src/discovery.rs b/src/discovery.rs index a976c56..0424979 100644 --- a/src/discovery.rs +++ b/src/discovery.rs @@ -225,4 +225,17 @@ mod tests { assert_eq!(format_path(&files[0].display_path), "a.ts"); fs::remove_dir_all(root).expect("cleanup"); } + + #[test] + fn explicit_files_reject_directories() { + let root = temp_dir("explicit-directory"); + fs::create_dir_all(root.join("src")).expect("create explicit directory"); + let error = + discover_source_files(&root, &["ts".to_string()], Some(&[PathBuf::from("src")])) + .expect_err("directory explicit file fails"); + assert!(error + .to_string() + .contains("explicit file is a directory: src")); + fs::remove_dir_all(root).expect("cleanup"); + } } diff --git a/src/duplicate.rs b/src/duplicate.rs index 7b80a04..f01bca7 100644 --- a/src/duplicate.rs +++ b/src/duplicate.rs @@ -291,6 +291,58 @@ mod tests { ); } + #[test] + fn ignores_matching_hashes_with_different_text() { + let mut files = vec![ + processed_file( + "a.ts", + "ts", + &[("const value = one;", LineStatus::Comparison)], + ), + processed_file( + "b.ts", + "ts", + &[("const value = two;", LineStatus::Comparison)], + ), + ]; + files[1].lines[0].hash = files[0].lines[0].hash; + let blocks = detect_duplicate_blocks(&files); + assert!(blocks.is_empty()); + } + + #[test] + fn sorts_duplicate_blocks_by_weight() { + let files = vec![ + processed_file( + "a.ts", + "ts", + &[ + ("const longerValue = computeOne();", LineStatus::Comparison), + ("return longerValue;", LineStatus::Comparison), + ("const uniqueA = true;", LineStatus::Comparison), + ("const x = 1;", LineStatus::Comparison), + ], + ), + processed_file( + "b.ts", + "ts", + &[ + ("const longerValue = computeOne();", LineStatus::Comparison), + ("return longerValue;", LineStatus::Comparison), + ("const uniqueB = true;", LineStatus::Comparison), + ("const x = 1;", LineStatus::Comparison), + ], + ), + ]; + let blocks = detect_duplicate_blocks(&files); + assert!(blocks.len() >= 2); + assert_eq!( + blocks[0].normalized_lines, + ["const longerValue = computeOne();", "return longerValue;"] + ); + assert!(blocks[0].weight >= blocks[1].weight); + } + #[test] fn ignores_single_line_duplicates_that_are_only_block_only_lines() { let files = vec![ diff --git a/src/line.rs b/src/line.rs index b33a4a6..92dc0f5 100644 --- a/src/line.rs +++ b/src/line.rs @@ -96,4 +96,23 @@ mod tests { assert_eq!(processed.lines[1].status, LineStatus::BlockOnly); fs::remove_file(path).expect("cleanup"); } + + #[test] + fn returns_clear_error_for_invalid_utf8() { + let path = std::env::temp_dir().join(format!( + "codem8-line-invalid-utf8-{}.ts", + std::process::id() + )); + fs::write(&path, [0xff, b'\n']).expect("write invalid source file"); + let source = SourceFile { + path: path.clone(), + display_path: "invalid.ts".into(), + extension: "ts".to_string(), + }; + let error = process_source_file(&source).expect_err("invalid UTF-8 fails"); + assert!(error + .to_string() + .contains("could not read invalid.ts as UTF-8 text")); + fs::remove_file(path).expect("cleanup"); + } } diff --git a/src/paths.rs b/src/paths.rs index 777473f..f55926f 100644 --- a/src/paths.rs +++ b/src/paths.rs @@ -22,3 +22,29 @@ pub fn normalize_display_path(path: &Path) -> PathBuf { normalized } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn formats_paths_with_forward_slashes() { + assert_eq!( + format_path(Path::new("src\\nested\\a.ts")), + "src/nested/a.ts" + ); + } + + #[test] + fn normalizes_display_paths_without_losing_parent_segments() { + assert_eq!( + normalize_display_path(Path::new("./src/../a.ts")), + PathBuf::from("src").join("..").join("a.ts") + ); + } + + #[test] + fn normalizes_empty_display_path_to_current_directory() { + assert_eq!(normalize_display_path(Path::new(".")), PathBuf::from(".")); + } +} From b9ecef27bf5ae10c76efa67ead311cb4ff7f024a Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 17:54:01 +0200 Subject: [PATCH 14/22] [test] make duplicate sort fixture fail without sorting --- src/duplicate.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/duplicate.rs b/src/duplicate.rs index f01bca7..6414768 100644 --- a/src/duplicate.rs +++ b/src/duplicate.rs @@ -317,20 +317,20 @@ mod tests { "a.ts", "ts", &[ + ("const x = 1;", LineStatus::Comparison), + ("const uniqueA = true;", LineStatus::Comparison), ("const longerValue = computeOne();", LineStatus::Comparison), ("return longerValue;", LineStatus::Comparison), - ("const uniqueA = true;", LineStatus::Comparison), - ("const x = 1;", LineStatus::Comparison), ], ), processed_file( "b.ts", "ts", &[ + ("const x = 1;", LineStatus::Comparison), + ("const uniqueB = true;", LineStatus::Comparison), ("const longerValue = computeOne();", LineStatus::Comparison), ("return longerValue;", LineStatus::Comparison), - ("const uniqueB = true;", LineStatus::Comparison), - ("const x = 1;", LineStatus::Comparison), ], ), ]; From 38662c82dbdb292feba6745b6e4b2a36b3e89a3e Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 18:23:54 +0200 Subject: [PATCH 15/22] [feat] add CLI help output for duplicate reports --- src/cli.rs | 92 ++++++++++++++++++++++++++++++++++++++++++++++++- src/error.rs | 15 ++++++++ src/language.rs | 17 ++++++++- src/lib.rs | 48 +++++++++++++++++--------- src/main.rs | 4 +++ 5 files changed, 157 insertions(+), 19 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 3fa99a1..1cdb894 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -3,6 +3,51 @@ use std::path::PathBuf; use crate::error::{CodeM8Error, Result}; const DEFAULT_FILE_EXTENSIONS: &[&str] = &["ts"]; +const HELP_TEXT: &str = "\ +CodeM8 - deterministic source code analysis reports. + +USAGE: + codem8 help + codem8 --report-duplicate [OPTIONS] + +COMMANDS: + help + Display this detailed documentation. + +REQUIRED REPORT SWITCHES: + --report-duplicate + Analyze source files and print a duplicate code report. + +OPTIONS: + -file-extension= + --file-extension= + Comma-separated source file extensions to analyze. + Defaults to: ts + Examples: -file-extension=ts,tsx,js,jsx + + -files= + --files= + Comma-separated explicit files to analyze instead of recursively + discovering files from the current directory. + Example: -files=src/a.ts,src/b.js + +DUPLICATE REPORT PURPOSE: + The duplicate report helps you find repeated code that may be worth + refactoring, reviewing, or consolidating. It lists each duplicated block with + the files and line ranges where it appears, making it easier to compare the + repeated code and decide whether it should stay duplicated. + +EXAMPLES: + codem8 --report-duplicate + codem8 --report-duplicate -file-extension=ts,tsx,js,jsx + codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js +"; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CliCommand { + Help, + ReportDuplicate(CliConfig), +} #[derive(Debug, Clone, PartialEq, Eq)] pub struct CliConfig { @@ -11,6 +56,29 @@ pub struct CliConfig { pub files: Option>, } +#[must_use] +pub const fn help_text() -> &'static str { + HELP_TEXT +} + +/// Parses command-line arguments into a CLI command. +/// +/// # Errors +/// +/// Returns an error when the arguments are invalid, repeated, or missing the +/// required report switch. +pub fn parse_command(args: I) -> Result +where + I: IntoIterator, + S: Into, +{ + let args = args.into_iter().map(Into::into).collect::>(); + if args.len() == 1 && is_help_argument(&args[0]) { + return Ok(CliCommand::Help); + } + parse_args(args).map(CliCommand::ReportDuplicate) +} + /// Parses command-line arguments into a validated CLI configuration. /// /// # Errors @@ -54,7 +122,7 @@ where } } if !report_duplicate { - return Err(CodeM8Error::new( + return Err(CodeM8Error::with_help( "no report switch provided; pass --report-duplicate", )); } @@ -124,10 +192,30 @@ pub fn parse_file_list(value: &str) -> Result> { Ok(files) } +fn is_help_argument(arg: &str) -> bool { + matches!(arg, "help" | "--help" | "-h") +} + #[cfg(test)] mod tests { use super::*; + #[test] + fn parses_help_command() { + let command = parse_command(["help"]).expect("help parses"); + assert_eq!(command, CliCommand::Help); + } + + #[test] + fn exposes_detailed_help_text() { + assert!(help_text().contains("USAGE:")); + assert!(help_text().contains("--report-duplicate")); + assert!(help_text().contains("-file-extension=")); + assert!(help_text().contains("-files=")); + assert!(help_text().contains("helps you find repeated code")); + assert!(!help_text().contains("Duplicate weight")); + } + #[test] fn parses_default_duplicate_report_config() { let config = parse_args(["--report-duplicate"]).expect("config parses"); @@ -166,12 +254,14 @@ mod tests { fn rejects_missing_report_switch() { let error = parse_args(["-file-extension=rs"]).expect_err("missing report fails"); assert!(error.to_string().contains("no report switch provided")); + assert!(error.should_show_help()); } #[test] fn rejects_unknown_arguments() { let error = parse_args(["--report-duplicate", "--verbose"]).expect_err("unknown arg fails"); assert!(error.to_string().contains("unknown argument: --verbose")); + assert!(!error.should_show_help()); } #[test] diff --git a/src/error.rs b/src/error.rs index dc57d67..bcfe20b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -10,6 +10,7 @@ pub type Result = std::result::Result; #[derive(Debug, Clone, PartialEq, Eq)] pub struct CodeM8Error { message: String, + show_help: bool, } impl CodeM8Error { @@ -17,6 +18,15 @@ impl CodeM8Error { pub fn new(message: impl Into) -> Self { Self { message: message.into(), + show_help: false, + } + } + + #[must_use] + pub fn with_help(message: impl Into) -> Self { + Self { + message: message.into(), + show_help: true, } } @@ -24,6 +34,11 @@ impl CodeM8Error { pub fn io(path: &Path, action: &str, error: &io::Error) -> Self { Self::new(format!("could not {action} {}: {error}", format_path(path))) } + + #[must_use] + pub const fn should_show_help(&self) -> bool { + self.show_help + } } impl fmt::Display for CodeM8Error { diff --git a/src/language.rs b/src/language.rs index b74ba57..60d7985 100644 --- a/src/language.rs +++ b/src/language.rs @@ -21,7 +21,22 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "Rust", extensions: &["rs"], - block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"], + block_only_lines: &[ + "{", + "}", + "(", + ")", + "))", + "[", + "]", + ");", + "];", + "));", + "})?;", + "})", + "})?;", + ".into_iter()", + ], }, LanguageLinePattern { language_name: "C / C++ / Objective-C", diff --git a/src/lib.rs b/src/lib.rs index e13d185..659a0a0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,23 +25,29 @@ where S: Into, W: Write, { - let config = cli::parse_args(args)?; - if config.report_duplicate { - let source_files = discovery::discover_source_files( - current_dir, - &config.file_extensions, - config.files.as_deref(), - )?; - let processed_files = line::process_source_files(&source_files)?; - let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files); - let report = report::DuplicateReport { - analyzed_files: source_files.len(), - analyzed_extensions: config.file_extensions, - duplicate_blocks, - }; - writer - .write_all(report::render_duplicate_report(&report).as_bytes()) - .map_err(|error| CodeM8Error::new(format!("could not write report output: {error}")))?; + match cli::parse_command(args)? { + cli::CliCommand::Help => writer + .write_all(cli::help_text().as_bytes()) + .map_err(|error| CodeM8Error::new(format!("could not write help output: {error}")))?, + cli::CliCommand::ReportDuplicate(config) => { + let source_files = discovery::discover_source_files( + current_dir, + &config.file_extensions, + config.files.as_deref(), + )?; + let processed_files = line::process_source_files(&source_files)?; + let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files); + let report = report::DuplicateReport { + analyzed_files: source_files.len(), + analyzed_extensions: config.file_extensions, + duplicate_blocks, + }; + writer + .write_all(report::render_duplicate_report(&report).as_bytes()) + .map_err(|error| { + CodeM8Error::new(format!("could not write report output: {error}")) + })?; + } } Ok(()) } @@ -170,4 +176,12 @@ mod tests { .to_string() .contains("explicit file does not exist: missing.ts")); } + + #[test] + fn help_command_prints_documentation() { + let project = TempProject::new("help"); + let output = run_in(&project, &["help"]).expect("help succeeds"); + assert!(output.contains("USAGE:")); + assert!(output.contains("--report-duplicate")); + } } diff --git a/src/main.rs b/src/main.rs index 77dbbac..a6e1e1d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,6 +18,10 @@ fn main() -> ExitCode { } Err(error) => { eprintln!("error: {error}"); + if error.should_show_help() { + eprintln!(); + eprint!("{}", codem8::cli::help_text()); + } ExitCode::FAILURE } } From 17e24d4c95643999ad3a76bc0856eb63abd85798 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 18:34:14 +0200 Subject: [PATCH 16/22] [refactor] rename duplicate mitigation line patterns --- src/language.rs | 51 +++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/language.rs b/src/language.rs index 60d7985..b69b6fb 100644 --- a/src/language.rs +++ b/src/language.rs @@ -7,21 +7,21 @@ use crate::model::LineStatus; pub struct LanguageLinePattern { pub language_name: &'static str, pub extensions: &'static [&'static str], - pub block_only_lines: &'static [&'static str], + pub duplicate_mitigation_lines: &'static [&'static str], } pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "TypeScript / JavaScript", extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"], - block_only_lines: &[ + duplicate_mitigation_lines: &[ "(", ")", "{", "}", "[", "]", ");", "];", "};", ")};", "}),", "});", ], }, LanguageLinePattern { language_name: "Rust", extensions: &["rs"], - block_only_lines: &[ + duplicate_mitigation_lines: &[ "{", "}", "(", @@ -41,14 +41,14 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "C / C++ / Objective-C", extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"], - block_only_lines: &[ + duplicate_mitigation_lines: &[ "{", "}", "(", ")", "[", "]", ");", "];", "};", "#endif", "#else", ], }, LanguageLinePattern { language_name: "C#", extensions: &["cs"], - block_only_lines: &[ + duplicate_mitigation_lines: &[ "{", "}", "(", @@ -66,47 +66,47 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "Java / Kotlin / Scala", extensions: &["java", "kt", "kts", "scala", "sc"], - block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"], + duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"], }, LanguageLinePattern { language_name: "Go", extensions: &["go"], - block_only_lines: &["{", "}", "(", ")", "[", "]"], + duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]"], }, LanguageLinePattern { language_name: "Python", extensions: &["py", "pyw"], - block_only_lines: &["(", ")", "[", "]", "{", "}"], + duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}"], }, LanguageLinePattern { language_name: "Ruby", extensions: &["rb"], - block_only_lines: &["(", ")", "[", "]", "{", "}", "end"], + duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}", "end"], }, LanguageLinePattern { language_name: "PHP", extensions: &["php", "phtml"], - block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"], + duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"], }, LanguageLinePattern { language_name: "Swift", extensions: &["swift"], - block_only_lines: &["{", "}", "(", ")", "[", "]", ");", "];"], + duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];"], }, LanguageLinePattern { language_name: "Shell", extensions: &["sh", "bash", "zsh", "fish"], - block_only_lines: &["then", "do", "done", "fi", "else", "{", "}"], + duplicate_mitigation_lines: &["then", "do", "done", "fi", "else", "{", "}"], }, LanguageLinePattern { language_name: "PowerShell", extensions: &["ps1", "psm1", "psd1"], - block_only_lines: &["{", "}", "(", ")", "[", "]", ");"], + duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");"], }, LanguageLinePattern { language_name: "HTML / XML", extensions: &["html", "htm", "xml", "xhtml", "svg"], - block_only_lines: &[ + duplicate_mitigation_lines: &[ ">", "/>", "", @@ -120,26 +120,27 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "CSS / SCSS / Sass / Less", extensions: &["css", "scss", "sass", "less"], - block_only_lines: &["{", "}", ");"], + duplicate_mitigation_lines: &["{", "}", ");"], }, LanguageLinePattern { language_name: "SQL", extensions: &["sql"], - block_only_lines: &["(", ")", ");", ";", "BEGIN", "END"], + duplicate_mitigation_lines: &["(", ")", ");", ";", "BEGIN", "END"], }, LanguageLinePattern { language_name: "YAML / JSON / TOML", extensions: &["yaml", "yml", "json", "toml"], - block_only_lines: &["{", "}", "[", "]", "},", "],"], + duplicate_mitigation_lines: &["{", "}", "[", "]", "},", "],"], }, ]; #[derive(Debug)] -struct BlockOnlyRegistry { +struct DuplicateMitigationLineRegistry { by_extension: HashMap<&'static str, HashMap>>, } -static BLOCK_ONLY_REGISTRY: OnceLock = OnceLock::new(); +static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock = + OnceLock::new(); #[must_use] pub fn hash_normalized_line(line: &str) -> u128 { @@ -162,23 +163,23 @@ pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> Line } } -fn registry() -> &'static BlockOnlyRegistry { - BLOCK_ONLY_REGISTRY.get_or_init(|| { +fn registry() -> &'static DuplicateMitigationLineRegistry { + DUPLICATE_MITIGATION_LINE_REGISTRY.get_or_init(|| { let mut by_extension: HashMap<&'static str, HashMap>> = HashMap::new(); for language in LANGUAGE_PATTERNS { for extension in language.extensions { - register_block_only_lines( + register_duplicate_mitigation_lines( by_extension.entry(extension).or_default(), - language.block_only_lines, + language.duplicate_mitigation_lines, ); } } - BlockOnlyRegistry { by_extension } + DuplicateMitigationLineRegistry { by_extension } }) } -fn register_block_only_lines( +fn register_duplicate_mitigation_lines( patterns_by_hash: &mut HashMap>, lines: &'static [&'static str], ) { From d6178ea80c00e869ff967f06761e4dc1f6cd0f59 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 18:51:53 +0200 Subject: [PATCH 17/22] [feat] add punctuation duplicate mitigation patterns --- src/language.rs | 165 ++++++++++++++++++++++++++++++------------------ 1 file changed, 104 insertions(+), 61 deletions(-) diff --git a/src/language.rs b/src/language.rs index b69b6fb..efaaf86 100644 --- a/src/language.rs +++ b/src/language.rs @@ -7,6 +7,7 @@ use crate::model::LineStatus; pub struct LanguageLinePattern { pub language_name: &'static str, pub extensions: &'static [&'static str], + pub duplicate_mitigation_pattern: &'static [char], pub duplicate_mitigation_lines: &'static [&'static str], } @@ -14,101 +15,82 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "TypeScript / JavaScript", extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"], - duplicate_mitigation_lines: &[ - "(", ")", "{", "}", "[", "]", ");", "];", "};", ")};", "}),", "});", - ], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "Rust", extensions: &["rs"], - duplicate_mitigation_lines: &[ - "{", - "}", - "(", - ")", - "))", - "[", - "]", - ");", - "];", - "));", - "})?;", - "})", - "})?;", - ".into_iter()", - ], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_lines: &[".into_iter()"], }, LanguageLinePattern { language_name: "C / C++ / Objective-C", extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"], - duplicate_mitigation_lines: &[ - "{", "}", "(", ")", "[", "]", ");", "];", "};", "#endif", "#else", - ], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_lines: &["#endif", "#else"], }, LanguageLinePattern { language_name: "C#", extensions: &["cs"], - duplicate_mitigation_lines: &[ - "{", - "}", - "(", - ")", - "[", - "]", - ");", - "];", - "};", - "#endregion", - "#else", - "#endif", - ], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_lines: &["#endregion", "#else", "#endif"], }, LanguageLinePattern { language_name: "Java / Kotlin / Scala", extensions: &["java", "kt", "kts", "scala", "sc"], - duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "Go", extensions: &["go"], - duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "Python", extensions: &["py", "pyw"], - duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "Ruby", extensions: &["rb"], - duplicate_mitigation_lines: &["(", ")", "[", "]", "{", "}", "end"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':'], + duplicate_mitigation_lines: &["end"], }, LanguageLinePattern { language_name: "PHP", extensions: &["php", "phtml"], - duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];", "};", "?>"], + duplicate_mitigation_pattern: &[ + '(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>', '/', + ], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "Swift", extensions: &["swift"], - duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");", "];"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "Shell", extensions: &["sh", "bash", "zsh", "fish"], - duplicate_mitigation_lines: &["then", "do", "done", "fi", "else", "{", "}"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', '&', '|'], + duplicate_mitigation_lines: &["then", "do", "done", "fi", "else"], }, LanguageLinePattern { language_name: "PowerShell", extensions: &["ps1", "psm1", "psd1"], - duplicate_mitigation_lines: &["{", "}", "(", ")", "[", "]", ");"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '|'], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "HTML / XML", extensions: &["html", "htm", "xml", "xhtml", "svg"], + duplicate_mitigation_pattern: &['<', '>', '/'], duplicate_mitigation_lines: &[ - ">", - "/>", "", "", "", @@ -120,23 +102,32 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "CSS / SCSS / Sass / Less", extensions: &["css", "scss", "sass", "less"], - duplicate_mitigation_lines: &["{", "}", ");"], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'], + duplicate_mitigation_lines: &[], }, LanguageLinePattern { language_name: "SQL", extensions: &["sql"], - duplicate_mitigation_lines: &["(", ")", ");", ";", "BEGIN", "END"], + duplicate_mitigation_pattern: &['(', ')', ';', ',', ':'], + duplicate_mitigation_lines: &["BEGIN", "END"], }, LanguageLinePattern { language_name: "YAML / JSON / TOML", extensions: &["yaml", "yml", "json", "toml"], - duplicate_mitigation_lines: &["{", "}", "[", "]", "},", "],"], + duplicate_mitigation_pattern: &['{', '}', '[', ']', ','], + duplicate_mitigation_lines: &[], }, ]; #[derive(Debug)] struct DuplicateMitigationLineRegistry { - by_extension: HashMap<&'static str, HashMap>>, + by_extension: HashMap<&'static str, DuplicateMitigationPatterns>, +} + +#[derive(Debug, Default)] +struct DuplicateMitigationPatterns { + lines_by_hash: HashMap>, + character_pattern: Vec, } static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock = @@ -150,13 +141,10 @@ pub fn hash_normalized_line(line: &str) -> u128 { #[must_use] pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus { let extension = extension.to_ascii_lowercase(); - let Some(patterns_by_hash) = registry().by_extension.get(extension.as_str()) else { - return LineStatus::Comparison; - }; - let Some(patterns) = patterns_by_hash.get(&hash) else { + let Some(patterns) = registry().by_extension.get(extension.as_str()) else { return LineStatus::Comparison; }; - if patterns.contains(&normalized_line) { + if patterns.matches_line(normalized_line, hash) { LineStatus::BlockOnly } else { LineStatus::Comparison @@ -165,20 +153,37 @@ pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> Line fn registry() -> &'static DuplicateMitigationLineRegistry { DUPLICATE_MITIGATION_LINE_REGISTRY.get_or_init(|| { - let mut by_extension: HashMap<&'static str, HashMap>> = - HashMap::new(); + let mut by_extension: HashMap<&'static str, DuplicateMitigationPatterns> = HashMap::new(); for language in LANGUAGE_PATTERNS { for extension in language.extensions { + let patterns = by_extension.entry(extension).or_default(); register_duplicate_mitigation_lines( - by_extension.entry(extension).or_default(), + &mut patterns.lines_by_hash, language.duplicate_mitigation_lines, ); + register_duplicate_mitigation_pattern( + &mut patterns.character_pattern, + language.duplicate_mitigation_pattern, + ); } } DuplicateMitigationLineRegistry { by_extension } }) } +impl DuplicateMitigationPatterns { + fn matches_line(&self, normalized_line: &str, hash: u128) -> bool { + self.matches_registered_line(normalized_line, hash) + || matches_duplicate_mitigation_pattern(normalized_line, &self.character_pattern) + } + + fn matches_registered_line(&self, normalized_line: &str, hash: u128) -> bool { + self.lines_by_hash + .get(&hash) + .is_some_and(|patterns| patterns.contains(&normalized_line)) + } +} + fn register_duplicate_mitigation_lines( patterns_by_hash: &mut HashMap>, lines: &'static [&'static str], @@ -191,14 +196,33 @@ fn register_duplicate_mitigation_lines( } } +fn register_duplicate_mitigation_pattern( + character_pattern: &mut Vec, + characters: &'static [char], +) { + for &character in characters { + if !character_pattern.contains(&character) { + character_pattern.push(character); + } + } +} + +fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) -> bool { + !character_pattern.is_empty() + && line + .chars() + .all(|character| character.is_whitespace() || character_pattern.contains(&character)) +} + #[cfg(test)] mod tests { use super::*; #[test] - fn assigns_block_only_status_from_extension_specific_registry() { - let hash = hash_normalized_line("}"); - assert_eq!(classify_line("ts", "}", hash), LineStatus::BlockOnly); + fn assigns_block_only_status_from_extension_specific_line_registry() { + let line = ".into_iter()"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); } #[test] @@ -216,4 +240,23 @@ mod tests { LineStatus::Comparison ); } + + #[test] + fn assigns_block_only_status_from_character_pattern() { + let line = "} \t);"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn ignores_character_pattern_for_unknown_extensions() { + let line = "});"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("unknown", line, hash), LineStatus::Comparison); + } + + #[test] + fn empty_character_pattern_does_not_match() { + assert!(!matches_duplicate_mitigation_pattern("}", &[])); + } } From 03660654034780457c693b605f3dc18a1b91defe Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 19:03:43 +0200 Subject: [PATCH 18/22] [fix] use language registry for default duplicate report extensions --- src/cli.rs | 13 ++++--------- src/language.rs | 23 +++++++++++++++++++++++ src/lib.rs | 13 +++++++++---- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 1cdb894..57a4868 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,8 +1,8 @@ use std::path::PathBuf; use crate::error::{CodeM8Error, Result}; +use crate::language::supported_file_extensions; -const DEFAULT_FILE_EXTENSIONS: &[&str] = &["ts"]; const HELP_TEXT: &str = "\ CodeM8 - deterministic source code analysis reports. @@ -22,7 +22,7 @@ OPTIONS: -file-extension= --file-extension= Comma-separated source file extensions to analyze. - Defaults to: ts + Defaults to all extensions registered in LANGUAGE_PATTERNS. Examples: -file-extension=ts,tsx,js,jsx -files= @@ -128,12 +128,7 @@ where } Ok(CliConfig { report_duplicate, - file_extensions: file_extensions.unwrap_or_else(|| { - DEFAULT_FILE_EXTENSIONS - .iter() - .map(std::string::ToString::to_string) - .collect() - }), + file_extensions: file_extensions.unwrap_or_else(supported_file_extensions), files, }) } @@ -220,7 +215,7 @@ mod tests { fn parses_default_duplicate_report_config() { let config = parse_args(["--report-duplicate"]).expect("config parses"); assert!(config.report_duplicate); - assert_eq!(config.file_extensions, ["ts"]); + assert_eq!(config.file_extensions, supported_file_extensions()); assert_eq!(config.files, None); } diff --git a/src/language.rs b/src/language.rs index efaaf86..a3be521 100644 --- a/src/language.rs +++ b/src/language.rs @@ -119,6 +119,19 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ }, ]; +#[must_use] +pub fn supported_file_extensions() -> Vec { + let mut extensions = Vec::new(); + for language in LANGUAGE_PATTERNS { + for &extension in language.extensions { + if !extensions.iter().any(|selected| selected == extension) { + extensions.push(extension.to_string()); + } + } + } + extensions +} + #[derive(Debug)] struct DuplicateMitigationLineRegistry { by_extension: HashMap<&'static str, DuplicateMitigationPatterns>, @@ -259,4 +272,14 @@ mod tests { fn empty_character_pattern_does_not_match() { assert!(!matches_duplicate_mitigation_pattern("}", &[])); } + + #[test] + fn collects_supported_file_extensions_from_language_patterns() { + let extensions = supported_file_extensions(); + for language in LANGUAGE_PATTERNS { + for extension in language.extensions { + assert!(extensions.iter().any(|selected| selected == extension)); + } + } + } } diff --git a/src/lib.rs b/src/lib.rs index 659a0a0..72704ea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,14 +115,17 @@ mod tests { "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n", ); let output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); + let expected_extensions = language::supported_file_extensions().join(", "); assert_eq!( output, - concat!( + [ "Duplicate Code Report\n", "=====================\n", "\n", "Analyzed files: 2\n", - "Analyzed extensions: ts\n", + "Analyzed extensions: ", + &expected_extensions, + "\n", "Duplicate blocks found: 1\n", "\n", "#1 Weight: 324\n", @@ -139,7 +142,8 @@ mod tests { " if (value === undefined) {\n", " return defaultValue;\n", " }\n", - ) + ] + .concat() ); } @@ -160,7 +164,8 @@ mod tests { project.write("src/a.js", "const value = one;\n"); project.write("src/b.js", "const value = one;\n"); let default_output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); - assert!(default_output.contains("Analyzed files: 0")); + assert!(default_output.contains("Analyzed files: 2")); + assert!(default_output.contains("Duplicate blocks found: 1")); let js_output = run_in(&project, &["--report-duplicate", "-file-extension=js"]) .expect("report succeeds"); assert!(js_output.contains("Analyzed files: 2")); From 606268d0d9e1b12935feef12abac28a50089643e Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 19:25:10 +0200 Subject: [PATCH 19/22] [feat] add verbose mode --- README.md | 11 ++++++++ src/cli.rs | 21 +++++++++++++-- src/language.rs | 6 ++--- src/lib.rs | 39 ++++++++++++++++++++------- src/report.rs | 71 ++++++++++++++++++++++++++++++++++++++----------- 5 files changed, 117 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index ce3d08a..1ee7d5b 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,12 @@ Analyze an explicit list of files instead of recursively discovering files: codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js ``` +Include duplicate block metrics: + +```bash +codem8 --report-duplicate --verbose +``` + ## Duplicate Report By default, CodeM8 analyzes `.ts` files. Recursive discovery skips common @@ -79,6 +85,11 @@ Duplicate block weight is calculated as: Reports are sorted deterministically by descending weight, then by line count, character count, first location, and normalized block text. +By default, each duplicate block prints the duplicated code before its +locations. Use `--verbose` to also show weight, line count, and occurrence +count. Character counts are used internally for scoring and sorting, but are +not printed. + ## Language Heuristics CodeM8 includes a hard-coded registry of block-only line patterns for common diff --git a/src/cli.rs b/src/cli.rs index 57a4868..381520b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -31,6 +31,9 @@ OPTIONS: discovering files from the current directory. Example: -files=src/a.ts,src/b.js + --verbose + Include duplicate block metrics in report output. + DUPLICATE REPORT PURPOSE: The duplicate report helps you find repeated code that may be worth refactoring, reviewing, or consolidating. It lists each duplicated block with @@ -52,6 +55,7 @@ pub enum CliCommand { #[derive(Debug, Clone, PartialEq, Eq)] pub struct CliConfig { pub report_duplicate: bool, + pub verbose: bool, pub file_extensions: Vec, pub files: Option>, } @@ -91,12 +95,15 @@ where S: Into, { let mut report_duplicate = false; + let mut verbose = false; let mut file_extensions = None; let mut files = None; for arg in args { let arg = arg.into(); if arg == "--report-duplicate" { report_duplicate = true; + } else if arg == "--verbose" { + verbose = true; } else if let Some(value) = arg .strip_prefix("-file-extension=") .or_else(|| arg.strip_prefix("--file-extension=")) @@ -128,6 +135,7 @@ where } Ok(CliConfig { report_duplicate, + verbose, file_extensions: file_extensions.unwrap_or_else(supported_file_extensions), files, }) @@ -205,6 +213,7 @@ mod tests { fn exposes_detailed_help_text() { assert!(help_text().contains("USAGE:")); assert!(help_text().contains("--report-duplicate")); + assert!(help_text().contains("--verbose")); assert!(help_text().contains("-file-extension=")); assert!(help_text().contains("-files=")); assert!(help_text().contains("helps you find repeated code")); @@ -215,10 +224,18 @@ mod tests { fn parses_default_duplicate_report_config() { let config = parse_args(["--report-duplicate"]).expect("config parses"); assert!(config.report_duplicate); + assert!(!config.verbose); assert_eq!(config.file_extensions, supported_file_extensions()); assert_eq!(config.files, None); } + #[test] + fn parses_verbose_duplicate_report_config() { + let config = parse_args(["--report-duplicate", "--verbose"]).expect("config parses"); + assert!(config.report_duplicate); + assert!(config.verbose); + } + #[test] fn parses_extensions_case_insensitively_and_trims_whitespace() { let extensions = parse_file_extensions(" ts, JS ,tsx,ts ").expect("extensions parse"); @@ -254,8 +271,8 @@ mod tests { #[test] fn rejects_unknown_arguments() { - let error = parse_args(["--report-duplicate", "--verbose"]).expect_err("unknown arg fails"); - assert!(error.to_string().contains("unknown argument: --verbose")); + let error = parse_args(["--report-duplicate", "--unknown"]).expect_err("unknown arg fails"); + assert!(error.to_string().contains("unknown argument: --unknown")); assert!(!error.should_show_help()); } diff --git a/src/language.rs b/src/language.rs index a3be521..2e75db0 100644 --- a/src/language.rs +++ b/src/language.rs @@ -22,7 +22,7 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ language_name: "Rust", extensions: &["rs"], duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], - duplicate_mitigation_lines: &[".into_iter()"], + duplicate_mitigation_lines: &[".into_iter()", "///"], }, LanguageLinePattern { language_name: "C / C++ / Objective-C", @@ -114,8 +114,8 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "YAML / JSON / TOML", extensions: &["yaml", "yml", "json", "toml"], - duplicate_mitigation_pattern: &['{', '}', '[', ']', ','], - duplicate_mitigation_lines: &[], + duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_lines: &["jobs:", "on:"], }, ]; diff --git a/src/lib.rs b/src/lib.rs index 72704ea..c1c2ce9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,7 +43,7 @@ where duplicate_blocks, }; writer - .write_all(report::render_duplicate_report(&report).as_bytes()) + .write_all(report::render_duplicate_report(&report, config.verbose).as_bytes()) .map_err(|error| { CodeM8Error::new(format!("could not write report output: {error}")) })?; @@ -128,25 +128,44 @@ mod tests { "\n", "Duplicate blocks found: 1\n", "\n", - "#1 Weight: 324\n", - "Lines: 4\n", - "Characters: 81\n", - "Occurrences: 2\n", - "\n", - "Locations:\n", - "- src/a.ts:1-4\n", - "- src/b.ts:1-4\n", - "\n", + "#1\n", "Code:\n", " const value = computeValue(input);\n", " if (value === undefined) {\n", " return defaultValue;\n", " }\n", + "\n", + "Locations:\n", + "- src/a.ts:1-4\n", + "- src/b.ts:1-4\n", ] .concat() ); } + #[test] + fn verbose_duplicate_report_includes_metrics_without_characters() { + let project = TempProject::new("verbose"); + project.write( + "src/a.ts", + "const value = computeValue(input);\nreturn value;\n", + ); + project.write( + "src/b.ts", + "const value = computeValue(input);\nreturn value;\n", + ); + let output = + run_in(&project, &["--report-duplicate", "--verbose"]).expect("report succeeds"); + assert!(output.contains("Weight:")); + assert!(output.contains("Lines: 2")); + assert!(output.contains("Occurrences: 2")); + assert!(!output.contains("Characters:")); + assert!( + output.find("Code:").expect("code section exists") + < output.find("Locations:").expect("locations section exists") + ); + } + #[test] fn explicit_files_disable_recursive_discovery() { let project = TempProject::new("explicit-files"); diff --git a/src/report.rs b/src/report.rs index 126a1cc..1d039e5 100644 --- a/src/report.rs +++ b/src/report.rs @@ -11,7 +11,7 @@ pub struct DuplicateReport { } #[must_use] -pub fn render_duplicate_report(report: &DuplicateReport) -> String { +pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> String { let mut output = String::new(); output.push_str("Duplicate Code Report\n"); output.push_str("=====================\n\n"); @@ -28,11 +28,20 @@ pub fn render_duplicate_report(report: &DuplicateReport) -> String { ); for (index, block) in report.duplicate_blocks.iter().enumerate() { output.push('\n'); - let _ = writeln!(output, "#{} Weight: {}", index + 1, block.weight); - let _ = writeln!(output, "Lines: {}", block.line_count()); - let _ = writeln!(output, "Characters: {}", block.character_count()); - let _ = writeln!(output, "Occurrences: {}\n", block.occurrences.len()); - output.push_str("Locations:\n"); + let _ = writeln!(output, "#{}", index + 1); + if verbose { + let _ = writeln!(output, "Weight: {}", block.weight); + let _ = writeln!(output, "Lines: {}", block.line_count()); + let _ = writeln!(output, "Occurrences: {}", block.occurrences.len()); + output.push('\n'); + } + output.push_str("Code:\n"); + for line in &block.normalized_lines { + output.push_str(" "); + output.push_str(line); + output.push('\n'); + } + output.push_str("\nLocations:\n"); for occurrence in &block.occurrences { let _ = writeln!( output, @@ -42,12 +51,6 @@ pub fn render_duplicate_report(report: &DuplicateReport) -> String { occurrence.end_line ); } - output.push_str("\nCode:\n"); - for line in &block.normalized_lines { - output.push_str(" "); - output.push_str(line); - output.push('\n'); - } } output } @@ -68,7 +71,7 @@ mod tests { duplicate_blocks: Vec::new(), }; assert_eq!( - render_duplicate_report(&report), + render_duplicate_report(&report, false), "Duplicate Code Report\n\ =====================\n\ \n\ @@ -100,10 +103,46 @@ mod tests { weight: 13, }], }; - let output = render_duplicate_report(&report); - assert!(output.contains("#1 Weight: 13")); - assert!(output.contains("Lines: 1")); + let output = render_duplicate_report(&report, false); + assert!(output.contains("#1\n")); + assert!(!output.contains("Weight: 13")); + assert!(!output.contains("Lines: 1")); + assert!(!output.contains("Occurrences: 2")); + assert!(!output.contains("Characters:")); assert!(output.contains("- src/a.ts:1-1")); assert!(output.contains(" return value;")); + assert!( + output.find("Code:").expect("code section exists") + < output.find("Locations:").expect("locations section exists") + ); + } + + #[test] + fn renders_duplicate_block_metrics_in_verbose_mode() { + let report = DuplicateReport { + analyzed_files: 2, + analyzed_extensions: vec!["ts".to_string()], + duplicate_blocks: vec![DuplicateBlock { + normalized_lines: vec!["return value;".to_string()], + occurrences: vec![ + DuplicateOccurrence { + file_path: PathBuf::from("src/a.ts"), + start_line: 1, + end_line: 1, + }, + DuplicateOccurrence { + file_path: PathBuf::from("src/b.ts"), + start_line: 2, + end_line: 2, + }, + ], + weight: 13, + }], + }; + let output = render_duplicate_report(&report, true); + assert!(output.contains("Weight: 13")); + assert!(output.contains("Lines: 1")); + assert!(output.contains("Occurrences: 2")); + assert!(!output.contains("Characters:")); } } From 49ff176853a9ca3353873f08c297f033963a3869 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 20:01:47 +0200 Subject: [PATCH 20/22] [feat] add regex duplicate mitigation patterns --- Cargo.lock | 45 ++++++++++++++++++++ Cargo.toml | 1 + src/language.rs | 110 ++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 133 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2465298..7b39a5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,13 +2,58 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "codem8" version = "0.1.0" dependencies = [ + "regex", "xxhash-rust", ] +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + [[package]] name = "xxhash-rust" version = "0.8.15" diff --git a/Cargo.toml b/Cargo.toml index a1ad11d..8cfb01c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,4 +9,5 @@ keywords = ["cli", "duplicate-detection", "source-code", "analysis"] categories = ["command-line-utilities", "development-tools"] [dependencies] +regex = "1" xxhash-rust = { version = "0.8", features = ["xxh3"] } diff --git a/src/language.rs b/src/language.rs index 2e75db0..efabeb9 100644 --- a/src/language.rs +++ b/src/language.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::sync::OnceLock; use crate::model::LineStatus; +use regex::Regex; #[derive(Debug, Clone, Copy)] pub struct LanguageLinePattern { @@ -9,113 +10,137 @@ pub struct LanguageLinePattern { pub extensions: &'static [&'static str], pub duplicate_mitigation_pattern: &'static [char], pub duplicate_mitigation_lines: &'static [&'static str], + pub duplicate_mitigation_regexps: &'static [&'static str], } pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "TypeScript / JavaScript", extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "Rust", extensions: &["rs"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], - duplicate_mitigation_lines: &[".into_iter()", "///"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["///"], + duplicate_mitigation_regexps: &[ + r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$", + r"^[A-Za-z0-9_]+\s*[.,]?$", + r"^\.?\s*[A-Za-z0-9_]+(?:\(\s*\)?)?$", + r"^let\s+(?:mut\s+)?[A-Za-z0-9_]+\s*=$", + r"^pub\s+[A-Za-z0-9_]*\s*:\s*[A-Za-z0-9_]*[,]?$", + r"^use\s+[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*;$", + ], }, LanguageLinePattern { language_name: "C / C++ / Objective-C", extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], - duplicate_mitigation_lines: &["#endif", "#else"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif"], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "C#", extensions: &["cs"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], - duplicate_mitigation_lines: &["#endregion", "#else", "#endif"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif", "#endregion"], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "Java / Kotlin / Scala", extensions: &["java", "kt", "kts", "scala", "sc"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "Go", extensions: &["go"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "Python", extensions: &["py", "pyw"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "Ruby", extensions: &["rb"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '}'], duplicate_mitigation_lines: &["end"], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "PHP", extensions: &["php", "phtml"], duplicate_mitigation_pattern: &[ - '(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>', '/', + '(', ')', ',', '/', ':', ';', '<', '>', '?', '[', ']', '{', '}', ], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "Swift", extensions: &["swift"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "Shell", extensions: &["sh", "bash", "zsh", "fish"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', '&', '|'], - duplicate_mitigation_lines: &["then", "do", "done", "fi", "else"], + duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "PowerShell", extensions: &["ps1", "psm1", "psd1"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '|'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '|', '}'], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "HTML / XML", extensions: &["html", "htm", "xml", "xhtml", "svg"], - duplicate_mitigation_pattern: &['<', '>', '/'], + duplicate_mitigation_pattern: &['/', '<', '>'], duplicate_mitigation_lines: &[ - "", - "", - "", "", "", + "", "", + "", + "", ], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "CSS / SCSS / Sass / Less", extensions: &["css", "scss", "sass", "less"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', ':'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "SQL", extensions: &["sql"], - duplicate_mitigation_pattern: &['(', ')', ';', ',', ':'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';'], duplicate_mitigation_lines: &["BEGIN", "END"], + duplicate_mitigation_regexps: &[], }, LanguageLinePattern { language_name: "YAML / JSON / TOML", extensions: &["yaml", "yml", "json", "toml"], - duplicate_mitigation_pattern: &['(', ')', '{', '}', '[', ']', ';', ',', '?', ':', '<', '>'], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], duplicate_mitigation_lines: &["jobs:", "on:"], + duplicate_mitigation_regexps: &[], }, ]; @@ -141,6 +166,7 @@ struct DuplicateMitigationLineRegistry { struct DuplicateMitigationPatterns { lines_by_hash: HashMap>, character_pattern: Vec, + regexps: Vec, } static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock = @@ -178,6 +204,10 @@ fn registry() -> &'static DuplicateMitigationLineRegistry { &mut patterns.character_pattern, language.duplicate_mitigation_pattern, ); + register_duplicate_mitigation_regexps( + &mut patterns.regexps, + language.duplicate_mitigation_regexps, + ); } } DuplicateMitigationLineRegistry { by_extension } @@ -188,6 +218,7 @@ impl DuplicateMitigationPatterns { fn matches_line(&self, normalized_line: &str, hash: u128) -> bool { self.matches_registered_line(normalized_line, hash) || matches_duplicate_mitigation_pattern(normalized_line, &self.character_pattern) + || matches_duplicate_mitigation_regexps(normalized_line, &self.regexps) } fn matches_registered_line(&self, normalized_line: &str, hash: u128) -> bool { @@ -220,6 +251,17 @@ fn register_duplicate_mitigation_pattern( } } +fn register_duplicate_mitigation_regexps( + regexps: &mut Vec, + patterns: &'static [&'static str], +) { + for &pattern in patterns { + if !regexps.iter().any(|regexp| regexp.as_str() == pattern) { + regexps.push(Regex::new(pattern).expect("duplicate mitigation regexp must compile")); + } + } +} + fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) -> bool { !character_pattern.is_empty() && line @@ -227,6 +269,14 @@ fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) .all(|character| character.is_whitespace() || character_pattern.contains(&character)) } +fn matches_duplicate_mitigation_regexps(line: &str, regexps: &[Regex]) -> bool { + regexps.iter().any(|regexp| { + regexp + .find(line) + .is_some_and(|matched| matched.start() == 0 && matched.end() == line.len()) + }) +} + #[cfg(test)] mod tests { use super::*; @@ -261,6 +311,20 @@ mod tests { assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); } + #[test] + fn assigns_block_only_status_from_regexps() { + let line = ".update()"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn regexps_must_match_the_full_line() { + let line = ".update()?.await"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::Comparison); + } + #[test] fn ignores_character_pattern_for_unknown_extensions() { let line = "});"; From e4461c61198bc2cd62623548225d7315d7d0d186 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 20:10:15 +0200 Subject: [PATCH 21/22] [fix] reject double-dash CLI options --- README.md | 4 ++-- src/cli.rs | 51 ++++++++++++++++++++++++++++++++++----------------- src/lib.rs | 2 +- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 1ee7d5b..880105a 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js Include duplicate block metrics: ```bash -codem8 --report-duplicate --verbose +codem8 --report-duplicate -verbose ``` ## Duplicate Report @@ -86,7 +86,7 @@ Reports are sorted deterministically by descending weight, then by line count, character count, first location, and normalized block text. By default, each duplicate block prints the duplicated code before its -locations. Use `--verbose` to also show weight, line count, and occurrence +locations. Use `-verbose` to also show weight, line count, and occurrence count. Character counts are used internally for scoring and sorting, but are not printed. diff --git a/src/cli.rs b/src/cli.rs index 381520b..597a98b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -20,18 +20,16 @@ REQUIRED REPORT SWITCHES: OPTIONS: -file-extension= - --file-extension= Comma-separated source file extensions to analyze. Defaults to all extensions registered in LANGUAGE_PATTERNS. Examples: -file-extension=ts,tsx,js,jsx -files= - --files= Comma-separated explicit files to analyze instead of recursively discovering files from the current directory. Example: -files=src/a.ts,src/b.js - --verbose + -verbose Include duplicate block metrics in report output. DUPLICATE REPORT PURPOSE: @@ -102,22 +100,16 @@ where let arg = arg.into(); if arg == "--report-duplicate" { report_duplicate = true; - } else if arg == "--verbose" { + } else if arg == "-verbose" { verbose = true; - } else if let Some(value) = arg - .strip_prefix("-file-extension=") - .or_else(|| arg.strip_prefix("--file-extension=")) - { + } else if let Some(value) = arg.strip_prefix("-file-extension=") { if file_extensions.is_some() { return Err(CodeM8Error::new( "file extensions were provided more than once", )); } file_extensions = Some(parse_file_extensions(value)?); - } else if let Some(value) = arg - .strip_prefix("-files=") - .or_else(|| arg.strip_prefix("--files=")) - { + } else if let Some(value) = arg.strip_prefix("-files=") { if files.is_some() { return Err(CodeM8Error::new( "explicit files were provided more than once", @@ -196,7 +188,7 @@ pub fn parse_file_list(value: &str) -> Result> { } fn is_help_argument(arg: &str) -> bool { - matches!(arg, "help" | "--help" | "-h") + matches!(arg, "help" | "-h") } #[cfg(test)] @@ -209,13 +201,22 @@ mod tests { assert_eq!(command, CliCommand::Help); } + #[test] + fn parses_short_help_option() { + let command = parse_command(["-h"]).expect("short help parses"); + assert_eq!(command, CliCommand::Help); + } + #[test] fn exposes_detailed_help_text() { assert!(help_text().contains("USAGE:")); assert!(help_text().contains("--report-duplicate")); - assert!(help_text().contains("--verbose")); + assert!(help_text().contains("-verbose")); assert!(help_text().contains("-file-extension=")); assert!(help_text().contains("-files=")); + assert!(!help_text().contains("--verbose")); + assert!(!help_text().contains("--file-extension=")); + assert!(!help_text().contains("--files=")); assert!(help_text().contains("helps you find repeated code")); assert!(!help_text().contains("Duplicate weight")); } @@ -231,7 +232,7 @@ mod tests { #[test] fn parses_verbose_duplicate_report_config() { - let config = parse_args(["--report-duplicate", "--verbose"]).expect("config parses"); + let config = parse_args(["--report-duplicate", "-verbose"]).expect("config parses"); assert!(config.report_duplicate); assert!(config.verbose); } @@ -276,12 +277,28 @@ mod tests { assert!(!error.should_show_help()); } + #[test] + fn rejects_double_dash_option_arguments() { + for option in [ + "--help", + "--verbose", + "--file-extension=js", + "--files=src/a.ts", + ] { + let error = + parse_args(["--report-duplicate", option]).expect_err("double-dash option fails"); + assert!(error + .to_string() + .contains(&format!("unknown argument: {option}"))); + } + } + #[test] fn rejects_repeated_file_extension_arguments() { let error = parse_args([ "--report-duplicate", "-file-extension=ts", - "--file-extension=js", + "-file-extension=js", ]) .expect_err("repeated extensions fail"); assert!(error @@ -291,7 +308,7 @@ mod tests { #[test] fn rejects_repeated_explicit_file_arguments() { - let error = parse_args(["--report-duplicate", "-files=a.ts", "--files=b.ts"]) + let error = parse_args(["--report-duplicate", "-files=a.ts", "-files=b.ts"]) .expect_err("repeated explicit files fail"); assert!(error .to_string() diff --git a/src/lib.rs b/src/lib.rs index c1c2ce9..b0a3005 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -155,7 +155,7 @@ mod tests { "const value = computeValue(input);\nreturn value;\n", ); let output = - run_in(&project, &["--report-duplicate", "--verbose"]).expect("report succeeds"); + run_in(&project, &["--report-duplicate", "-verbose"]).expect("report succeeds"); assert!(output.contains("Weight:")); assert!(output.contains("Lines: 2")); assert!(output.contains("Occurrences: 2")); From 71d01dfee02201ab02c7999d79b6bed837ddd9f9 Mon Sep 17 00:00:00 2001 From: b4prog Date: Thu, 25 Jun 2026 20:51:20 +0200 Subject: [PATCH 22/22] [feat] add git branch duplicate report scanning --- README.md | 13 +++ src/cli.rs | 142 +++++++++++++++++++++--- src/discovery.rs | 9 +- src/git.rs | 284 +++++++++++++++++++++++++++++++++++++++++++++++ src/language.rs | 6 + src/lib.rs | 44 +++++++- src/report.rs | 43 ++++++- 7 files changed, 518 insertions(+), 23 deletions(-) create mode 100644 src/git.rs diff --git a/README.md b/README.md index 880105a..74ed36e 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,13 @@ Analyze an explicit list of files instead of recursively discovering files: codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js ``` +Analyze files changed on the current local Git branch compared to the origin +base branch: + +```bash +codem8 --report-duplicate -git-branch +``` + Include duplicate block metrics: ```bash @@ -76,6 +83,12 @@ trailing Unicode whitespace are removed before hashing and comparison. Empty trimmed lines are ignored. CodeM8 currently expects UTF-8 source files; invalid UTF-8 produces a clear error rather than lossy output. +Use `-git-branch` to analyze only files changed on the current local branch +compared to the origin base branch. CodeM8 resolves that base from `origin/HEAD` +with `origin/main` and `origin/master` fallbacks. This includes committed, +staged, unstaged, and untracked files that still exist in the worktree. The +option requires a Git repository and cannot be combined with `-files`. + Duplicate block weight is calculated as: ```text diff --git a/src/cli.rs b/src/cli.rs index 597a98b..9b3dc17 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,11 +1,11 @@ +use std::fmt::Write as _; use std::path::PathBuf; use crate::error::{CodeM8Error, Result}; use crate::language::supported_file_extensions; -const HELP_TEXT: &str = "\ -CodeM8 - deterministic source code analysis reports. - +const CARGO_LOCK: &str = include_str!("../Cargo.lock"); +const HELP_TEXT_BODY: &str = "\ USAGE: codem8 help codem8 --report-duplicate [OPTIONS] @@ -29,6 +29,11 @@ OPTIONS: discovering files from the current directory. Example: -files=src/a.ts,src/b.js + -git-branch + Analyze files changed on the current local Git branch compared to the + origin base branch, including committed, staged, unstaged, and untracked + files. Cannot be combined with -files. + -verbose Include duplicate block metrics in report output. @@ -42,8 +47,15 @@ EXAMPLES: codem8 --report-duplicate codem8 --report-duplicate -file-extension=ts,tsx,js,jsx codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js + codem8 --report-duplicate -git-branch "; +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct CargoLockPackage<'a> { + name: &'a str, + version: &'a str, +} + #[derive(Debug, Clone, PartialEq, Eq)] pub enum CliCommand { Help, @@ -56,11 +68,20 @@ pub struct CliConfig { pub verbose: bool, pub file_extensions: Vec, pub files: Option>, + pub git_branch: bool, } #[must_use] -pub const fn help_text() -> &'static str { - HELP_TEXT +pub fn help_text() -> String { + let version = codem8_version_from_cargo_lock().unwrap_or("unknown"); + let mut output = String::new(); + let _ = writeln!( + output, + "CodeM8 {version} - deterministic source code analysis reports." + ); + output.push('\n'); + output.push_str(HELP_TEXT_BODY); + output } /// Parses command-line arguments into a CLI command. @@ -96,12 +117,20 @@ where let mut verbose = false; let mut file_extensions = None; let mut files = None; + let mut git_branch = false; for arg in args { let arg = arg.into(); if arg == "--report-duplicate" { report_duplicate = true; } else if arg == "-verbose" { verbose = true; + } else if arg == "-git-branch" { + if git_branch { + return Err(CodeM8Error::new( + "git branch mode was provided more than once", + )); + } + git_branch = true; } else if let Some(value) = arg.strip_prefix("-file-extension=") { if file_extensions.is_some() { return Err(CodeM8Error::new( @@ -125,11 +154,17 @@ where "no report switch provided; pass --report-duplicate", )); } + if git_branch && files.is_some() { + return Err(CodeM8Error::new( + "git branch mode cannot be combined with explicit files", + )); + } Ok(CliConfig { report_duplicate, verbose, file_extensions: file_extensions.unwrap_or_else(supported_file_extensions), files, + git_branch, }) } @@ -191,6 +226,30 @@ fn is_help_argument(arg: &str) -> bool { matches!(arg, "help" | "-h") } +fn codem8_version_from_cargo_lock() -> Option<&'static str> { + cargo_lock_packages(CARGO_LOCK) + .find(|package| package.name == "codem8") + .map(|package| package.version) +} + +fn cargo_lock_packages(lockfile: &str) -> impl Iterator> { + lockfile.split("[[package]]").filter_map(cargo_lock_package) +} + +fn cargo_lock_package(section: &str) -> Option> { + let name = cargo_lock_value(section, "name")?; + let version = cargo_lock_value(section, "version")?; + Some(CargoLockPackage { name, version }) +} + +fn cargo_lock_value<'a>(section: &'a str, key: &str) -> Option<&'a str> { + let prefix = format!("{key} = \""); + section + .lines() + .map(str::trim) + .find_map(|line| line.strip_prefix(&prefix)?.strip_suffix('"')) +} + #[cfg(test)] mod tests { use super::*; @@ -209,16 +268,42 @@ mod tests { #[test] fn exposes_detailed_help_text() { - assert!(help_text().contains("USAGE:")); - assert!(help_text().contains("--report-duplicate")); - assert!(help_text().contains("-verbose")); - assert!(help_text().contains("-file-extension=")); - assert!(help_text().contains("-files=")); - assert!(!help_text().contains("--verbose")); - assert!(!help_text().contains("--file-extension=")); - assert!(!help_text().contains("--files=")); - assert!(help_text().contains("helps you find repeated code")); - assert!(!help_text().contains("Duplicate weight")); + let help = help_text(); + assert!(help.contains("USAGE:")); + assert!(help.contains("--report-duplicate")); + assert!(help.contains("-verbose")); + assert!(help.contains("-file-extension=")); + assert!(help.contains("-files=")); + assert!(help.contains("-git-branch")); + assert!(!help.contains("--verbose")); + assert!(!help.contains("--file-extension=")); + assert!(!help.contains("--files=")); + assert!(!help.contains("--git-branch")); + assert!(help.contains("helps you find repeated code")); + assert!(!help.contains("Duplicate weight")); + } + + #[test] + fn help_text_includes_version_from_cargo_lock() { + let version = codem8_version_from_cargo_lock().expect("codem8 version exists"); + assert!(help_text().starts_with(&format!("CodeM8 {version} - "))); + } + + #[test] + fn extracts_package_versions_from_cargo_lock_sections() { + let lockfile = r#" +[[package]] +name = "dependency" +version = "1.2.3" + +[[package]] +name = "codem8" +version = "0.4.2" +"#; + let package = cargo_lock_packages(lockfile) + .find(|package| package.name == "codem8") + .expect("package exists"); + assert_eq!(package.version, "0.4.2"); } #[test] @@ -228,6 +313,7 @@ mod tests { assert!(!config.verbose); assert_eq!(config.file_extensions, supported_file_extensions()); assert_eq!(config.files, None); + assert!(!config.git_branch); } #[test] @@ -237,6 +323,13 @@ mod tests { assert!(config.verbose); } + #[test] + fn parses_git_branch_duplicate_report_config() { + let config = parse_args(["--report-duplicate", "-git-branch"]).expect("config parses"); + assert!(config.git_branch); + assert_eq!(config.files, None); + } + #[test] fn parses_extensions_case_insensitively_and_trims_whitespace() { let extensions = parse_file_extensions(" ts, JS ,tsx,ts ").expect("extensions parse"); @@ -284,6 +377,7 @@ mod tests { "--verbose", "--file-extension=js", "--files=src/a.ts", + "--git-branch", ] { let error = parse_args(["--report-duplicate", option]).expect_err("double-dash option fails"); @@ -315,6 +409,24 @@ mod tests { .contains("explicit files were provided more than once")); } + #[test] + fn rejects_repeated_git_branch_arguments() { + let error = parse_args(["--report-duplicate", "-git-branch", "-git-branch"]) + .expect_err("repeated git branch mode fails"); + assert!(error + .to_string() + .contains("git branch mode was provided more than once")); + } + + #[test] + fn rejects_git_branch_with_explicit_files() { + let error = parse_args(["--report-duplicate", "-git-branch", "-files=a.ts"]) + .expect_err("exclusive file modes fail"); + assert!(error + .to_string() + .contains("git branch mode cannot be combined with explicit files")); + } + #[test] fn parses_explicit_file_list() { let files = parse_file_list("src/a.ts, ./src/b.ts").expect("files parse"); diff --git a/src/discovery.rs b/src/discovery.rs index 0424979..c48293a 100644 --- a/src/discovery.rs +++ b/src/discovery.rs @@ -90,9 +90,16 @@ fn discover_explicit_files( if !seen_paths.insert(canonical_path.clone()) { continue; } + let display_path = if absolute_input { + canonical_path + .strip_prefix(current_dir) + .map_or_else(|_| normalize_display_path(file), normalize_display_path) + } else { + normalize_display_path(file) + }; source_files.push(SourceFile { path: canonical_path, - display_path: normalize_display_path(file), + display_path, extension, }); } diff --git a/src/git.rs b/src/git.rs new file mode 100644 index 0000000..965f0c9 --- /dev/null +++ b/src/git.rs @@ -0,0 +1,284 @@ +use std::collections::BTreeSet; +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::{Command, Output}; + +use crate::error::{CodeM8Error, Result}; + +/// Lists files changed on the current branch compared to the origin base branch. +/// +/// # Errors +/// +/// Returns an error when `current_dir` is not inside a Git repository, the +/// current branch cannot be resolved, or the origin base branch is missing. +pub fn changed_files_against_origin(current_dir: &Path) -> Result> { + let repo_root = repo_root(current_dir)?; + ensure_named_branch(&repo_root)?; + let origin_ref = origin_base_ref(&repo_root)?; + let merge_base = run_git_text( + &repo_root, + &["merge-base", &origin_ref, "HEAD"], + "find merge base with origin base branch", + )?; + let mut paths = BTreeSet::new(); + collect_nul_paths( + &repo_root, + &[ + "diff", + "--name-only", + "-z", + "--diff-filter=ACMRTUXB", + merge_base.trim(), + "HEAD", + ], + &mut paths, + )?; + collect_nul_paths( + &repo_root, + &[ + "diff", + "--name-only", + "-z", + "--cached", + "--diff-filter=ACMRTUXB", + ], + &mut paths, + )?; + collect_nul_paths( + &repo_root, + &["diff", "--name-only", "-z", "--diff-filter=ACMRTUXB"], + &mut paths, + )?; + collect_nul_paths( + &repo_root, + &["ls-files", "--others", "--exclude-standard", "-z"], + &mut paths, + )?; + Ok(paths + .into_iter() + .filter_map(|path| existing_file_path(&repo_root, current_dir, &path)) + .collect()) +} + +fn repo_root(current_dir: &Path) -> Result { + let output = run_git_output( + current_dir, + &["rev-parse", "--show-toplevel"], + "find git repository", + )?; + if !output.status.success() { + return Err(CodeM8Error::new( + "git branch mode requires the current directory to be inside a git repository", + )); + } + let root = output_text(output.stdout, "parse git repository root")?; + Ok(PathBuf::from(root.trim())) +} + +fn ensure_named_branch(repo_root: &Path) -> Result<()> { + let branch = run_git_text( + repo_root, + &["rev-parse", "--abbrev-ref", "HEAD"], + "determine current git branch", + )?; + let branch = branch.trim(); + if branch == "HEAD" { + return Err(CodeM8Error::new( + "git branch mode requires a named local branch, but HEAD is detached", + )); + } + Ok(()) +} + +fn origin_base_ref(repo_root: &Path) -> Result { + for candidate in ["origin/HEAD", "origin/main", "origin/master"] { + if verify_origin_ref(repo_root, candidate) { + return Ok(candidate.to_string()); + } + } + Err(CodeM8Error::new( + "git branch mode could not resolve origin base branch", + )) +} + +fn verify_origin_ref(repo_root: &Path, origin_ref: &str) -> bool { + let commit_ref = format!("{origin_ref}^{{commit}}"); + run_git_output( + repo_root, + &["rev-parse", "--verify", &commit_ref], + "resolve origin base branch", + ) + .is_ok_and(|output| output.status.success()) +} + +fn collect_nul_paths(repo_root: &Path, args: &[&str], paths: &mut BTreeSet) -> Result<()> { + let output = run_git_output(repo_root, args, "list changed git files")?; + let stdout = ensure_git_success(output, "list changed git files")?; + for path in nul_paths(&stdout) { + paths.insert(path); + } + Ok(()) +} + +fn existing_file_path(repo_root: &Path, current_dir: &Path, path: &Path) -> Option { + let absolute = repo_root.join(path); + let metadata = fs::symlink_metadata(&absolute).ok()?; + if !metadata.is_file() || metadata.file_type().is_symlink() { + return None; + } + let relative = absolute.strip_prefix(current_dir).map(Path::to_path_buf); + Some(relative.unwrap_or(absolute)) +} + +fn run_git_text(current_dir: &Path, args: &[&str], action: &str) -> Result { + let output = run_git_output(current_dir, args, action)?; + let stdout = ensure_git_success(output, action)?; + output_text(stdout, action) +} + +fn run_git_output(current_dir: &Path, args: &[&str], action: &str) -> Result { + Command::new("git") + .arg("-C") + .arg(current_dir) + .args(args) + .output() + .map_err(|error| CodeM8Error::new(format!("could not {action}: {error}"))) +} + +fn ensure_git_success(output: Output, action: &str) -> Result> { + if output.status.success() { + return Ok(output.stdout); + } + let stderr = output_text(output.stderr, action)?; + Err(CodeM8Error::new(format!( + "could not {action}: {}", + stderr.trim() + ))) +} + +fn output_text(bytes: Vec, action: &str) -> Result { + String::from_utf8(bytes) + .map_err(|error| CodeM8Error::new(format!("could not {action}: {error}"))) +} + +fn nul_paths(bytes: &[u8]) -> Vec { + String::from_utf8_lossy(bytes) + .split('\0') + .filter(|path| !path.is_empty()) + .map(PathBuf::from) + .collect() +} + +#[cfg(test)] +mod tests { + use std::process::Command; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + struct TempGitRepo { + path: PathBuf, + } + + impl TempGitRepo { + fn new(name: &str) -> Self { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = + std::env::temp_dir().join(format!("codem8-git-{name}-{}-{id}", std::process::id())); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + Self { path } + } + + fn path(&self) -> &Path { + &self.path + } + + fn write(&self, relative_path: &str, contents: &str) { + let path = self.path.join(relative_path); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).expect("create parent directory"); + } + fs::write(path, contents).expect("write test file"); + } + + fn git(&self, args: &[&str]) { + let status = Command::new("git") + .arg("-C") + .arg(&self.path) + .args(args) + .status() + .expect("run git"); + assert!(status.success(), "git command failed: {args:?}"); + } + + fn commit(&self, message: &str) { + self.git(&["add", "."]); + self.git(&[ + "-c", + "user.name=CodeM8 Test", + "-c", + "user.email=codem8@example.invalid", + "commit", + "-m", + message, + ]); + } + } + + impl Drop for TempGitRepo { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.path); + } + } + + fn git_is_available() -> bool { + Command::new("git") + .arg("--version") + .status() + .is_ok_and(|status| status.success()) + } + + #[test] + fn rejects_non_git_directory() { + let repo = TempGitRepo::new("non-repo"); + let error = changed_files_against_origin(repo.path()).expect_err("non-repo fails"); + assert!(error.to_string().contains("requires the current directory")); + } + + #[test] + fn lists_committed_staged_unstaged_and_untracked_files() { + if !git_is_available() { + return; + } + let repo = TempGitRepo::new("changes"); + repo.git(&["init"]); + repo.write("src/base.ts", "const value = one;\n"); + repo.write("src/deleted.ts", "const value = deleted;\n"); + repo.commit("initial"); + repo.git(&["update-ref", "refs/remotes/origin/main", "HEAD"]); + repo.git(&["branch", "-M", "feature"]); + repo.write("src/committed.ts", "const value = committed;\n"); + repo.commit("branch change"); + repo.git(&["update-ref", "refs/remotes/origin/feature", "HEAD"]); + repo.write("src/staged.ts", "const value = staged;\n"); + repo.git(&["add", "src/staged.ts"]); + repo.write("src/base.ts", "const value = modified;\n"); + repo.write("src/untracked.ts", "const value = untracked;\n"); + fs::remove_file(repo.path().join("src/deleted.ts")).expect("delete tracked file"); + let files = changed_files_against_origin(repo.path()).expect("list branch files"); + assert_eq!( + files, + [ + PathBuf::from("src/base.ts"), + PathBuf::from("src/committed.ts"), + PathBuf::from("src/staged.ts"), + PathBuf::from("src/untracked.ts"), + ] + ); + } +} diff --git a/src/language.rs b/src/language.rs index efabeb9..636c8b1 100644 --- a/src/language.rs +++ b/src/language.rs @@ -27,11 +27,17 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], duplicate_mitigation_lines: &["///"], duplicate_mitigation_regexps: &[ + // Excludes short path or enum variant fragments. Example: Self::Ready, r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$", + // Excludes bare identifiers with optional punctuation. Example: value, r"^[A-Za-z0-9_]+\s*[.,]?$", + // Excludes simple method or field access lines. Example: .clone() r"^\.?\s*[A-Za-z0-9_]+(?:\(\s*\)?)?$", + // Excludes incomplete let bindings split across lines. Example: let value = r"^let\s+(?:mut\s+)?[A-Za-z0-9_]+\s*=$", + // Excludes simple public struct field declarations. Example: pub name: String, r"^pub\s+[A-Za-z0-9_]*\s*:\s*[A-Za-z0-9_]*[,]?$", + // Excludes single-path use imports. Example: use crate::module; r"^use\s+[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*;$", ], }, diff --git a/src/lib.rs b/src/lib.rs index b0a3005..6656221 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ pub mod cli; pub mod discovery; pub mod duplicate; pub mod error; +pub mod git; pub mod language; pub mod line; pub mod model; @@ -30,16 +31,28 @@ where .write_all(cli::help_text().as_bytes()) .map_err(|error| CodeM8Error::new(format!("could not write help output: {error}")))?, cli::CliCommand::ReportDuplicate(config) => { + let should_report_scanned_files = config.git_branch || config.files.is_some(); + let git_branch_files = if config.git_branch { + Some(git::changed_files_against_origin(current_dir)?) + } else { + None + }; let source_files = discovery::discover_source_files( current_dir, &config.file_extensions, - config.files.as_deref(), + git_branch_files.as_deref().or(config.files.as_deref()), )?; let processed_files = line::process_source_files(&source_files)?; let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files); let report = report::DuplicateReport { analyzed_files: source_files.len(), analyzed_extensions: config.file_extensions, + scanned_files: should_report_scanned_files.then(|| { + source_files + .iter() + .map(|source_file| source_file.display_path.clone()) + .collect() + }), duplicate_blocks, }; writer @@ -122,7 +135,7 @@ mod tests { "Duplicate Code Report\n", "=====================\n", "\n", - "Analyzed files: 2\n", + "Number of files scanned: 2\n", "Analyzed extensions: ", &expected_extensions, "\n", @@ -173,21 +186,42 @@ mod tests { project.write("src/b.ts", "const value = one;\n"); let output = run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds"); - assert!(output.contains("Analyzed files: 1")); + assert!(output.contains("Number of files scanned: 1")); assert!(output.contains("Duplicate blocks found: 0")); } + #[test] + fn verbose_explicit_files_report_lists_scanned_files() { + let project = TempProject::new("verbose-explicit-files"); + project.write("src/a.ts", "const value = one;\n"); + project.write("src/b.ts", "const value = one;\n"); + let quiet_output = + run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds"); + assert!(!quiet_output.contains("Files scanned:")); + let verbose_output = run_in( + &project, + &["--report-duplicate", "-verbose", "-files=src/a.ts"], + ) + .expect("report succeeds"); + assert!(verbose_output.contains( + "Number of files scanned: 1\n\ + Files scanned:\n\ + - src/a.ts\n\ + Analyzed extensions:" + )); + } + #[test] fn custom_extensions_change_analyzed_files() { let project = TempProject::new("custom-extensions"); project.write("src/a.js", "const value = one;\n"); project.write("src/b.js", "const value = one;\n"); let default_output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); - assert!(default_output.contains("Analyzed files: 2")); + assert!(default_output.contains("Number of files scanned: 2")); assert!(default_output.contains("Duplicate blocks found: 1")); let js_output = run_in(&project, &["--report-duplicate", "-file-extension=js"]) .expect("report succeeds"); - assert!(js_output.contains("Analyzed files: 2")); + assert!(js_output.contains("Number of files scanned: 2")); assert!(js_output.contains("Duplicate blocks found: 1")); } diff --git a/src/report.rs b/src/report.rs index 1d039e5..eb7c42d 100644 --- a/src/report.rs +++ b/src/report.rs @@ -1,4 +1,5 @@ use std::fmt::Write as _; +use std::path::PathBuf; use crate::model::DuplicateBlock; use crate::paths::format_path; @@ -7,6 +8,7 @@ use crate::paths::format_path; pub struct DuplicateReport { pub analyzed_files: usize, pub analyzed_extensions: Vec, + pub scanned_files: Option>, pub duplicate_blocks: Vec, } @@ -15,7 +17,18 @@ pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> Strin let mut output = String::new(); output.push_str("Duplicate Code Report\n"); output.push_str("=====================\n\n"); - let _ = writeln!(output, "Analyzed files: {}", report.analyzed_files); + let _ = writeln!(output, "Number of files scanned: {}", report.analyzed_files); + let scanned_files = if verbose { + report.scanned_files.as_ref() + } else { + None + }; + if let Some(scanned_files) = scanned_files { + output.push_str("Files scanned:\n"); + for file in scanned_files { + let _ = writeln!(output, "- {}", format_path(file)); + } + } let _ = writeln!( output, "Analyzed extensions: {}", @@ -68,6 +81,7 @@ mod tests { let report = DuplicateReport { analyzed_files: 0, analyzed_extensions: vec!["ts".to_string()], + scanned_files: None, duplicate_blocks: Vec::new(), }; assert_eq!( @@ -75,7 +89,7 @@ mod tests { "Duplicate Code Report\n\ =====================\n\ \n\ - Analyzed files: 0\n\ + Number of files scanned: 0\n\ Analyzed extensions: ts\n\ Duplicate blocks found: 0\n" ); @@ -86,6 +100,7 @@ mod tests { let report = DuplicateReport { analyzed_files: 2, analyzed_extensions: vec!["ts".to_string(), "js".to_string()], + scanned_files: None, duplicate_blocks: vec![DuplicateBlock { normalized_lines: vec!["return value;".to_string()], occurrences: vec![ @@ -122,6 +137,7 @@ mod tests { let report = DuplicateReport { analyzed_files: 2, analyzed_extensions: vec!["ts".to_string()], + scanned_files: None, duplicate_blocks: vec![DuplicateBlock { normalized_lines: vec!["return value;".to_string()], occurrences: vec![ @@ -145,4 +161,27 @@ mod tests { assert!(output.contains("Occurrences: 2")); assert!(!output.contains("Characters:")); } + + #[test] + fn renders_scanned_file_list_only_in_verbose_mode() { + let report = DuplicateReport { + analyzed_files: 2, + analyzed_extensions: vec!["ts".to_string()], + scanned_files: Some(vec![ + PathBuf::from("src/a.ts"), + PathBuf::from("src/nested/b.ts"), + ]), + duplicate_blocks: Vec::new(), + }; + let quiet_output = render_duplicate_report(&report, false); + assert!(!quiet_output.contains("Files scanned:")); + let verbose_output = render_duplicate_report(&report, true); + assert!(verbose_output.contains( + "Number of files scanned: 2\n\ + Files scanned:\n\ + - src/a.ts\n\ + - src/nested/b.ts\n\ + Analyzed extensions: ts" + )); + } }