diff --git a/Cargo.lock b/Cargo.lock index 7b39a5f..9998c0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,20 +11,136 @@ dependencies = [ "memchr", ] +[[package]] +name = "bstr" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cee35f73844aa3014bb606320a6c1f010249dbdf43342fe54b5a4f6a8ed4b79" +dependencies = [ + "memchr", + "serde_core", +] + [[package]] name = "codem8" -version = "0.1.0" +version = "0.2.0" dependencies = [ + "ignore", + "rayon", "regex", "xxhash-rust", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "ignore" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b915661dd01db3f05050265b2477bcc6527b3792388e2749b41623cc592be67d" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "log" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" + [[package]] name = "memchr" version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "regex" version = "1.12.4" @@ -54,6 +170,86 @@ version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + [[package]] name = "xxhash-rust" version = "0.8.15" diff --git a/Cargo.toml b/Cargo.toml index 8cfb01c..e1567f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "codem8" -version = "0.1.0" +version = "0.2.0" edition = "2021" license = "MIT" description = "A deterministic source code analysis CLI for duplicate code reports." @@ -9,5 +9,7 @@ keywords = ["cli", "duplicate-detection", "source-code", "analysis"] categories = ["command-line-utilities", "development-tools"] [dependencies] +ignore = "0.4" +rayon = "1" regex = "1" xxhash-rust = { version = "0.8", features = ["xxh3"] } diff --git a/README.md b/README.md index 74ed36e..2312f11 100644 --- a/README.md +++ b/README.md @@ -40,13 +40,13 @@ cargo run -- --report-duplicate ## Usage -Analyze TypeScript files from the current directory: +Analyze supported source files from the current directory: ```bash codem8 --report-duplicate ``` -Analyze multiple extensions: +Restrict analysis to specific extensions: ```bash codem8 --report-duplicate -file-extension=ts,tsx,js,jsx @@ -65,7 +65,7 @@ base branch: codem8 --report-duplicate -git-branch ``` -Include duplicate block metrics: +Include duplicate block metrics and timing information: ```bash codem8 --report-duplicate -verbose @@ -73,8 +73,9 @@ codem8 --report-duplicate -verbose ## Duplicate Report -By default, CodeM8 analyzes `.ts` files. Recursive discovery skips common -irrelevant directories such as `.git`, `node_modules`, `target`, `dist`, +By default, CodeM8 analyzes all registered source file extensions. Recursive +discovery respects Git ignore rules, works outside Git repositories, and skips +common irrelevant directories such as `.git`, `node_modules`, `target`, `dist`, `build`, `coverage`, `.next`, `.nuxt`, `.svelte-kit`, `.idea`, and `.vscode`. Symbolic links are not followed. @@ -99,55 +100,22 @@ Reports are sorted deterministically by descending weight, then by line count, character count, first location, and normalized block text. By default, each duplicate block prints the duplicated code before its -locations. Use `-verbose` to also show weight, line count, and occurrence -count. Character counts are used internally for scoring and sorting, but are -not printed. - -## Language Heuristics - -CodeM8 includes a hard-coded registry of block-only line patterns for common -languages and markup formats: - -- TypeScript / JavaScript -- Rust -- C / C++ / Objective-C -- C# -- Java / Kotlin / Scala -- Go -- Python -- Ruby -- PHP -- Swift -- Shell -- PowerShell -- HTML / XML -- CSS / SCSS / Sass / Less -- SQL -- YAML / JSON / TOML - -Block-only lines, such as braces or closing tags, cannot start a duplicate by -themselves. They can still be included inside a larger duplicated block when -surrounding comparison lines match. +locations. Use `-verbose` to also show weight, line count, occurrence count, and +timings for discovery, file processing, and duplicate detection. Character +counts are used internally for scoring and sorting, but are not printed. ## Development Run the full local verification set: ```bash +cargo test cargo fmt --all -- --check -cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -rtk cargo build --locked --all-targets -cargo test --all-targets +cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo +cargo build --locked --all-targets ``` The repository includes GitHub Actions workflows for Rust CI and a CodeRabbit review gate. CI verifies formatting, build success, and tests on pushes and pull requests. The CodeRabbit gate runs when CodeRabbit submits or edits a pull request review and fails if CodeRabbit requests changes on the current PR head. - -## Dependency Policy - -CodeM8 avoids external packages for functionality that is simple to implement -and maintain directly. The first implementation uses one runtime dependency, -`xxhash-rust`, for the required XXH3 128-bit hash implementation. The crate is -widely used and permissively licensed under MIT or Apache-2.0. diff --git a/src/discovery.rs b/src/discovery.rs index c48293a..b985495 100644 --- a/src/discovery.rs +++ b/src/discovery.rs @@ -1,6 +1,9 @@ use std::collections::HashSet; use std::fs; use std::path::{Path, PathBuf}; +use std::sync::mpsc; + +use ignore::{DirEntry, WalkBuilder, WalkState}; use crate::error::{CodeM8Error, Result}; use crate::model::SourceFile; @@ -34,9 +37,7 @@ pub fn discover_source_files( let mut source_files = if let Some(files) = explicit_files { discover_explicit_files(current_dir, extensions, files)? } else { - let mut source_files = Vec::new(); - walk_directory(current_dir, current_dir, extensions, &mut source_files)?; - source_files + discover_recursive_files(current_dir, extensions)? }; source_files.sort_by(|left, right| { format_path(&left.display_path).cmp(&format_path(&right.display_path)) @@ -44,6 +45,89 @@ pub fn discover_source_files( Ok(source_files) } +fn discover_recursive_files(root: &Path, extensions: &[String]) -> Result> { + let root = root.to_path_buf(); + let extensions = extensions.to_vec(); + let (source_tx, source_rx) = mpsc::channel(); + let (error_tx, error_rx) = mpsc::channel(); + let walker = WalkBuilder::new(&root) + .hidden(false) + .ignore(true) + .git_ignore(true) + .git_global(true) + .git_exclude(true) + .require_git(false) + .parents(true) + .filter_entry(should_walk_entry) + .build_parallel(); + walker.run(|| { + let root = root.clone(); + let extensions = extensions.clone(); + let source_tx = source_tx.clone(); + let error_tx = error_tx.clone(); + Box::new(move |entry| match entry { + Ok(entry) => { + let Some(source_file) = source_file_from_entry(&root, &extensions, &entry) else { + return WalkState::Continue; + }; + if source_tx.send(source_file).is_err() { + return WalkState::Quit; + } + WalkState::Continue + } + Err(error) => { + let _ = error_tx.send(walk_error(&root, &error)); + WalkState::Quit + } + }) + }); + drop(source_tx); + drop(error_tx); + if let Some(error) = error_rx.into_iter().next() { + return Err(error); + } + Ok(source_rx.into_iter().collect()) +} + +fn source_file_from_entry( + root: &Path, + extensions: &[String], + entry: &DirEntry, +) -> Option { + let file_type = entry.file_type()?; + if !file_type.is_file() { + return None; + } + let path = entry.path(); + let extension = selected_extension(path, extensions)?; + let display_path = path + .strip_prefix(root) + .map_or_else(|_| normalize_display_path(path), normalize_display_path); + Some(SourceFile { + path: path.to_path_buf(), + display_path, + extension, + }) +} + +fn walk_error(root: &Path, error: &ignore::Error) -> CodeM8Error { + CodeM8Error::new(format!( + "could not walk directory {}: {error}", + format_path(root) + )) +} + +fn should_walk_entry(entry: &DirEntry) -> bool { + let Some(file_type) = entry.file_type() else { + return true; + }; + if !file_type.is_dir() || entry.depth() == 0 { + return true; + } + let directory_name = entry.file_name().to_string_lossy().to_ascii_lowercase(); + !IGNORED_DIRECTORIES.contains(&directory_name.as_str()) +} + fn discover_explicit_files( current_dir: &Path, extensions: &[String], @@ -106,52 +190,6 @@ fn discover_explicit_files( Ok(source_files) } -fn walk_directory( - root: &Path, - directory: &Path, - extensions: &[String], - source_files: &mut Vec, -) -> Result<()> { - let mut entries = fs::read_dir(directory) - .map_err(|error| CodeM8Error::io(directory, "read directory", &error))? - .collect::, _>>() - .map_err(|error| CodeM8Error::io(directory, "read directory entry", &error))?; - entries.sort_by(|left, right| { - left.file_name() - .to_string_lossy() - .cmp(&right.file_name().to_string_lossy()) - }); - for entry in entries { - let path = entry.path(); - let file_type = entry - .file_type() - .map_err(|error| CodeM8Error::io(&path, "inspect path", &error))?; - if file_type.is_symlink() { - continue; - } - if file_type.is_dir() { - let directory_name = entry.file_name().to_string_lossy().to_ascii_lowercase(); - if IGNORED_DIRECTORIES.contains(&directory_name.as_str()) { - continue; - } - walk_directory(root, &path, extensions, source_files)?; - } else if file_type.is_file() { - let Some(extension) = selected_extension(&path, extensions) else { - continue; - }; - let display_path = path - .strip_prefix(root) - .map_or_else(|_| normalize_display_path(&path), normalize_display_path); - source_files.push(SourceFile { - path, - display_path, - extension, - }); - } - } - Ok(()) -} - fn selected_extension(path: &Path, extensions: &[String]) -> Option { let extension = path.extension()?.to_str()?.to_ascii_lowercase(); extensions @@ -196,6 +234,20 @@ mod tests { fs::remove_dir_all(root).expect("cleanup"); } + #[test] + fn recursive_discovery_respects_gitignore_without_requiring_git_repository() { + let root = temp_dir("gitignore"); + fs::create_dir_all(root.join("src")).expect("create src"); + fs::create_dir_all(root.join("generated")).expect("create generated"); + fs::write(root.join(".gitignore"), "generated/\n").expect("write gitignore"); + fs::write(root.join("src").join("a.ts"), "").expect("write source ts"); + fs::write(root.join("generated").join("ignored.ts"), "").expect("write ignored ts"); + let files = discover_source_files(&root, &["ts".to_string()], None).expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "src/a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } + #[test] fn explicit_files_skip_unselected_extensions() { let root = temp_dir("explicit-skip"); diff --git a/src/language.rs b/src/language.rs index 636c8b1..202827a 100644 --- a/src/language.rs +++ b/src/language.rs @@ -17,9 +17,18 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ LanguageLinePattern { language_name: "TypeScript / JavaScript", extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"], - duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], - duplicate_mitigation_lines: &[], - duplicate_mitigation_regexps: &[], + duplicate_mitigation_pattern: &[ + '&', '(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '|', '}', + ], + duplicate_mitigation_lines: &["// @ts-nocheck"], + duplicate_mitigation_regexps: &[ + // Excludes single-line block comments used by generated files and tooling. Example: /* eslint-disable */ + r"^/\*.*\*/$", + // Excludes generated interface field declarations. Example: errors: InvalidInputError[] + r"^[A-Za-z_$][A-Za-z0-9_$]*\??:\s*(?:Scalars\['[A-Za-z]+'\]|[A-Z][A-Za-z0-9_$]*(?:\[\])?|[a-z]+(?:\[\])?|\([^)]*\))(?:\[\])?(?:\s*\|\s*(?:null|number|boolean|string))*[,]?$", + // Excludes generated GraphQL typename marker fields. Example: __typename: 'User' + r"^__typename:\s*'[A-Za-z_$][A-Za-z0-9_$]*'[,]?$", + ], }, LanguageLinePattern { language_name: "Rust", @@ -142,8 +151,8 @@ pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ duplicate_mitigation_regexps: &[], }, LanguageLinePattern { - language_name: "YAML / JSON / TOML", - extensions: &["yaml", "yml", "json", "toml"], + language_name: "YAML", + extensions: &["yaml", "yml"], duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], duplicate_mitigation_lines: &["jobs:", "on:"], duplicate_mitigation_regexps: &[], @@ -331,6 +340,34 @@ mod tests { assert_eq!(classify_line("rs", line, hash), LineStatus::Comparison); } + #[test] + fn assigns_block_only_status_for_typescript_codegen_lines() { + let lines = [ + "// @ts-nocheck", + "/* eslint-disable */", + "errors: DeleteViewsError[]", + "__typename: 'DeleteViewsResponse'", + ]; + for line in lines { + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); + } + } + + #[test] + fn assigns_block_only_status_for_yaml_lines() { + let line = "jobs:"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("yaml", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn assigns_comparison_status_for_json_lines() { + let line = "}"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("json", line, hash), LineStatus::Comparison); + } + #[test] fn ignores_character_pattern_for_unknown_extensions() { let line = "});"; diff --git a/src/lib.rs b/src/lib.rs index 6656221..d7454b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ pub mod report; use std::io::Write; use std::path::Path; +use std::time::{Duration, Instant}; use crate::error::{CodeM8Error, Result}; @@ -32,18 +33,24 @@ where .map_err(|error| CodeM8Error::new(format!("could not write help output: {error}")))?, cli::CliCommand::ReportDuplicate(config) => { let should_report_scanned_files = config.git_branch || config.files.is_some(); - let git_branch_files = if config.git_branch { - Some(git::changed_files_against_origin(current_dir)?) - } else { - None - }; - let source_files = discovery::discover_source_files( - current_dir, - &config.file_extensions, - git_branch_files.as_deref().or(config.files.as_deref()), - )?; - let processed_files = line::process_source_files(&source_files)?; - let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files); + let (source_files, discovery_duration) = time_result(config.verbose, || { + let git_branch_files = if config.git_branch { + Some(git::changed_files_against_origin(current_dir)?) + } else { + None + }; + discovery::discover_source_files( + current_dir, + &config.file_extensions, + git_branch_files.as_deref().or(config.files.as_deref()), + ) + })?; + let (processed_files, file_processing_duration) = + time_result(config.verbose, || line::process_source_files(&source_files))?; + let (duplicate_blocks, duplicate_detection_duration) = + time_value(config.verbose, || { + duplicate::detect_duplicate_blocks(&processed_files) + }); let report = report::DuplicateReport { analyzed_files: source_files.len(), analyzed_extensions: config.file_extensions, @@ -53,6 +60,20 @@ where .map(|source_file| source_file.display_path.clone()) .collect() }), + timings: match ( + discovery_duration, + file_processing_duration, + duplicate_detection_duration, + ) { + (Some(discovery), Some(file_processing), Some(duplicate_detection)) => { + Some(report::DuplicateReportTimings { + discovery, + file_processing, + duplicate_detection, + }) + } + _ => None, + }, duplicate_blocks, }; writer @@ -65,6 +86,21 @@ where Ok(()) } +fn time_result( + enabled: bool, + operation: impl FnOnce() -> Result, +) -> Result<(T, Option)> { + let started_at = enabled.then(Instant::now); + let value = operation()?; + Ok((value, started_at.map(|instant| instant.elapsed()))) +} + +fn time_value(enabled: bool, operation: impl FnOnce() -> T) -> (T, Option) { + let started_at = enabled.then(Instant::now); + let value = operation(); + (value, started_at.map(|instant| instant.elapsed())) +} + #[cfg(test)] mod tests { use std::fs; @@ -128,7 +164,9 @@ mod tests { "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n", ); let output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); - let expected_extensions = language::supported_file_extensions().join(", "); + let mut expected_extensions = language::supported_file_extensions(); + expected_extensions.sort(); + let expected_extensions = expected_extensions.join(", "); assert_eq!( output, [ @@ -172,6 +210,10 @@ mod tests { assert!(output.contains("Weight:")); assert!(output.contains("Lines: 2")); assert!(output.contains("Occurrences: 2")); + assert!(output.contains("Timings:")); + assert!(output.contains("- Discovery:")); + assert!(output.contains("- File processing:")); + assert!(output.contains("- Duplicate detection:")); assert!(!output.contains("Characters:")); assert!( output.find("Code:").expect("code section exists") diff --git a/src/line.rs b/src/line.rs index 92dc0f5..5c591c8 100644 --- a/src/line.rs +++ b/src/line.rs @@ -1,6 +1,8 @@ use std::fs::File; use std::io::{BufRead, BufReader}; +use rayon::prelude::*; + use crate::error::{CodeM8Error, Result}; use crate::language::{classify_line, hash_normalized_line}; use crate::model::{LineEntry, ProcessedFile, SourceFile}; @@ -11,7 +13,7 @@ use crate::model::{LineEntry, ProcessedFile, SourceFile}; /// /// Returns an error when any input file cannot be opened or read as UTF-8 text. pub fn process_source_files(source_files: &[SourceFile]) -> Result> { - source_files.iter().map(process_source_file).collect() + source_files.par_iter().map(process_source_file).collect() } /// Processes one source file into its normalized, classified lines. @@ -63,6 +65,7 @@ pub fn normalize_line(line: &str) -> Option { #[cfg(test)] mod tests { use std::fs; + use std::path::PathBuf; use crate::model::LineStatus; @@ -97,6 +100,32 @@ mod tests { fs::remove_file(path).expect("cleanup"); } + #[test] + fn processes_files_in_input_order() { + let id = std::process::id(); + let first_path = std::env::temp_dir().join(format!("codem8-line-order-first-{id}.ts")); + let second_path = std::env::temp_dir().join(format!("codem8-line-order-second-{id}.ts")); + fs::write(&first_path, "const first = 1;\n").expect("write first source file"); + fs::write(&second_path, "const second = 2;\n").expect("write second source file"); + let sources = vec![ + SourceFile { + path: first_path.clone(), + display_path: "first.ts".into(), + extension: "ts".to_string(), + }, + SourceFile { + path: second_path.clone(), + display_path: "second.ts".into(), + extension: "ts".to_string(), + }, + ]; + let processed = process_source_files(&sources).expect("process source files"); + assert_eq!(processed[0].source.display_path, PathBuf::from("first.ts")); + assert_eq!(processed[1].source.display_path, PathBuf::from("second.ts")); + fs::remove_file(first_path).expect("cleanup first"); + fs::remove_file(second_path).expect("cleanup second"); + } + #[test] fn returns_clear_error_for_invalid_utf8() { let path = std::env::temp_dir().join(format!( diff --git a/src/report.rs b/src/report.rs index eb7c42d..9667ef3 100644 --- a/src/report.rs +++ b/src/report.rs @@ -1,5 +1,6 @@ use std::fmt::Write as _; use std::path::PathBuf; +use std::time::Duration; use crate::model::DuplicateBlock; use crate::paths::format_path; @@ -9,9 +10,17 @@ pub struct DuplicateReport { pub analyzed_files: usize, pub analyzed_extensions: Vec, pub scanned_files: Option>, + pub timings: Option, pub duplicate_blocks: Vec, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DuplicateReportTimings { + pub discovery: Duration, + pub file_processing: Duration, + pub duplicate_detection: Duration, +} + #[must_use] pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> String { let mut output = String::new(); @@ -32,13 +41,33 @@ pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> Strin let _ = writeln!( output, "Analyzed extensions: {}", - report.analyzed_extensions.join(", ") + sorted_extensions(&report.analyzed_extensions).join(", ") ); let _ = writeln!( output, "Duplicate blocks found: {}", report.duplicate_blocks.len() ); + if verbose { + if let Some(timings) = report.timings { + output.push_str("Timings:\n"); + let _ = writeln!( + output, + "- Discovery: {}", + format_duration(timings.discovery) + ); + let _ = writeln!( + output, + "- File processing: {}", + format_duration(timings.file_processing) + ); + let _ = writeln!( + output, + "- Duplicate detection: {}", + format_duration(timings.duplicate_detection) + ); + } + } for (index, block) in report.duplicate_blocks.iter().enumerate() { output.push('\n'); let _ = writeln!(output, "#{}", index + 1); @@ -68,9 +97,23 @@ pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> Strin output } +fn format_duration(duration: Duration) -> String { + let microseconds = duration.as_micros(); + let milliseconds = microseconds / 1_000; + let fractional_microseconds = microseconds % 1_000; + format!("{milliseconds}.{fractional_microseconds:03} ms") +} + +fn sorted_extensions(extensions: &[String]) -> Vec { + let mut extensions = extensions.to_vec(); + extensions.sort(); + extensions +} + #[cfg(test)] mod tests { use std::path::PathBuf; + use std::time::Duration; use crate::model::{DuplicateBlock, DuplicateOccurrence}; @@ -82,6 +125,7 @@ mod tests { analyzed_files: 0, analyzed_extensions: vec!["ts".to_string()], scanned_files: None, + timings: None, duplicate_blocks: Vec::new(), }; assert_eq!( @@ -101,6 +145,7 @@ mod tests { analyzed_files: 2, analyzed_extensions: vec!["ts".to_string(), "js".to_string()], scanned_files: None, + timings: None, duplicate_blocks: vec![DuplicateBlock { normalized_lines: vec!["return value;".to_string()], occurrences: vec![ @@ -132,12 +177,26 @@ mod tests { ); } + #[test] + fn renders_analyzed_extensions_alphabetically() { + let report = DuplicateReport { + analyzed_files: 0, + analyzed_extensions: vec!["ts".to_string(), "js".to_string(), "rs".to_string()], + scanned_files: None, + timings: None, + duplicate_blocks: Vec::new(), + }; + let output = render_duplicate_report(&report, false); + assert!(output.contains("Analyzed extensions: js, rs, ts\n")); + } + #[test] fn renders_duplicate_block_metrics_in_verbose_mode() { let report = DuplicateReport { analyzed_files: 2, analyzed_extensions: vec!["ts".to_string()], scanned_files: None, + timings: None, duplicate_blocks: vec![DuplicateBlock { normalized_lines: vec!["return value;".to_string()], occurrences: vec![ @@ -171,6 +230,7 @@ mod tests { PathBuf::from("src/a.ts"), PathBuf::from("src/nested/b.ts"), ]), + timings: None, duplicate_blocks: Vec::new(), }; let quiet_output = render_duplicate_report(&report, false); @@ -184,4 +244,28 @@ mod tests { Analyzed extensions: ts" )); } + + #[test] + fn renders_timings_only_in_verbose_mode() { + let report = DuplicateReport { + analyzed_files: 1, + analyzed_extensions: vec!["ts".to_string()], + scanned_files: None, + timings: Some(DuplicateReportTimings { + discovery: Duration::from_micros(1_234), + file_processing: Duration::from_micros(12_345), + duplicate_detection: Duration::from_micros(123_456), + }), + duplicate_blocks: Vec::new(), + }; + let quiet_output = render_duplicate_report(&report, false); + assert!(!quiet_output.contains("Timings:")); + let verbose_output = render_duplicate_report(&report, true); + assert!(verbose_output.contains( + "Timings:\n\ + - Discovery: 1.234 ms\n\ + - File processing: 12.345 ms\n\ + - Duplicate detection: 123.456 ms\n" + )); + } }