diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000..f7b3a5d --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,4 @@ +reviews: + request_changes_workflow: true + review_details: true + poem: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..9dac4b1 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,34 @@ +name: Rust CI + +on: + push: + branches: + - main + pull_request: + +permissions: + contents: read + +jobs: + rust: + name: Build, test, and format + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + run: rustup toolchain install stable --profile minimal --component rustfmt --component clippy + + - name: Check formatting + run: cargo fmt --all -- --check + + - name: Run Clippy + run: cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo + + - name: Build + run: cargo build --locked --all-targets + + - name: Test + run: cargo test --locked --all-targets diff --git a/.github/workflows/coderabbit-review.yml b/.github/workflows/coderabbit-review.yml new file mode 100644 index 0000000..5cfc373 --- /dev/null +++ b/.github/workflows/coderabbit-review.yml @@ -0,0 +1,79 @@ +name: CodeRabbit Review Gate + +on: + pull_request_review: + types: + - submitted + - edited + - dismissed + +permissions: + contents: read + pull-requests: read + +jobs: + coderabbit-review: + name: Validate CodeRabbit review + if: github.event.pull_request.draft == false && github.event.review.user.login == 'coderabbitai[bot]' + runs-on: ubuntu-latest + + steps: + - name: Check CodeRabbit review state + env: + GITHUB_TOKEN: ${{ github.token }} + GITHUB_REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + node <<'NODE' + const token = process.env.GITHUB_TOKEN; + const [owner, repo] = process.env.GITHUB_REPOSITORY.split("/"); + const prNumber = process.env.PR_NUMBER; + const headSha = process.env.PR_HEAD_SHA; + + async function fetchReviews(page = 1, reviews = []) { + const url = `https://api.github.com/repos/${owner}/${repo}/pulls/${prNumber}/reviews?per_page=100&page=${page}`; + const response = await fetch(url, { + headers: { + Authorization: `Bearer ${token}`, + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + }, + }); + + if (!response.ok) { + const body = await response.text(); + throw new Error(`GitHub review lookup failed: ${response.status} ${body}`); + } + + const pageReviews = await response.json(); + if (pageReviews.length === 0) { + return reviews; + } + return fetchReviews(page + 1, reviews.concat(pageReviews)); + } + + (async () => { + const reviews = await fetchReviews(); + const codeRabbitReviews = reviews + .filter((review) => review.user?.login === "coderabbitai[bot]") + .filter((review) => review.commit_id === headSha) + .sort((left, right) => new Date(left.submitted_at) - new Date(right.submitted_at)); + + const latestReview = codeRabbitReviews.at(-1); + if (!latestReview) { + console.error(`CodeRabbit has not submitted a review for ${headSha}.`); + process.exit(1); + } + + if (latestReview.state === "CHANGES_REQUESTED") { + console.error("CodeRabbit requested changes on this pull request."); + process.exit(1); + } + + console.log(`CodeRabbit review state for ${headSha}: ${latestReview.state}`); + })().catch((error) => { + console.error(error); + process.exit(1); + }); + NODE diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..9b718e5 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,19 @@ +# Agent Instructions + +These instructions apply to code agents working in this repository, including Codex. + +## Before finishing a change + +Run the repository verification commands from the workspace root and fix any issues before handing work back: + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity -W clippy::pedantic -W clippy::nursery -W clippy::cargo +rtk cargo build --locked --all-targets +``` + +## Notes + +- Treat Clippy warnings as errors for generated or edited code. +- Prefer changes that satisfy the repository `clippy.toml` configuration without adding `#[allow(...)]` attributes unless a maintainer explicitly asks for them. +- If a command cannot be run in the current environment, call that out clearly in the handoff. diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..7b39a5f --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,61 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "codem8" +version = "0.1.0" +dependencies = [ + "regex", + "xxhash-rust", +] + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..8cfb01c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "codem8" +version = "0.1.0" +edition = "2021" +license = "MIT" +description = "A deterministic source code analysis CLI for duplicate code reports." +repository = "https://github.com/b4prog/CodeM8" +keywords = ["cli", "duplicate-detection", "source-code", "analysis"] +categories = ["command-line-utilities", "development-tools"] + +[dependencies] +regex = "1" +xxhash-rust = { version = "0.8", features = ["xxh3"] } diff --git a/README.md b/README.md index 93bf7bd..74ed36e 100644 --- a/README.md +++ b/README.md @@ -1 +1,153 @@ -# CodeM8 \ No newline at end of file +# CodeM8 + +CodeM8 is a Rust command-line application for deterministic source code reports. +The initial report detects duplicated line-based code blocks in a repository: + +```bash +codem8 --report-duplicate +``` + +The duplicate report is designed for both human developers and coding agents. It +trims source lines, ignores empty lines, hashes normalized lines with XXH3 +128-bit, classifies syntax-only lines as block-only, groups repeated blocks, and +prints a stable plain-text report sorted by duplicate weight. + +## Installation + +Install `codem8` from the GitHub source with Cargo: + +```bash +cargo install --git https://github.com/b4prog/CodeM8 codem8 +``` + +Build from a local checkout with Cargo: + +```bash +cargo build --release +``` + +Install from a local checkout: + +```bash +cargo install --path . +``` + +Run from the local checkout without installing: + +```bash +cargo run -- --report-duplicate +``` + +## Usage + +Analyze TypeScript files from the current directory: + +```bash +codem8 --report-duplicate +``` + +Analyze multiple extensions: + +```bash +codem8 --report-duplicate -file-extension=ts,tsx,js,jsx +``` + +Analyze an explicit list of files instead of recursively discovering files: + +```bash +codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js +``` + +Analyze files changed on the current local Git branch compared to the origin +base branch: + +```bash +codem8 --report-duplicate -git-branch +``` + +Include duplicate block metrics: + +```bash +codem8 --report-duplicate -verbose +``` + +## Duplicate Report + +By default, CodeM8 analyzes `.ts` files. Recursive discovery skips common +irrelevant directories such as `.git`, `node_modules`, `target`, `dist`, +`build`, `coverage`, `.next`, `.nuxt`, `.svelte-kit`, `.idea`, and `.vscode`. +Symbolic links are not followed. + +Every non-empty line is normalized with Rust string trimming, so leading and +trailing Unicode whitespace are removed before hashing and comparison. Empty +trimmed lines are ignored. CodeM8 currently expects UTF-8 source files; invalid +UTF-8 produces a clear error rather than lossy output. + +Use `-git-branch` to analyze only files changed on the current local branch +compared to the origin base branch. CodeM8 resolves that base from `origin/HEAD` +with `origin/main` and `origin/master` fallbacks. This includes committed, +staged, unstaged, and untracked files that still exist in the worktree. The +option requires a Git repository and cannot be combined with `-files`. + +Duplicate block weight is calculated as: + +```text +(occurrences - 1) * duplicated_line_count * cumulative_normalized_character_count +``` + +Reports are sorted deterministically by descending weight, then by line count, +character count, first location, and normalized block text. + +By default, each duplicate block prints the duplicated code before its +locations. Use `-verbose` to also show weight, line count, and occurrence +count. Character counts are used internally for scoring and sorting, but are +not printed. + +## Language Heuristics + +CodeM8 includes a hard-coded registry of block-only line patterns for common +languages and markup formats: + +- TypeScript / JavaScript +- Rust +- C / C++ / Objective-C +- C# +- Java / Kotlin / Scala +- Go +- Python +- Ruby +- PHP +- Swift +- Shell +- PowerShell +- HTML / XML +- CSS / SCSS / Sass / Less +- SQL +- YAML / JSON / TOML + +Block-only lines, such as braces or closing tags, cannot start a duplicate by +themselves. They can still be included inside a larger duplicated block when +surrounding comparison lines match. + +## Development + +Run the full local verification set: + +```bash +cargo fmt --all -- --check +cargo clippy --workspace --all-targets --all-features -- -D warnings -W clippy::too_many_lines -W clippy::too_many_arguments -W clippy::type_complexity -W clippy::excessive_nesting -W clippy::cognitive_complexity +rtk cargo build --locked --all-targets +cargo test --all-targets +``` + +The repository includes GitHub Actions workflows for Rust CI and a CodeRabbit +review gate. CI verifies formatting, build success, and tests on pushes and pull +requests. The CodeRabbit gate runs when CodeRabbit submits or edits a pull +request review and fails if CodeRabbit requests changes on the current PR head. + +## Dependency Policy + +CodeM8 avoids external packages for functionality that is simple to implement +and maintain directly. The first implementation uses one runtime dependency, +`xxhash-rust`, for the required XXH3 128-bit hash implementation. The crate is +widely used and permissively licensed under MIT or Apache-2.0. diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000..770b51a --- /dev/null +++ b/clippy.toml @@ -0,0 +1,5 @@ +too-many-lines-threshold = 80 +too-many-arguments-threshold = 5 +type-complexity-threshold = 200 +excessive-nesting-threshold = 4 +cognitive-complexity-threshold = 20 diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..9b3dc17 --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,446 @@ +use std::fmt::Write as _; +use std::path::PathBuf; + +use crate::error::{CodeM8Error, Result}; +use crate::language::supported_file_extensions; + +const CARGO_LOCK: &str = include_str!("../Cargo.lock"); +const HELP_TEXT_BODY: &str = "\ +USAGE: + codem8 help + codem8 --report-duplicate [OPTIONS] + +COMMANDS: + help + Display this detailed documentation. + +REQUIRED REPORT SWITCHES: + --report-duplicate + Analyze source files and print a duplicate code report. + +OPTIONS: + -file-extension= + Comma-separated source file extensions to analyze. + Defaults to all extensions registered in LANGUAGE_PATTERNS. + Examples: -file-extension=ts,tsx,js,jsx + + -files= + Comma-separated explicit files to analyze instead of recursively + discovering files from the current directory. + Example: -files=src/a.ts,src/b.js + + -git-branch + Analyze files changed on the current local Git branch compared to the + origin base branch, including committed, staged, unstaged, and untracked + files. Cannot be combined with -files. + + -verbose + Include duplicate block metrics in report output. + +DUPLICATE REPORT PURPOSE: + The duplicate report helps you find repeated code that may be worth + refactoring, reviewing, or consolidating. It lists each duplicated block with + the files and line ranges where it appears, making it easier to compare the + repeated code and decide whether it should stay duplicated. + +EXAMPLES: + codem8 --report-duplicate + codem8 --report-duplicate -file-extension=ts,tsx,js,jsx + codem8 --report-duplicate -file-extension=ts,js -files=src/a.ts,src/b.js + codem8 --report-duplicate -git-branch +"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct CargoLockPackage<'a> { + name: &'a str, + version: &'a str, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CliCommand { + Help, + ReportDuplicate(CliConfig), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CliConfig { + pub report_duplicate: bool, + pub verbose: bool, + pub file_extensions: Vec, + pub files: Option>, + pub git_branch: bool, +} + +#[must_use] +pub fn help_text() -> String { + let version = codem8_version_from_cargo_lock().unwrap_or("unknown"); + let mut output = String::new(); + let _ = writeln!( + output, + "CodeM8 {version} - deterministic source code analysis reports." + ); + output.push('\n'); + output.push_str(HELP_TEXT_BODY); + output +} + +/// Parses command-line arguments into a CLI command. +/// +/// # Errors +/// +/// Returns an error when the arguments are invalid, repeated, or missing the +/// required report switch. +pub fn parse_command(args: I) -> Result +where + I: IntoIterator, + S: Into, +{ + let args = args.into_iter().map(Into::into).collect::>(); + if args.len() == 1 && is_help_argument(&args[0]) { + return Ok(CliCommand::Help); + } + parse_args(args).map(CliCommand::ReportDuplicate) +} + +/// Parses command-line arguments into a validated CLI configuration. +/// +/// # Errors +/// +/// Returns an error when the arguments are invalid, repeated, or missing the +/// required report switch. +pub fn parse_args(args: I) -> Result +where + I: IntoIterator, + S: Into, +{ + let mut report_duplicate = false; + let mut verbose = false; + let mut file_extensions = None; + let mut files = None; + let mut git_branch = false; + for arg in args { + let arg = arg.into(); + if arg == "--report-duplicate" { + report_duplicate = true; + } else if arg == "-verbose" { + verbose = true; + } else if arg == "-git-branch" { + if git_branch { + return Err(CodeM8Error::new( + "git branch mode was provided more than once", + )); + } + git_branch = true; + } else if let Some(value) = arg.strip_prefix("-file-extension=") { + if file_extensions.is_some() { + return Err(CodeM8Error::new( + "file extensions were provided more than once", + )); + } + file_extensions = Some(parse_file_extensions(value)?); + } else if let Some(value) = arg.strip_prefix("-files=") { + if files.is_some() { + return Err(CodeM8Error::new( + "explicit files were provided more than once", + )); + } + files = Some(parse_file_list(value)?); + } else { + return Err(CodeM8Error::new(format!("unknown argument: {arg}"))); + } + } + if !report_duplicate { + return Err(CodeM8Error::with_help( + "no report switch provided; pass --report-duplicate", + )); + } + if git_branch && files.is_some() { + return Err(CodeM8Error::new( + "git branch mode cannot be combined with explicit files", + )); + } + Ok(CliConfig { + report_duplicate, + verbose, + file_extensions: file_extensions.unwrap_or_else(supported_file_extensions), + files, + git_branch, + }) +} + +/// Parses a comma-separated list of file extensions. +/// +/// # Errors +/// +/// Returns an error when an extension is empty, starts with `.`, or contains a +/// path separator. +pub fn parse_file_extensions(value: &str) -> Result> { + let mut extensions = Vec::new(); + for raw_extension in value.split(',') { + let extension = raw_extension.trim(); + if extension.is_empty() { + return Err(CodeM8Error::new("file extension values must not be empty")); + } + if extension.starts_with('.') { + return Err(CodeM8Error::new(format!( + "file extensions must not start with a dot: {extension}" + ))); + } + if extension.contains('/') || extension.contains('\\') { + return Err(CodeM8Error::new(format!( + "file extensions must not contain path separators: {extension}" + ))); + } + let extension = extension.to_ascii_lowercase(); + if !extensions.contains(&extension) { + extensions.push(extension); + } + } + if extensions.is_empty() { + return Err(CodeM8Error::new("at least one file extension is required")); + } + Ok(extensions) +} + +/// Parses a comma-separated list of explicit file paths. +/// +/// # Errors +/// +/// Returns an error when any provided file path is empty. +pub fn parse_file_list(value: &str) -> Result> { + let mut files = Vec::new(); + for raw_file in value.split(',') { + let file = raw_file.trim(); + if file.is_empty() { + return Err(CodeM8Error::new("file path values must not be empty")); + } + files.push(PathBuf::from(file)); + } + if files.is_empty() { + return Err(CodeM8Error::new("at least one explicit file is required")); + } + Ok(files) +} + +fn is_help_argument(arg: &str) -> bool { + matches!(arg, "help" | "-h") +} + +fn codem8_version_from_cargo_lock() -> Option<&'static str> { + cargo_lock_packages(CARGO_LOCK) + .find(|package| package.name == "codem8") + .map(|package| package.version) +} + +fn cargo_lock_packages(lockfile: &str) -> impl Iterator> { + lockfile.split("[[package]]").filter_map(cargo_lock_package) +} + +fn cargo_lock_package(section: &str) -> Option> { + let name = cargo_lock_value(section, "name")?; + let version = cargo_lock_value(section, "version")?; + Some(CargoLockPackage { name, version }) +} + +fn cargo_lock_value<'a>(section: &'a str, key: &str) -> Option<&'a str> { + let prefix = format!("{key} = \""); + section + .lines() + .map(str::trim) + .find_map(|line| line.strip_prefix(&prefix)?.strip_suffix('"')) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_help_command() { + let command = parse_command(["help"]).expect("help parses"); + assert_eq!(command, CliCommand::Help); + } + + #[test] + fn parses_short_help_option() { + let command = parse_command(["-h"]).expect("short help parses"); + assert_eq!(command, CliCommand::Help); + } + + #[test] + fn exposes_detailed_help_text() { + let help = help_text(); + assert!(help.contains("USAGE:")); + assert!(help.contains("--report-duplicate")); + assert!(help.contains("-verbose")); + assert!(help.contains("-file-extension=")); + assert!(help.contains("-files=")); + assert!(help.contains("-git-branch")); + assert!(!help.contains("--verbose")); + assert!(!help.contains("--file-extension=")); + assert!(!help.contains("--files=")); + assert!(!help.contains("--git-branch")); + assert!(help.contains("helps you find repeated code")); + assert!(!help.contains("Duplicate weight")); + } + + #[test] + fn help_text_includes_version_from_cargo_lock() { + let version = codem8_version_from_cargo_lock().expect("codem8 version exists"); + assert!(help_text().starts_with(&format!("CodeM8 {version} - "))); + } + + #[test] + fn extracts_package_versions_from_cargo_lock_sections() { + let lockfile = r#" +[[package]] +name = "dependency" +version = "1.2.3" + +[[package]] +name = "codem8" +version = "0.4.2" +"#; + let package = cargo_lock_packages(lockfile) + .find(|package| package.name == "codem8") + .expect("package exists"); + assert_eq!(package.version, "0.4.2"); + } + + #[test] + fn parses_default_duplicate_report_config() { + let config = parse_args(["--report-duplicate"]).expect("config parses"); + assert!(config.report_duplicate); + assert!(!config.verbose); + assert_eq!(config.file_extensions, supported_file_extensions()); + assert_eq!(config.files, None); + assert!(!config.git_branch); + } + + #[test] + fn parses_verbose_duplicate_report_config() { + let config = parse_args(["--report-duplicate", "-verbose"]).expect("config parses"); + assert!(config.report_duplicate); + assert!(config.verbose); + } + + #[test] + fn parses_git_branch_duplicate_report_config() { + let config = parse_args(["--report-duplicate", "-git-branch"]).expect("config parses"); + assert!(config.git_branch); + assert_eq!(config.files, None); + } + + #[test] + fn parses_extensions_case_insensitively_and_trims_whitespace() { + let extensions = parse_file_extensions(" ts, JS ,tsx,ts ").expect("extensions parse"); + assert_eq!(extensions, ["ts", "js", "tsx"]); + } + + #[test] + fn rejects_empty_extensions() { + let error = parse_file_extensions("ts,,js").expect_err("empty extension fails"); + assert!(error.to_string().contains("must not be empty")); + } + + #[test] + fn rejects_extensions_with_leading_dot() { + let error = parse_file_extensions(".ts").expect_err("dot-prefixed extension fails"); + assert!(error.to_string().contains("must not start with a dot")); + } + + #[test] + fn rejects_extensions_with_path_separators() { + let error = parse_file_extensions("src/ts").expect_err("path-like extension fails"); + assert!(error + .to_string() + .contains("must not contain path separators")); + } + + #[test] + fn rejects_missing_report_switch() { + let error = parse_args(["-file-extension=rs"]).expect_err("missing report fails"); + assert!(error.to_string().contains("no report switch provided")); + assert!(error.should_show_help()); + } + + #[test] + fn rejects_unknown_arguments() { + let error = parse_args(["--report-duplicate", "--unknown"]).expect_err("unknown arg fails"); + assert!(error.to_string().contains("unknown argument: --unknown")); + assert!(!error.should_show_help()); + } + + #[test] + fn rejects_double_dash_option_arguments() { + for option in [ + "--help", + "--verbose", + "--file-extension=js", + "--files=src/a.ts", + "--git-branch", + ] { + let error = + parse_args(["--report-duplicate", option]).expect_err("double-dash option fails"); + assert!(error + .to_string() + .contains(&format!("unknown argument: {option}"))); + } + } + + #[test] + fn rejects_repeated_file_extension_arguments() { + let error = parse_args([ + "--report-duplicate", + "-file-extension=ts", + "-file-extension=js", + ]) + .expect_err("repeated extensions fail"); + assert!(error + .to_string() + .contains("file extensions were provided more than once")); + } + + #[test] + fn rejects_repeated_explicit_file_arguments() { + let error = parse_args(["--report-duplicate", "-files=a.ts", "-files=b.ts"]) + .expect_err("repeated explicit files fail"); + assert!(error + .to_string() + .contains("explicit files were provided more than once")); + } + + #[test] + fn rejects_repeated_git_branch_arguments() { + let error = parse_args(["--report-duplicate", "-git-branch", "-git-branch"]) + .expect_err("repeated git branch mode fails"); + assert!(error + .to_string() + .contains("git branch mode was provided more than once")); + } + + #[test] + fn rejects_git_branch_with_explicit_files() { + let error = parse_args(["--report-duplicate", "-git-branch", "-files=a.ts"]) + .expect_err("exclusive file modes fail"); + assert!(error + .to_string() + .contains("git branch mode cannot be combined with explicit files")); + } + + #[test] + fn parses_explicit_file_list() { + let files = parse_file_list("src/a.ts, ./src/b.ts").expect("files parse"); + assert_eq!( + files, + [PathBuf::from("src/a.ts"), PathBuf::from("./src/b.ts")] + ); + } + + #[test] + fn rejects_empty_explicit_file_paths() { + let error = parse_file_list("src/a.ts, ").expect_err("empty explicit file fails"); + assert!(error + .to_string() + .contains("file path values must not be empty")); + } +} diff --git a/src/discovery.rs b/src/discovery.rs new file mode 100644 index 0000000..c48293a --- /dev/null +++ b/src/discovery.rs @@ -0,0 +1,248 @@ +use std::collections::HashSet; +use std::fs; +use std::path::{Path, PathBuf}; + +use crate::error::{CodeM8Error, Result}; +use crate::model::SourceFile; +use crate::paths::{format_path, normalize_display_path}; + +const IGNORED_DIRECTORIES: &[&str] = &[ + ".git", + "node_modules", + "target", + "dist", + "build", + "coverage", + ".next", + ".nuxt", + ".svelte-kit", + ".idea", + ".vscode", +]; + +/// Discovers source files that match the selected extensions. +/// +/// # Errors +/// +/// Returns an error when explicit files are invalid or when walking the file +/// tree fails. +pub fn discover_source_files( + current_dir: &Path, + extensions: &[String], + explicit_files: Option<&[PathBuf]>, +) -> Result> { + let mut source_files = if let Some(files) = explicit_files { + discover_explicit_files(current_dir, extensions, files)? + } else { + let mut source_files = Vec::new(); + walk_directory(current_dir, current_dir, extensions, &mut source_files)?; + source_files + }; + source_files.sort_by(|left, right| { + format_path(&left.display_path).cmp(&format_path(&right.display_path)) + }); + Ok(source_files) +} + +fn discover_explicit_files( + current_dir: &Path, + extensions: &[String], + files: &[PathBuf], +) -> Result> { + let mut source_files = Vec::new(); + let mut seen_paths = HashSet::new(); + for file in files { + let absolute_input = file.is_absolute(); + let path = if absolute_input { + file.clone() + } else { + current_dir.join(file) + }; + let metadata = fs::symlink_metadata(&path).map_err(|_| { + CodeM8Error::new(format!( + "explicit file does not exist: {}", + format_path(file) + )) + })?; + if metadata.file_type().is_symlink() { + return Err(CodeM8Error::new(format!( + "explicit file is a symbolic link and will not be followed: {}", + format_path(file) + ))); + } + if metadata.is_dir() { + return Err(CodeM8Error::new(format!( + "explicit file is a directory: {}", + format_path(file) + ))); + } + if !metadata.is_file() { + return Err(CodeM8Error::new(format!( + "explicit path is not a file: {}", + format_path(file) + ))); + } + let Some(extension) = selected_extension(&path, extensions) else { + continue; + }; + let canonical_path = fs::canonicalize(&path) + .map_err(|error| CodeM8Error::io(&path, "canonicalize explicit file", &error))?; + if !seen_paths.insert(canonical_path.clone()) { + continue; + } + let display_path = if absolute_input { + canonical_path + .strip_prefix(current_dir) + .map_or_else(|_| normalize_display_path(file), normalize_display_path) + } else { + normalize_display_path(file) + }; + source_files.push(SourceFile { + path: canonical_path, + display_path, + extension, + }); + } + Ok(source_files) +} + +fn walk_directory( + root: &Path, + directory: &Path, + extensions: &[String], + source_files: &mut Vec, +) -> Result<()> { + let mut entries = fs::read_dir(directory) + .map_err(|error| CodeM8Error::io(directory, "read directory", &error))? + .collect::, _>>() + .map_err(|error| CodeM8Error::io(directory, "read directory entry", &error))?; + entries.sort_by(|left, right| { + left.file_name() + .to_string_lossy() + .cmp(&right.file_name().to_string_lossy()) + }); + for entry in entries { + let path = entry.path(); + let file_type = entry + .file_type() + .map_err(|error| CodeM8Error::io(&path, "inspect path", &error))?; + if file_type.is_symlink() { + continue; + } + if file_type.is_dir() { + let directory_name = entry.file_name().to_string_lossy().to_ascii_lowercase(); + if IGNORED_DIRECTORIES.contains(&directory_name.as_str()) { + continue; + } + walk_directory(root, &path, extensions, source_files)?; + } else if file_type.is_file() { + let Some(extension) = selected_extension(&path, extensions) else { + continue; + }; + let display_path = path + .strip_prefix(root) + .map_or_else(|_| normalize_display_path(&path), normalize_display_path); + source_files.push(SourceFile { + path, + display_path, + extension, + }); + } + } + Ok(()) +} + +fn selected_extension(path: &Path, extensions: &[String]) -> Option { + let extension = path.extension()?.to_str()?.to_ascii_lowercase(); + extensions + .iter() + .any(|selected| selected.eq_ignore_ascii_case(&extension)) + .then_some(extension) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + fn temp_dir(name: &str) -> PathBuf { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = std::env::temp_dir().join(format!( + "codem8-discovery-{name}-{}-{id}", + std::process::id() + )); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + path + } + + #[test] + fn recursively_discovers_matching_extensions_and_ignores_common_directories() { + let root = temp_dir("recursive"); + fs::create_dir_all(root.join("src")).expect("create src"); + fs::create_dir_all(root.join("target")).expect("create target"); + fs::write(root.join("src").join("a.TS"), "").expect("write ts"); + fs::write(root.join("src").join("b.js"), "").expect("write js"); + fs::write(root.join("target").join("ignored.ts"), "").expect("write ignored"); + let files = discover_source_files(&root, &["ts".to_string()], None).expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "src/a.TS"); + fs::remove_dir_all(root).expect("cleanup"); + } + + #[test] + fn explicit_files_skip_unselected_extensions() { + let root = temp_dir("explicit-skip"); + fs::write(root.join("a.ts"), "").expect("write ts"); + fs::write(root.join("b.js"), "").expect("write js"); + let files = discover_source_files( + &root, + &["ts".to_string()], + Some(&[PathBuf::from("a.ts"), PathBuf::from("b.js")]), + ) + .expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(format_path(&files[0].display_path), "a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } + + #[test] + fn explicit_files_deduplicate_resolved_paths() { + let root = temp_dir("explicit-dedup"); + fs::write(root.join("a.ts"), "").expect("write ts"); + let absolute = fs::canonicalize(root.join("a.ts")).expect("canonicalize ts"); + let files = discover_source_files( + &root, + &["ts".to_string()], + Some(&[ + PathBuf::from("a.ts"), + PathBuf::from(".").join("a.ts"), + absolute.clone(), + ]), + ) + .expect("discover"); + assert_eq!(files.len(), 1); + assert_eq!(files[0].path, absolute); + assert_eq!(format_path(&files[0].display_path), "a.ts"); + fs::remove_dir_all(root).expect("cleanup"); + } + + #[test] + fn explicit_files_reject_directories() { + let root = temp_dir("explicit-directory"); + fs::create_dir_all(root.join("src")).expect("create explicit directory"); + let error = + discover_source_files(&root, &["ts".to_string()], Some(&[PathBuf::from("src")])) + .expect_err("directory explicit file fails"); + assert!(error + .to_string() + .contains("explicit file is a directory: src")); + fs::remove_dir_all(root).expect("cleanup"); + } +} diff --git a/src/duplicate.rs b/src/duplicate.rs new file mode 100644 index 0000000..6414768 --- /dev/null +++ b/src/duplicate.rs @@ -0,0 +1,410 @@ +use std::cmp::Ordering; +use std::collections::{BTreeSet, HashMap}; +use std::path::PathBuf; + +use crate::model::{DuplicateBlock, DuplicateOccurrence, LineEntry, LineStatus, ProcessedFile}; +use crate::paths::format_path; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +struct LineRef { + file_index: usize, + line_index: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct OccurrenceKey { + file_path: PathBuf, + file_path_key: String, + start_line: usize, + end_line: usize, +} + +impl Ord for OccurrenceKey { + fn cmp(&self, other: &Self) -> Ordering { + self.file_path_key + .cmp(&other.file_path_key) + .then_with(|| self.start_line.cmp(&other.start_line)) + .then_with(|| self.end_line.cmp(&other.end_line)) + } +} + +impl PartialOrd for OccurrenceKey { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +pub fn detect_duplicate_blocks(files: &[ProcessedFile]) -> Vec { + let mut line_index: HashMap> = HashMap::new(); + for (file_index, file) in files.iter().enumerate() { + for (line_index_in_file, line) in file.lines.iter().enumerate() { + line_index.entry(line.hash).or_default().push(LineRef { + file_index, + line_index: line_index_in_file, + }); + } + } + let mut blocks_by_lines: HashMap, BTreeSet> = HashMap::new(); + for refs in line_index.values() { + if refs.len() < 2 { + continue; + } + let mut comparison_refs_by_text: HashMap> = HashMap::new(); + for line_ref in refs { + let line = line_at(files, *line_ref); + if line.status != LineStatus::Comparison { + continue; + } + comparison_refs_by_text + .entry(line.normalized_text.clone()) + .or_default() + .push(*line_ref); + } + for comparison_refs in comparison_refs_by_text.values() { + if comparison_refs.len() < 2 { + continue; + } + collect_candidates(files, comparison_refs, &mut blocks_by_lines); + } + } + let mut duplicate_blocks = blocks_by_lines + .into_iter() + .filter_map(|(normalized_lines, occurrences)| { + if normalized_lines.is_empty() || occurrences.len() < 2 { + return None; + } + let occurrences = occurrences + .into_iter() + .map(|occurrence| DuplicateOccurrence { + file_path: occurrence.file_path, + start_line: occurrence.start_line, + end_line: occurrence.end_line, + }) + .collect::>(); + let character_count = normalized_lines + .iter() + .map(|line| line.chars().count() as u64) + .sum::(); + let weight = + (occurrences.len() as u64 - 1) * normalized_lines.len() as u64 * character_count; + Some(DuplicateBlock { + normalized_lines, + occurrences, + weight, + }) + }) + .collect::>(); + duplicate_blocks.sort_by(compare_duplicate_blocks); + duplicate_blocks +} + +#[derive(Debug)] +struct CandidateBlock { + normalized_lines: Vec, + left_occurrence: OccurrenceKey, + right_occurrence: OccurrenceKey, +} + +fn collect_candidates( + files: &[ProcessedFile], + comparison_refs: &[LineRef], + blocks_by_lines: &mut HashMap, BTreeSet>, +) { + for left_index in 0..comparison_refs.len() { + let left = comparison_refs[left_index]; + for &right in &comparison_refs[(left_index + 1)..] { + let Some(candidate) = expand_pair(files, left, right) else { + continue; + }; + store_candidate(candidate, blocks_by_lines); + } + } +} + +fn store_candidate( + candidate: CandidateBlock, + blocks_by_lines: &mut HashMap, BTreeSet>, +) { + let occurrences = blocks_by_lines + .entry(candidate.normalized_lines) + .or_default(); + occurrences.insert(candidate.left_occurrence); + occurrences.insert(candidate.right_occurrence); +} + +fn expand_pair(files: &[ProcessedFile], left: LineRef, right: LineRef) -> Option { + if left == right { + return None; + } + let mut left_start = left.line_index; + let mut right_start = right.line_index; + while left_start > 0 + && right_start > 0 + && line_text(files, left.file_index, left_start - 1) + == line_text(files, right.file_index, right_start - 1) + { + left_start -= 1; + right_start -= 1; + } + let mut left_end = left.line_index; + let mut right_end = right.line_index; + while left_end + 1 < files[left.file_index].lines.len() + && right_end + 1 < files[right.file_index].lines.len() + && line_text(files, left.file_index, left_end + 1) + == line_text(files, right.file_index, right_end + 1) + { + left_end += 1; + right_end += 1; + } + if left.file_index == right.file_index && left_start <= right_end && right_start <= left_end { + return None; + } + let normalized_lines = files[left.file_index].lines[left_start..=left_end] + .iter() + .map(|line| line.normalized_text.clone()) + .collect::>(); + Some(CandidateBlock { + normalized_lines, + left_occurrence: occurrence_for(files, left.file_index, left_start, left_end), + right_occurrence: occurrence_for(files, right.file_index, right_start, right_end), + }) +} + +fn occurrence_for( + files: &[ProcessedFile], + file_index: usize, + start_index: usize, + end_index: usize, +) -> OccurrenceKey { + let lines = &files[file_index].lines; + let file_path = files[file_index].source.display_path.clone(); + OccurrenceKey { + file_path_key: format_path(&file_path), + file_path, + start_line: lines[start_index].line_number, + end_line: lines[end_index].line_number, + } +} + +fn line_at(files: &[ProcessedFile], line_ref: LineRef) -> &LineEntry { + &files[line_ref.file_index].lines[line_ref.line_index] +} + +fn line_text(files: &[ProcessedFile], file_index: usize, line_index: usize) -> &str { + &files[file_index].lines[line_index].normalized_text +} + +fn compare_duplicate_blocks(left: &DuplicateBlock, right: &DuplicateBlock) -> Ordering { + right + .weight + .cmp(&left.weight) + .then_with(|| right.line_count().cmp(&left.line_count())) + .then_with(|| right.character_count().cmp(&left.character_count())) + .then_with(|| first_occurrence_key(left).cmp(&first_occurrence_key(right))) + .then_with(|| first_occurrence_start_line(left).cmp(&first_occurrence_start_line(right))) + .then_with(|| normalized_block_text(left).cmp(&normalized_block_text(right))) +} + +fn first_occurrence_key(block: &DuplicateBlock) -> String { + block + .occurrences + .first() + .map(|occurrence| format_path(&occurrence.file_path)) + .unwrap_or_default() +} + +fn first_occurrence_start_line(block: &DuplicateBlock) -> usize { + block + .occurrences + .first() + .map(|occurrence| occurrence.start_line) + .unwrap_or_default() +} + +fn normalized_block_text(block: &DuplicateBlock) -> String { + block.normalized_lines.join("\n") +} + +#[cfg(test)] +mod tests { + use crate::language::hash_normalized_line; + use crate::model::{LineEntry, ProcessedFile, SourceFile}; + + use super::*; + + fn processed_file(path: &str, extension: &str, lines: &[(&str, LineStatus)]) -> ProcessedFile { + let line_entries = lines + .iter() + .enumerate() + .map(|(index, (text, status))| LineEntry { + file_path: PathBuf::from(path), + line_number: index + 1, + normalized_text: (*text).to_string(), + hash: hash_normalized_line(text), + status: *status, + }) + .collect(); + ProcessedFile { + source: SourceFile { + path: PathBuf::from(path), + display_path: PathBuf::from(path), + extension: extension.to_string(), + }, + lines: line_entries, + } + } + + #[test] + fn groups_three_occurrences_of_the_same_block() { + let files = vec![ + processed_file( + "a.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("return value;", LineStatus::Comparison), + ], + ), + processed_file( + "b.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("return value;", LineStatus::Comparison), + ], + ), + processed_file( + "c.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("return value;", LineStatus::Comparison), + ], + ), + ]; + let blocks = detect_duplicate_blocks(&files); + assert_eq!(blocks.len(), 1); + assert_eq!(blocks[0].occurrences.len(), 3); + assert_eq!( + blocks[0].normalized_lines, + ["const value = one;", "return value;"] + ); + } + + #[test] + fn ignores_matching_hashes_with_different_text() { + let mut files = vec![ + processed_file( + "a.ts", + "ts", + &[("const value = one;", LineStatus::Comparison)], + ), + processed_file( + "b.ts", + "ts", + &[("const value = two;", LineStatus::Comparison)], + ), + ]; + files[1].lines[0].hash = files[0].lines[0].hash; + let blocks = detect_duplicate_blocks(&files); + assert!(blocks.is_empty()); + } + + #[test] + fn sorts_duplicate_blocks_by_weight() { + let files = vec![ + processed_file( + "a.ts", + "ts", + &[ + ("const x = 1;", LineStatus::Comparison), + ("const uniqueA = true;", LineStatus::Comparison), + ("const longerValue = computeOne();", LineStatus::Comparison), + ("return longerValue;", LineStatus::Comparison), + ], + ), + processed_file( + "b.ts", + "ts", + &[ + ("const x = 1;", LineStatus::Comparison), + ("const uniqueB = true;", LineStatus::Comparison), + ("const longerValue = computeOne();", LineStatus::Comparison), + ("return longerValue;", LineStatus::Comparison), + ], + ), + ]; + let blocks = detect_duplicate_blocks(&files); + assert!(blocks.len() >= 2); + assert_eq!( + blocks[0].normalized_lines, + ["const longerValue = computeOne();", "return longerValue;"] + ); + assert!(blocks[0].weight >= blocks[1].weight); + } + + #[test] + fn ignores_single_line_duplicates_that_are_only_block_only_lines() { + let files = vec![ + processed_file("a.ts", "ts", &[("}", LineStatus::BlockOnly)]), + processed_file("b.ts", "ts", &[("}", LineStatus::BlockOnly)]), + ]; + let blocks = detect_duplicate_blocks(&files); + assert!(blocks.is_empty()); + } + + #[test] + fn includes_block_only_lines_inside_larger_duplicate_blocks() { + let files = vec![ + processed_file( + "a.ts", + "ts", + &[ + ("if (ready) {", LineStatus::Comparison), + ("}", LineStatus::BlockOnly), + ("return value;", LineStatus::Comparison), + ], + ), + processed_file( + "b.ts", + "ts", + &[ + ("if (ready) {", LineStatus::Comparison), + ("}", LineStatus::BlockOnly), + ("return value;", LineStatus::Comparison), + ], + ), + ]; + let blocks = detect_duplicate_blocks(&files); + assert_eq!(blocks.len(), 1); + assert_eq!( + blocks[0].normalized_lines, + ["if (ready) {", "}", "return value;"] + ); + } + + #[test] + fn rejects_overlapping_duplicate_ranges_in_the_same_file() { + let files = vec![processed_file( + "a.ts", + "ts", + &[ + ("const value = one;", LineStatus::Comparison), + ("const value = one;", LineStatus::Comparison), + ("const value = one;", LineStatus::Comparison), + ], + )]; + let blocks = detect_duplicate_blocks(&files); + assert!(!blocks.iter().any(|block| { + block.normalized_lines == ["const value = one;", "const value = one;"] + && block + .occurrences + .iter() + .any(|occurrence| occurrence.start_line == 1) + && block + .occurrences + .iter() + .any(|occurrence| occurrence.start_line == 2) + })); + } +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..bcfe20b --- /dev/null +++ b/src/error.rs @@ -0,0 +1,50 @@ +use std::error::Error; +use std::fmt; +use std::io; +use std::path::Path; + +use crate::paths::format_path; + +pub type Result = std::result::Result; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CodeM8Error { + message: String, + show_help: bool, +} + +impl CodeM8Error { + #[must_use] + pub fn new(message: impl Into) -> Self { + Self { + message: message.into(), + show_help: false, + } + } + + #[must_use] + pub fn with_help(message: impl Into) -> Self { + Self { + message: message.into(), + show_help: true, + } + } + + #[must_use] + pub fn io(path: &Path, action: &str, error: &io::Error) -> Self { + Self::new(format!("could not {action} {}: {error}", format_path(path))) + } + + #[must_use] + pub const fn should_show_help(&self) -> bool { + self.show_help + } +} + +impl fmt::Display for CodeM8Error { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str(&self.message) + } +} + +impl Error for CodeM8Error {} diff --git a/src/git.rs b/src/git.rs new file mode 100644 index 0000000..965f0c9 --- /dev/null +++ b/src/git.rs @@ -0,0 +1,284 @@ +use std::collections::BTreeSet; +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::{Command, Output}; + +use crate::error::{CodeM8Error, Result}; + +/// Lists files changed on the current branch compared to the origin base branch. +/// +/// # Errors +/// +/// Returns an error when `current_dir` is not inside a Git repository, the +/// current branch cannot be resolved, or the origin base branch is missing. +pub fn changed_files_against_origin(current_dir: &Path) -> Result> { + let repo_root = repo_root(current_dir)?; + ensure_named_branch(&repo_root)?; + let origin_ref = origin_base_ref(&repo_root)?; + let merge_base = run_git_text( + &repo_root, + &["merge-base", &origin_ref, "HEAD"], + "find merge base with origin base branch", + )?; + let mut paths = BTreeSet::new(); + collect_nul_paths( + &repo_root, + &[ + "diff", + "--name-only", + "-z", + "--diff-filter=ACMRTUXB", + merge_base.trim(), + "HEAD", + ], + &mut paths, + )?; + collect_nul_paths( + &repo_root, + &[ + "diff", + "--name-only", + "-z", + "--cached", + "--diff-filter=ACMRTUXB", + ], + &mut paths, + )?; + collect_nul_paths( + &repo_root, + &["diff", "--name-only", "-z", "--diff-filter=ACMRTUXB"], + &mut paths, + )?; + collect_nul_paths( + &repo_root, + &["ls-files", "--others", "--exclude-standard", "-z"], + &mut paths, + )?; + Ok(paths + .into_iter() + .filter_map(|path| existing_file_path(&repo_root, current_dir, &path)) + .collect()) +} + +fn repo_root(current_dir: &Path) -> Result { + let output = run_git_output( + current_dir, + &["rev-parse", "--show-toplevel"], + "find git repository", + )?; + if !output.status.success() { + return Err(CodeM8Error::new( + "git branch mode requires the current directory to be inside a git repository", + )); + } + let root = output_text(output.stdout, "parse git repository root")?; + Ok(PathBuf::from(root.trim())) +} + +fn ensure_named_branch(repo_root: &Path) -> Result<()> { + let branch = run_git_text( + repo_root, + &["rev-parse", "--abbrev-ref", "HEAD"], + "determine current git branch", + )?; + let branch = branch.trim(); + if branch == "HEAD" { + return Err(CodeM8Error::new( + "git branch mode requires a named local branch, but HEAD is detached", + )); + } + Ok(()) +} + +fn origin_base_ref(repo_root: &Path) -> Result { + for candidate in ["origin/HEAD", "origin/main", "origin/master"] { + if verify_origin_ref(repo_root, candidate) { + return Ok(candidate.to_string()); + } + } + Err(CodeM8Error::new( + "git branch mode could not resolve origin base branch", + )) +} + +fn verify_origin_ref(repo_root: &Path, origin_ref: &str) -> bool { + let commit_ref = format!("{origin_ref}^{{commit}}"); + run_git_output( + repo_root, + &["rev-parse", "--verify", &commit_ref], + "resolve origin base branch", + ) + .is_ok_and(|output| output.status.success()) +} + +fn collect_nul_paths(repo_root: &Path, args: &[&str], paths: &mut BTreeSet) -> Result<()> { + let output = run_git_output(repo_root, args, "list changed git files")?; + let stdout = ensure_git_success(output, "list changed git files")?; + for path in nul_paths(&stdout) { + paths.insert(path); + } + Ok(()) +} + +fn existing_file_path(repo_root: &Path, current_dir: &Path, path: &Path) -> Option { + let absolute = repo_root.join(path); + let metadata = fs::symlink_metadata(&absolute).ok()?; + if !metadata.is_file() || metadata.file_type().is_symlink() { + return None; + } + let relative = absolute.strip_prefix(current_dir).map(Path::to_path_buf); + Some(relative.unwrap_or(absolute)) +} + +fn run_git_text(current_dir: &Path, args: &[&str], action: &str) -> Result { + let output = run_git_output(current_dir, args, action)?; + let stdout = ensure_git_success(output, action)?; + output_text(stdout, action) +} + +fn run_git_output(current_dir: &Path, args: &[&str], action: &str) -> Result { + Command::new("git") + .arg("-C") + .arg(current_dir) + .args(args) + .output() + .map_err(|error| CodeM8Error::new(format!("could not {action}: {error}"))) +} + +fn ensure_git_success(output: Output, action: &str) -> Result> { + if output.status.success() { + return Ok(output.stdout); + } + let stderr = output_text(output.stderr, action)?; + Err(CodeM8Error::new(format!( + "could not {action}: {}", + stderr.trim() + ))) +} + +fn output_text(bytes: Vec, action: &str) -> Result { + String::from_utf8(bytes) + .map_err(|error| CodeM8Error::new(format!("could not {action}: {error}"))) +} + +fn nul_paths(bytes: &[u8]) -> Vec { + String::from_utf8_lossy(bytes) + .split('\0') + .filter(|path| !path.is_empty()) + .map(PathBuf::from) + .collect() +} + +#[cfg(test)] +mod tests { + use std::process::Command; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + struct TempGitRepo { + path: PathBuf, + } + + impl TempGitRepo { + fn new(name: &str) -> Self { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = + std::env::temp_dir().join(format!("codem8-git-{name}-{}-{id}", std::process::id())); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + Self { path } + } + + fn path(&self) -> &Path { + &self.path + } + + fn write(&self, relative_path: &str, contents: &str) { + let path = self.path.join(relative_path); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).expect("create parent directory"); + } + fs::write(path, contents).expect("write test file"); + } + + fn git(&self, args: &[&str]) { + let status = Command::new("git") + .arg("-C") + .arg(&self.path) + .args(args) + .status() + .expect("run git"); + assert!(status.success(), "git command failed: {args:?}"); + } + + fn commit(&self, message: &str) { + self.git(&["add", "."]); + self.git(&[ + "-c", + "user.name=CodeM8 Test", + "-c", + "user.email=codem8@example.invalid", + "commit", + "-m", + message, + ]); + } + } + + impl Drop for TempGitRepo { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.path); + } + } + + fn git_is_available() -> bool { + Command::new("git") + .arg("--version") + .status() + .is_ok_and(|status| status.success()) + } + + #[test] + fn rejects_non_git_directory() { + let repo = TempGitRepo::new("non-repo"); + let error = changed_files_against_origin(repo.path()).expect_err("non-repo fails"); + assert!(error.to_string().contains("requires the current directory")); + } + + #[test] + fn lists_committed_staged_unstaged_and_untracked_files() { + if !git_is_available() { + return; + } + let repo = TempGitRepo::new("changes"); + repo.git(&["init"]); + repo.write("src/base.ts", "const value = one;\n"); + repo.write("src/deleted.ts", "const value = deleted;\n"); + repo.commit("initial"); + repo.git(&["update-ref", "refs/remotes/origin/main", "HEAD"]); + repo.git(&["branch", "-M", "feature"]); + repo.write("src/committed.ts", "const value = committed;\n"); + repo.commit("branch change"); + repo.git(&["update-ref", "refs/remotes/origin/feature", "HEAD"]); + repo.write("src/staged.ts", "const value = staged;\n"); + repo.git(&["add", "src/staged.ts"]); + repo.write("src/base.ts", "const value = modified;\n"); + repo.write("src/untracked.ts", "const value = untracked;\n"); + fs::remove_file(repo.path().join("src/deleted.ts")).expect("delete tracked file"); + let files = changed_files_against_origin(repo.path()).expect("list branch files"); + assert_eq!( + files, + [ + PathBuf::from("src/base.ts"), + PathBuf::from("src/committed.ts"), + PathBuf::from("src/staged.ts"), + PathBuf::from("src/untracked.ts"), + ] + ); + } +} diff --git a/src/language.rs b/src/language.rs new file mode 100644 index 0000000..636c8b1 --- /dev/null +++ b/src/language.rs @@ -0,0 +1,355 @@ +use std::collections::HashMap; +use std::sync::OnceLock; + +use crate::model::LineStatus; +use regex::Regex; + +#[derive(Debug, Clone, Copy)] +pub struct LanguageLinePattern { + pub language_name: &'static str, + pub extensions: &'static [&'static str], + pub duplicate_mitigation_pattern: &'static [char], + pub duplicate_mitigation_lines: &'static [&'static str], + pub duplicate_mitigation_regexps: &'static [&'static str], +} + +pub const LANGUAGE_PATTERNS: &[LanguageLinePattern] = &[ + LanguageLinePattern { + language_name: "TypeScript / JavaScript", + extensions: &["ts", "tsx", "js", "jsx", "mjs", "cjs"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "Rust", + extensions: &["rs"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["///"], + duplicate_mitigation_regexps: &[ + // Excludes short path or enum variant fragments. Example: Self::Ready, + r"^[A-Za-z0-9_]*::?\s*[A-Za-z0-9_]*[,]?$", + // Excludes bare identifiers with optional punctuation. Example: value, + r"^[A-Za-z0-9_]+\s*[.,]?$", + // Excludes simple method or field access lines. Example: .clone() + r"^\.?\s*[A-Za-z0-9_]+(?:\(\s*\)?)?$", + // Excludes incomplete let bindings split across lines. Example: let value = + r"^let\s+(?:mut\s+)?[A-Za-z0-9_]+\s*=$", + // Excludes simple public struct field declarations. Example: pub name: String, + r"^pub\s+[A-Za-z0-9_]*\s*:\s*[A-Za-z0-9_]*[,]?$", + // Excludes single-path use imports. Example: use crate::module; + r"^use\s+[A-Za-z_][A-Za-z0-9_]*(?:::[A-Za-z_][A-Za-z0-9_]*)*;$", + ], + }, + LanguageLinePattern { + language_name: "C / C++ / Objective-C", + extensions: &["c", "h", "cpp", "hpp", "cc", "hh", "cxx", "hxx", "m", "mm"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "C#", + extensions: &["cs"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["#else", "#endif", "#endregion"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "Java / Kotlin / Scala", + extensions: &["java", "kt", "kts", "scala", "sc"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "Go", + extensions: &["go"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "Python", + extensions: &["py", "pyw"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "Ruby", + extensions: &["rb"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["end"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "PHP", + extensions: &["php", "phtml"], + duplicate_mitigation_pattern: &[ + '(', ')', ',', '/', ':', ';', '<', '>', '?', '[', ']', '{', '}', + ], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "Swift", + extensions: &["swift"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "Shell", + extensions: &["sh", "bash", "zsh", "fish"], + duplicate_mitigation_pattern: &['&', '(', ')', ';', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &["do", "done", "else", "fi", "then"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "PowerShell", + extensions: &["ps1", "psm1", "psd1"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '?', '[', ']', '{', '|', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "HTML / XML", + extensions: &["html", "htm", "xml", "xhtml", "svg"], + duplicate_mitigation_pattern: &['/', '<', '>'], + duplicate_mitigation_lines: &[ + "", + "", + "", + "", + "", + "", + ], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "CSS / SCSS / Sass / Less", + extensions: &["css", "scss", "sass", "less"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '[', ']', '{', '}'], + duplicate_mitigation_lines: &[], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "SQL", + extensions: &["sql"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';'], + duplicate_mitigation_lines: &["BEGIN", "END"], + duplicate_mitigation_regexps: &[], + }, + LanguageLinePattern { + language_name: "YAML / JSON / TOML", + extensions: &["yaml", "yml", "json", "toml"], + duplicate_mitigation_pattern: &['(', ')', ',', ':', ';', '<', '>', '?', '[', ']', '{', '}'], + duplicate_mitigation_lines: &["jobs:", "on:"], + duplicate_mitigation_regexps: &[], + }, +]; + +#[must_use] +pub fn supported_file_extensions() -> Vec { + let mut extensions = Vec::new(); + for language in LANGUAGE_PATTERNS { + for &extension in language.extensions { + if !extensions.iter().any(|selected| selected == extension) { + extensions.push(extension.to_string()); + } + } + } + extensions +} + +#[derive(Debug)] +struct DuplicateMitigationLineRegistry { + by_extension: HashMap<&'static str, DuplicateMitigationPatterns>, +} + +#[derive(Debug, Default)] +struct DuplicateMitigationPatterns { + lines_by_hash: HashMap>, + character_pattern: Vec, + regexps: Vec, +} + +static DUPLICATE_MITIGATION_LINE_REGISTRY: OnceLock = + OnceLock::new(); + +#[must_use] +pub fn hash_normalized_line(line: &str) -> u128 { + xxhash_rust::xxh3::xxh3_128(line.as_bytes()) +} + +#[must_use] +pub fn classify_line(extension: &str, normalized_line: &str, hash: u128) -> LineStatus { + let extension = extension.to_ascii_lowercase(); + let Some(patterns) = registry().by_extension.get(extension.as_str()) else { + return LineStatus::Comparison; + }; + if patterns.matches_line(normalized_line, hash) { + LineStatus::BlockOnly + } else { + LineStatus::Comparison + } +} + +fn registry() -> &'static DuplicateMitigationLineRegistry { + DUPLICATE_MITIGATION_LINE_REGISTRY.get_or_init(|| { + let mut by_extension: HashMap<&'static str, DuplicateMitigationPatterns> = HashMap::new(); + for language in LANGUAGE_PATTERNS { + for extension in language.extensions { + let patterns = by_extension.entry(extension).or_default(); + register_duplicate_mitigation_lines( + &mut patterns.lines_by_hash, + language.duplicate_mitigation_lines, + ); + register_duplicate_mitigation_pattern( + &mut patterns.character_pattern, + language.duplicate_mitigation_pattern, + ); + register_duplicate_mitigation_regexps( + &mut patterns.regexps, + language.duplicate_mitigation_regexps, + ); + } + } + DuplicateMitigationLineRegistry { by_extension } + }) +} + +impl DuplicateMitigationPatterns { + fn matches_line(&self, normalized_line: &str, hash: u128) -> bool { + self.matches_registered_line(normalized_line, hash) + || matches_duplicate_mitigation_pattern(normalized_line, &self.character_pattern) + || matches_duplicate_mitigation_regexps(normalized_line, &self.regexps) + } + + fn matches_registered_line(&self, normalized_line: &str, hash: u128) -> bool { + self.lines_by_hash + .get(&hash) + .is_some_and(|patterns| patterns.contains(&normalized_line)) + } +} + +fn register_duplicate_mitigation_lines( + patterns_by_hash: &mut HashMap>, + lines: &'static [&'static str], +) { + for &line in lines { + patterns_by_hash + .entry(hash_normalized_line(line)) + .or_default() + .push(line); + } +} + +fn register_duplicate_mitigation_pattern( + character_pattern: &mut Vec, + characters: &'static [char], +) { + for &character in characters { + if !character_pattern.contains(&character) { + character_pattern.push(character); + } + } +} + +fn register_duplicate_mitigation_regexps( + regexps: &mut Vec, + patterns: &'static [&'static str], +) { + for &pattern in patterns { + if !regexps.iter().any(|regexp| regexp.as_str() == pattern) { + regexps.push(Regex::new(pattern).expect("duplicate mitigation regexp must compile")); + } + } +} + +fn matches_duplicate_mitigation_pattern(line: &str, character_pattern: &[char]) -> bool { + !character_pattern.is_empty() + && line + .chars() + .all(|character| character.is_whitespace() || character_pattern.contains(&character)) +} + +fn matches_duplicate_mitigation_regexps(line: &str, regexps: &[Regex]) -> bool { + regexps.iter().any(|regexp| { + regexp + .find(line) + .is_some_and(|matched| matched.start() == 0 && matched.end() == line.len()) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn assigns_block_only_status_from_extension_specific_line_registry() { + let line = ".into_iter()"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn assigns_comparison_status_for_meaningful_lines() { + let line = "const value = computeValue(input);"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::Comparison); + } + + #[test] + fn verifies_text_after_hash_lookup() { + let hash = hash_normalized_line("}"); + assert_eq!( + classify_line("ts", "not-a-brace", hash), + LineStatus::Comparison + ); + } + + #[test] + fn assigns_block_only_status_from_character_pattern() { + let line = "} \t);"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("ts", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn assigns_block_only_status_from_regexps() { + let line = ".update()"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::BlockOnly); + } + + #[test] + fn regexps_must_match_the_full_line() { + let line = ".update()?.await"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("rs", line, hash), LineStatus::Comparison); + } + + #[test] + fn ignores_character_pattern_for_unknown_extensions() { + let line = "});"; + let hash = hash_normalized_line(line); + assert_eq!(classify_line("unknown", line, hash), LineStatus::Comparison); + } + + #[test] + fn empty_character_pattern_does_not_match() { + assert!(!matches_duplicate_mitigation_pattern("}", &[])); + } + + #[test] + fn collects_supported_file_extensions_from_language_patterns() { + let extensions = supported_file_extensions(); + for language in LANGUAGE_PATTERNS { + for extension in language.extensions { + assert!(extensions.iter().any(|selected| selected == extension)); + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..6656221 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,245 @@ +pub mod cli; +pub mod discovery; +pub mod duplicate; +pub mod error; +pub mod git; +pub mod language; +pub mod line; +pub mod model; +pub mod paths; +pub mod report; + +use std::io::Write; +use std::path::Path; + +use crate::error::{CodeM8Error, Result}; + +/// Runs the CLI workflow and writes the selected report to the provided writer. +/// +/// # Errors +/// +/// Returns an error when argument parsing, file discovery, file processing, or +/// report writing fails. +pub fn run(args: I, current_dir: &Path, writer: &mut W) -> Result<()> +where + I: IntoIterator, + S: Into, + W: Write, +{ + match cli::parse_command(args)? { + cli::CliCommand::Help => writer + .write_all(cli::help_text().as_bytes()) + .map_err(|error| CodeM8Error::new(format!("could not write help output: {error}")))?, + cli::CliCommand::ReportDuplicate(config) => { + let should_report_scanned_files = config.git_branch || config.files.is_some(); + let git_branch_files = if config.git_branch { + Some(git::changed_files_against_origin(current_dir)?) + } else { + None + }; + let source_files = discovery::discover_source_files( + current_dir, + &config.file_extensions, + git_branch_files.as_deref().or(config.files.as_deref()), + )?; + let processed_files = line::process_source_files(&source_files)?; + let duplicate_blocks = duplicate::detect_duplicate_blocks(&processed_files); + let report = report::DuplicateReport { + analyzed_files: source_files.len(), + analyzed_extensions: config.file_extensions, + scanned_files: should_report_scanned_files.then(|| { + source_files + .iter() + .map(|source_file| source_file.display_path.clone()) + .collect() + }), + duplicate_blocks, + }; + writer + .write_all(report::render_duplicate_report(&report, config.verbose).as_bytes()) + .map_err(|error| { + CodeM8Error::new(format!("could not write report output: {error}")) + })?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::{Path, PathBuf}; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + static TEMP_COUNTER: AtomicUsize = AtomicUsize::new(0); + + struct TempProject { + path: PathBuf, + } + + impl TempProject { + fn new(name: &str) -> Self { + let id = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed); + let path = + std::env::temp_dir().join(format!("codem8-{name}-{}-{id}", std::process::id())); + if path.exists() { + fs::remove_dir_all(&path).expect("remove stale test directory"); + } + fs::create_dir_all(&path).expect("create test directory"); + Self { path } + } + + fn write(&self, relative_path: &str, contents: &str) { + let path = self.path.join(relative_path); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).expect("create test parent directory"); + } + fs::write(path, contents).expect("write test file"); + } + + fn path(&self) -> &Path { + &self.path + } + } + + impl Drop for TempProject { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.path); + } + } + + fn run_in(project: &TempProject, args: &[&str]) -> std::result::Result { + let mut output = Vec::new(); + run(args.iter().copied(), project.path(), &mut output)?; + Ok(String::from_utf8(output).expect("report is UTF-8")) + } + + #[test] + fn duplicate_report_snapshot_is_stable() { + let project = TempProject::new("snapshot"); + project.write( + "src/a.ts", + "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n", + ); + project.write( + "src/b.ts", + "const value = computeValue(input);\nif (value === undefined) {\nreturn defaultValue;\n}\n", + ); + let output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); + let expected_extensions = language::supported_file_extensions().join(", "); + assert_eq!( + output, + [ + "Duplicate Code Report\n", + "=====================\n", + "\n", + "Number of files scanned: 2\n", + "Analyzed extensions: ", + &expected_extensions, + "\n", + "Duplicate blocks found: 1\n", + "\n", + "#1\n", + "Code:\n", + " const value = computeValue(input);\n", + " if (value === undefined) {\n", + " return defaultValue;\n", + " }\n", + "\n", + "Locations:\n", + "- src/a.ts:1-4\n", + "- src/b.ts:1-4\n", + ] + .concat() + ); + } + + #[test] + fn verbose_duplicate_report_includes_metrics_without_characters() { + let project = TempProject::new("verbose"); + project.write( + "src/a.ts", + "const value = computeValue(input);\nreturn value;\n", + ); + project.write( + "src/b.ts", + "const value = computeValue(input);\nreturn value;\n", + ); + let output = + run_in(&project, &["--report-duplicate", "-verbose"]).expect("report succeeds"); + assert!(output.contains("Weight:")); + assert!(output.contains("Lines: 2")); + assert!(output.contains("Occurrences: 2")); + assert!(!output.contains("Characters:")); + assert!( + output.find("Code:").expect("code section exists") + < output.find("Locations:").expect("locations section exists") + ); + } + + #[test] + fn explicit_files_disable_recursive_discovery() { + let project = TempProject::new("explicit-files"); + project.write("src/a.ts", "const value = one;\n"); + project.write("src/b.ts", "const value = one;\n"); + let output = + run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds"); + assert!(output.contains("Number of files scanned: 1")); + assert!(output.contains("Duplicate blocks found: 0")); + } + + #[test] + fn verbose_explicit_files_report_lists_scanned_files() { + let project = TempProject::new("verbose-explicit-files"); + project.write("src/a.ts", "const value = one;\n"); + project.write("src/b.ts", "const value = one;\n"); + let quiet_output = + run_in(&project, &["--report-duplicate", "-files=src/a.ts"]).expect("report succeeds"); + assert!(!quiet_output.contains("Files scanned:")); + let verbose_output = run_in( + &project, + &["--report-duplicate", "-verbose", "-files=src/a.ts"], + ) + .expect("report succeeds"); + assert!(verbose_output.contains( + "Number of files scanned: 1\n\ + Files scanned:\n\ + - src/a.ts\n\ + Analyzed extensions:" + )); + } + + #[test] + fn custom_extensions_change_analyzed_files() { + let project = TempProject::new("custom-extensions"); + project.write("src/a.js", "const value = one;\n"); + project.write("src/b.js", "const value = one;\n"); + let default_output = run_in(&project, &["--report-duplicate"]).expect("report succeeds"); + assert!(default_output.contains("Number of files scanned: 2")); + assert!(default_output.contains("Duplicate blocks found: 1")); + let js_output = run_in(&project, &["--report-duplicate", "-file-extension=js"]) + .expect("report succeeds"); + assert!(js_output.contains("Number of files scanned: 2")); + assert!(js_output.contains("Duplicate blocks found: 1")); + } + + #[test] + fn invalid_explicit_file_returns_a_clear_error() { + let project = TempProject::new("invalid-file"); + let error = run_in(&project, &["--report-duplicate", "-files=missing.ts"]) + .expect_err("missing explicit file fails"); + assert!(error + .to_string() + .contains("explicit file does not exist: missing.ts")); + } + + #[test] + fn help_command_prints_documentation() { + let project = TempProject::new("help"); + let output = run_in(&project, &["help"]).expect("help succeeds"); + assert!(output.contains("USAGE:")); + assert!(output.contains("--report-duplicate")); + } +} diff --git a/src/line.rs b/src/line.rs new file mode 100644 index 0000000..92dc0f5 --- /dev/null +++ b/src/line.rs @@ -0,0 +1,118 @@ +use std::fs::File; +use std::io::{BufRead, BufReader}; + +use crate::error::{CodeM8Error, Result}; +use crate::language::{classify_line, hash_normalized_line}; +use crate::model::{LineEntry, ProcessedFile, SourceFile}; + +/// Processes a set of source files into normalized line entries. +/// +/// # Errors +/// +/// Returns an error when any input file cannot be opened or read as UTF-8 text. +pub fn process_source_files(source_files: &[SourceFile]) -> Result> { + source_files.iter().map(process_source_file).collect() +} + +/// Processes one source file into its normalized, classified lines. +/// +/// # Errors +/// +/// Returns an error when the file cannot be opened or read as UTF-8 text. +pub fn process_source_file(source_file: &SourceFile) -> Result { + let file = File::open(&source_file.path) + .map_err(|error| CodeM8Error::io(&source_file.display_path, "open file", &error))?; + let reader = BufReader::new(file); + let mut lines = Vec::new(); + for (index, line) in reader.lines().enumerate() { + let line = line.map_err(|error| { + CodeM8Error::new(format!( + "could not read {} as UTF-8 text: {error}", + crate::paths::format_path(&source_file.display_path) + )) + })?; + let Some(normalized_text) = normalize_line(&line) else { + continue; + }; + let hash = hash_normalized_line(&normalized_text); + let status = classify_line(&source_file.extension, &normalized_text, hash); + lines.push(LineEntry { + file_path: source_file.display_path.clone(), + line_number: index + 1, + normalized_text, + hash, + status, + }); + } + Ok(ProcessedFile { + source: source_file.clone(), + lines, + }) +} + +#[must_use] +pub fn normalize_line(line: &str) -> Option { + let normalized = line.trim(); + if normalized.is_empty() { + None + } else { + Some(normalized.to_string()) + } +} + +#[cfg(test)] +mod tests { + use std::fs; + + use crate::model::LineStatus; + + use super::*; + + #[test] + fn trims_unicode_whitespace_and_skips_empty_lines() { + assert_eq!( + normalize_line("\t value \u{2003}"), + Some("value".to_string()) + ); + assert_eq!(normalize_line(" \t "), None); + } + + #[test] + fn processes_non_empty_lines_with_original_line_numbers() { + let path = std::env::temp_dir().join(format!("codem8-line-test-{}.ts", std::process::id())); + fs::write(&path, " const value = 1; \n\n }\n").expect("write source file"); + let source = SourceFile { + path: path.clone(), + display_path: "sample.ts".into(), + extension: "ts".to_string(), + }; + let processed = process_source_file(&source).expect("process source file"); + assert_eq!(processed.lines.len(), 2); + assert_eq!(processed.lines[0].line_number, 1); + assert_eq!(processed.lines[0].normalized_text, "const value = 1;"); + assert_eq!(processed.lines[0].status, LineStatus::Comparison); + assert_eq!(processed.lines[1].line_number, 3); + assert_eq!(processed.lines[1].normalized_text, "}"); + assert_eq!(processed.lines[1].status, LineStatus::BlockOnly); + fs::remove_file(path).expect("cleanup"); + } + + #[test] + fn returns_clear_error_for_invalid_utf8() { + let path = std::env::temp_dir().join(format!( + "codem8-line-invalid-utf8-{}.ts", + std::process::id() + )); + fs::write(&path, [0xff, b'\n']).expect("write invalid source file"); + let source = SourceFile { + path: path.clone(), + display_path: "invalid.ts".into(), + extension: "ts".to_string(), + }; + let error = process_source_file(&source).expect_err("invalid UTF-8 fails"); + assert!(error + .to_string() + .contains("could not read invalid.ts as UTF-8 text")); + fs::remove_file(path).expect("cleanup"); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..a6e1e1d --- /dev/null +++ b/src/main.rs @@ -0,0 +1,28 @@ +use std::io::Write; +use std::process::ExitCode; + +fn main() -> ExitCode { + let current_dir = match std::env::current_dir() { + Ok(current_dir) => current_dir, + Err(error) => { + eprintln!("error: could not determine current directory: {error}"); + return ExitCode::FAILURE; + } + }; + let stdout = std::io::stdout(); + let mut stdout = stdout.lock(); + match codem8::run(std::env::args().skip(1), ¤t_dir, &mut stdout) { + Ok(()) => { + let _ = stdout.flush(); + ExitCode::SUCCESS + } + Err(error) => { + eprintln!("error: {error}"); + if error.should_show_help() { + eprintln!(); + eprint!("{}", codem8::cli::help_text()); + } + ExitCode::FAILURE + } + } +} diff --git a/src/model.rs b/src/model.rs new file mode 100644 index 0000000..2a1b195 --- /dev/null +++ b/src/model.rs @@ -0,0 +1,58 @@ +use std::path::PathBuf; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LineStatus { + Comparison, + BlockOnly, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SourceFile { + pub path: PathBuf, + pub display_path: PathBuf, + pub extension: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct LineEntry { + pub file_path: PathBuf, + pub line_number: usize, + pub normalized_text: String, + pub hash: u128, + pub status: LineStatus, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ProcessedFile { + pub source: SourceFile, + pub lines: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DuplicateOccurrence { + pub file_path: PathBuf, + pub start_line: usize, + pub end_line: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DuplicateBlock { + pub normalized_lines: Vec, + pub occurrences: Vec, + pub weight: u64, +} + +impl DuplicateBlock { + #[must_use] + pub const fn line_count(&self) -> usize { + self.normalized_lines.len() + } + + #[must_use] + pub fn character_count(&self) -> u64 { + self.normalized_lines + .iter() + .map(|line| line.chars().count() as u64) + .sum() + } +} diff --git a/src/paths.rs b/src/paths.rs new file mode 100644 index 0000000..f55926f --- /dev/null +++ b/src/paths.rs @@ -0,0 +1,50 @@ +use std::path::{Component, Path, PathBuf}; + +#[must_use] +pub fn format_path(path: &Path) -> String { + path.to_string_lossy().replace('\\', "/") +} + +#[must_use] +pub fn normalize_display_path(path: &Path) -> PathBuf { + let mut normalized = PathBuf::new(); + for component in path.components() { + match component { + Component::CurDir => {} + Component::Normal(part) => normalized.push(part), + Component::ParentDir => normalized.push(".."), + Component::RootDir | Component::Prefix(_) => normalized.push(component.as_os_str()), + } + } + if normalized.as_os_str().is_empty() { + PathBuf::from(".") + } else { + normalized + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn formats_paths_with_forward_slashes() { + assert_eq!( + format_path(Path::new("src\\nested\\a.ts")), + "src/nested/a.ts" + ); + } + + #[test] + fn normalizes_display_paths_without_losing_parent_segments() { + assert_eq!( + normalize_display_path(Path::new("./src/../a.ts")), + PathBuf::from("src").join("..").join("a.ts") + ); + } + + #[test] + fn normalizes_empty_display_path_to_current_directory() { + assert_eq!(normalize_display_path(Path::new(".")), PathBuf::from(".")); + } +} diff --git a/src/report.rs b/src/report.rs new file mode 100644 index 0000000..eb7c42d --- /dev/null +++ b/src/report.rs @@ -0,0 +1,187 @@ +use std::fmt::Write as _; +use std::path::PathBuf; + +use crate::model::DuplicateBlock; +use crate::paths::format_path; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DuplicateReport { + pub analyzed_files: usize, + pub analyzed_extensions: Vec, + pub scanned_files: Option>, + pub duplicate_blocks: Vec, +} + +#[must_use] +pub fn render_duplicate_report(report: &DuplicateReport, verbose: bool) -> String { + let mut output = String::new(); + output.push_str("Duplicate Code Report\n"); + output.push_str("=====================\n\n"); + let _ = writeln!(output, "Number of files scanned: {}", report.analyzed_files); + let scanned_files = if verbose { + report.scanned_files.as_ref() + } else { + None + }; + if let Some(scanned_files) = scanned_files { + output.push_str("Files scanned:\n"); + for file in scanned_files { + let _ = writeln!(output, "- {}", format_path(file)); + } + } + let _ = writeln!( + output, + "Analyzed extensions: {}", + report.analyzed_extensions.join(", ") + ); + let _ = writeln!( + output, + "Duplicate blocks found: {}", + report.duplicate_blocks.len() + ); + for (index, block) in report.duplicate_blocks.iter().enumerate() { + output.push('\n'); + let _ = writeln!(output, "#{}", index + 1); + if verbose { + let _ = writeln!(output, "Weight: {}", block.weight); + let _ = writeln!(output, "Lines: {}", block.line_count()); + let _ = writeln!(output, "Occurrences: {}", block.occurrences.len()); + output.push('\n'); + } + output.push_str("Code:\n"); + for line in &block.normalized_lines { + output.push_str(" "); + output.push_str(line); + output.push('\n'); + } + output.push_str("\nLocations:\n"); + for occurrence in &block.occurrences { + let _ = writeln!( + output, + "- {}:{}-{}", + format_path(&occurrence.file_path), + occurrence.start_line, + occurrence.end_line + ); + } + } + output +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use crate::model::{DuplicateBlock, DuplicateOccurrence}; + + use super::*; + + #[test] + fn renders_empty_report() { + let report = DuplicateReport { + analyzed_files: 0, + analyzed_extensions: vec!["ts".to_string()], + scanned_files: None, + duplicate_blocks: Vec::new(), + }; + assert_eq!( + render_duplicate_report(&report, false), + "Duplicate Code Report\n\ + =====================\n\ + \n\ + Number of files scanned: 0\n\ + Analyzed extensions: ts\n\ + Duplicate blocks found: 0\n" + ); + } + + #[test] + fn renders_duplicate_block_details() { + let report = DuplicateReport { + analyzed_files: 2, + analyzed_extensions: vec!["ts".to_string(), "js".to_string()], + scanned_files: None, + duplicate_blocks: vec![DuplicateBlock { + normalized_lines: vec!["return value;".to_string()], + occurrences: vec![ + DuplicateOccurrence { + file_path: PathBuf::from("src/a.ts"), + start_line: 1, + end_line: 1, + }, + DuplicateOccurrence { + file_path: PathBuf::from("src/b.js"), + start_line: 5, + end_line: 5, + }, + ], + weight: 13, + }], + }; + let output = render_duplicate_report(&report, false); + assert!(output.contains("#1\n")); + assert!(!output.contains("Weight: 13")); + assert!(!output.contains("Lines: 1")); + assert!(!output.contains("Occurrences: 2")); + assert!(!output.contains("Characters:")); + assert!(output.contains("- src/a.ts:1-1")); + assert!(output.contains(" return value;")); + assert!( + output.find("Code:").expect("code section exists") + < output.find("Locations:").expect("locations section exists") + ); + } + + #[test] + fn renders_duplicate_block_metrics_in_verbose_mode() { + let report = DuplicateReport { + analyzed_files: 2, + analyzed_extensions: vec!["ts".to_string()], + scanned_files: None, + duplicate_blocks: vec![DuplicateBlock { + normalized_lines: vec!["return value;".to_string()], + occurrences: vec![ + DuplicateOccurrence { + file_path: PathBuf::from("src/a.ts"), + start_line: 1, + end_line: 1, + }, + DuplicateOccurrence { + file_path: PathBuf::from("src/b.ts"), + start_line: 2, + end_line: 2, + }, + ], + weight: 13, + }], + }; + let output = render_duplicate_report(&report, true); + assert!(output.contains("Weight: 13")); + assert!(output.contains("Lines: 1")); + assert!(output.contains("Occurrences: 2")); + assert!(!output.contains("Characters:")); + } + + #[test] + fn renders_scanned_file_list_only_in_verbose_mode() { + let report = DuplicateReport { + analyzed_files: 2, + analyzed_extensions: vec!["ts".to_string()], + scanned_files: Some(vec![ + PathBuf::from("src/a.ts"), + PathBuf::from("src/nested/b.ts"), + ]), + duplicate_blocks: Vec::new(), + }; + let quiet_output = render_duplicate_report(&report, false); + assert!(!quiet_output.contains("Files scanned:")); + let verbose_output = render_duplicate_report(&report, true); + assert!(verbose_output.contains( + "Number of files scanned: 2\n\ + Files scanned:\n\ + - src/a.ts\n\ + - src/nested/b.ts\n\ + Analyzed extensions: ts" + )); + } +}