From b9c593f1ceae515533d61559c16ee6ac482e2c78 Mon Sep 17 00:00:00 2001 From: VooDisss Date: Wed, 25 Feb 2026 12:03:20 +0200 Subject: [PATCH 1/3] feat: implement rpg_analyze_health tool - Code Health Meter Implements a new MCP tool providing code health metrics based on the Code Health Meter (CHM) framework from the research paper. ## New Files - health.rs: Graph health metrics (instability, centrality, god objects) - duplication.rs: Token-based (Rabin-Karp) and semantic (Jaccard) clone detection ## Key Features - Instability index: I = Ce / (Ca + Ce) - Degree centrality (normalized) - God object detection (high degree + extreme instability) - Rabin-Karp rolling hash for Type-1/Type-2 clone detection - Jaccard similarity on lifted features for Type-3/Type-4 detection - LLM-friendly output via TOON formatter ## Testing - 17 unit tests in duplication module - 6 unit tests in health module ## References - Inspired by: Khalfallah, B. H. (2025). Code Health Meter. ACM Trans. Softw. Eng. Methodol. https://doi.org/10.1145/3737670 --- crates/rpg-mcp/src/params.rs | 16 + crates/rpg-mcp/src/tools.rs | 33 + crates/rpg-nav/Cargo.toml | 1 + crates/rpg-nav/src/duplication.rs | 1071 +++++++++++++++++++++++++++++ crates/rpg-nav/src/health.rs | 587 ++++++++++++++++ crates/rpg-nav/src/lib.rs | 5 +- crates/rpg-nav/src/search.rs | 5 +- crates/rpg-nav/src/toon.rs | 150 ++++ 8 files changed, 1864 insertions(+), 4 deletions(-) create mode 100644 crates/rpg-nav/src/duplication.rs create mode 100644 crates/rpg-nav/src/health.rs diff --git a/crates/rpg-mcp/src/params.rs b/crates/rpg-mcp/src/params.rs index 34af32a..37bd1cc 100644 --- a/crates/rpg-mcp/src/params.rs +++ b/crates/rpg-mcp/src/params.rs @@ -210,3 +210,19 @@ pub(crate) struct SliceBetweenParams { /// Include entity metadata (name, file, features) in output pub(crate) include_metadata: Option, } + +/// Parameters for the `analyze_health` tool. +#[derive(Debug, Deserialize, JsonSchema)] +pub(crate) struct AnalyzeHealthParams { + /// Instability threshold above which entities are flagged as highly unstable (default: 0.7). + pub(crate) instability_threshold: Option, + /// Minimum total degree for god object detection (default: 10). + pub(crate) god_object_threshold: Option, + /// Run Rabin-Karp token-based clone detection (reads source files from disk, slower). Default: false. + pub(crate) include_duplication: Option, + /// Run Jaccard feature-based semantic clone detection (in-memory, fast). + /// Requires entities to have been lifted. Default: false. + pub(crate) include_semantic_duplication: Option, + /// Jaccard similarity threshold for semantic clone detection (default: 0.6). + pub(crate) semantic_similarity_threshold: Option, +} diff --git a/crates/rpg-mcp/src/tools.rs b/crates/rpg-mcp/src/tools.rs index d03e344..74a8292 100644 --- a/crates/rpg-mcp/src/tools.rs +++ b/crates/rpg-mcp/src/tools.rs @@ -2883,6 +2883,39 @@ impl RpgServer { Ok(result) } + + #[tool( + description = "Analyze code health metrics including coupling, instability, centrality, and potential god objects. Returns entities with architectural issues and recommendations for refactoring. Set include_duplication=true to detect code clones via Rabin-Karp fingerprinting (reads source files, slower). Set include_semantic_duplication=true to detect conceptual duplicates via Jaccard similarity on lifted features (in-memory, fast; requires entities to be lifted)." + )] + async fn analyze_health( + &self, + Parameters(params): Parameters, + ) -> Result { + self.ensure_graph().await?; + let notice = self.staleness_notice().await; + let guard = self.graph.read().await; + let graph = guard.as_ref().unwrap(); + + let config = rpg_nav::health::HealthConfig { + instability_threshold: params.instability_threshold.unwrap_or(0.7), + god_object_degree_threshold: params.god_object_threshold.unwrap_or(10), + include_duplication: params.include_duplication.unwrap_or(false), + include_semantic_duplication: params.include_semantic_duplication.unwrap_or(false), + semantic_duplication_config: rpg_nav::duplication::SemanticDuplicationConfig { + similarity_threshold: params.semantic_similarity_threshold.unwrap_or(0.6), + ..Default::default() + }, + ..Default::default() + }; + + let report = rpg_nav::health::compute_health_full(graph, &self.project_root, &config); + + Ok(format!( + "{}{}", + notice, + rpg_nav::toon::format_health_report(&report) + )) + } } impl RpgServer { diff --git a/crates/rpg-nav/Cargo.toml b/crates/rpg-nav/Cargo.toml index f672d25..fec44c0 100644 --- a/crates/rpg-nav/Cargo.toml +++ b/crates/rpg-nav/Cargo.toml @@ -19,6 +19,7 @@ strsim.workspace = true toon-format.workspace = true serde.workspace = true serde_json.workspace = true +rayon.workspace = true fastembed = { workspace = true, optional = true } [dev-dependencies] diff --git a/crates/rpg-nav/src/duplication.rs b/crates/rpg-nav/src/duplication.rs new file mode 100644 index 0000000..d7ab017 --- /dev/null +++ b/crates/rpg-nav/src/duplication.rs @@ -0,0 +1,1071 @@ +//! Duplication detection via Rabin-Karp rolling hash fingerprinting. +//! +//! Implements the CHM (Code Health Meter) duplication analysis from paper.md §3.4: +//! - Tokenization: strip whitespace/comments, normalize identifiers +//! - Rolling hash: Rabin-Karp fingerprinting with configurable window size +//! - Clone detection: HashMap collision-based fingerprint matching +//! +//! This approach is language-agnostic and detects Type-1 (exact) and Type-2 (renamed) clones. + +use crate::search::jaccard_similarity; +use rpg_core::graph::{EntityKind, RPGraph}; +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +/// Base multiplier for rolling hash (per paper: typically 256) +const HASH_BASE: u64 = 256; + +/// Large prime modulus to prevent overflow (per paper: 10^9 + 7) +const HASH_MOD: u64 = 1_000_000_007; + +/// Default window size in tokens for entity-level fingerprinting. +/// Lowered from 50 (file-level) to 20 to catch function-sized duplicates. +const DEFAULT_WINDOW_SIZE: usize = 20; + +/// Minimum duplicate length in tokens to report (filters noise). +/// Lowered from 30 (file-level) to 15 for entity-level snippets. +const MIN_DUPLICATE_TOKENS: usize = 15; + +/// A detected clone group with high similarity. +#[derive(Debug, Clone, serde::Serialize)] +pub struct CloneGroup { + /// Entity IDs participating in this clone group + pub entities: Vec, + /// Similarity coefficient (0.0 - 1.0) + pub similarity: f64, + /// Estimated duplicated token count + pub duplicated_tokens: usize, + /// File paths involved + pub files: Vec, +} + +/// Configuration for duplication detection. +#[derive(Debug, Clone)] +pub struct DuplicationConfig { + /// Window size in tokens for fingerprinting + pub window_size: usize, + /// Minimum tokens to consider as a duplicate + pub min_tokens: usize, + /// Minimum similarity threshold to report (0.0 - 1.0) + pub similarity_threshold: f64, +} + +impl Default for DuplicationConfig { + fn default() -> Self { + Self { + window_size: DEFAULT_WINDOW_SIZE, + min_tokens: MIN_DUPLICATE_TOKENS, + similarity_threshold: 0.7, + } + } +} + +/// A detected group of conceptual duplicates identified via feature-set Jaccard similarity. +#[derive(Debug, Clone, serde::Serialize)] +pub struct SemanticCloneGroup { + /// Entity IDs in this group + pub entities: Vec, + /// Jaccard similarity of feature sets: |A ∩ B| / |A ∪ B| + pub similarity: f64, + /// Shared features that caused the match + pub shared_features: Vec, + /// File paths (parallel to `entities`) + pub files: Vec, +} + +/// Configuration for semantic (feature-based Jaccard) duplication detection. +#[derive(Debug, Clone)] +pub struct SemanticDuplicationConfig { + /// Jaccard threshold above which pairs are flagged as conceptual duplicates (default: 0.6). + pub similarity_threshold: f64, + /// Minimum number of features an entity must have to participate (default: 1). + pub min_features: usize, + /// Skip pairs from the same source file — cross-file duplicates are more actionable (default: true). + pub skip_same_file: bool, + /// Skip features appearing in more than this many entities; too generic to be discriminative (default: 20). + pub max_feature_frequency: usize, + /// Maximum number of groups to return (default: 50). + pub max_results: usize, +} + +impl Default for SemanticDuplicationConfig { + fn default() -> Self { + Self { + similarity_threshold: 0.6, + min_features: 1, + skip_same_file: true, + max_feature_frequency: 20, + max_results: 50, + } + } +} + +/// Detect conceptual duplicates by comparing entity semantic feature sets via Jaccard similarity. +/// +/// Unlike token-based clone detection, this operates entirely on in-memory `entity.semantic_features` +/// (verb-object phrases from LLM lifting) and requires no disk I/O. +/// +/// Uses an inverted index to avoid O(n²) pair generation: only entity pairs sharing at +/// least one feature are considered candidates, reducing work dramatically on large graphs. +pub fn detect_semantic_duplicates( + graph: &RPGraph, + config: &SemanticDuplicationConfig, +) -> Vec { + // Step 1: Collect eligible entities (exclude Modules, require min_features) + let eligible: Vec<(&String, &str, &[String])> = graph + .entities + .iter() + .filter(|(_, e)| { + e.kind != EntityKind::Module && e.semantic_features.len() >= config.min_features + }) + .map(|(id, e)| { + let file = e.file.to_str().unwrap_or(""); + (id, file, e.semantic_features.as_slice()) + }) + .collect(); + + if eligible.len() < 2 { + return Vec::new(); + } + + // Step 2: Build inverted index: feature → Vec + // Skip features that appear in too many entities (too generic to be useful) + let mut feature_freq: HashMap<&str, usize> = HashMap::new(); + for (_, _, features) in &eligible { + for f in *features { + *feature_freq.entry(f.as_str()).or_insert(0) += 1; + } + } + + let mut inverted: HashMap<&str, Vec> = HashMap::new(); + for (idx, (_, _, features)) in eligible.iter().enumerate() { + for f in *features { + if feature_freq.get(f.as_str()).copied().unwrap_or(0) <= config.max_feature_frequency { + inverted.entry(f.as_str()).or_default().push(idx); + } + } + } + + // Step 3: Collect candidate pairs that share at least one feature + let mut shared_counts: HashMap<(usize, usize), usize> = HashMap::new(); + for indices in inverted.values() { + if indices.len() < 2 { + continue; + } + for i in 0..indices.len() { + for j in (i + 1)..indices.len() { + let a = indices[i].min(indices[j]); + let b = indices[i].max(indices[j]); + *shared_counts.entry((a, b)).or_insert(0) += 1; + } + } + } + + // Step 4: Compute exact Jaccard for candidates and filter by threshold + let mut groups: Vec = Vec::new(); + for ((a_idx, b_idx), shared_count) in &shared_counts { + let (a_id, a_file, a_features) = eligible[*a_idx]; + let (b_id, b_file, b_features) = eligible[*b_idx]; + + // Early bail: shared / max(|A|, |B|) is an upper bound on Jaccard + let upper_bound = *shared_count as f64 / a_features.len().max(b_features.len()) as f64; + if upper_bound < config.similarity_threshold { + continue; + } + + if config.skip_same_file && a_file == b_file { + continue; + } + + let a_set: HashSet<&str> = a_features.iter().map(|s| s.as_str()).collect(); + let b_set: HashSet<&str> = b_features.iter().map(|s| s.as_str()).collect(); + let sim = jaccard_similarity(&a_set, &b_set); + + if sim < config.similarity_threshold { + continue; + } + + let mut shared: Vec = a_set + .intersection(&b_set) + .map(|s| (*s).to_string()) + .collect(); + shared.sort(); + + groups.push(SemanticCloneGroup { + entities: vec![a_id.clone(), b_id.clone()], + similarity: (sim * 1000.0).round() / 1000.0, + shared_features: shared, + files: vec![a_file.to_string(), b_file.to_string()], + }); + } + + // Step 5: Sort by similarity descending, cap results + groups.sort_by(|a, b| { + b.similarity + .partial_cmp(&a.similarity) + .unwrap_or(std::cmp::Ordering::Equal) + }); + groups.truncate(config.max_results); + groups +} + +/// Token type for normalized code representation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +enum TokenType { + /// Identifier (variable, function, class name) - normalized + Identifier, + /// Keyword (if, else, fn, let, etc.) + Keyword, + /// Operator (+, -, *, /, =, etc.) + Operator, + /// Literal (number, string - replaced with placeholder) + Literal, + /// Punctuation ({, }, (, ), ;, etc.) + Punctuation, +} + +/// A normalized token for fingerprinting. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct Token { + kind: TokenType, + value: u64, +} + +/// Tokenize source code into normalized tokens. +/// +/// Per paper §3.4: strip whitespace/comments, normalize identifiers, +/// replace literals with placeholders for Type-2 clone detection. +fn tokenize(source: &str) -> Vec { + let mut tokens = Vec::new(); + let mut chars = source.chars().peekable(); + + while let Some(&ch) = chars.peek() { + match ch { + // Skip whitespace + ' ' | '\t' | '\n' | '\r' => { + chars.next(); + } + // Single-line comment + '/' if chars.clone().nth(1) == Some('/') => { + chars.next(); + chars.next(); + while let Some(&c) = chars.peek() { + if c == '\n' { + break; + } + chars.next(); + } + } + // Multi-line comment (Rust-style) + '/' if chars.clone().nth(1) == Some('*') => { + chars.next(); + chars.next(); + while let Some(&c) = chars.peek() { + if c == '*' && chars.clone().nth(1) == Some('/') { + chars.next(); + chars.next(); + break; + } + chars.next(); + } + } + // String literal + '"' | '\'' => { + let quote = ch; + chars.next(); + while let Some(&c) = chars.peek() { + chars.next(); + if c == quote { + break; + } + if c == '\\' { + chars.next(); + } + } + tokens.push(Token { + kind: TokenType::Literal, + value: hash_str("LIT"), + }); + } + // Number literal + '0'..='9' => { + while let Some(&c) = chars.peek() { + if c.is_ascii_digit() + || c == '.' + || c == 'x' + || c == 'X' + || c == 'e' + || c == 'E' + { + chars.next(); + } else { + break; + } + } + tokens.push(Token { + kind: TokenType::Literal, + value: hash_str("LIT"), + }); + } + // Identifier or keyword + 'a'..='z' | 'A'..='Z' | '_' => { + let mut ident = String::new(); + while let Some(&c) = chars.peek() { + if c.is_ascii_alphanumeric() || c == '_' { + ident.push(c); + chars.next(); + } else { + break; + } + } + let kind = if is_keyword(&ident) { + TokenType::Keyword + } else { + TokenType::Identifier + }; + // Normalize identifiers: hash by kind, not by name (Type-2 detection) + tokens.push(Token { + kind, + value: if kind == TokenType::Keyword { + hash_str(&ident) + } else { + hash_str("ID") + }, + }); + } + // Operators (multi-char first) + '<' | '>' | '=' | '!' | '&' | '|' | '+' | '-' | '*' | '/' | '%' | '^' => { + let mut op = String::new(); + op.push(chars.next().unwrap()); + // Check for two-char operators + if let Some(&c) = chars.peek() { + if matches!(c, '=' | '&' | '|' | '<' | '>' | '+') { + op.push(c); + chars.next(); + } + } + tokens.push(Token { + kind: TokenType::Operator, + value: hash_str(&op), + }); + } + // Punctuation + '{' | '}' | '(' | ')' | '[' | ']' | ';' | ':' | ',' | '.' | '#' | '@' | '~' | '?' => { + tokens.push(Token { + kind: TokenType::Punctuation, + value: hash_str(&ch.to_string()), + }); + chars.next(); + } + // Unknown - skip + _ => { + chars.next(); + } + } + } + + tokens +} + +/// Check if a string is a programming language keyword. +#[allow(clippy::match_same_arms)] +fn is_keyword(s: &str) -> bool { + matches!( + s, + // Rust + "fn" | "let" | "mut" | "const" | "static" | "pub" | "mod" | "use" | "crate" | "self" + | "Self" | "super" | "struct" | "enum" | "impl" | "trait" | "type" | "where" | "async" + | "await" | "move" | "ref" | "match" | "if" | "else" | "loop" | "while" | "for" | "in" + | "return" | "break" | "continue" | "unsafe" | "extern" | "dyn" | "as" + // TypeScript/JavaScript + | "function" | "var" | "class" | "interface" | "extends" | "implements" | "import" + | "export" | "from" | "default" | "new" | "this" | "typeof" | "instanceof" | "void" + | "null" | "undefined" | "true" | "false" | "try" | "catch" | "finally" | "throw" + | "switch" | "case" | "do" | "delete" | "yield" | "constructor" | "readonly" + // Python + | "def" | "lambda" | "pass" | "raise" | "except" | "with" | "assert" | "global" + | "nonlocal" | "print" | "elif" + // Go + | "package" | "go" | "chan" | "select" | "defer" | "fallthrough" | "goto" | "range" + | "map" | "make" | "append" | "copy" + // Java + | "public" | "private" | "protected" | "final" | "abstract" | "synchronized" + | "volatile" | "transient" | "native" | "strictfp" | "throws" + // C/C++ + | "int" | "char" | "float" | "double" | "long" | "short" | "unsigned" | "signed" + | "auto" | "register" | "inline" | "restrict" | "sizeof" | "typedef" + ) +} + +/// Hash a string to a u64 value. +fn hash_str(s: &str) -> u64 { + let mut hash: u64 = 0; + for byte in s.bytes() { + hash = (hash.wrapping_mul(HASH_BASE).wrapping_add(u64::from(byte))) % HASH_MOD; + } + hash +} + +/// Compute Rabin-Karp fingerprints for a token stream. +/// +/// Per paper Algorithm 4: slide a window of size w over tokens, +/// computing rolling hash for each window position. +fn compute_fingerprints(tokens: &[Token], window_size: usize) -> Vec { + if tokens.len() < window_size { + return Vec::new(); + } + + let mut fingerprints = Vec::with_capacity(tokens.len() - window_size + 1); + + // Pre-compute base^(window_size - 1) mod MOD for rolling hash. + // Use iterative modular exponentiation to avoid u64 overflow (256^49 >> u64::MAX). + let base_pow: u64 = (0..window_size - 1).fold(1u64, |acc, _| (acc * HASH_BASE) % HASH_MOD); + + // Compute initial window hash + let mut hash: u64 = 0; + for token in tokens.iter().take(window_size) { + hash = (hash.wrapping_mul(HASH_BASE).wrapping_add(token.value)) % HASH_MOD; + } + fingerprints.push(hash); + + // Roll the window + for i in window_size..tokens.len() { + // Remove leftmost token's contribution + let left_val = (tokens[i - window_size].value * base_pow) % HASH_MOD; + hash = (hash + HASH_MOD - left_val) % HASH_MOD; + // Add new token + hash = (hash.wrapping_mul(HASH_BASE).wrapping_add(tokens[i].value)) % HASH_MOD; + fingerprints.push(hash); + } + + fingerprints +} + +/// Entity with its source code and fingerprints. +#[derive(Debug)] +struct EntityFingerprints { + entity_id: String, + file: String, + fps: Vec, + token_count: usize, +} + +/// Detect duplication across entities in the graph. +/// +/// Per paper §3.4: compute fingerprints for each entity, store in HashMap, +/// find collisions indicating potential clones. +pub fn detect_duplication( + graph: &RPGraph, + project_root: &Path, + config: &DuplicationConfig, +) -> Vec { + use rayon::prelude::*; + + // Collect entities to analyze (skip Module entities) + let entities: Vec<_> = graph + .entities + .iter() + .filter(|(_, e)| e.kind != EntityKind::Module) + .collect(); + + // Phase 1: Cache file contents (read each file once, shared across entities) + let file_contents: HashMap = { + let unique_files: HashSet = entities + .iter() + .map(|(_, e)| project_root.join(&e.file)) + .collect(); + unique_files + .into_iter() + .filter_map(|p| std::fs::read_to_string(&p).ok().map(|s| (p, s))) + .collect() + }; + + // Phase 2: Per-entity tokenization using line ranges + let entity_fps: Vec = entities + .par_iter() + .filter_map(|(id, entity)| { + let file_path = project_root.join(&entity.file); + let source = file_contents.get(&file_path)?; + + // Extract only the entity's source lines (1-indexed → 0-indexed) + let lines: Vec<&str> = source.lines().collect(); + let start = entity.line_start.saturating_sub(1); + let end = entity.line_end.min(lines.len()); + if start >= end { + return None; + } + let entity_source = lines[start..end].join("\n"); + + let tokens = tokenize(&entity_source); + if tokens.len() < config.min_tokens { + return None; + } + + let fingerprints = compute_fingerprints(&tokens, config.window_size); + if fingerprints.is_empty() { + return None; + } + + Some(EntityFingerprints { + entity_id: (*id).to_string(), + file: entity.file.display().to_string(), + fps: fingerprints, + token_count: tokens.len(), + }) + }) + .collect(); + + // Build fingerprint -> entity mapping (find collisions) + let mut fingerprint_map: HashMap> = HashMap::new(); + for (idx, ef) in entity_fps.iter().enumerate() { + for &fp in &ef.fps { + fingerprint_map.entry(fp).or_default().push(idx); + } + } + + // Find entity pairs with high fingerprint overlap. + // Deduplicate indices per fingerprint: the same entity can produce many + // matching windows for a single fingerprint value, so we must count each + // (entity_a, entity_b) pair at most once per fingerprint to keep + // similarity ≤ 1.0. + let mut pair_scores: HashMap<(usize, usize), usize> = HashMap::new(); + for indices in fingerprint_map.values() { + if indices.len() < 2 { + continue; + } + // Unique entity indices that share this fingerprint + let unique: Vec = { + let mut set: Vec = indices.clone(); + set.sort_unstable(); + set.dedup(); + set + }; + if unique.len() < 2 { + continue; + } + for i in 0..unique.len() { + for j in (i + 1)..unique.len() { + let a = unique[i]; // already sorted + let b = unique[j]; + *pair_scores.entry((a, b)).or_insert(0) += 1; + } + } + } + + // Convert to similarity and filter by threshold + let mut clone_groups: Vec = Vec::new(); + for ((a, b), shared) in pair_scores { + let ef_a = &entity_fps[a]; + let ef_b = &entity_fps[b]; + + // Duplication coefficient: shared fingerprints / min(fp_a, fp_b) + let min_fps = ef_a.fps.len().min(ef_b.fps.len()); + if min_fps == 0 { + continue; + } + + let similarity = shared as f64 / min_fps as f64; + if similarity < config.similarity_threshold { + continue; + } + + // Estimate duplicated tokens + let ratio = (shared as f64 / ef_a.fps.len().max(1) as f64).clamp(0.0, 1.0); + #[allow(clippy::cast_sign_loss)] // ratio is clamped to [0,1]; result is non-negative + let duplicated_tokens = (ratio * ef_a.token_count as f64).round() as usize; + + if duplicated_tokens < config.min_tokens { + continue; + } + + clone_groups.push(CloneGroup { + entities: vec![ef_a.entity_id.clone(), ef_b.entity_id.clone()], + similarity: (similarity * 1000.0).round() / 1000.0, + duplicated_tokens, + files: vec![ef_a.file.clone(), ef_b.file.clone()], + }); + } + + // Sort by similarity descending + clone_groups.sort_by(|a, b| { + b.similarity + .partial_cmp(&a.similarity) + .unwrap_or(std::cmp::Ordering::Equal) + }); + + // Limit to top 50 groups to avoid overwhelming output + clone_groups.truncate(50); + + clone_groups +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize_simple() { + let source = "fn foo() { let x = 1; }"; + let tokens = tokenize(source); + + // Should have tokens for: fn, foo, (, ), {, let, x, =, 1, ;, } + assert!(!tokens.is_empty()); + assert!(tokens.len() >= 8); + } + + #[test] + fn test_tokenize_normalizes_identifiers() { + let source1 = "let foo = 1;"; + let source2 = "let bar = 2;"; + + let tokens1 = tokenize(source1); + let tokens2 = tokenize(source2); + + // Both should have same token sequence (identifiers normalized to "ID") + let values1: Vec<_> = tokens1.iter().map(|t| t.value).collect(); + let values2: Vec<_> = tokens2.iter().map(|t| t.value).collect(); + + // Keywords and structure should match + assert_eq!(tokens1.len(), tokens2.len()); + assert_eq!(values1, values2); + } + + #[test] + fn test_tokenize_strips_comments() { + let source = "fn foo() { /* comment */ let x = 1; }\n// line comment\nlet y = 2;"; + let tokens = tokenize(source); + + // Comments should be stripped + let token_values: Vec<_> = tokens.iter().map(|t| t.value).collect(); + assert!(!token_values.iter().any(|&v| v == hash_str("comment"))); + } + + #[test] + fn test_tokenize_normalizes_literals() { + let source1 = "let x = 42;"; + let source2 = "let x = 999999;"; + + let tokens1 = tokenize(source1); + let tokens2 = tokenize(source2); + + // Literals should both be normalized to same value + let lit1 = tokens1.iter().find(|t| t.kind == TokenType::Literal); + let lit2 = tokens2.iter().find(|t| t.kind == TokenType::Literal); + + assert_eq!(lit1.map(|t| t.value), lit2.map(|t| t.value)); + } + + #[test] + fn test_fingerprints_deterministic() { + let source = "fn foo() { let x = 1; let y = 2; return x + y; }"; + let tokens = tokenize(source); + + let fp1 = compute_fingerprints(&tokens, 10); + let fp2 = compute_fingerprints(&tokens, 10); + + assert_eq!(fp1, fp2); + } + + #[test] + fn test_fingerprints_empty_for_short_input() { + let source = "fn"; + let tokens = tokenize(source); + + let fp = compute_fingerprints(&tokens, 10); + + assert!(fp.is_empty()); + } + + #[test] + fn test_identical_code_high_similarity() { + let source = r" + fn calculate_total(items: &[Item]) -> f64 { + let mut total = 0.0; + for item in items { + total += item.price * item.quantity; + } + total + } + "; + + let tokens = tokenize(source); + let fps = compute_fingerprints(&tokens, DEFAULT_WINDOW_SIZE); + + // Same code should have matching fingerprints + let tokens2 = tokenize(source); + let fps2 = compute_fingerprints(&tokens2, DEFAULT_WINDOW_SIZE); + + assert_eq!(fps, fps2); + } + + #[test] + fn test_type2_clone_detection() { + // Type-2: same structure, renamed identifiers + let source1 = r" + fn process_data(input: &str) -> String { + let result = input.to_uppercase(); + result.trim().to_string() + } + "; + + let source2 = r" + fn handle_text(data: &str) -> String { + let output = data.to_uppercase(); + output.trim().to_string() + } + "; + + let tokens1 = tokenize(source1); + let tokens2 = tokenize(source2); + + // Structure should be identical after normalization + let values1: Vec<_> = tokens1.iter().map(|t| t.value).collect(); + let values2: Vec<_> = tokens2.iter().map(|t| t.value).collect(); + + assert_eq!( + values1, values2, + "Type-2 clones should normalize to same tokens" + ); + } + + #[test] + fn test_is_keyword() { + assert!(is_keyword("fn")); + assert!(is_keyword("let")); + assert!(is_keyword("function")); + assert!(is_keyword("class")); + assert!(is_keyword("def")); + assert!(!is_keyword("my_function")); + assert!(!is_keyword("MyClass")); + assert!(!is_keyword("variable_name")); + } + + // --- Semantic duplication tests --- + + use rpg_core::graph::{Entity, EntityDeps}; + use std::path::PathBuf; + + fn make_entity_with_features(id: &str, file: &str, features: Vec<&str>) -> Entity { + Entity { + id: id.to_string(), + kind: EntityKind::Function, + name: id.to_string(), + file: PathBuf::from(file), + line_start: 1, + line_end: 10, + parent_class: None, + semantic_features: features.into_iter().map(|s| s.to_string()).collect(), + feature_source: Some("llm".to_string()), + hierarchy_path: String::new(), + deps: EntityDeps::default(), + signature: None, + } + } + + #[test] + fn test_semantic_duplicates_identical_features() { + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "src/a.rs:process".to_string(), + make_entity_with_features( + "src/a.rs:process", + "src/a.rs", + vec!["validate input", "handle error"], + ), + ); + graph.entities.insert( + "src/b.rs:handle".to_string(), + make_entity_with_features( + "src/b.rs:handle", + "src/b.rs", + vec!["validate input", "handle error"], + ), + ); + + let config = SemanticDuplicationConfig { + similarity_threshold: 0.6, + skip_same_file: true, + ..Default::default() + }; + let groups = detect_semantic_duplicates(&graph, &config); + + assert_eq!(groups.len(), 1); + assert!((groups[0].similarity - 1.0).abs() < 0.001); + assert_eq!(groups[0].shared_features.len(), 2); + } + + #[test] + fn test_semantic_duplicates_skips_same_file() { + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "src/a.rs:foo".to_string(), + make_entity_with_features( + "src/a.rs:foo", + "src/a.rs", + vec!["validate input", "return result"], + ), + ); + graph.entities.insert( + "src/a.rs:bar".to_string(), + make_entity_with_features( + "src/a.rs:bar", + "src/a.rs", + vec!["validate input", "return result"], + ), + ); + + // skip_same_file=true should suppress the pair + let config = SemanticDuplicationConfig { + similarity_threshold: 0.5, + skip_same_file: true, + ..Default::default() + }; + assert!(detect_semantic_duplicates(&graph, &config).is_empty()); + + // skip_same_file=false should surface it + let config2 = SemanticDuplicationConfig { + similarity_threshold: 0.5, + skip_same_file: false, + ..Default::default() + }; + assert_eq!(detect_semantic_duplicates(&graph, &config2).len(), 1); + } + + #[test] + fn test_semantic_duplicates_skips_unlifted_entities() { + let mut graph = RPGraph::new("rust"); + // Unlifted entity (no features) — must not participate + graph.entities.insert( + "src/a.rs:empty".to_string(), + make_entity_with_features("src/a.rs:empty", "src/a.rs", vec![]), + ); + // Two lifted entities from different files with same feature + graph.entities.insert( + "src/b.rs:lifted_one".to_string(), + make_entity_with_features("src/b.rs:lifted_one", "src/b.rs", vec!["handle request"]), + ); + graph.entities.insert( + "src/c.rs:lifted_two".to_string(), + make_entity_with_features("src/c.rs:lifted_two", "src/c.rs", vec!["handle request"]), + ); + + let config = SemanticDuplicationConfig { + similarity_threshold: 0.9, + min_features: 1, + ..Default::default() + }; + let groups = detect_semantic_duplicates(&graph, &config); + + // Only the two lifted entities should match; the unlifted one must not appear + assert_eq!(groups.len(), 1); + assert!( + !groups[0].entities.contains(&"src/a.rs:empty".to_string()), + "unlifted entity must not appear in semantic clone groups" + ); + } + + // --- Per-entity token-based detection tests --- + + /// Helper: create an Entity with specific line range (for detect_duplication tests). + fn make_entity_at_lines( + id: &str, + file: &str, + line_start: usize, + line_end: usize, + ) -> Entity { + Entity { + id: id.to_string(), + kind: EntityKind::Function, + name: id.to_string(), + file: PathBuf::from(file), + line_start, + line_end, + parent_class: None, + semantic_features: Vec::new(), + feature_source: None, + hierarchy_path: String::new(), + deps: EntityDeps::default(), + signature: None, + } + } + + #[test] + fn test_detect_duplication_identical_functions() { + // Two files each containing the same function at known line ranges. + // detect_duplication should find them as a clone pair. + let dir = tempfile::tempdir().unwrap(); + + let func_code = r#"fn looks_like_custom_hook(name: &str) -> bool { + if !name.starts_with("use") || name.len() <= 3 { + return false; + } + name.chars().nth(3).is_some_and(|c| c.is_ascii_uppercase()) +} +"#; + // File A: function at lines 1-6 + let file_a = dir.path().join("a.rs"); + std::fs::write(&file_a, func_code).unwrap(); + + // File B: preamble + same function at lines 3-8 + let file_b = dir.path().join("b.rs"); + std::fs::write( + &file_b, + format!("// preamble\nuse std::io;\n{}", func_code), + ) + .unwrap(); + + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "a.rs:looks_like_custom_hook".to_string(), + make_entity_at_lines("a.rs:looks_like_custom_hook", "a.rs", 1, 6), + ); + graph.entities.insert( + "b.rs:looks_like_custom_hook".to_string(), + make_entity_at_lines("b.rs:looks_like_custom_hook", "b.rs", 3, 8), + ); + + let config = DuplicationConfig { + window_size: 10, + min_tokens: 10, + similarity_threshold: 0.5, + }; + let groups = detect_duplication(&graph, dir.path(), &config); + + assert!( + !groups.is_empty(), + "identical functions across files must be detected as clones" + ); + assert!( + groups[0].similarity <= 1.0, + "similarity must not exceed 1.0, got {}", + groups[0].similarity + ); + assert!( + groups[0].similarity > 0.7, + "identical functions should have high similarity, got {}", + groups[0].similarity + ); + } + + #[test] + fn test_detect_duplication_similarity_bounded() { + // Ensure the dedup fix keeps similarity ≤ 1.0 even with many fingerprint collisions. + let dir = tempfile::tempdir().unwrap(); + + // Two entities with the EXACT same source → shared fingerprints == min fingerprints + let source = "fn compute(x: i32, y: i32) -> i32 { let result = x + y; result * result }\n"; + let file_a = dir.path().join("x.rs"); + let file_b = dir.path().join("y.rs"); + std::fs::write(&file_a, source).unwrap(); + std::fs::write(&file_b, source).unwrap(); + + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "x.rs:compute".to_string(), + make_entity_at_lines("x.rs:compute", "x.rs", 1, 1), + ); + graph.entities.insert( + "y.rs:compute".to_string(), + make_entity_at_lines("y.rs:compute", "y.rs", 1, 1), + ); + + let config = DuplicationConfig { + window_size: 5, + min_tokens: 5, + similarity_threshold: 0.1, + }; + let groups = detect_duplication(&graph, dir.path(), &config); + + for group in &groups { + assert!( + group.similarity <= 1.0, + "similarity must be ≤ 1.0 after dedup fix, got {}", + group.similarity + ); + } + } + + #[test] + fn test_detect_duplication_no_clones_for_different_code() { + // Two completely different functions should NOT be reported as clones. + let dir = tempfile::tempdir().unwrap(); + + let file_a = dir.path().join("add.rs"); + std::fs::write(&file_a, "fn add(a: i32, b: i32) -> i32 { a + b }\n").unwrap(); + + let file_b = dir.path().join("greet.rs"); + std::fs::write( + &file_b, + "fn greet(name: &str) -> String { format!(\"Hello, {}!\", name) }\n", + ) + .unwrap(); + + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "add.rs:add".to_string(), + make_entity_at_lines("add.rs:add", "add.rs", 1, 1), + ); + graph.entities.insert( + "greet.rs:greet".to_string(), + make_entity_at_lines("greet.rs:greet", "greet.rs", 1, 1), + ); + + let config = DuplicationConfig { + window_size: 5, + min_tokens: 5, + similarity_threshold: 0.7, + }; + let groups = detect_duplication(&graph, dir.path(), &config); + + assert!( + groups.is_empty(), + "completely different functions should not be reported as clones" + ); + } + + #[test] + fn test_detect_duplication_invalid_line_range() { + // Entity with line_start > line_end or beyond file length → no panic, just skipped. + let dir = tempfile::tempdir().unwrap(); + + let file_a = dir.path().join("short.rs"); + std::fs::write(&file_a, "fn tiny() {}\n").unwrap(); // 1 line + + let mut graph = RPGraph::new("rust"); + // line_start beyond file length + graph.entities.insert( + "short.rs:far".to_string(), + make_entity_at_lines("short.rs:far", "short.rs", 100, 200), + ); + // line_start = 0 (edge: saturating_sub converts to 0-indexed start of 0) + graph.entities.insert( + "short.rs:zero".to_string(), + make_entity_at_lines("short.rs:zero", "short.rs", 0, 1), + ); + + let config = DuplicationConfig::default(); + // Must not panic + let groups = detect_duplication(&graph, dir.path(), &config); + // No meaningful pairs expected from degenerate ranges + assert!(groups.is_empty() || groups.iter().all(|g| g.similarity <= 1.0)); + } + + #[test] + fn test_detect_duplication_missing_file() { + // Entity referencing a non-existent file → gracefully skipped, no panic. + let dir = tempfile::tempdir().unwrap(); + + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "gone.rs:phantom".to_string(), + make_entity_at_lines("gone.rs:phantom", "gone.rs", 1, 10), + ); + + let config = DuplicationConfig::default(); + let groups = detect_duplication(&graph, dir.path(), &config); + assert!( + groups.is_empty(), + "missing files should be skipped, not cause errors" + ); + } +} diff --git a/crates/rpg-nav/src/health.rs b/crates/rpg-nav/src/health.rs new file mode 100644 index 0000000..c48f081 --- /dev/null +++ b/crates/rpg-nav/src/health.rs @@ -0,0 +1,587 @@ +//! Health analysis: coupling, instability, centrality, and god object detection. +//! +//! Implements the CHM (Code Health Meter) metrics from the paper: +//! - In-degree (afferent coupling, Ca) +//! - Out-degree (efferent coupling, Ce) +//! - Instability index I = Ce / (Ca + Ce) +//! - Degree centrality (normalized) +//! - God Object heuristic (high degree + extreme instability) + +use crate::duplication::{ + CloneGroup, DuplicationConfig, SemanticCloneGroup, SemanticDuplicationConfig, + detect_duplication, detect_semantic_duplicates, +}; +use rpg_core::graph::{EdgeKind, EntityKind, RPGraph}; +use serde::Serialize; +use std::collections::HashMap; +use std::path::Path; + +/// Edge kinds that represent dependency relationships (not structural containment). +const DEPENDENCY_EDGE_KINDS: &[EdgeKind] = &[ + EdgeKind::Imports, + EdgeKind::Invokes, + EdgeKind::Inherits, + EdgeKind::Composes, + EdgeKind::Renders, + EdgeKind::ReadsState, + EdgeKind::WritesState, + EdgeKind::Dispatches, + EdgeKind::DataFlow, +]; + +/// A health issue detected for an entity. +#[derive(Debug, Clone, Serialize, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum HealthIssue { + /// Entity has high total degree and extreme instability (god object). + PotentialGodObject { + total_degree: usize, + instability: f64, + }, + /// Entity has high instability (> threshold), indicating it's dependent on many. + HighlyUnstable { instability: f64, out_degree: usize }, + /// Entity has very low instability (< 0.3), indicating it's depended on by many. + HighlyStable { instability: f64, in_degree: usize }, + /// Entity has high total degree (hub). + HubEntity { total_degree: usize }, +} + +/// Health metrics for a single entity. +#[derive(Debug, Clone, Serialize)] +pub struct EntityHealth { + pub entity_id: String, + pub name: String, + pub file: String, + pub kind: String, + /// Afferent coupling (Ca): number of incoming dependency edges. + pub in_degree: usize, + /// Efferent coupling (Ce): number of outgoing dependency edges. + pub out_degree: usize, + /// Instability index: Ce / (Ca + Ce). Range [0, 1]. + /// I ≈ 1: unstable (depends on many) + /// I ≈ 0: stable (depended on by many) + pub instability: f64, + /// Degree centrality: total_degree / (n - 1), where n = total entities. + pub centrality: f64, + /// Detected health issues for this entity. + #[serde(skip_serializing_if = "Vec::is_empty")] + pub issues: Vec, +} + +/// Aggregate health statistics for the codebase. +#[derive(Debug, Clone, Serialize)] +pub struct HealthSummary { + pub total_entities: usize, + pub analyzed_entities: usize, + pub total_dependency_edges: usize, + pub avg_in_degree: f64, + pub avg_out_degree: f64, + pub avg_instability: f64, + pub avg_centrality: f64, + pub god_object_count: usize, + pub highly_unstable_count: usize, + pub highly_stable_count: usize, + pub hub_count: usize, +} + +/// Complete health analysis report. +#[derive(Debug, Clone, Serialize)] +pub struct HealthReport { + pub summary: HealthSummary, + pub entities: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub duplicates: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub semantic_duplicates: Option>, + #[serde(skip_serializing_if = "Vec::is_empty")] + pub top_unstable: Vec, + #[serde(skip_serializing_if = "Vec::is_empty")] + pub top_god_objects: Vec, +} + +/// Configuration for health analysis. +#[derive(Debug, Clone)] +pub struct HealthConfig { + /// Instability threshold for flagging highly unstable entities. + pub instability_threshold: f64, + /// Minimum total degree to consider as a hub. + pub hub_threshold: usize, + /// Minimum total degree for god object detection. + pub god_object_degree_threshold: usize, + /// Instability extreme threshold for god object (must be > this or < 1-this). + pub god_object_instability_threshold: f64, + /// Maximum entities to include in top lists. + pub top_n: usize, + /// Include token-based duplication detection (reads source files from disk, slower). + pub include_duplication: bool, + /// Duplication detection config. + pub duplication_config: DuplicationConfig, + /// Include semantic duplication detection via Jaccard similarity on lifted features (in-memory, fast). + pub include_semantic_duplication: bool, + /// Semantic duplication detection config. + pub semantic_duplication_config: SemanticDuplicationConfig, +} + +impl Default for HealthConfig { + fn default() -> Self { + Self { + instability_threshold: 0.7, + hub_threshold: 8, + god_object_degree_threshold: 10, + god_object_instability_threshold: 0.7, + top_n: 10, + include_duplication: false, + duplication_config: DuplicationConfig::default(), + include_semantic_duplication: false, + semantic_duplication_config: SemanticDuplicationConfig::default(), + } + } +} + +/// Compute health metrics for all entities in the graph. +pub fn compute_health(graph: &RPGraph, config: &HealthConfig) -> HealthReport { + let total_entities = graph.entities.len(); + let n = total_entities; + let normalizer = if n > 1 { (n - 1) as f64 } else { 1.0 }; + + // Count dependency edges (exclude Contains) + let total_dependency_edges = graph + .edges + .iter() + .filter(|e| e.kind != EdgeKind::Contains) + .count(); + + // Compute in-degree and out-degree for each entity + let mut in_degrees: HashMap<&str, usize> = HashMap::with_capacity(total_entities); + let mut out_degrees: HashMap<&str, usize> = HashMap::with_capacity(total_entities); + + for edge in &graph.edges { + if !DEPENDENCY_EDGE_KINDS.contains(&edge.kind) { + continue; + } + *out_degrees.entry(edge.source.as_str()).or_insert(0) += 1; + *in_degrees.entry(edge.target.as_str()).or_insert(0) += 1; + } + + // Build entity health records + let mut entities: Vec = Vec::with_capacity(total_entities); + let mut god_object_count = 0usize; + let mut highly_unstable_count = 0usize; + let mut highly_stable_count = 0usize; + let mut hub_count = 0usize; + + for (id, entity) in &graph.entities { + // Skip Module entities (file-level) for analysis + if entity.kind == EntityKind::Module { + continue; + } + + let in_degree = *in_degrees.get(id.as_str()).unwrap_or(&0); + let out_degree = *out_degrees.get(id.as_str()).unwrap_or(&0); + let total_degree = in_degree + out_degree; + + // Instability: Ce / (Ca + Ce) + // Handle edge case where both are 0 + let instability = if total_degree == 0 { + 0.0 + } else { + out_degree as f64 / total_degree as f64 + }; + + // Degree centrality (normalized) + let centrality = total_degree as f64 / normalizer; + + // Detect issues + let mut issues = Vec::new(); + + // God Object: high degree + extreme instability + if total_degree >= config.god_object_degree_threshold + && (instability > config.god_object_instability_threshold + || instability < (1.0 - config.god_object_instability_threshold)) + { + issues.push(HealthIssue::PotentialGodObject { + total_degree, + instability, + }); + god_object_count += 1; + } + + // High instability + if instability > config.instability_threshold && out_degree > 0 { + issues.push(HealthIssue::HighlyUnstable { + instability, + out_degree, + }); + highly_unstable_count += 1; + } + + // High stability (depended on by many) + if instability < (1.0 - config.instability_threshold) && in_degree > 0 { + issues.push(HealthIssue::HighlyStable { + instability, + in_degree, + }); + highly_stable_count += 1; + } + + // Hub entity + if total_degree >= config.hub_threshold { + issues.push(HealthIssue::HubEntity { total_degree }); + hub_count += 1; + } + + entities.push(EntityHealth { + entity_id: id.clone(), + name: entity.name.clone(), + file: entity.file.display().to_string(), + kind: format!("{:?}", entity.kind).to_lowercase(), + in_degree, + out_degree, + instability: clean_float(instability), + centrality: clean_float(centrality), + issues, + }); + } + + // Compute summary statistics + let analyzed = entities.len(); + let total_in: usize = entities.iter().map(|e| e.in_degree).sum(); + let total_out: usize = entities.iter().map(|e| e.out_degree).sum(); + let total_instability: f64 = entities.iter().map(|e| e.instability).sum(); + let total_centrality: f64 = entities.iter().map(|e| e.centrality).sum(); + + let avg_in_degree = if analyzed > 0 { + total_in as f64 / analyzed as f64 + } else { + 0.0 + }; + let avg_out_degree = if analyzed > 0 { + total_out as f64 / analyzed as f64 + } else { + 0.0 + }; + let avg_instability = if analyzed > 0 { + total_instability / analyzed as f64 + } else { + 0.0 + }; + let avg_centrality = if analyzed > 0 { + total_centrality / analyzed as f64 + } else { + 0.0 + }; + + // Sort by instability for top unstable + let mut sorted_by_instability = entities.clone(); + sorted_by_instability.sort_by(|a, b| { + b.instability + .partial_cmp(&a.instability) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let top_unstable: Vec = sorted_by_instability + .into_iter() + .filter(|e| e.instability > config.instability_threshold) + .take(config.top_n) + .collect(); + + // Sort by god object score for top god objects + let mut sorted_by_god = entities.clone(); + sorted_by_god.sort_by(|a, b| { + let a_score = a + .issues + .iter() + .filter(|i| matches!(i, HealthIssue::PotentialGodObject { .. })) + .count(); + let b_score = b + .issues + .iter() + .filter(|i| matches!(i, HealthIssue::PotentialGodObject { .. })) + .count(); + b_score.cmp(&a_score).then_with(|| { + let a_degree: usize = a.in_degree + a.out_degree; + let b_degree: usize = b.in_degree + b.out_degree; + b_degree.cmp(&a_degree) + }) + }); + let top_god_objects: Vec = sorted_by_god + .into_iter() + .filter(|e| { + e.issues + .iter() + .any(|i| matches!(i, HealthIssue::PotentialGodObject { .. })) + }) + .take(config.top_n) + .collect(); + + let summary = HealthSummary { + total_entities, + analyzed_entities: analyzed, + total_dependency_edges, + avg_in_degree: clean_float(avg_in_degree), + avg_out_degree: clean_float(avg_out_degree), + avg_instability: clean_float(avg_instability), + avg_centrality: clean_float(avg_centrality), + god_object_count, + highly_unstable_count, + highly_stable_count, + hub_count, + }; + + // Sort entities by entity_id for deterministic output + entities.sort_by(|a, b| a.entity_id.cmp(&b.entity_id)); + + HealthReport { + summary, + entities, + duplicates: None, + semantic_duplicates: None, + top_unstable, + top_god_objects, + } +} + +/// Compute health metrics with optional duplication detection. +/// This is the main entry point for MCP tool. +pub fn compute_health_full( + graph: &RPGraph, + project_root: &Path, + config: &HealthConfig, +) -> HealthReport { + let mut report = compute_health(graph, config); + + if config.include_duplication { + report.duplicates = Some(detect_duplication( + graph, + project_root, + &config.duplication_config, + )); + } + + if config.include_semantic_duplication { + report.semantic_duplicates = Some(detect_semantic_duplicates( + graph, + &config.semantic_duplication_config, + )); + } + + report +} + +/// Clean a float: NaN/Infinity → 0, round to 6 decimals. +fn clean_float(v: f64) -> f64 { + if v.is_nan() || v.is_infinite() { + return 0.0; + } + (v * 1_000_000.0).round() / 1_000_000.0 +} + +#[cfg(test)] +mod tests { + use super::*; + use rpg_core::graph::{DependencyEdge, Entity, EntityDeps}; + use std::path::PathBuf; + + fn make_entity(id: &str, name: &str, kind: EntityKind) -> Entity { + Entity { + id: id.to_string(), + kind, + name: name.to_string(), + file: PathBuf::from("src/lib.rs"), + line_start: 1, + line_end: 5, + parent_class: None, + semantic_features: vec![], + feature_source: None, + hierarchy_path: String::new(), + deps: EntityDeps::default(), + signature: None, + } + } + + fn make_test_graph() -> RPGraph { + // A -> B -> C (linear chain via Invokes) + // A -> C (direct edge) + // Total edges: A has out_degree=2, in_degree=0 + // B has out_degree=1, in_degree=1 + // C has out_degree=0, in_degree=2 + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "a".to_string(), + make_entity("a", "fn_a", EntityKind::Function), + ); + graph.entities.insert( + "b".to_string(), + make_entity("b", "fn_b", EntityKind::Function), + ); + graph.entities.insert( + "c".to_string(), + make_entity("c", "fn_c", EntityKind::Function), + ); + graph.edges = vec![ + DependencyEdge { + source: "a".to_string(), + target: "b".to_string(), + kind: EdgeKind::Invokes, + }, + DependencyEdge { + source: "a".to_string(), + target: "c".to_string(), + kind: EdgeKind::Invokes, + }, + DependencyEdge { + source: "b".to_string(), + target: "c".to_string(), + kind: EdgeKind::Invokes, + }, + ]; + graph.refresh_metadata(); + graph + } + + #[test] + fn test_compute_health_linear_chain() { + let graph = make_test_graph(); + let config = HealthConfig::default(); + let report = compute_health(&graph, &config); + + assert_eq!(report.summary.analyzed_entities, 3); + assert_eq!(report.summary.total_dependency_edges, 3); + + // Find entity A + let a = report.entities.iter().find(|e| e.entity_id == "a").unwrap(); + assert_eq!(a.in_degree, 0); + assert_eq!(a.out_degree, 2); + assert!((a.instability - 1.0).abs() < 0.001); // Fully unstable + + // Find entity C + let c = report.entities.iter().find(|e| e.entity_id == "c").unwrap(); + assert_eq!(c.in_degree, 2); + assert_eq!(c.out_degree, 0); + assert!((c.instability - 0.0).abs() < 0.001); // Fully stable + + // Find entity B + let b = report.entities.iter().find(|e| e.entity_id == "b").unwrap(); + assert_eq!(b.in_degree, 1); + assert_eq!(b.out_degree, 1); + assert!((b.instability - 0.5).abs() < 0.001); // Balanced + } + + #[test] + fn test_god_object_detection() { + let mut graph = RPGraph::new("rust"); + + // Create a god object with 12 edges + graph.entities.insert( + "god".to_string(), + make_entity("god", "GodClass", EntityKind::Class), + ); + + // Add many dependencies (8 outgoing + 4 incoming = 12 total) + for i in 0..8 { + let dep_id = format!("dep_{}", i); + graph.entities.insert( + dep_id.clone(), + make_entity(&dep_id, &dep_id, EntityKind::Function), + ); + graph.edges.push(DependencyEdge { + source: "god".to_string(), + target: dep_id, + kind: EdgeKind::Invokes, + }); + } + for i in 0..4 { + let caller_id = format!("caller_{}", i); + graph.entities.insert( + caller_id.clone(), + make_entity(&caller_id, &caller_id, EntityKind::Function), + ); + graph.edges.push(DependencyEdge { + source: caller_id, + target: "god".to_string(), + kind: EdgeKind::Invokes, + }); + } + + graph.refresh_metadata(); + + let config = HealthConfig { + god_object_degree_threshold: 10, + god_object_instability_threshold: 0.7, + ..Default::default() + }; + let report = compute_health(&graph, &config); + + // The god entity should be flagged + assert!(report.summary.god_object_count > 0 || report.summary.hub_count > 0); + + let god = report + .entities + .iter() + .find(|e| e.entity_id == "god") + .unwrap(); + assert_eq!(god.in_degree, 4); + assert_eq!(god.out_degree, 8); + assert_eq!(god.in_degree + god.out_degree, 12); + } + + #[test] + fn test_centrality_normalization() { + let graph = make_test_graph(); + let config = HealthConfig::default(); + let report = compute_health(&graph, &config); + + // Centrality should be <= 1.0 for all entities + for entity in &report.entities { + assert!(entity.centrality <= 1.0); + assert!(entity.centrality >= 0.0); + } + } + + #[test] + fn test_empty_graph() { + let graph = RPGraph::new("rust"); + let config = HealthConfig::default(); + let report = compute_health(&graph, &config); + + assert_eq!(report.summary.total_entities, 0); + assert_eq!(report.summary.analyzed_entities, 0); + assert!(report.entities.is_empty()); + } + + #[test] + fn test_skip_module_entities() { + let mut graph = RPGraph::new("rust"); + graph.entities.insert( + "mod1".to_string(), + make_entity("mod1", "module", EntityKind::Module), + ); + graph.entities.insert( + "fn1".to_string(), + make_entity("fn1", "function", EntityKind::Function), + ); + graph.refresh_metadata(); + + let config = HealthConfig::default(); + let report = compute_health(&graph, &config); + + // Only the function should be analyzed + assert_eq!(report.summary.analyzed_entities, 1); + assert_eq!(report.entities.len(), 1); + assert_eq!(report.entities[0].entity_id, "fn1"); + } + + #[test] + fn test_deterministic_output() { + let graph = make_test_graph(); + let config = HealthConfig::default(); + + let report1 = compute_health(&graph, &config); + let report2 = compute_health(&graph, &config); + + // Entities should be sorted by ID for deterministic output + assert_eq!(report1.entities.len(), report2.entities.len()); + for (e1, e2) in report1.entities.iter().zip(report2.entities.iter()) { + assert_eq!(e1.entity_id, e2.entity_id); + } + } +} diff --git a/crates/rpg-nav/src/lib.rs b/crates/rpg-nav/src/lib.rs index 6835de7..a883fa1 100644 --- a/crates/rpg-nav/src/lib.rs +++ b/crates/rpg-nav/src/lib.rs @@ -1,16 +1,19 @@ //! Navigation tools for querying the Repository Planning Graph. //! //! Provides SearchNode (intent-based discovery), FetchNode (entity details), -//! ExploreRPG (dependency traversal), and TOON serialization for LLM-optimized output. +//! ExploreRPG (dependency traversal), Health analysis, Duplication detection, +//! and TOON serialization for LLM-optimized output. pub mod context; pub mod dataflow; pub mod diff; +pub mod duplication; #[cfg(feature = "embeddings")] pub mod embeddings; pub mod explore; pub mod export; pub mod fetch; +pub mod health; pub mod impact; pub mod paths; pub mod planner; diff --git a/crates/rpg-nav/src/search.rs b/crates/rpg-nav/src/search.rs index 831c3bc..c785deb 100644 --- a/crates/rpg-nav/src/search.rs +++ b/crates/rpg-nav/src/search.rs @@ -231,9 +231,8 @@ pub fn search_with_params(graph: &RPGraph, params: &SearchParams) -> Vec, b: &HashSet<&str>) -> f64 { +/// is used for semantic duplication detection in duplication.rs. +pub(crate) fn jaccard_similarity(a: &HashSet<&str>, b: &HashSet<&str>) -> f64 { if a.is_empty() && b.is_empty() { return 0.0; } diff --git a/crates/rpg-nav/src/toon.rs b/crates/rpg-nav/src/toon.rs index 38d3c6a..2ebe598 100644 --- a/crates/rpg-nav/src/toon.rs +++ b/crates/rpg-nav/src/toon.rs @@ -704,6 +704,156 @@ fn clean_score(v: f64) -> f64 { (v * 1_000_000.0).round() / 1_000_000.0 } +// --------------------------------------------------------------------------- +// Health report output +// --------------------------------------------------------------------------- + +use crate::health::HealthReport; + +/// Format a health report as TOON for LLM consumption. +pub fn format_health_report(report: &HealthReport) -> String { + let mut output = String::new(); + + // Summary section + output.push_str("# Code Health Analysis\n\n"); + output.push_str(&format!( + "entities: {} ({} analyzed)\n", + report.summary.total_entities, report.summary.analyzed_entities + )); + output.push_str(&format!( + "dependency_edges: {}\n", + report.summary.total_dependency_edges + )); + output.push_str(&format!( + "avg_instability: {:.3}\n", + report.summary.avg_instability + )); + output.push_str(&format!( + "avg_centrality: {:.4}\n", + report.summary.avg_centrality + )); + output.push_str(&format!( + "god_objects: {}\n", + report.summary.god_object_count + )); + output.push_str(&format!( + "highly_unstable: {}\n", + report.summary.highly_unstable_count + )); + output.push_str(&format!( + "highly_stable: {}\n", + report.summary.highly_stable_count + )); + output.push_str(&format!("hubs: {}\n", report.summary.hub_count)); + + // Top unstable entities + if !report.top_unstable.is_empty() { + output.push_str("\n## Top Unstable Entities (I > 0.7)\n\n"); + for entity in &report.top_unstable { + output.push_str(&format!( + "- {} ({}) | instability={:.3} | in={} out={}\n", + entity.entity_id, + entity.kind, + entity.instability, + entity.in_degree, + entity.out_degree + )); + } + } + + // Top god objects + if !report.top_god_objects.is_empty() { + output.push_str("\n## God Object Candidates\n\n"); + for entity in &report.top_god_objects { + output.push_str(&format!( + "- {} ({}) | degree={} | instability={:.3}\n", + entity.entity_id, + entity.kind, + entity.in_degree + entity.out_degree, + entity.instability + )); + } + } + + // Duplication info if present + if let Some(ref dupes) = report.duplicates { + output.push_str("\n## Duplication Hotspots\n\n"); + if dupes.is_empty() { + output.push_str("No token-based clones detected.\n"); + } else { + for group in dupes.iter().take(10) { + output.push_str(&format!( + "- similarity={:.1}% | tokens={} | entities={}\n", + group.similarity * 100.0, + group.duplicated_tokens, + group.entities.len() + )); + } + } + } + + // Semantic duplication info if present + if let Some(ref sem_dupes) = report.semantic_duplicates { + if !sem_dupes.is_empty() { + output.push_str("\n## Semantic Duplication (Conceptual Clones)\n\n"); + output.push_str( + "Entities sharing similar intent (lifted feature overlap). \ + May indicate accidental duplication or a missing abstraction.\n\n", + ); + for group in sem_dupes.iter().take(10) { + output.push_str(&format!( + "- similarity={:.1}% | shared: [{}]\n", + group.similarity * 100.0, + group.shared_features.join(", ") + )); + for (id, file) in group.entities.iter().zip(group.files.iter()) { + output.push_str(&format!(" {} ({})\n", id, file)); + } + } + } + } + + // Recommendations + output.push_str("\n## Recommendations\n\n"); + if report.summary.god_object_count > 0 { + output.push_str(&format!( + "1. **Refactor god objects**: {} entities have high coupling. Consider extracting responsibilities.\n", + report.summary.god_object_count + )); + } + if report.summary.highly_unstable_count > report.summary.analyzed_entities / 3 { + output.push_str(&format!( + "2. **Reduce instability**: {} entities are highly unstable. Consider introducing stable abstractions.\n", + report.summary.highly_unstable_count + )); + } + if report.summary.hub_count > 0 { + output.push_str(&format!( + "3. **Review hub entities**: {} entities act as hubs. Ensure they have focused responsibilities.\n", + report.summary.hub_count + )); + } + if let Some(ref sem_dupes) = report.semantic_duplicates { + if !sem_dupes.is_empty() { + output.push_str(&format!( + "4. **Extract shared abstractions**: {} entity pairs share similar intent. \ + Consider introducing a shared interface or helper.\n", + sem_dupes.len() + )); + } + } + if report.summary.god_object_count == 0 + && report.summary.highly_unstable_count == 0 + && report.summary.hub_count == 0 + { + output.push_str( + "✅ No major architectural issues detected. The codebase shows good modularity.\n", + ); + } + + output +} + // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- From 06813a08491961132f8fae0ff2907d35723533fb Mon Sep 17 00:00:00 2001 From: VooDisss Date: Wed, 25 Feb 2026 12:16:57 +0200 Subject: [PATCH 2/3] feat: Introduce initial duplication detection module and related documentation, configuration, and audit reports. --- crates/rpg-nav/src/duplication.rs | 25 ++++++---------- crates/rpg-nav/src/toon.rs | 48 +++++++++++++++---------------- 2 files changed, 32 insertions(+), 41 deletions(-) diff --git a/crates/rpg-nav/src/duplication.rs b/crates/rpg-nav/src/duplication.rs index d7ab017..d341b90 100644 --- a/crates/rpg-nav/src/duplication.rs +++ b/crates/rpg-nav/src/duplication.rs @@ -338,11 +338,11 @@ fn tokenize(source: &str) -> Vec { let mut op = String::new(); op.push(chars.next().unwrap()); // Check for two-char operators - if let Some(&c) = chars.peek() { - if matches!(c, '=' | '&' | '|' | '<' | '>' | '+') { - op.push(c); - chars.next(); - } + if let Some(&c) = chars.peek() + && matches!(c, '=' | '&' | '|' | '<' | '>' | '+') + { + op.push(c); + chars.next(); } tokens.push(Token { kind: TokenType::Operator, @@ -507,7 +507,7 @@ pub fn detect_duplication( } Some(EntityFingerprints { - entity_id: (*id).to_string(), + entity_id: (*id).clone(), file: entity.file.display().to_string(), fps: fingerprints, token_count: tokens.len(), @@ -867,12 +867,7 @@ mod tests { // --- Per-entity token-based detection tests --- /// Helper: create an Entity with specific line range (for detect_duplication tests). - fn make_entity_at_lines( - id: &str, - file: &str, - line_start: usize, - line_end: usize, - ) -> Entity { + fn make_entity_at_lines(id: &str, file: &str, line_start: usize, line_end: usize) -> Entity { Entity { id: id.to_string(), kind: EntityKind::Function, @@ -908,11 +903,7 @@ mod tests { // File B: preamble + same function at lines 3-8 let file_b = dir.path().join("b.rs"); - std::fs::write( - &file_b, - format!("// preamble\nuse std::io;\n{}", func_code), - ) - .unwrap(); + std::fs::write(&file_b, format!("// preamble\nuse std::io;\n{}", func_code)).unwrap(); let mut graph = RPGraph::new("rust"); graph.entities.insert( diff --git a/crates/rpg-nav/src/toon.rs b/crates/rpg-nav/src/toon.rs index 2ebe598..65f6f06 100644 --- a/crates/rpg-nav/src/toon.rs +++ b/crates/rpg-nav/src/toon.rs @@ -793,22 +793,22 @@ pub fn format_health_report(report: &HealthReport) -> String { } // Semantic duplication info if present - if let Some(ref sem_dupes) = report.semantic_duplicates { - if !sem_dupes.is_empty() { - output.push_str("\n## Semantic Duplication (Conceptual Clones)\n\n"); - output.push_str( - "Entities sharing similar intent (lifted feature overlap). \ - May indicate accidental duplication or a missing abstraction.\n\n", - ); - for group in sem_dupes.iter().take(10) { - output.push_str(&format!( - "- similarity={:.1}% | shared: [{}]\n", - group.similarity * 100.0, - group.shared_features.join(", ") - )); - for (id, file) in group.entities.iter().zip(group.files.iter()) { - output.push_str(&format!(" {} ({})\n", id, file)); - } + if let Some(ref sem_dupes) = report.semantic_duplicates + && !sem_dupes.is_empty() + { + output.push_str("\n## Semantic Duplication (Conceptual Clones)\n\n"); + output.push_str( + "Entities sharing similar intent (lifted feature overlap). \ + May indicate accidental duplication or a missing abstraction.\n\n", + ); + for group in sem_dupes.iter().take(10) { + output.push_str(&format!( + "- similarity={:.1}% | shared: [{}]\n", + group.similarity * 100.0, + group.shared_features.join(", ") + )); + for (id, file) in group.entities.iter().zip(group.files.iter()) { + output.push_str(&format!(" {} ({})\n", id, file)); } } } @@ -833,14 +833,14 @@ pub fn format_health_report(report: &HealthReport) -> String { report.summary.hub_count )); } - if let Some(ref sem_dupes) = report.semantic_duplicates { - if !sem_dupes.is_empty() { - output.push_str(&format!( - "4. **Extract shared abstractions**: {} entity pairs share similar intent. \ - Consider introducing a shared interface or helper.\n", - sem_dupes.len() - )); - } + if let Some(ref sem_dupes) = report.semantic_duplicates + && !sem_dupes.is_empty() + { + output.push_str(&format!( + "4. **Extract shared abstractions**: {} entity pairs share similar intent. \ + Consider introducing a shared interface or helper.\n", + sem_dupes.len() + )); } if report.summary.god_object_count == 0 && report.summary.highly_unstable_count == 0 From fcfe330ef325ccde18401f9255b42f05c07bc0a0 Mon Sep 17 00:00:00 2001 From: VooDisss Date: Wed, 25 Feb 2026 12:28:29 +0200 Subject: [PATCH 3/3] feat: Implement code health analysis with graph metrics and duplication detection, alongside new audit reports, documentation, and configuration files. --- .../src/prompts/server_instructions.md | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/crates/rpg-mcp/src/prompts/server_instructions.md b/crates/rpg-mcp/src/prompts/server_instructions.md index c358e45..6292eb9 100644 --- a/crates/rpg-mcp/src/prompts/server_instructions.md +++ b/crates/rpg-mcp/src/prompts/server_instructions.md @@ -133,6 +133,39 @@ When using the RPG to understand or navigate a codebase (after lifting is comple - Use `context_pack` instead of search→fetch→explore chains (1 call vs 3-5, ~44% fewer tokens) - Use `impact_radius` for richer reachability analysis with edge paths (1 call vs multi-step explore) +## HEALTH ANALYSIS + +Use `analyze_health` to assess architectural quality of the codebase. It computes +instability, centrality, coupling metrics, and optionally detects code duplication. + +**When to use:** After lifting is complete, to identify refactoring targets, god objects, +unstable modules, and duplicated code. + +**Parameters (all optional):** +- `instability_threshold` (default 0.7) — flag entities with instability above this +- `god_object_threshold` (default 10) — minimum degree to flag as god object +- `include_duplication` (default false) — run Rabin-Karp token-based clone detection (reads source files, slower) +- `include_semantic_duplication` (default false) — run Jaccard feature-based clone detection (in-memory, fast) +- `semantic_similarity_threshold` (default 0.6) — Jaccard threshold for semantic clones + +**Output sections:** +- Summary: entity count, edges, avg instability/centrality, god objects, hubs +- God Object Candidates (degree ≥ threshold) +- Top Unstable Entities (I > 0.7) +- Hub Entities (high centrality) +- Duplication Hotspots (when `include_duplication=true`) — token-level Type-1/Type-2 clones +- Semantic Duplication (when `include_semantic_duplication=true`) — conceptual clones via lifted features +- Recommendations for refactoring + +**Examples:** +```json +{} // baseline health (no duplication) +{"include_duplication": true} // + token-based clones +{"include_semantic_duplication": true} // + conceptual clones +{"include_duplication": true, "include_semantic_duplication": true} // both +{"god_object_threshold": 5, "instability_threshold": 0.5} // stricter thresholds +``` + ## TOOLS - **lifting_status**: Dashboard — coverage, per-area progress, unlifted files, NEXT STEP - **build_rpg**: Index the codebase (run once, instant) @@ -148,6 +181,7 @@ When using the RPG to understand or navigate a codebase (after lifting is comple - **context_pack**: Single-call search+fetch+explore. Searches, fetches source, expands neighbors, trims to token budget - **impact_radius**: BFS reachability with edge paths. Answers "what depends on X?" in one call. Traverses DataFlow edges for data lineage analysis - **plan_change**: Change planning — find relevant entities, dependency-safe modification order, impact radius, and related tests +- **analyze_health**: Architectural health analysis — instability, centrality, god objects, duplication detection (token + semantic) - **rpg_info**: Get codebase overview, statistics, and inter-area connectivity - **update_rpg**: Incrementally update after code changes - **reload_rpg**: Reload graph from disk