diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index ad174c48..53a58f14 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -24,6 +24,7 @@ use crate::import_edges::{self, ImportEdgeContext}; use crate::import_resolution; use crate::journal; use crate::parallel; +use crate::ast_db::{self, AstInsertNode, FileAstBatch}; use crate::roles_db; use crate::structure; use crate::types::{FileSymbols, ImportResolutionInput}; @@ -46,6 +47,10 @@ pub struct PipelineTiming { pub edges_ms: f64, pub structure_ms: f64, pub roles_ms: f64, + pub ast_ms: f64, + pub complexity_ms: f64, + pub cfg_ms: f64, + pub dataflow_ms: f64, pub finalize_ms: f64, } @@ -77,6 +82,9 @@ pub struct BuildPipelineResult { /// ran (≤5 changed files, >20 existing files). When false, the JS caller /// must run its own structure phase as a post-processing step. pub structure_handled: bool, + /// Whether the Rust pipeline wrote AST/complexity/CFG/dataflow to the DB. + /// When true, the JS caller can skip `runPostNativeAnalysis` entirely. + pub analysis_complete: bool, } /// Normalize path to forward slashes. @@ -175,6 +183,7 @@ pub fn run_pipeline( is_full_build: false, structure_scope: Some(vec![]), structure_handled: true, + analysis_complete: true, }); } @@ -391,6 +400,62 @@ pub fn run_pipeline( } timing.roles_ms = t0.elapsed().as_secs_f64() * 1000.0; + // ── Stage 8b: Analysis persistence (AST, complexity, CFG, dataflow) ── + // Write analysis data from parsed file_symbols directly to DB tables, + // eliminating the JS runPostNativeAnalysis step and its WASM re-parse. + let include_complexity = opts.complexity.unwrap_or(true); + let include_cfg = opts.cfg.unwrap_or(true); + let do_analysis = include_ast || include_dataflow || include_cfg || include_complexity; + + let mut analysis_ok = true; + if do_analysis { + // Determine which files to analyze (excludes reverse-dep files) + let analysis_file_set: HashSet<&str> = match &analysis_scope { + Some(files) => files.iter().map(|s| s.as_str()).collect(), + None => file_symbols.keys().map(|s| s.as_str()).collect(), + }; + + // Build node ID lookup: (file, name, line) -> node_id + let node_id_map = build_analysis_node_map(conn, &analysis_file_set); + + // AST nodes + if include_ast { + let t0 = Instant::now(); + let ast_batches = build_ast_batches(&file_symbols, &analysis_file_set); + if ast_db::do_insert_ast_nodes(conn, &ast_batches).is_err() { + analysis_ok = false; + } + timing.ast_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + + // Complexity metrics + if include_complexity { + let t0 = Instant::now(); + if !write_complexity(conn, &file_symbols, &analysis_file_set, &node_id_map) { + analysis_ok = false; + } + timing.complexity_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + + // CFG blocks + edges + if include_cfg { + let t0 = Instant::now(); + if !write_cfg(conn, &file_symbols, &analysis_file_set, &node_id_map) { + analysis_ok = false; + } + timing.cfg_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + + // Dataflow edges + if include_dataflow { + let t0 = Instant::now(); + if !write_dataflow(conn, &file_symbols, &analysis_file_set) { + analysis_ok = false; + } + timing.dataflow_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + } + // ── Stage 9: Finalize ────────────────────────────────────────────── let t0 = Instant::now(); let (node_count, edge_count) = finalize_build(conn, root_dir); @@ -406,6 +471,10 @@ pub fn run_pipeline( + timing.edges_ms + timing.structure_ms + timing.roles_ms + + timing.ast_ms + + timing.complexity_ms + + timing.cfg_ms + + timing.dataflow_ms + timing.finalize_ms; let overhead = total_start.elapsed().as_secs_f64() * 1000.0 - stage_sum; timing.setup_ms += overhead.max(0.0); @@ -422,6 +491,7 @@ pub fn run_pipeline( is_full_build: change_result.is_full_build, structure_scope: changed_file_list.clone(), structure_handled: use_fast_path, + analysis_complete: do_analysis && analysis_ok, }) } @@ -937,6 +1007,403 @@ fn build_and_insert_call_edges( } } +// ── Analysis persistence helpers ───────────────────────────────────────── + +/// Build a lookup map from (file, name, line) to node_id for analysis writes. +fn build_analysis_node_map( + conn: &Connection, + files: &HashSet<&str>, +) -> HashMap<(String, String, u32), i64> { + let mut map = HashMap::new(); + if files.is_empty() { + return map; + } + + // Use a temp table to batch all file lookups into a single join query, + // avoiding N per-file round-trips through prepared-statement execution. + let _ = conn.execute_batch( + "CREATE TEMP TABLE IF NOT EXISTS _analysis_files (file TEXT NOT NULL)", + ); + let _ = conn.execute("DELETE FROM temp._analysis_files", []); + + if let Ok(mut ins) = conn.prepare("INSERT INTO temp._analysis_files (file) VALUES (?1)") { + for file in files { + let _ = ins.execute(rusqlite::params![file]); + } + } + + let mut stmt = match conn.prepare( + "SELECT n.id, n.file, n.name, n.line FROM nodes n \ + INNER JOIN temp._analysis_files af ON n.file = af.file \ + WHERE n.kind != 'file'", + ) { + Ok(s) => s, + Err(_) => return map, + }; + + if let Ok(rows) = stmt.query_map([], |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, String>(1)?, + row.get::<_, String>(2)?, + row.get::<_, u32>(3)?, + )) + }) { + for row in rows.flatten() { + let (id, file, name, line) = row; + map.insert((file, name, line), id); + } + } + + let _ = conn.execute("DROP TABLE IF EXISTS temp._analysis_files", []); + map +} + +/// Convert FileSymbols AST nodes to FileAstBatch format for `ast_db::do_insert_ast_nodes`. +fn build_ast_batches( + file_symbols: &HashMap, + analysis_files: &HashSet<&str>, +) -> Vec { + let mut batches = Vec::new(); + for (file, symbols) in file_symbols { + if !analysis_files.contains(file.as_str()) || symbols.ast_nodes.is_empty() { + continue; + } + batches.push(FileAstBatch { + file: file.clone(), + nodes: symbols + .ast_nodes + .iter() + .map(|n| AstInsertNode { + line: n.line, + kind: n.kind.clone(), + name: n.name.clone(), + text: n.text.clone(), + receiver: n.receiver.clone(), + }) + .collect(), + }); + } + batches +} + +/// Write complexity metrics from parsed definitions to the `function_complexity` table. +fn write_complexity( + conn: &Connection, + file_symbols: &HashMap, + analysis_files: &HashSet<&str>, + node_id_map: &HashMap<(String, String, u32), i64>, +) -> bool { + let tx = match conn.unchecked_transaction() { + Ok(tx) => tx, + Err(_) => return false, + }; + + let mut stmt = match tx.prepare( + "INSERT OR REPLACE INTO function_complexity \ + (node_id, cognitive, cyclomatic, max_nesting, \ + loc, sloc, comment_lines, \ + halstead_n1, halstead_n2, halstead_big_n1, halstead_big_n2, \ + halstead_vocabulary, halstead_length, halstead_volume, \ + halstead_difficulty, halstead_effort, halstead_bugs, \ + maintainability_index) \ + VALUES (?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18)", + ) { + Ok(s) => s, + Err(_) => return false, + }; + + fn insert_def_complexity( + stmt: &mut rusqlite::Statement, + file: &str, + def: &crate::types::Definition, + node_id_map: &HashMap<(String, String, u32), i64>, + ) { + if let Some(ref cm) = def.complexity { + let key = (file.to_string(), def.name.clone(), def.line); + if let Some(&node_id) = node_id_map.get(&key) { + let h = cm.halstead.as_ref(); + let loc = cm.loc.as_ref(); + let _ = stmt.execute(rusqlite::params![ + node_id, + cm.cognitive, + cm.cyclomatic, + cm.max_nesting, + loc.map(|l| l.loc).unwrap_or(0), + loc.map(|l| l.sloc).unwrap_or(0), + loc.map(|l| l.comment_lines).unwrap_or(0), + h.map(|h| h.n1).unwrap_or(0), + h.map(|h| h.n2).unwrap_or(0), + h.map(|h| h.big_n1).unwrap_or(0), + h.map(|h| h.big_n2).unwrap_or(0), + h.map(|h| h.vocabulary).unwrap_or(0), + h.map(|h| h.length).unwrap_or(0), + h.map(|h| h.volume).unwrap_or(0.0), + h.map(|h| h.difficulty).unwrap_or(0.0), + h.map(|h| h.effort).unwrap_or(0.0), + h.map(|h| h.bugs).unwrap_or(0.0), + cm.maintainability_index.unwrap_or(0.0), + ]); + } + } + } + + for (file, symbols) in file_symbols { + if !analysis_files.contains(file.as_str()) { + continue; + } + for def in &symbols.definitions { + insert_def_complexity(&mut stmt, file, def, node_id_map); + if let Some(ref children) = def.children { + for child in children { + insert_def_complexity(&mut stmt, file, child, node_id_map); + } + } + } + } + + drop(stmt); // release borrow on tx before commit + tx.commit().is_ok() +} + +/// Write CFG blocks and edges from parsed definitions to DB tables. +fn write_cfg( + conn: &Connection, + file_symbols: &HashMap, + analysis_files: &HashSet<&str>, + node_id_map: &HashMap<(String, String, u32), i64>, +) -> bool { + let tx = match conn.unchecked_transaction() { + Ok(tx) => tx, + Err(_) => return false, + }; + + let mut block_stmt = match tx.prepare( + "INSERT INTO cfg_blocks \ + (function_node_id, block_index, block_type, start_line, end_line, label) \ + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + ) { + Ok(s) => s, + Err(_) => return false, + }; + + let mut edge_stmt = match tx.prepare( + "INSERT INTO cfg_edges \ + (function_node_id, source_block_id, target_block_id, kind) \ + VALUES (?1, ?2, ?3, ?4)", + ) { + Ok(s) => s, + Err(_) => return false, + }; + + for (file, symbols) in file_symbols { + if !analysis_files.contains(file.as_str()) { + continue; + } + for def in &symbols.definitions { + write_def_cfg( + &tx, &mut block_stmt, &mut edge_stmt, + file, def, node_id_map, + ); + if let Some(ref children) = def.children { + for child in children { + write_def_cfg( + &tx, &mut block_stmt, &mut edge_stmt, + file, child, node_id_map, + ); + } + } + } + } + + drop(block_stmt); + drop(edge_stmt); + tx.commit().is_ok() +} + +/// Write CFG data for a single definition. +fn write_def_cfg( + tx: &rusqlite::Transaction, + block_stmt: &mut rusqlite::Statement, + edge_stmt: &mut rusqlite::Statement, + file: &str, + def: &crate::types::Definition, + node_id_map: &HashMap<(String, String, u32), i64>, +) { + let cfg = match &def.cfg { + Some(c) if !c.blocks.is_empty() => c, + _ => return, + }; + let key = (file.to_string(), def.name.clone(), def.line); + let node_id = match node_id_map.get(&key) { + Some(&id) => id, + None => return, + }; + + // Insert blocks and track DB IDs for edge resolution + let mut block_db_ids: HashMap = HashMap::new(); + for block in &cfg.blocks { + if block_stmt + .execute(rusqlite::params![ + node_id, + block.index, + &block.block_type, + block.start_line, + block.end_line, + &block.label, + ]) + .is_ok() + { + block_db_ids.insert(block.index, tx.last_insert_rowid()); + } + } + + // Insert edges using resolved block DB IDs + for edge in &cfg.edges { + if let (Some(&src), Some(&tgt)) = ( + block_db_ids.get(&edge.source_index), + block_db_ids.get(&edge.target_index), + ) { + let _ = edge_stmt.execute(rusqlite::params![node_id, src, tgt, &edge.kind]); + } + } +} + +/// Write dataflow edges from parsed FileSymbols to the `dataflow` table. +/// Resolves function names to node IDs using the DB, mirroring the JS +/// `makeNodeResolver` logic (prefer same-file match, fall back to global). +fn write_dataflow( + conn: &Connection, + file_symbols: &HashMap, + analysis_files: &HashSet<&str>, +) -> bool { + let tx = match conn.unchecked_transaction() { + Ok(tx) => tx, + Err(_) => return false, + }; + + let mut insert_stmt = match tx.prepare( + "INSERT INTO dataflow \ + (source_id, target_id, kind, param_index, expression, line, confidence) \ + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)", + ) { + Ok(s) => s, + Err(_) => return false, + }; + + let mut local_stmt = match tx.prepare( + "SELECT id FROM nodes WHERE name = ?1 AND file = ?2 \ + AND kind IN ('function','method') LIMIT 1", + ) { + Ok(s) => s, + Err(_) => return false, + }; + + let mut global_stmt = match tx.prepare( + "SELECT id FROM nodes WHERE name = ?1 \ + AND kind IN ('function','method') \ + ORDER BY file, line LIMIT 1", + ) { + Ok(s) => s, + Err(_) => return false, + }; + + for (file, symbols) in file_symbols { + if !analysis_files.contains(file.as_str()) { + continue; + } + let data = match &symbols.dataflow { + Some(d) => d, + None => continue, + }; + + // argFlows → flows_to edges + for flow in &data.arg_flows { + let caller = match &flow.caller_func { + Some(name) => name.as_str(), + None => continue, + }; + let src = resolve_dataflow_node(&mut local_stmt, &mut global_stmt, caller, file); + let tgt = resolve_dataflow_node(&mut local_stmt, &mut global_stmt, &flow.callee_name, file); + if let (Some(src), Some(tgt)) = (src, tgt) { + let _ = insert_stmt.execute(rusqlite::params![ + src, + tgt, + "flows_to", + flow.arg_index, + &flow.expression, + flow.line, + flow.confidence, + ]); + } + } + + // assignments → returns edges + for assignment in &data.assignments { + let consumer = match &assignment.caller_func { + Some(name) => name.as_str(), + None => continue, + }; + let producer = resolve_dataflow_node(&mut local_stmt, &mut global_stmt, &assignment.source_call_name, file); + let consumer_id = resolve_dataflow_node(&mut local_stmt, &mut global_stmt, consumer, file); + if let (Some(producer), Some(consumer_id)) = (producer, consumer_id) { + let _ = insert_stmt.execute(rusqlite::params![ + producer, + consumer_id, + "returns", + Option::::None, + &assignment.expression, + assignment.line, + 1.0_f64, + ]); + } + } + + // mutations → mutates edges (only for param bindings) + for mutation in &data.mutations { + if mutation.binding_type.as_deref() != Some("param") { + continue; + } + let func = match &mutation.func_name { + Some(name) => name.as_str(), + None => continue, + }; + if let Some(node_id) = resolve_dataflow_node(&mut local_stmt, &mut global_stmt, func, file) { + let _ = insert_stmt.execute(rusqlite::params![ + node_id, + node_id, + "mutates", + Option::::None, + &mutation.mutating_expr, + mutation.line, + 1.0_f64, + ]); + } + } + } + + drop(insert_stmt); + drop(local_stmt); + drop(global_stmt); + tx.commit().is_ok() +} + +/// Resolve a function name to a node ID, trying same-file first then global. +/// Mirrors the JS `makeNodeResolver` logic from `features/dataflow.ts`. +fn resolve_dataflow_node( + local_stmt: &mut rusqlite::Statement, + global_stmt: &mut rusqlite::Statement, + name: &str, + file: &str, +) -> Option { + if let Ok(id) = local_stmt.query_row(rusqlite::params![name, file], |r| r.get::<_, i64>(0)) { + return Some(id); + } + global_stmt + .query_row(rusqlite::params![name], |r| r.get::<_, i64>(0)) + .ok() +} + /// Current time in milliseconds since epoch. fn now_ms() -> f64 { std::time::SystemTime::now() diff --git a/crates/codegraph-core/src/config.rs b/crates/codegraph-core/src/config.rs index eba0df07..af2805bc 100644 --- a/crates/codegraph-core/src/config.rs +++ b/crates/codegraph-core/src/config.rs @@ -63,6 +63,10 @@ pub struct BuildOpts { #[serde(default)] pub ast: Option, + /// Whether to include complexity metrics. + #[serde(default)] + pub complexity: Option, + /// Whether to include CFG analysis. #[serde(default)] pub cfg: Option, diff --git a/crates/codegraph-core/src/lib.rs b/crates/codegraph-core/src/lib.rs index 5fbe317d..52f9ae41 100644 --- a/crates/codegraph-core/src/lib.rs +++ b/crates/codegraph-core/src/lib.rs @@ -67,6 +67,20 @@ pub fn parse_files( ) } +/// Parse multiple files in parallel with ALL analysis data extracted in a single pass. +/// Always includes: symbols, AST nodes, complexity, CFG, and dataflow. +/// Eliminates the need for any downstream re-parse (WASM or native standalone). +#[napi] +pub fn parse_files_full( + file_paths: Vec, + root_dir: String, +) -> Vec { + parallel::parse_files_parallel_full( + &file_paths, + &root_dir, + ) +} + /// Resolve a single import path. #[napi] pub fn resolve_import( diff --git a/crates/codegraph-core/src/parallel.rs b/crates/codegraph-core/src/parallel.rs index 3a8bcba7..65c472b1 100644 --- a/crates/codegraph-core/src/parallel.rs +++ b/crates/codegraph-core/src/parallel.rs @@ -10,6 +10,8 @@ use crate::types::FileSymbols; /// Parse multiple files in parallel using rayon. /// Each thread creates its own Parser (cheap; Language objects are Send+Sync). /// Failed files are silently skipped (matches WASM behavior). +/// All analysis data (symbols, AST nodes, complexity, CFG, dataflow) is always +/// extracted in a single parse pass — no separate re-parse needed downstream. /// When `include_dataflow` is false, dataflow extraction is skipped for performance. /// When `include_ast_nodes` is false, AST node walking is skipped for performance. pub fn parse_files_parallel( @@ -40,6 +42,35 @@ pub fn parse_files_parallel( .collect() } +/// Parse multiple files in parallel, always extracting ALL analysis data: +/// symbols, AST nodes, complexity, CFG, and dataflow in a single parse pass. +/// This eliminates the need for any downstream re-parse (WASM or native standalone). +pub fn parse_files_parallel_full( + file_paths: &[String], + _root_dir: &str, +) -> Vec { + file_paths + .par_iter() + .filter_map(|file_path| { + let lang = LanguageKind::from_extension(file_path)?; + let source = fs::read(file_path).ok()?; + let line_count = source.iter().filter(|&&b| b == b'\n').count() as u32 + 1; + + let mut parser = Parser::new(); + parser.set_language(&lang.tree_sitter_language()).ok()?; + + let tree = parser.parse(&source, None)?; + // Always include AST nodes + let mut symbols = + extract_symbols_with_opts(lang, &tree, &source, file_path, true); + // Always extract dataflow + symbols.dataflow = extract_dataflow(&tree, &source, lang.lang_id_str()); + symbols.line_count = Some(line_count); + Some(symbols) + }) + .collect() +} + /// Parse a single file and return its symbols. /// When `include_dataflow` is false, dataflow extraction is skipped for performance. /// When `include_ast_nodes` is false, AST node walking is skipped for performance. diff --git a/package-lock.json b/package-lock.json index e42cc31e..b53ccb7d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1314,9 +1314,6 @@ "cpu": [ "arm64" ], - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -1330,9 +1327,6 @@ "cpu": [ "x64" ], - "libc": [ - "glibc" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -1346,9 +1340,6 @@ "cpu": [ "x64" ], - "libc": [ - "musl" - ], "license": "Apache-2.0", "optional": true, "os": [ @@ -7292,6 +7283,7 @@ "resolved": "git+ssh://git@github.com/gleam-lang/tree-sitter-gleam.git#0153f8b875cd02034b553f3a84a2f5ee67a80364", "integrity": "sha512-BEC6Ti8xkVezSjitXVg6y+Hzin9VaoG+lcVGy73QCGB66wZBb2UCWbxvrdaBpioNyruYvDyxSPk/NECzT7QWKw==", "dev": true, + "hasInstallScript": true, "license": "Apache-2.0", "dependencies": { "nan": "^2.18.0" diff --git a/scripts/benchmark.ts b/scripts/benchmark.ts index 5b2de8a8..554e89fa 100644 --- a/scripts/benchmark.ts +++ b/scripts/benchmark.ts @@ -37,40 +37,29 @@ if (!isWorker()) { process.exit(1); } + function formatEngineResult(data) { + if (!data) return null; + return { + buildTimeMs: data.buildTimeMs, + queryTimeMs: data.queryTimeMs, + nodes: data.nodes, + edges: data.edges, + dbSizeBytes: data.dbSizeBytes, + perFile: data.perFile, + noopRebuildMs: data.noopRebuildMs, + oneFileRebuildMs: data.oneFileRebuildMs, + oneFilePhases: data.oneFilePhases, + queries: data.queries, + phases: data.phases, + }; + } + const result = { version, date: new Date().toISOString().slice(0, 10), files: primary.files, - wasm: wasm - ? { - buildTimeMs: wasm.buildTimeMs, - queryTimeMs: wasm.queryTimeMs, - nodes: wasm.nodes, - edges: wasm.edges, - dbSizeBytes: wasm.dbSizeBytes, - perFile: wasm.perFile, - noopRebuildMs: wasm.noopRebuildMs, - oneFileRebuildMs: wasm.oneFileRebuildMs, - oneFilePhases: wasm.oneFilePhases, - queries: wasm.queries, - phases: wasm.phases, - } - : null, - native: native - ? { - buildTimeMs: native.buildTimeMs, - queryTimeMs: native.queryTimeMs, - nodes: native.nodes, - edges: native.edges, - dbSizeBytes: native.dbSizeBytes, - perFile: native.perFile, - noopRebuildMs: native.noopRebuildMs, - oneFileRebuildMs: native.oneFileRebuildMs, - oneFilePhases: native.oneFilePhases, - queries: native.queries, - phases: native.phases, - } - : null, + wasm: formatEngineResult(wasm), + native: formatEngineResult(native), }; console.log(JSON.stringify(result, null, 2)); diff --git a/src/ast-analysis/engine.ts b/src/ast-analysis/engine.ts index 5c5c69a3..f92af4a9 100644 --- a/src/ast-analysis/engine.ts +++ b/src/ast-analysis/engine.ts @@ -666,6 +666,79 @@ async function delegateToBuildFunctions( } } +// ─── Native full-analysis fast path ──────────────────────────────────── + +/** + * Check whether all files already have complete analysis data from the native + * parse pass (parseFilesFull). When true, no WASM re-parse or JS visitor walk + * is needed — the engine can skip directly to DB persistence. + */ +function allNativeDataComplete( + fileSymbols: Map, + opts: AnalysisOpts, +): boolean { + const doAst = opts.ast !== false; + const doComplexity = opts.complexity !== false; + const doCfg = opts.cfg !== false; + const doDataflow = opts.dataflow !== false; + + for (const [relPath, symbols] of fileSymbols) { + // If any file has a WASM tree, it was parsed by WASM — not native full + if (symbols._tree) return false; + + const ext = path.extname(relPath).toLowerCase(); + const langId = symbols._langId || ''; + + // AST nodes: native must have produced them + if ( + doAst && + !Array.isArray(symbols.astNodes) && + (WALK_EXTENSIONS.has(ext) || AST_TYPE_MAPS.has(langId)) + ) { + debug(`allNativeDataComplete: ${relPath} missing astNodes`); + return false; + } + + // Dataflow: native must have produced it + if ( + doDataflow && + !symbols.dataflow && + (DATAFLOW_EXTENSIONS.has(ext) || DATAFLOW_RULES.has(langId)) + ) { + debug(`allNativeDataComplete: ${relPath} missing dataflow`); + return false; + } + + const defs = symbols.definitions || []; + for (const def of defs) { + if (!hasFuncBody(def)) continue; + + // Complexity: every function must already have it + if ( + doComplexity && + !def.complexity && + (COMPLEXITY_EXTENSIONS.has(ext) || COMPLEXITY_RULES.has(langId)) + ) { + debug(`allNativeDataComplete: ${relPath}:${def.name} missing complexity`); + return false; + } + + // CFG: every function must already have blocks + if ( + doCfg && + def.cfg !== null && + !Array.isArray(def.cfg?.blocks) && + (CFG_EXTENSIONS.has(ext) || CFG_RULES.has(langId)) + ) { + debug(`allNativeDataComplete: ${relPath}:${def.name} missing cfg blocks`); + return false; + } + } + } + + return fileSymbols.size > 0; +} + // ─── Public API ────────────────────────────────────────────────────────── export async function runAnalyses( @@ -686,6 +759,16 @@ export async function runAnalyses( const extToLang = buildExtToLangMap(); + // Fast path: when all files were parsed by the native engine with full analysis + // (parseFilesFull), all data is already present — skip WASM re-parse and JS + // visitor walks entirely, go straight to DB persistence. + if (allNativeDataComplete(fileSymbols, opts)) { + debug('native full-analysis fast path: all data present, skipping WASM/visitor passes'); + if (doComplexity && doCfg) reconcileCfgCyclomatic(fileSymbols); + await delegateToBuildFunctions(db, fileSymbols, rootDir, opts, engineOpts, timing); + return timing; + } + // Native analysis pass: try Rust standalone functions before WASM fallback. // This fills in complexity/CFG/dataflow for files that the native parse pipeline // missed, avoiding the need to parse with WASM + run JS visitors. diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index fe3d9970..668576fb 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -23,7 +23,13 @@ import { loadNative } from '../../../infrastructure/native.js'; import { semverCompare } from '../../../infrastructure/update-check.js'; import { toErrorMessage } from '../../../shared/errors.js'; import { CODEGRAPH_VERSION } from '../../../shared/version.js'; -import type { BuildGraphOpts, BuildResult, Definition, ExtractorOutput } from '../../../types.js'; +import type { + BetterSqlite3Database, + BuildGraphOpts, + BuildResult, + Definition, + ExtractorOutput, +} from '../../../types.js'; import { getActiveEngine } from '../../parser.js'; import { setWorkspaces } from '../resolve.js'; import { PipelineContext } from './context.js'; @@ -120,15 +126,11 @@ function setupPipeline(ctx: PipelineContext): void { const native = enginePref !== 'wasm' ? loadNative() : null; ctx.nativeAvailable = !!native?.NativeDatabase; - // Native-first: use only rusqlite for the entire pipeline (no better-sqlite3). - // This eliminates the dual-connection WAL corruption problem and enables all - // native fast-paths (bulkInsertNodes, classifyRolesFull, etc.). - // Fallback: if native is unavailable or FORCE_JS is set, use better-sqlite3. - if ( - ctx.nativeAvailable && - native?.NativeDatabase && - process.env.CODEGRAPH_FORCE_JS_PIPELINE !== '1' - ) { + // When native is available, use a NativeDbProxy backed by a single rusqlite + // connection. This eliminates the dual-connection WAL corruption problem. + // The Rust orchestrator handles the full pipeline; the proxy is used for any + // JS post-processing (e.g. structure fallback on large builds). + if (ctx.nativeAvailable && native?.NativeDatabase) { try { const dir = path.dirname(ctx.dbPath); if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); @@ -264,13 +266,14 @@ interface NativeOrchestratorResult { structureScope?: string[]; /** Whether the Rust pipeline handled the structure phase (small-incremental fast path). */ structureHandled?: boolean; + /** Whether the Rust pipeline wrote AST/complexity/CFG/dataflow to DB. */ + analysisComplete?: boolean; } // ── Native orchestrator helpers ─────────────────────────────────────── /** Determine whether the native orchestrator should be skipped. Returns a reason string, or null if it should run. */ function shouldSkipNativeOrchestrator(ctx: PipelineContext): string | null { - if (process.env.CODEGRAPH_FORCE_JS_PIPELINE === '1') return 'CODEGRAPH_FORCE_JS_PIPELINE=1'; if (ctx.forceFullRebuild) return 'forceFullRebuild'; // v3.9.0 addon had buggy incremental purge (wrong SQL on analysis tables, // scoped removal over-detection). Fixed in v3.9.1 by PR #865. Gate on @@ -452,7 +455,11 @@ async function runPostNativeStructure( return performance.now() - structureStart; } -/** Run AST/complexity/CFG/dataflow analysis after native orchestrator. */ +/** + * JS fallback for AST/complexity/CFG/dataflow analysis after native orchestrator. + * Used when the Rust addon doesn't include analysis persistence (older addon + * version) or when analysis failed on the Rust side. + */ async function runPostNativeAnalysis( ctx: PipelineContext, allFileSymbols: Map, @@ -472,45 +479,50 @@ async function runPostNativeAnalysis( analysisFileSymbols = allFileSymbols; } - // In native-first mode, nativeDb is already open — no reopen needed. - if (!ctx.nativeFirstProxy) { - const native = loadNative(); - if (native?.NativeDatabase) { + // Reopen nativeDb for analysis features (suspend/resume WAL pattern). + const native = loadNative(); + if (native?.NativeDatabase) { + try { + ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); + if (ctx.engineOpts) ctx.engineOpts.nativeDb = ctx.nativeDb; + } catch { + ctx.nativeDb = undefined; + if (ctx.engineOpts) ctx.engineOpts.nativeDb = undefined; + } + } + + // Wire up WAL checkpoint callbacks for the analysis engine + if (ctx.nativeDb && ctx.engineOpts) { + ctx.engineOpts.suspendJsDb = () => { + ctx.db.pragma('wal_checkpoint(TRUNCATE)'); + }; + ctx.engineOpts.resumeJsDb = () => { try { - // Checkpoint JS WAL before opening native connection so both - // connections see the same DB state (structure writes are flushed). - ctx.db.pragma('wal_checkpoint(TRUNCATE)'); - ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); - if (ctx.engineOpts) { - ctx.engineOpts.nativeDb = ctx.nativeDb; - ctx.engineOpts.suspendJsDb = () => { - ctx.db.pragma('wal_checkpoint(TRUNCATE)'); - }; - ctx.engineOpts.resumeJsDb = () => { - try { - ctx.nativeDb?.exec('PRAGMA wal_checkpoint(TRUNCATE)'); - } catch (e) { - debug( - `resumeJsDb: WAL checkpoint failed (nativeDb may already be closed): ${toErrorMessage(e)}`, - ); - } - }; - } - } catch { - ctx.nativeDb = undefined; - if (ctx.engineOpts) ctx.engineOpts.nativeDb = undefined; + ctx.nativeDb?.exec('PRAGMA wal_checkpoint(TRUNCATE)'); + } catch (e) { + debug( + `resumeJsDb: WAL checkpoint failed (nativeDb may already be closed): ${toErrorMessage(e)}`, + ); } - } + }; } try { - const { runAnalyses: runAnalysesFn } = await import('../../../ast-analysis/engine.js'); + const { runAnalyses: runAnalysesFn } = (await import('../../../ast-analysis/engine.js')) as { + runAnalyses: ( + db: BetterSqlite3Database, + fileSymbols: Map, + rootDir: string, + opts: Record, + engineOpts?: Record, + ) => Promise<{ astMs?: number; complexityMs?: number; cfgMs?: number; dataflowMs?: number }>; + }; const result = await runAnalysesFn( ctx.db, analysisFileSymbols, ctx.rootDir, - ctx.opts, - ctx.engineOpts, + ctx.opts as Record, + ctx.engineOpts as Record | undefined, ); timing.astMs = result.astMs ?? 0; timing.complexityMs = result.complexityMs ?? 0; @@ -520,8 +532,8 @@ async function runPostNativeAnalysis( warn(`Analysis phases failed after native build: ${toErrorMessage(err)}`); } - // Close nativeDb after analyses (skip in native-first — single connection stays open) - if (ctx.nativeDb && !ctx.nativeFirstProxy) { + // Close nativeDb after analyses + if (ctx.nativeDb) { try { ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); } catch { @@ -533,7 +545,11 @@ async function runPostNativeAnalysis( /* ignore close errors */ } ctx.nativeDb = undefined; - if (ctx.engineOpts) ctx.engineOpts.nativeDb = undefined; + if (ctx.engineOpts) { + ctx.engineOpts.nativeDb = undefined; + ctx.engineOpts.suspendJsDb = undefined; + ctx.engineOpts.resumeJsDb = undefined; + } } return timing; @@ -635,35 +651,40 @@ async function tryNativeOrchestrator( ); // ── Post-native structure + analysis ────────────────────────────── - let analysisTiming = { astMs: 0, complexityMs: 0, cfgMs: 0, dataflowMs: 0 }; + let analysisTiming = { + astMs: +(p.astMs ?? 0), + complexityMs: +(p.complexityMs ?? 0), + cfgMs: +(p.cfgMs ?? 0), + dataflowMs: +(p.dataflowMs ?? 0), + }; let structurePatchMs = 0; - const needsAnalysis = - ctx.opts.ast !== false || - ctx.opts.complexity !== false || - ctx.opts.cfg !== false || - ctx.opts.dataflow !== false; // Skip JS structure when the Rust pipeline's small-incremental fast path // already handled it. For full builds and large incrementals where Rust // skipped structure, we must run the JS fallback. const needsStructure = !result.structureHandled; - - if (needsAnalysis || needsStructure) { - // Always hand off to better-sqlite3 for JS post-processing. - // The NativeDbProxy has per-statement napi serialization overhead that - // makes structure/analysis phases significantly slower than direct - // better-sqlite3. Native bulk-insert methods (bulkInsertCfg, etc.) - // are wired through engineOpts.nativeDb in runPostNativeAnalysis. - if (ctx.nativeFirstProxy) ctx.nativeFirstProxy = false; - if (!handoffWalAfterNativeBuild(ctx)) { + // When the Rust addon doesn't include analysis persistence (older addon + // version or analysis failed), fall back to JS-side analysis. + const needsAnalysisFallback = + !result.analysisComplete && + (ctx.opts.ast !== false || + ctx.opts.complexity !== false || + ctx.opts.cfg !== false || + ctx.opts.dataflow !== false); + + if (needsStructure || needsAnalysisFallback) { + // When analysis fallback is needed, handoff to better-sqlite3 — the + // analysis engine uses the suspend/resume WAL pattern that requires a + // real better-sqlite3 connection, not the NativeDbProxy. + if (needsAnalysisFallback && ctx.nativeFirstProxy) { + closeNativeDb(ctx, 'pre-analysis-fallback'); + ctx.db = openDb(ctx.dbPath); + ctx.nativeFirstProxy = false; + } else if (!ctx.nativeFirstProxy && !handoffWalAfterNativeBuild(ctx)) { // DB reopen failed — return partial result return formatNativeTimingResult(p, 0, analysisTiming); } - // When structure was handled by Rust, we only need changed files for - // analysis — no need to load the entire graph from DB. When structure - // was NOT handled, we need all files to build the complete directory tree. - const scopeFiles = needsStructure ? undefined : result.changedFiles; - const fileSymbols = reconstructFileSymbolsFromDb(ctx, scopeFiles); + const fileSymbols = reconstructFileSymbolsFromDb(ctx); if (needsStructure) { structurePatchMs = await runPostNativeStructure( @@ -674,7 +695,7 @@ async function tryNativeOrchestrator( ); } - if (needsAnalysis) { + if (needsAnalysisFallback) { analysisTiming = await runPostNativeAnalysis(ctx, fileSymbols, result.changedFiles); } } @@ -686,30 +707,7 @@ async function tryNativeOrchestrator( // ── Pipeline stages execution ─────────────────────────────────────────── async function runPipelineStages(ctx: PipelineContext): Promise { - // ── Native-first mode ──────────────────────────────────────────────── - // When ctx.nativeFirstProxy is true, ctx.db is a NativeDbProxy backed by - // the single rusqlite connection (ctx.nativeDb). No dual-connection WAL - // dance is needed — every stage uses the same connection transparently. - if (ctx.nativeFirstProxy) { - // Ensure engineOpts.nativeDb is set so stages can use dedicated native methods. - if (ctx.engineOpts) { - ctx.engineOpts.nativeDb = ctx.nativeDb; - } - - await collectFiles(ctx); - await detectChanges(ctx); - if (ctx.earlyExit) return; - await parseFiles(ctx); - await insertNodes(ctx); - await resolveImports(ctx); - await buildEdges(ctx); - await buildStructure(ctx); - await runAnalyses(ctx); - await finalize(ctx); - return; - } - - // ── Legacy dual-connection mode (WASM / fallback) ──────────────────── + // ── WASM / fallback dual-connection mode ───────────────────────────── // NativeDatabase is deferred — not opened during setup. collectFiles and // detectChanges only need better-sqlite3. If no files changed, we exit // early without ever opening the native connection, saving ~5ms. @@ -717,6 +715,13 @@ async function runPipelineStages(ctx: PipelineContext): Promise { // suspend it now to avoid dual-connection WAL corruption during stages. if (ctx.db && ctx.nativeDb) { suspendNativeDb(ctx, 'pre-collect'); + // When nativeFirstProxy is true, ctx.db is a NativeDbProxy wrapping the + // now-closed NativeDatabase. Replace it with a real better-sqlite3 + // connection so the JS pipeline stages can operate normally. + if (ctx.nativeFirstProxy) { + ctx.db = openDb(ctx.dbPath); + ctx.nativeFirstProxy = false; + } } await collectFiles(ctx); diff --git a/src/domain/parser.ts b/src/domain/parser.ts index 97272262..57bd116e 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -780,7 +780,7 @@ export async function parseFileAuto( const { native } = resolveEngine(opts); if (native) { - const result = native.parseFile(filePath, source, !!opts.dataflow, opts.ast !== false); + const result = native.parseFile(filePath, source, true, true); if (!result) return null; const patched = patchNativeResult(result); // Always backfill typeMap for TS/TSX from WASM — native parser's type @@ -878,7 +878,11 @@ export async function parseFilesAuto( if (!native) return parseFilesWasm(filePaths, rootDir); const result = new Map(); - const nativeResults = native.parseFiles(filePaths, rootDir, !!opts.dataflow, opts.ast !== false); + // Always extract all analysis data (dataflow + AST nodes) during native parse. + // This eliminates the need for any downstream WASM re-parse or native standalone calls. + const nativeResults = native.parseFilesFull + ? native.parseFilesFull(filePaths, rootDir) + : native.parseFiles(filePaths, rootDir, true, true); const needsTypeMap: { filePath: string; relPath: string }[] = []; for (const r of nativeResults) { if (!r) continue; diff --git a/src/features/ast.ts b/src/features/ast.ts index 56119380..c31b6690 100644 --- a/src/features/ast.ts +++ b/src/features/ast.ts @@ -115,8 +115,8 @@ function tryNativeBulkInsert( receiver: n.receiver ?? '', })), }); - } else if (symbols.calls || symbols._tree) { - return false; // needs JS fallback + } else if (symbols._tree) { + return false; // has WASM tree not yet processed — needs JS fallback } } diff --git a/src/features/complexity.ts b/src/features/complexity.ts index 9d481057..509d0347 100644 --- a/src/features/complexity.ts +++ b/src/features/complexity.ts @@ -545,6 +545,10 @@ function collectNativeBulkRows( const rows: Array> = []; for (const [relPath, symbols] of fileSymbols) { + const ext = path.extname(relPath).toLowerCase(); + const langId = symbols._langId || ''; + const langSupported = COMPLEXITY_EXTENSIONS.has(ext) || COMPLEXITY_RULES.has(langId); + for (const def of symbols.definitions) { if (def.kind !== 'function' && def.kind !== 'method') continue; if (!def.line) continue; @@ -554,6 +558,9 @@ function collectNativeBulkRows( // of the native bulk-insert path for every TypeScript codebase (#846). if (!def.complexity) { if (def.name.includes('.') || !def.endLine || def.endLine <= def.line) continue; + // Languages without complexity rules will never have data — skip them + // rather than bailing out of the entire native bulk path. + if (!langSupported) continue; return null; // genuine function body missing complexity — needs JS fallback } const nodeId = getFunctionNodeId(db, def.name, relPath, def.line); diff --git a/src/types.ts b/src/types.ts index 8c6fc7fc..d8d7b00c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1874,6 +1874,7 @@ export type StmtCache = WeakMap,