diff --git a/packages/cli/src/repowise/cli/commands/update_cmd.py b/packages/cli/src/repowise/cli/commands/update_cmd.py index a687d0cb..a3570a4a 100644 --- a/packages/cli/src/repowise/cli/commands/update_cmd.py +++ b/packages/cli/src/repowise/cli/commands/update_cmd.py @@ -2,7 +2,6 @@ from __future__ import annotations -import asyncio import time from pathlib import Path from typing import Any @@ -103,64 +102,22 @@ def _resolve_index_only_mode( async def _persist_partial_health(session: Any, repo_id: str, report: Any) -> None: """Upsert health findings + metrics for the changed-files subset. - Unlike ``persist_pipeline_result`` (which delete-then-inserts the - whole repo), this writer only touches rows whose ``file_path`` is in - the partial report — so unchanged files keep their existing findings - and metrics across an incremental ``repowise update``. + Delegates to :mod:`repowise.core.pipeline.incremental` — the logic moved + to core so workspace updates can reuse the incremental path. """ - from repowise.core.persistence.crud import ( - upsert_health_findings, - upsert_health_metrics, - ) - - changed_paths = sorted({m.file_path for m in report.metrics or []}) - if not changed_paths: - return - await upsert_health_metrics(session, repo_id, report.metrics or []) - await upsert_health_findings( - session, repo_id, list(report.findings or []), file_paths=changed_paths - ) - # Per-function blame rollup for the changed files (keeps git_function_blame - # current between full indexes; FULL git tier only — empty otherwise). - fn_blame_rows = getattr(report, "function_blame_rows", None) - if fn_blame_rows: - from repowise.core.persistence.crud import upsert_git_function_blame_bulk + from repowise.core.pipeline.incremental import persist_partial_health - await upsert_git_function_blame_bulk(session, repo_id, fn_blame_rows) + await persist_partial_health(session, repo_id, report) async def _persist_incremental_commits(session: Any, repo_id: str, repo_path: Any) -> None: """Capture + upsert ``git_commits`` rows for commits new since the last index. - Foundation 1 only populated the per-commit table on the full orchestrator - index; without this, the commits/change-risk surface goes stale between full - re-indexes. Bounds the walk to commits newer than the newest persisted - ``committed_at`` (one ``git log`` pass) and upserts (idempotent on sha). + Delegates to :mod:`repowise.core.pipeline.incremental`. """ - from repowise.core.ingestion.git_indexer import GitIndexer - from repowise.core.persistence.crud import ( - get_latest_commit_committed_at, - upsert_git_commits_bulk, - ) + from repowise.core.pipeline.incremental import persist_incremental_commits - cfg = load_config(repo_path) - indexer = GitIndexer( - repo_path, - commit_limit=cfg.get("commit_limit"), - follow_renames=cfg.get("follow_renames", False), - ) - newest = await get_latest_commit_committed_at(session, repo_id) - since_ts: int | None = None - if newest is not None: - # SQLite drops tzinfo, so a naive read must be interpreted as UTC (the - # column is stored tz-aware) rather than local time. - from datetime import UTC - - dt = newest if newest.tzinfo is not None else newest.replace(tzinfo=UTC) - since_ts = int(dt.timestamp()) - rows = await asyncio.to_thread(indexer.capture_new_commit_rows, since_ts=since_ts) - if rows: - await upsert_git_commits_bulk(session, repo_id, rows) + await persist_incremental_commits(session, repo_id, repo_path) # --------------------------------------------------------------------------- @@ -326,13 +283,9 @@ def _refresh_workspace_editor_project_files( def _build_filtered_changed_paths(file_diffs: list, exclude_patterns: list[str]) -> list[str]: """Extract paths from file_diffs, filtering out excluded patterns.""" - paths = [fd.path for fd in file_diffs] - if not exclude_patterns: - return paths - import pathspec + from repowise.core.pipeline.incremental import build_filtered_changed_paths - spec = pathspec.PathSpec.from_lines("gitwildmatch", exclude_patterns) - return [p for p in paths if not spec.match_file(p)] + return build_filtered_changed_paths(file_diffs, exclude_patterns) def _build_repo_graph( @@ -345,92 +298,22 @@ def _build_repo_graph( ) -> tuple[list, dict[str, bytes], Any, Any, int]: """Traverse + parse the repo and build the graph (+ framework-aware edges). - Shared by the incremental rebuild path (:func:`_rebuild_graph_and_git`) and - the config-triggered re-score path (:func:`_run_full_health_rescore`) so both - build the same graph from the same parser and the same synthetic edge step. - - Files that fail to read/parse are skipped and reported as a count rather than - swallowed silently. ``source_map`` is populated only when ``collect_sources`` - is set (the re-score path doesn't need the raw bytes). - - Returns ``(parsed_files, source_map, graph_builder, repo_structure, - file_count)``. + Delegates to :mod:`repowise.core.pipeline.incremental` — the logic moved + to core so workspace updates can reuse the incremental path. Shared by + the incremental rebuild path (:func:`_rebuild_graph_and_git`) and the + config-triggered re-score path (:func:`_run_full_health_rescore`). """ - from pathlib import Path as PathlibPath - - from repowise.core.ingestion import ASTParser, FileTraverser, GraphBuilder, compute_content_hash + from repowise.core.pipeline.incremental import build_repo_graph - traverser = FileTraverser( + return build_repo_graph( repo_path, - extra_exclude_patterns=exclude_patterns or None, - include_submodules=include_submodules, - include_nested_repos=include_nested_repos, - ) - file_infos = list(traverser.traverse()) - repo_structure = traverser.get_repo_structure() - - # Content-hash parse cache: an incremental update re-ingests the whole - # repo, but only the changed files actually need a tree-sitter parse. - # Best-effort — any cache failure falls back to a full parse. - parse_cache = None - try: - from repowise.core.ingestion.parse_cache import ParseCache - - parse_cache = ParseCache(PathlibPath(repo_path) / ".repowise") - parse_cache.load() - except Exception: - parse_cache = None - - parser: Any = None # constructed lazily — every-file-cached updates skip query compilation - parsed_files: list = [] - source_map: dict[str, bytes] = {} - graph_builder = GraphBuilder( - repo_path, - exclude_patterns=exclude_patterns, - centrality_cache_dir=PathlibPath(repo_path) / ".repowise", + exclude_patterns, + collect_sources=collect_sources, include_submodules=include_submodules, include_nested_repos=include_nested_repos, + log=console.print, ) - skipped = 0 - for fi in file_infos: - try: - source = PathlibPath(fi.abs_path).read_bytes() - content_hash = compute_content_hash(source) - parsed = parse_cache.get(fi, content_hash) if parse_cache is not None else None - if parsed is None: - if parser is None: - parser = ASTParser() - parsed = parser.parse_file(fi, source) - if parse_cache is not None: - parse_cache.put(parsed, content_hash) - except Exception: - skipped += 1 - continue - parsed_files.append(parsed) - if collect_sources: - source_map[fi.path] = source - graph_builder.add_file(parsed) - graph_builder.build() - if parse_cache is not None: - parse_cache.save() - - if skipped: - console.print(f"[yellow]Skipped {skipped} file(s) that failed to parse.[/yellow]") - - # Add framework-aware synthetic edges (conftest, Django, FastAPI, Flask). - try: - from repowise.core.generation.editor_files.tech_stack import detect_tech_stack - - tech_items = detect_tech_stack(repo_path) - fw_count = graph_builder.add_framework_edges([item.name for item in tech_items]) - if fw_count: - console.print(f"Framework edges added: [cyan]{fw_count}[/cyan]") - except Exception: - pass # framework edge detection is best-effort - - return parsed_files, source_map, graph_builder, repo_structure, len(file_infos) - def _rebuild_graph_and_git( repo_path: Any, @@ -454,55 +337,26 @@ def _rebuild_graph_and_git( silently drop its submodule files on every incremental update. Missing keys fall back to False (legacy behavior). + Delegates to :mod:`repowise.core.pipeline.incremental` — the logic moved + to core so workspace updates can reuse the incremental path. + Returns ``(parsed_files, source_map, graph_builder, repo_structure, file_count, git_meta_map)``. """ - # Full re-ingest for graph (needed for cascade analysis) - parsed_files, source_map, graph_builder, repo_structure, file_count = _build_repo_graph( - repo_path, - exclude_patterns, - collect_sources=True, - include_submodules=include_submodules, - include_nested_repos=include_nested_repos, - ) - - # Re-index git metadata for changed files - git_meta_map: dict[str, dict] = {} - try: - from repowise.core.ingestion.git_indexer import GitIndexer - from repowise.core.ingestion.git_indexer.tiers import GitIndexTier + from repowise.core.pipeline.incremental import rebuild_graph_and_git - try: - tier = GitIndexTier(git_tier) if git_tier else GitIndexTier.FULL - except ValueError: - tier = GitIndexTier.FULL - _commit_limit = cfg.get("commit_limit") - _follow_renames = cfg.get("follow_renames", False) - git_indexer = GitIndexer( + return run_async( + rebuild_graph_and_git( repo_path, - commit_limit=_commit_limit, - follow_renames=_follow_renames, - exclude_patterns=exclude_patterns or None, - tier=tier, + file_diffs, + cfg, + exclude_patterns, + git_tier=git_tier, + include_submodules=include_submodules, + include_nested_repos=include_nested_repos, + log=console.print, ) - changed_paths = _build_filtered_changed_paths(file_diffs, exclude_patterns) - updated_meta = run_async(git_indexer.index_changed_files(changed_paths)) - git_meta_map = {m["file_path"]: m for m in updated_meta} - graph_builder.update_co_change_edges(git_meta_map) - except Exception as exc: - console.print(f"[yellow]Git re-index skipped: {exc}[/yellow]") - - # Pre-compute centrality/community metrics with the init path's fan-out - # parallelism. Without this, persist_graph_nodes computes the same - # metrics lazily one-by-one. Runs after the co-change edge refresh so - # the cached subgraphs reflect the final structure. Best-effort: every - # metric still falls back to lazy computation. - try: - run_async(graph_builder.compute_metrics_parallel()) - except Exception as exc: - console.print(f"[yellow]Metric pre-computation skipped: {exc}[/yellow]") - - return parsed_files, source_map, graph_builder, repo_structure, file_count, git_meta_map + ) def _run_partial_analysis( @@ -514,61 +368,22 @@ def _run_partial_analysis( ) -> tuple[Any, Any]: """Run partial code-health + dead-code analysis for the changed files. + Delegates to :mod:`repowise.core.pipeline.incremental` — the logic moved + to core so workspace updates can reuse the incremental path. + Returns ``(partial_health_report, dead_code_report)`` — either may be ``None`` if its analysis failed (both are best-effort). """ - # Run partial code-health analysis up front so both the index-only - # and full paths can upsert findings/metrics for changed files only. - # The full file-list is needed because duplication is cross-file — - # but only files in ``changed_paths`` produce new findings/metrics. - partial_health_report = None - try: - from repowise.core.analysis.health import HealthAnalyzer - from repowise.core.analysis.health.config import HealthConfig - - _health_analyzer = HealthAnalyzer( - graph_builder.graph(), - git_meta_map=git_meta_map, - parsed_files=parsed_files, - duplication_cache_dir=Path(repo_path) / ".repowise", - ) - _health_changed = {fd.path for fd in file_diffs if fd.status in ("added", "modified")} - if _health_changed: - _hcfg = HealthConfig.load(repo_path) - _analyzer_config = ( - _hcfg.to_analyzer_config([pf.file_info.path for pf in parsed_files]) - if (_hcfg.disabled_biomarkers or _hcfg.rules) - else None - ) - partial_health_report = _health_analyzer.analyze( - _analyzer_config, changed_files=_health_changed - ) - console.print( - f"Health analysis (partial): [cyan]{len(_health_changed)} files[/cyan], " - f"[yellow]{len(partial_health_report.findings)} findings[/yellow]" - ) - except Exception as exc: - console.print(f"[yellow]Health analysis skipped: {exc}[/yellow]") + from repowise.core.pipeline.incremental import run_partial_analysis - # Run partial dead-code analysis up front so both branches can - # persist its results. Previously this sat below the ``if index_only`` - # short-circuit, which left the closure's reference to - # ``dead_code_report`` unbound and crashed every ``--index-only`` run. - dead_code_report = None - try: - from repowise.core.analysis.dead_code import DeadCodeAnalyzer - - _analyzer_partial = DeadCodeAnalyzer(graph_builder.graph(), git_meta_map) - _changed_paths_partial = [fd.path for fd in file_diffs] - dead_code_report = _analyzer_partial.analyze_partial(_changed_paths_partial) - if dead_code_report.total_findings: - console.print( - f"Dead code findings (partial): [yellow]{dead_code_report.total_findings}[/yellow]" - ) - except Exception as exc: - console.print(f"[yellow]Dead code analysis skipped: {exc}[/yellow]") - - return partial_health_report, dead_code_report + return run_partial_analysis( + repo_path, + graph_builder, + git_meta_map, + parsed_files, + file_diffs, + log=console.print, + ) def _persist_index_only_update( @@ -583,76 +398,24 @@ def _persist_index_only_update( changed_paths: list[str], ) -> None: """Persist the index-only update (graph + git + dead-code + health), save - state, and print the completion line. No LLM regeneration.""" - - async def _persist_index_only() -> None: - from repowise.cli.helpers import get_db_url_for_repo - from repowise.core.persistence import ( - create_engine, - create_session_factory, - get_session, - init_db, - upsert_repository, - ) - - url = get_db_url_for_repo(repo_path) - engine = create_engine(url) - await init_db(engine) - sf = create_session_factory(engine) + state, and print the completion line. No LLM regeneration. - async with get_session(sf) as session: - repo = await upsert_repository(session, name=repo_path.name, local_path=str(repo_path)) - repo_id = repo.id - - if git_meta_map: - try: - from repowise.core.persistence.crud import ( - recompute_git_percentiles, - upsert_git_metadata_bulk, - ) - - await upsert_git_metadata_bulk(session, repo_id, list(git_meta_map.values())) - await recompute_git_percentiles(session, repo_id) - except Exception as exc: - console.print(f"[yellow]Git persist skipped: {exc}[/yellow]") - - try: - await _persist_incremental_commits(session, repo_id, repo_path) - except Exception as exc: - console.print(f"[yellow]Commit capture skipped: {exc}[/yellow]") - - if dead_code_report is not None: - try: - from repowise.core.persistence.crud import ( - upsert_dead_code_findings, - ) - - await upsert_dead_code_findings( - session, repo_id, dead_code_report.findings, file_paths=changed_paths - ) - except Exception as exc: - console.print(f"[yellow]Dead-code persist skipped: {exc}[/yellow]") - - if partial_health_report is not None: - try: - await _persist_partial_health(session, repo_id, partial_health_report) - except Exception as exc: - console.print(f"[yellow]Health persist skipped: {exc}[/yellow]") - - # Re-persist graph_nodes so symbol-level PageRank / - # betweenness / community ids stay in sync with the - # current graph build. Without this, ``repowise update`` - # leaves stale per-symbol metrics from the original init - # and the UI shows "Not indexed in graph" for every - # symbol on existing repos. - try: - from repowise.core.pipeline.persist import persist_graph_nodes - - await persist_graph_nodes(session, repo_id, graph_builder) - except Exception as exc: - console.print(f"[yellow]Graph nodes persist skipped: {exc}[/yellow]") + DB persistence delegates to :mod:`repowise.core.pipeline.incremental`; + state-file updates and console reporting stay here. + """ + from repowise.core.pipeline.incremental import persist_incremental_index - run_async(_persist_index_only()) + run_async( + persist_incremental_index( + repo_path, + graph_builder, + git_meta_map, + dead_code_report, + partial_health_report, + changed_paths, + log=console.print, + ) + ) from repowise.cli.helpers import config_fingerprint save_state( diff --git a/packages/core/src/repowise/core/pipeline/incremental.py b/packages/core/src/repowise/core/pipeline/incremental.py new file mode 100644 index 00000000..aa5a0b75 --- /dev/null +++ b/packages/core/src/repowise/core/pipeline/incremental.py @@ -0,0 +1,438 @@ +"""Incremental (changed-files) index refresh. + +The orchestration that `repowise update` runs for an already-indexed repo: +re-ingest the graph (parse-cache backed), re-index git metadata for the +changed files only, run partial health/dead-code analysis, and upsert the +results — without the full pipeline's delete-then-insert persistence or LLM +generation. + +Extracted from the CLI update command so workspace updates can route +already-indexed member repos through the same incremental path instead of +re-running the full init pipeline per repo. The CLI keeps thin wrappers +that delegate here. + +Progress/diagnostic messages go through an optional ``log`` callback (the +CLI passes ``console.print``; messages use rich markup). When ``log`` is +omitted the messages are dropped — every one of them annotates a +best-effort step that already degrades gracefully. +""" + +from __future__ import annotations + +import asyncio +from collections.abc import Callable +from pathlib import Path +from typing import Any + +LogFn = Callable[[str], None] + + +def _noop_log(message: str) -> None: # pragma: no cover - trivial + return None + + +def build_filtered_changed_paths(file_diffs: list, exclude_patterns: list[str]) -> list[str]: + """Extract paths from file_diffs, filtering out excluded patterns.""" + paths = [fd.path for fd in file_diffs] + if not exclude_patterns: + return paths + import pathspec + + spec = pathspec.PathSpec.from_lines("gitwildmatch", exclude_patterns) + return [p for p in paths if not spec.match_file(p)] + + +def build_repo_graph( + repo_path: Any, + exclude_patterns: list[str], + *, + collect_sources: bool = False, + include_submodules: bool = False, + include_nested_repos: bool = False, + log: LogFn | None = None, +) -> tuple[list, dict[str, bytes], Any, Any, int]: + """Traverse + parse the repo and build the graph (+ framework-aware edges). + + Shared by the incremental rebuild path (:func:`rebuild_graph_and_git`) and + the config-triggered re-score path so both build the same graph from the + same parser and the same synthetic edge step. + + Files that fail to read/parse are skipped and reported as a count rather than + swallowed silently. ``source_map`` is populated only when ``collect_sources`` + is set (the re-score path doesn't need the raw bytes). + + Returns ``(parsed_files, source_map, graph_builder, repo_structure, + file_count)``. + """ + from repowise.core.ingestion import ASTParser, FileTraverser, GraphBuilder, compute_content_hash + + log = log or _noop_log + + traverser = FileTraverser( + repo_path, + extra_exclude_patterns=exclude_patterns or None, + include_submodules=include_submodules, + include_nested_repos=include_nested_repos, + ) + file_infos = list(traverser.traverse()) + repo_structure = traverser.get_repo_structure() + + # Content-hash parse cache: an incremental update re-ingests the whole + # repo, but only the changed files actually need a tree-sitter parse. + # Best-effort — any cache failure falls back to a full parse. + parse_cache = None + try: + from repowise.core.ingestion.parse_cache import ParseCache + + parse_cache = ParseCache(Path(repo_path) / ".repowise") + parse_cache.load() + except Exception: + parse_cache = None + + parser: Any = None # constructed lazily — every-file-cached updates skip query compilation + parsed_files: list = [] + source_map: dict[str, bytes] = {} + graph_builder = GraphBuilder( + repo_path, + exclude_patterns=exclude_patterns, + centrality_cache_dir=Path(repo_path) / ".repowise", + include_submodules=include_submodules, + include_nested_repos=include_nested_repos, + ) + + skipped = 0 + for fi in file_infos: + try: + source = Path(fi.abs_path).read_bytes() + content_hash = compute_content_hash(source) + parsed = parse_cache.get(fi, content_hash) if parse_cache is not None else None + if parsed is None: + if parser is None: + parser = ASTParser() + parsed = parser.parse_file(fi, source) + if parse_cache is not None: + parse_cache.put(parsed, content_hash) + except Exception: + skipped += 1 + continue + parsed_files.append(parsed) + if collect_sources: + source_map[fi.path] = source + graph_builder.add_file(parsed) + graph_builder.build() + if parse_cache is not None: + parse_cache.save() + + if skipped: + log(f"[yellow]Skipped {skipped} file(s) that failed to parse.[/yellow]") + + # Add framework-aware synthetic edges (conftest, Django, FastAPI, Flask). + try: + from repowise.core.generation.editor_files.tech_stack import detect_tech_stack + + tech_items = detect_tech_stack(repo_path) + fw_count = graph_builder.add_framework_edges([item.name for item in tech_items]) + if fw_count: + log(f"Framework edges added: [cyan]{fw_count}[/cyan]") + except Exception: + pass # framework edge detection is best-effort + + return parsed_files, source_map, graph_builder, repo_structure, len(file_infos) + + +async def rebuild_graph_and_git( + repo_path: Any, + file_diffs: list, + cfg: dict, + exclude_patterns: list[str], + *, + git_tier: str | None = None, + include_submodules: bool = False, + include_nested_repos: bool = False, + log: LogFn | None = None, +) -> tuple[list, dict[str, bytes], Any, Any, int, dict[str, dict]]: + """Re-traverse + parse the repo, rebuild the graph (+ framework edges), and + re-index git metadata for the changed files. + + ``git_tier`` is the persisted ``state.json:git_tier`` value: a fast-mode + (ESSENTIAL) repo must not pay per-file blame on every update for signals + its index never had. Unknown/missing values fall back to FULL, matching + the historical behavior for legacy state files. + + ``include_submodules`` / ``include_nested_repos`` are likewise read from + state.json: a repo indexed with ``init --include-submodules`` must not + silently drop its submodule files on every incremental update. Missing + keys fall back to False (legacy behavior). + + Returns ``(parsed_files, source_map, graph_builder, repo_structure, + file_count, git_meta_map)``. + """ + log = log or _noop_log + + # Full re-ingest for graph (needed for cascade analysis) + parsed_files, source_map, graph_builder, repo_structure, file_count = build_repo_graph( + repo_path, + exclude_patterns, + collect_sources=True, + include_submodules=include_submodules, + include_nested_repos=include_nested_repos, + log=log, + ) + + # Re-index git metadata for changed files + git_meta_map: dict[str, dict] = {} + try: + from repowise.core.ingestion.git_indexer import GitIndexer + from repowise.core.ingestion.git_indexer.tiers import GitIndexTier + + try: + tier = GitIndexTier(git_tier) if git_tier else GitIndexTier.FULL + except ValueError: + tier = GitIndexTier.FULL + _commit_limit = cfg.get("commit_limit") + _follow_renames = cfg.get("follow_renames", False) + git_indexer = GitIndexer( + repo_path, + commit_limit=_commit_limit, + follow_renames=_follow_renames, + exclude_patterns=exclude_patterns or None, + tier=tier, + ) + changed_paths = build_filtered_changed_paths(file_diffs, exclude_patterns) + updated_meta = await git_indexer.index_changed_files(changed_paths) + git_meta_map = {m["file_path"]: m for m in updated_meta} + graph_builder.update_co_change_edges(git_meta_map) + except Exception as exc: + log(f"[yellow]Git re-index skipped: {exc}[/yellow]") + + # Pre-compute centrality/community metrics with the init path's fan-out + # parallelism. Without this, persist_graph_nodes computes the same + # metrics lazily one-by-one. Runs after the co-change edge refresh so + # the cached subgraphs reflect the final structure. Best-effort: every + # metric still falls back to lazy computation. + try: + await graph_builder.compute_metrics_parallel() + except Exception as exc: + log(f"[yellow]Metric pre-computation skipped: {exc}[/yellow]") + + return parsed_files, source_map, graph_builder, repo_structure, file_count, git_meta_map + + +def run_partial_analysis( + repo_path: Any, + graph_builder: Any, + git_meta_map: dict, + parsed_files: list, + file_diffs: list, + *, + log: LogFn | None = None, +) -> tuple[Any, Any]: + """Run partial code-health + dead-code analysis for the changed files. + + Returns ``(partial_health_report, dead_code_report)`` — either may be + ``None`` if its analysis failed (both are best-effort). + """ + log = log or _noop_log + + # Run partial code-health analysis up front so both the index-only + # and full paths can upsert findings/metrics for changed files only. + # The full file-list is needed because duplication is cross-file — + # but only files in ``changed_paths`` produce new findings/metrics. + partial_health_report = None + try: + from repowise.core.analysis.health import HealthAnalyzer + from repowise.core.analysis.health.config import HealthConfig + + _health_analyzer = HealthAnalyzer( + graph_builder.graph(), + git_meta_map=git_meta_map, + parsed_files=parsed_files, + duplication_cache_dir=Path(repo_path) / ".repowise", + ) + _health_changed = {fd.path for fd in file_diffs if fd.status in ("added", "modified")} + if _health_changed: + _hcfg = HealthConfig.load(repo_path) + _analyzer_config = ( + _hcfg.to_analyzer_config([pf.file_info.path for pf in parsed_files]) + if (_hcfg.disabled_biomarkers or _hcfg.rules) + else None + ) + partial_health_report = _health_analyzer.analyze( + _analyzer_config, changed_files=_health_changed + ) + log( + f"Health analysis (partial): [cyan]{len(_health_changed)} files[/cyan], " + f"[yellow]{len(partial_health_report.findings)} findings[/yellow]" + ) + except Exception as exc: + log(f"[yellow]Health analysis skipped: {exc}[/yellow]") + + # Run partial dead-code analysis up front so both branches can + # persist its results. Previously this sat below the ``if index_only`` + # short-circuit, which left the closure's reference to + # ``dead_code_report`` unbound and crashed every ``--index-only`` run. + dead_code_report = None + try: + from repowise.core.analysis.dead_code import DeadCodeAnalyzer + + _analyzer_partial = DeadCodeAnalyzer(graph_builder.graph(), git_meta_map) + _changed_paths_partial = [fd.path for fd in file_diffs] + dead_code_report = _analyzer_partial.analyze_partial(_changed_paths_partial) + if dead_code_report.total_findings: + log(f"Dead code findings (partial): [yellow]{dead_code_report.total_findings}[/yellow]") + except Exception as exc: + log(f"[yellow]Dead code analysis skipped: {exc}[/yellow]") + + return partial_health_report, dead_code_report + + +async def persist_partial_health(session: Any, repo_id: str, report: Any) -> None: + """Upsert health findings + metrics for the changed-files subset. + + Unlike ``persist_pipeline_result`` (which delete-then-inserts the + whole repo), this writer only touches rows whose ``file_path`` is in + the partial report — so unchanged files keep their existing findings + and metrics across an incremental ``repowise update``. + """ + from repowise.core.persistence.crud import ( + upsert_health_findings, + upsert_health_metrics, + ) + + changed_paths = sorted({m.file_path for m in report.metrics or []}) + if not changed_paths: + return + await upsert_health_metrics(session, repo_id, report.metrics or []) + await upsert_health_findings( + session, repo_id, list(report.findings or []), file_paths=changed_paths + ) + # Per-function blame rollup for the changed files (keeps git_function_blame + # current between full indexes; FULL git tier only — empty otherwise). + fn_blame_rows = getattr(report, "function_blame_rows", None) + if fn_blame_rows: + from repowise.core.persistence.crud import upsert_git_function_blame_bulk + + await upsert_git_function_blame_bulk(session, repo_id, fn_blame_rows) + + +async def persist_incremental_commits(session: Any, repo_id: str, repo_path: Any) -> None: + """Capture + upsert ``git_commits`` rows for commits new since the last index. + + Foundation 1 only populated the per-commit table on the full orchestrator + index; without this, the commits/change-risk surface goes stale between full + re-indexes. Bounds the walk to commits newer than the newest persisted + ``committed_at`` (one ``git log`` pass) and upserts (idempotent on sha). + """ + from repowise.core.ingestion.git_indexer import GitIndexer + from repowise.core.persistence.crud import ( + get_latest_commit_committed_at, + upsert_git_commits_bulk, + ) + from repowise.core.repo_config import load_repo_config + + cfg = load_repo_config(repo_path) + indexer = GitIndexer( + repo_path, + commit_limit=cfg.get("commit_limit"), + follow_renames=cfg.get("follow_renames", False), + ) + newest = await get_latest_commit_committed_at(session, repo_id) + since_ts: int | None = None + if newest is not None: + # SQLite drops tzinfo, so a naive read must be interpreted as UTC (the + # column is stored tz-aware) rather than local time. + from datetime import UTC + + dt = newest if newest.tzinfo is not None else newest.replace(tzinfo=UTC) + since_ts = int(dt.timestamp()) + rows = await asyncio.to_thread(indexer.capture_new_commit_rows, since_ts=since_ts) + if rows: + await upsert_git_commits_bulk(session, repo_id, rows) + + +async def persist_incremental_index( + repo_path: Any, + graph_builder: Any, + git_meta_map: dict, + dead_code_report: Any, + partial_health_report: Any, + changed_paths: list[str], + *, + log: LogFn | None = None, +) -> None: + """Persist an incremental index refresh (graph + git + dead-code + health). + + Upsert-only: unchanged files keep their existing rows, unlike + ``persist_pipeline_result``'s delete-then-insert. State-file updates stay + with the caller — this writes the DB only. + """ + from repowise.core.persistence import ( + create_engine, + create_session_factory, + get_session, + init_db, + upsert_repository, + ) + from repowise.core.persistence.database import resolve_db_url + + log = log or _noop_log + + url = resolve_db_url(repo_path) + engine = create_engine(url) + try: + await init_db(engine) + sf = create_session_factory(engine) + + async with get_session(sf) as session: + repo = await upsert_repository(session, name=repo_path.name, local_path=str(repo_path)) + repo_id = repo.id + + if git_meta_map: + try: + from repowise.core.persistence.crud import ( + recompute_git_percentiles, + upsert_git_metadata_bulk, + ) + + await upsert_git_metadata_bulk(session, repo_id, list(git_meta_map.values())) + await recompute_git_percentiles(session, repo_id) + except Exception as exc: + log(f"[yellow]Git persist skipped: {exc}[/yellow]") + + try: + await persist_incremental_commits(session, repo_id, repo_path) + except Exception as exc: + log(f"[yellow]Commit capture skipped: {exc}[/yellow]") + + if dead_code_report is not None: + try: + from repowise.core.persistence.crud import ( + upsert_dead_code_findings, + ) + + await upsert_dead_code_findings( + session, repo_id, dead_code_report.findings, file_paths=changed_paths + ) + except Exception as exc: + log(f"[yellow]Dead-code persist skipped: {exc}[/yellow]") + + if partial_health_report is not None: + try: + await persist_partial_health(session, repo_id, partial_health_report) + except Exception as exc: + log(f"[yellow]Health persist skipped: {exc}[/yellow]") + + # Re-persist graph_nodes so symbol-level PageRank / + # betweenness / community ids stay in sync with the + # current graph build. Without this, ``repowise update`` + # leaves stale per-symbol metrics from the original init + # and the UI shows "Not indexed in graph" for every + # symbol on existing repos. + try: + from repowise.core.pipeline.persist import persist_graph_nodes + + await persist_graph_nodes(session, repo_id, graph_builder) + except Exception as exc: + log(f"[yellow]Graph nodes persist skipped: {exc}[/yellow]") + finally: + await engine.dispose() diff --git a/packages/core/src/repowise/core/workspace/update.py b/packages/core/src/repowise/core/workspace/update.py index ec1e968e..79dc8ee8 100644 --- a/packages/core/src/repowise/core/workspace/update.py +++ b/packages/core/src/repowise/core/workspace/update.py @@ -135,19 +135,42 @@ def count_commits_between(repo_path: Path, base: str, head: str) -> int: return 0 -def read_state_commit(repo_path: Path) -> str | None: - """Return ``last_sync_commit`` from ``/.repowise/state.json`` or None.""" - import json as _json +def commit_exists(repo_path: Path, sha: str) -> bool: + """Return True when *sha* resolves to a commit in *repo_path*. + The incremental path must verify this itself: ``ChangeDetector`` returns + an **empty** diff for unresolvable refs, which would masquerade as "no + changes" and let the caller bump ``last_sync_commit`` past commits that + were never indexed (e.g. after a rebase or an aggressive ``git gc``). + """ + try: + result = subprocess.run( + ["git", "cat-file", "-e", f"{sha}^{{commit}}"], + cwd=str(repo_path), + capture_output=True, + timeout=10, + ) + return result.returncode == 0 + except Exception: + return False + + +def read_repo_state(repo_path: Path) -> dict[str, Any]: + """Return the parsed ``/.repowise/state.json``, or ``{}``.""" state_path = repo_path / ".repowise" / "state.json" if not state_path.is_file(): - return None + return {} try: data = _json.loads(state_path.read_text(encoding="utf-8")) - sha = data.get("last_sync_commit") - return str(sha) if sha else None + return data if isinstance(data, dict) else {} except Exception: - return None + return {} + + +def read_state_commit(repo_path: Path) -> str | None: + """Return ``last_sync_commit`` from ``/.repowise/state.json`` or None.""" + sha = read_repo_state(repo_path).get("last_sync_commit") + return str(sha) if sha else None def sync_workspace_state_from_disk( @@ -213,6 +236,106 @@ def check_repo_staleness( # --------------------------------------------------------------------------- +async def _incremental_repo_update( + repo_path: Path, + *, + state: dict[str, Any], + base_ref: str, + exclude_patterns: list[str] | None = None, +) -> RepoUpdateResult | None: + """Refresh an already-indexed repo through the incremental update path. + + Mirrors the single-repo ``repowise update --index-only`` flow: diff + ``base_ref..HEAD``, rebuild the graph (parse-cache backed), re-index git + metadata for the changed files only, run partial health/dead-code + analysis, and upsert the results. State-file updates stay with the + caller (``update_workspace``'s ``_update_one``). + + Returns ``None`` when the diff contains deletions or renames: the + incremental persistence is upsert-only, so rows for removed paths would + linger in graph/health tables forever. The full pipeline's + delete-then-insert persistence prunes them — the caller runs it instead. + + Raises on failure — the caller falls back to the full pipeline. + """ + from ..ingestion.change_detector import ChangeDetector + from ..pipeline.incremental import ( + persist_incremental_index, + rebuild_graph_and_git, + run_partial_analysis, + ) + from ..pipeline.phases.git import drop_transient_git_signals + from ..repo_config import load_repo_config + + alias = repo_path.name + head = get_head_commit(repo_path) or "HEAD" + + detector = ChangeDetector(repo_path) + file_diffs = detector.get_changed_files(base_ref, head) + if not file_diffs: + # New commits but nothing the index cares about changed (merge/empty + # commits, or every change excluded). Report success so the caller + # bumps ``last_sync_commit`` instead of re-diffing forever. + return RepoUpdateResult(alias=alias, updated=True) + + if any(fd.status in ("deleted", "renamed") for fd in file_diffs): + # Upsert-only persistence can't remove rows for paths that no longer + # exist; hand off to the full pipeline so its prune pass cleans up. + _log.info( + "workspace_update: %s has deleted/renamed files — using the full " + "pipeline so stale index rows are pruned", + alias, + ) + return None + + # Per-repo config, like the single-repo update path. The workspace-level + # ``exclude_patterns`` (when provided) apply on top. + cfg = load_repo_config(repo_path) + merged_excludes = list(cfg.get("exclude_patterns") or []) + for pattern in exclude_patterns or []: + if pattern not in merged_excludes: + merged_excludes.append(pattern) + + parsed_files, _source_map, graph_builder, _structure, file_count, git_meta_map = ( + await rebuild_graph_and_git( + repo_path, + file_diffs, + cfg, + merged_excludes, + git_tier=state.get("git_tier"), + include_submodules=bool(state.get("include_submodules", False)), + include_nested_repos=bool(state.get("include_nested_repos", False)), + log=_log.info, + ) + ) + + partial_health_report, dead_code_report = run_partial_analysis( + repo_path, graph_builder, git_meta_map, parsed_files, file_diffs, log=_log.info + ) + + # Partial health has consumed the per-file ``BlameIndex``; drop it before + # the metadata reaches persistence so the transient, non-serializable + # object can never leak downstream (mirrors the CLI update path). + drop_transient_git_signals(list(git_meta_map.values())) + + await persist_incremental_index( + repo_path, + graph_builder, + git_meta_map, + dead_code_report, + partial_health_report, + [fd.path for fd in file_diffs], + log=_log.info, + ) + + return RepoUpdateResult( + alias=alias, + updated=True, + file_count=file_count, + symbol_count=sum(len(pf.symbols) for pf in parsed_files), + ) + + async def update_single_repo_index( repo_path: Path, *, @@ -220,12 +343,14 @@ async def update_single_repo_index( exclude_patterns: list[str] | None = None, progress: Any | None = None, ) -> RepoUpdateResult: - """Re-run the ingestion pipeline (index-only) for a single repo. + """Refresh the index for a single repo. - This refreshes graph, git stats, dead code, and decisions — everything - except wiki pages. Used by workspace update when no LLM provider is set. + Already-indexed repos (a persisted ``last_sync_commit`` whose commit + still resolves, plus an existing ``wiki.db``) go through the incremental + update path — changed-files diff, partial analysis, upsert persistence. + Never-indexed repos, and any incremental failure, run the full + ingestion pipeline instead (index-only — no wiki pages). """ - from ..pipeline import run_pipeline from ..persistence import ( create_engine, create_session_factory, @@ -234,15 +359,42 @@ async def update_single_repo_index( upsert_repository, ) from ..persistence.database import resolve_db_url + from ..pipeline import run_pipeline from ..pipeline.persist import persist_pipeline_result alias = repo_path.name + state = read_repo_state(repo_path) + base_ref = state.get("last_sync_commit") + + if ( + base_ref + and (repo_path / ".repowise" / "wiki.db").is_file() + and commit_exists(repo_path, str(base_ref)) + ): + try: + incremental_result = await _incremental_repo_update( + repo_path, + state=state, + base_ref=str(base_ref), + exclude_patterns=exclude_patterns, + ) + if incremental_result is not None: + return incremental_result + # None → the diff needs the full pipeline's prune pass. + except Exception: + _log.warning( + "Incremental update failed for %s — falling back to the full pipeline", + repo_path, + exc_info=True, + ) try: result = await run_pipeline( repo_path, commit_depth=commit_depth, exclude_patterns=exclude_patterns, + include_submodules=bool(state.get("include_submodules", False)), + include_nested_repos=bool(state.get("include_nested_repos", False)), generate_docs=False, progress=progress, ) diff --git a/tests/unit/workspace/test_incremental_update.py b/tests/unit/workspace/test_incremental_update.py new file mode 100644 index 00000000..12e3737c --- /dev/null +++ b/tests/unit/workspace/test_incremental_update.py @@ -0,0 +1,289 @@ +"""Workspace updates route already-indexed repos through the incremental path. + +``update_single_repo_index`` previously re-ran the full init pipeline for +every stale repo. Already-indexed repos (persisted ``last_sync_commit`` that +still resolves + existing ``wiki.db``) now take the incremental update path — +changed-files diff, partial analysis, upsert persistence — and inherit the +persisted state flags (``git_tier``, ``include_submodules``, +``include_nested_repos``). Never-indexed repos and incremental failures fall +back to the full pipeline. +""" + +from __future__ import annotations + +import asyncio +import json +import subprocess +from pathlib import Path + +import pytest + +from repowise.core.workspace.update import ( + commit_exists, + get_head_commit, + read_repo_state, + update_single_repo_index, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _git(repo: Path, *args: str) -> str: + result = subprocess.run(["git", *args], cwd=str(repo), capture_output=True, text=True) + return result.stdout.strip() + + +def _make_git_repo(tmp_path: Path, name: str = "repo") -> Path: + repo = tmp_path / name + repo.mkdir(parents=True) + _git(repo, "init") + _git(repo, "config", "user.email", "test@test.com") + _git(repo, "config", "user.name", "Test") + (repo / "a.py").write_text("def alpha():\n return 1\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "initial") + return repo + + +def _mark_indexed(repo: Path, commit: str, **extra_state) -> None: + """Simulate a prior index: state.json with last_sync_commit + wiki.db.""" + state_dir = repo / ".repowise" + state_dir.mkdir(parents=True, exist_ok=True) + (state_dir / "state.json").write_text( + json.dumps({"last_sync_commit": commit, **extra_state}), encoding="utf-8" + ) + (state_dir / "wiki.db").touch() + + +def _add_commit(repo: Path, filename: str = "b.py") -> str: + (repo / filename).write_text("def beta():\n return 2\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", f"add {filename}") + return _git(repo, "rev-parse", "HEAD") + + +@pytest.fixture +def forbid_full_pipeline(monkeypatch): + """Make the full pipeline unreachable so tests prove the incremental + path was taken.""" + import repowise.core.pipeline as pipeline_pkg + + async def _boom(*args, **kwargs): # pragma: no cover - failure path + raise AssertionError("full pipeline must not run for indexed repos") + + monkeypatch.setattr(pipeline_pkg, "run_pipeline", _boom) + + +@pytest.fixture +def stub_full_pipeline(monkeypatch): + """Replace the full pipeline + its persistence with recording stubs.""" + import repowise.core.pipeline as pipeline_pkg + import repowise.core.pipeline.persist as persist_mod + + calls: list[dict] = [] + + class _FakeResult: + repo_name = "stub" + file_count = 7 + symbol_count = 9 + + async def _fake_pipeline(repo_path, **kwargs): + calls.append({"repo_path": repo_path, **kwargs}) + return _FakeResult() + + async def _fake_persist(result, session, repo_id): + return None + + monkeypatch.setattr(pipeline_pkg, "run_pipeline", _fake_pipeline) + monkeypatch.setattr(persist_mod, "persist_pipeline_result", _fake_persist) + return calls + + +# --------------------------------------------------------------------------- +# Routing +# --------------------------------------------------------------------------- + + +def test_indexed_repo_takes_incremental_path(tmp_path, forbid_full_pipeline): + """An already-indexed repo with new commits updates incrementally — + the full pipeline is never invoked.""" + repo = _make_git_repo(tmp_path) + base = get_head_commit(repo) + _mark_indexed(repo, base) + _add_commit(repo, "b.py") + + result = asyncio.run(update_single_repo_index(repo)) + + assert result.error is None + assert result.updated is True + assert result.file_count >= 2 # a.py + b.py (+ any traversed metadata files) + assert result.symbol_count == 2 # alpha + beta + # The upsert path initialized the schema in the pre-existing wiki.db. + assert (repo / ".repowise" / "wiki.db").stat().st_size > 0 + + +def test_no_relevant_changes_still_reports_updated(tmp_path, forbid_full_pipeline): + """An empty commit produces no file diffs; the repo still reports + updated=True so the caller bumps last_sync_commit instead of + re-diffing forever.""" + repo = _make_git_repo(tmp_path) + base = get_head_commit(repo) + _mark_indexed(repo, base) + _git(repo, "commit", "--allow-empty", "-m", "empty") + + result = asyncio.run(update_single_repo_index(repo)) + + assert result.updated is True + assert result.file_count == 0 + + +def test_deleted_file_falls_back_to_full_pipeline(tmp_path, stub_full_pipeline): + """Incremental persistence is upsert-only — it can't prune rows for + removed paths. A diff containing deletions must run the full pipeline + (delete-then-insert) so stale graph/health rows are cleaned up.""" + repo = _make_git_repo(tmp_path) + _add_commit(repo, "b.py") + base = get_head_commit(repo) + _mark_indexed(repo, base) + (repo / "b.py").unlink() + _git(repo, "add", "-A") + _git(repo, "commit", "-m", "remove b.py") + + result = asyncio.run(update_single_repo_index(repo)) + + assert len(stub_full_pipeline) == 1 + assert result.updated is True + + +def test_renamed_file_falls_back_to_full_pipeline(tmp_path, stub_full_pipeline): + """Renames leave the old path behind in upsert-only persistence — + same prune requirement as deletions.""" + repo = _make_git_repo(tmp_path) + base = get_head_commit(repo) + _mark_indexed(repo, base) + _git(repo, "mv", "a.py", "renamed.py") + _git(repo, "commit", "-m", "rename a.py") + + result = asyncio.run(update_single_repo_index(repo)) + + assert len(stub_full_pipeline) == 1 + assert result.updated is True + + +def test_never_indexed_repo_runs_full_pipeline(tmp_path, stub_full_pipeline): + repo = _make_git_repo(tmp_path) + + result = asyncio.run(update_single_repo_index(repo)) + + assert len(stub_full_pipeline) == 1 + assert result.updated is True + assert result.file_count == 7 + + +def test_unresolvable_base_commit_falls_back_to_full_pipeline(tmp_path, stub_full_pipeline): + """A last_sync_commit that no longer resolves (rebase, gc) must not be + treated as 'no changes' — it falls back to the full pipeline.""" + repo = _make_git_repo(tmp_path) + _mark_indexed(repo, "deadbeef" * 5) + _add_commit(repo, "b.py") + + result = asyncio.run(update_single_repo_index(repo)) + + assert len(stub_full_pipeline) == 1 + assert result.updated is True + + +def test_incremental_failure_falls_back_to_full_pipeline(tmp_path, stub_full_pipeline, monkeypatch): + import repowise.core.pipeline.incremental as incremental_mod + + repo = _make_git_repo(tmp_path) + base = get_head_commit(repo) + _mark_indexed(repo, base) + _add_commit(repo, "b.py") + + async def _boom(*args, **kwargs): + raise RuntimeError("incremental exploded") + + monkeypatch.setattr(incremental_mod, "rebuild_graph_and_git", _boom) + + result = asyncio.run(update_single_repo_index(repo)) + + assert len(stub_full_pipeline) == 1 + assert result.updated is True + assert result.error is None + + +def test_incremental_threads_persisted_state_flags(tmp_path, forbid_full_pipeline, monkeypatch): + """git_tier / include_submodules / include_nested_repos from state.json + reach the incremental rebuild.""" + import repowise.core.pipeline.incremental as incremental_mod + + repo = _make_git_repo(tmp_path) + base = get_head_commit(repo) + _mark_indexed( + repo, + base, + git_tier="essential", + include_submodules=True, + include_nested_repos=True, + ) + _add_commit(repo, "b.py") + + captured: dict = {} + + async def _fake_rebuild(repo_path, file_diffs, cfg, exclude_patterns, **kwargs): + captured.update(kwargs) + return [], {}, None, None, 0, {} + + def _fake_analysis(*args, **kwargs): + return None, None + + async def _fake_persist(*args, **kwargs): + return None + + monkeypatch.setattr(incremental_mod, "rebuild_graph_and_git", _fake_rebuild) + monkeypatch.setattr(incremental_mod, "run_partial_analysis", _fake_analysis) + monkeypatch.setattr(incremental_mod, "persist_incremental_index", _fake_persist) + + result = asyncio.run(update_single_repo_index(repo)) + + assert result.updated is True + assert captured["git_tier"] == "essential" + assert captured["include_submodules"] is True + assert captured["include_nested_repos"] is True + + +# --------------------------------------------------------------------------- +# Helpers under test +# --------------------------------------------------------------------------- + + +def test_commit_exists(tmp_path): + repo = _make_git_repo(tmp_path) + head = get_head_commit(repo) + assert commit_exists(repo, head) is True + assert commit_exists(repo, "deadbeef" * 5) is False + + +def test_commit_exists_non_git_dir(tmp_path): + assert commit_exists(tmp_path, "deadbeef" * 5) is False + + +def test_read_repo_state(tmp_path): + repo = tmp_path / "r" + (repo / ".repowise").mkdir(parents=True) + (repo / ".repowise" / "state.json").write_text( + json.dumps({"last_sync_commit": "abc", "include_submodules": True}) + ) + state = read_repo_state(repo) + assert state["last_sync_commit"] == "abc" + assert state["include_submodules"] is True + + +def test_read_repo_state_missing_or_malformed(tmp_path): + assert read_repo_state(tmp_path) == {} + (tmp_path / ".repowise").mkdir() + (tmp_path / ".repowise" / "state.json").write_text("not json{") + assert read_repo_state(tmp_path) == {}