From dc6f81bbd133ce0a82c3f4157d04078b2026850e Mon Sep 17 00:00:00 2001 From: RaghavChamadiya Date: Fri, 5 Jun 2026 16:03:17 +0530 Subject: [PATCH 1/2] fix(update): honor include-submodules across incremental updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related gaps following the pruned-resolver-scans work (#380): 1. `repowise update` ignored submodule/nested-repo flags: the update path constructed FileTraverser and GraphBuilder with defaults, so a repo indexed with `init --include-submodules` silently dropped its submodule files on every incremental update. The flag is now persisted to state.json at init (same pattern as git_tier) and read back in update_cmd, threaded through _rebuild_graph_and_git / _build_repo_graph into both constructors (incremental rebuild and the config-triggered health re-score). Missing keys keep legacy behavior (False). 2. `--include-submodules` was defeated for *initialized* submodules: `.gitmodules` was only parsed when the flag was off, so an initialized submodule (whose `.git` file makes it look like a nested repo) fell through to the nested-git boundary skip and was dropped anyway. The traverser now parses `.gitmodules` unconditionally and exempts opted-in submodules from the nested-git skip — while still applying gitignore/exclude checks and still skipping non-submodule nested repos. _detect_monorepo package scans now mirror GraphBuilder._prune_nested_git semantics. --- .../repowise/cli/commands/init_cmd/command.py | 5 ++ .../cli/commands/init_cmd/persistence.py | 4 + .../src/repowise/cli/commands/update_cmd.py | 37 ++++++++- .../src/repowise/core/ingestion/traverser.py | 25 ++++-- .../cli/test_update_include_submodules.py | 76 +++++++++++++++++++ tests/unit/ingestion/test_traverser.py | 54 +++++++++++++ 6 files changed, 189 insertions(+), 12 deletions(-) create mode 100644 tests/unit/cli/test_update_include_submodules.py diff --git a/packages/cli/src/repowise/cli/commands/init_cmd/command.py b/packages/cli/src/repowise/cli/commands/init_cmd/command.py index c1e90281..33d7a29b 100644 --- a/packages/cli/src/repowise/cli/commands/init_cmd/command.py +++ b/packages/cli/src/repowise/cli/commands/init_cmd/command.py @@ -806,6 +806,10 @@ async def _index_with_resume() -> Any: # same tier instead of silently upgrading ESSENTIAL → FULL (issue #341). base_state["run_mode"] = run_mode base_state["git_tier"] = git_tier_for_run_mode(run_mode) + # Record whether submodules were indexed so `repowise update` rebuilds + # the graph with the same boundary semantics (same pattern as git_tier: + # missing → False keeps legacy behavior for old state files). + base_state["include_submodules"] = include_submodules if phase_timings: base_state["phase_timings"] = phase_timings kg = getattr(result, "knowledge_graph_result", None) @@ -835,6 +839,7 @@ async def _index_with_resume() -> Any: commit_limit=commit_limit, resolved_commit_limit=resolved_commit_limit, resolved_reasoning=resolved_reasoning, + include_submodules=include_submodules, ) # ---- Completion panel ---- diff --git a/packages/cli/src/repowise/cli/commands/init_cmd/persistence.py b/packages/cli/src/repowise/cli/commands/init_cmd/persistence.py index 9862be6f..8d9f4ec1 100644 --- a/packages/cli/src/repowise/cli/commands/init_cmd/persistence.py +++ b/packages/cli/src/repowise/cli/commands/init_cmd/persistence.py @@ -177,6 +177,7 @@ def save_full_state_and_config( commit_limit: int | None, resolved_commit_limit: int, resolved_reasoning: str, + include_submodules: bool = False, ) -> None: """Persist state.json + config for a completed full-mode (docs) init run.""" @@ -216,6 +217,9 @@ async def _count_db_pages() -> int: # Full-mode docs runs always index the FULL git tier. state["run_mode"] = "standard" state["git_tier"] = "full" + # Same pattern as git_tier: `repowise update` reads this back so its + # graph rebuild keeps the init run's submodule boundary semantics. + state["include_submodules"] = include_submodules total_tokens = sum(p.total_tokens for p in (result.generated_pages or [])) state["total_tokens"] = total_tokens if phase_timings: diff --git a/packages/cli/src/repowise/cli/commands/update_cmd.py b/packages/cli/src/repowise/cli/commands/update_cmd.py index 1dd532e8..2e5423a2 100644 --- a/packages/cli/src/repowise/cli/commands/update_cmd.py +++ b/packages/cli/src/repowise/cli/commands/update_cmd.py @@ -330,6 +330,8 @@ def _build_repo_graph( exclude_patterns: list[str], *, collect_sources: bool = False, + include_submodules: bool = False, + include_nested_repos: bool = False, ) -> tuple[list, dict[str, bytes], Any, Any, int]: """Traverse + parse the repo and build the graph (+ framework-aware edges). @@ -348,7 +350,12 @@ def _build_repo_graph( from repowise.core.ingestion import ASTParser, FileTraverser, GraphBuilder, compute_content_hash - traverser = FileTraverser(repo_path, extra_exclude_patterns=exclude_patterns or None) + traverser = FileTraverser( + repo_path, + extra_exclude_patterns=exclude_patterns or None, + include_submodules=include_submodules, + include_nested_repos=include_nested_repos, + ) file_infos = list(traverser.traverse()) repo_structure = traverser.get_repo_structure() @@ -371,6 +378,8 @@ def _build_repo_graph( repo_path, exclude_patterns=exclude_patterns, centrality_cache_dir=PathlibPath(repo_path) / ".repowise", + include_submodules=include_submodules, + include_nested_repos=include_nested_repos, ) skipped = 0 @@ -419,6 +428,8 @@ def _rebuild_graph_and_git( cfg: dict, exclude_patterns: list[str], git_tier: str | None = None, + include_submodules: bool = False, + include_nested_repos: bool = False, ) -> tuple[list, dict[str, bytes], Any, Any, int, dict[str, dict]]: """Re-traverse + parse the repo, rebuild the graph (+ framework edges), and re-index git metadata for the changed files. @@ -428,12 +439,21 @@ def _rebuild_graph_and_git( its index never had. Unknown/missing values fall back to FULL, matching the historical behavior for legacy state files. + ``include_submodules`` / ``include_nested_repos`` are likewise read from + state.json: a repo indexed with ``init --include-submodules`` must not + silently drop its submodule files on every incremental update. Missing + keys fall back to False (legacy behavior). + Returns ``(parsed_files, source_map, graph_builder, repo_structure, file_count, git_meta_map)``. """ # Full re-ingest for graph (needed for cascade analysis) parsed_files, source_map, graph_builder, repo_structure, file_count = _build_repo_graph( - repo_path, exclude_patterns, collect_sources=True + repo_path, + exclude_patterns, + collect_sources=True, + include_submodules=include_submodules, + include_nested_repos=include_nested_repos, ) # Re-index git metadata for changed files @@ -720,7 +740,10 @@ def _run_full_health_rescore( # Share the rebuild path with the incremental update so both produce the # same graph (same parser, same framework-aware synthetic edges). parsed_files, _source_map, graph_builder, _repo_structure, _file_count = _build_repo_graph( - repo_path, exclude_patterns + repo_path, + exclude_patterns, + include_submodules=bool(state.get("include_submodules", False)), + include_nested_repos=bool(state.get("include_nested_repos", False)), ) # Fan-out metric precompute (mirrors _rebuild_graph_and_git) — the @@ -1145,7 +1168,13 @@ def update_command( parsed_files, source_map, graph_builder, repo_structure, file_count, git_meta_map = ( _rebuild_graph_and_git( - repo_path, file_diffs, cfg, exclude_patterns, git_tier=state.get("git_tier") + repo_path, + file_diffs, + cfg, + exclude_patterns, + git_tier=state.get("git_tier"), + include_submodules=bool(state.get("include_submodules", False)), + include_nested_repos=bool(state.get("include_nested_repos", False)), ) ) diff --git a/packages/core/src/repowise/core/ingestion/traverser.py b/packages/core/src/repowise/core/ingestion/traverser.py index b16de7a4..e28de904 100644 --- a/packages/core/src/repowise/core/ingestion/traverser.py +++ b/packages/core/src/repowise/core/ingestion/traverser.py @@ -227,9 +227,11 @@ def __init__( self._dir_ignore_cache: dict[str, pathspec.PathSpec] = { str(self.repo_root): self._extra_ignore, } - self._submodule_paths: frozenset[str] = frozenset() - if not include_submodules: - self._submodule_paths = _parse_gitmodules(self.repo_root) + # Parse .gitmodules unconditionally: when submodules are *included* + # the set is what exempts initialized submodules (whose `.git` file + # makes them look like nested repos) from the nested-git skip below. + self._submodule_paths: frozenset[str] = _parse_gitmodules(self.repo_root) + self._include_submodules = include_submodules self._include_nested_repos = include_nested_repos self.stats = TraversalStats() self._count_lock = threading.Lock() @@ -238,7 +240,7 @@ def __init__( repo_root=str(self.repo_root), max_file_size_kb=max_file_size_kb, extra_exclude_patterns=len(patterns), - submodules_skipped=len(self._submodule_paths), + submodules_skipped=0 if include_submodules else len(self._submodule_paths), include_nested_repos=include_nested_repos, ) @@ -352,15 +354,19 @@ def _should_skip_dir( if dirname in _BLOCKED_DIRS: return True rel_str = rel_path.as_posix() - if rel_str in self._submodule_paths: + is_submodule = rel_str in self._submodule_paths + if is_submodule and not self._include_submodules: self.stats.skipped_submodule += 1 return True # Nested git repos are independent units — stop at the boundary # unless the caller explicitly opted in. Mirrors the workspace # scanner, which already refuses to descend into nested `.git` # markers. Without this, a parent repo that physically contains - # sibling repos gets walked end-to-end. - if not self._include_nested_repos and _is_nested_git_repo(abs_path): + # sibling repos gets walked end-to-end. An *initialized* submodule + # carries a `.git` file and would match here too — submodules that + # were explicitly opted in above are exempt (they still fall through + # to the gitignore/exclude checks below). + if not self._include_nested_repos and not is_submodule and _is_nested_git_repo(abs_path): self.stats.skipped_nested_repo += 1 log.debug("Skipping nested git repo", path=rel_str) return True @@ -479,7 +485,10 @@ def _detect_monorepo(self) -> tuple[list[PackageInfo], bool]: """ packages: list[PackageInfo] = [] seen_paths: set[str] = set() - prune_nested = not self._include_nested_repos + # Mirrors GraphBuilder._prune_nested_git: when submodules or nested + # repos are indexed, package-language/entry-point scans must not + # prune them (both are `.git`-bearing subdirs to fs_walk). + prune_nested = not (self._include_submodules or self._include_nested_repos) for depth in (1, 2): pattern = "/".join(["*"] * depth) + "/*" diff --git a/tests/unit/cli/test_update_include_submodules.py b/tests/unit/cli/test_update_include_submodules.py new file mode 100644 index 00000000..b5e900d6 --- /dev/null +++ b/tests/unit/cli/test_update_include_submodules.py @@ -0,0 +1,76 @@ +"""`repowise update` must honor the persisted submodule flags. + +A repo indexed with ``init --include-submodules`` records +``include_submodules: true`` in state.json; its incremental updates must +rebuild the graph with the same boundary semantics. ``_build_repo_graph`` +previously constructed FileTraverser and GraphBuilder without the flags — +silently dropping submodule files (and their manifests) on every update. +Same class of bug as the ``git_tier`` gap (see test_update_git_tier.py). +""" + +from __future__ import annotations + +from repowise.cli.commands.update_cmd import _build_repo_graph, _rebuild_graph_and_git + + +def _init_repo_with_initialized_submodule(tmp_path): + """A parent repo containing an *initialized* submodule (`.git` file).""" + import git as gitpython + + repo = gitpython.Repo.init(tmp_path) + with repo.config_writer() as cw: + cw.set_value("user", "name", "Alice") + cw.set_value("user", "email", "alice@example.com") + (tmp_path / "a.py").write_text("x = 1\n") + (tmp_path / ".gitmodules").write_text( + '[submodule "libs/sub"]\n' + " path = libs/sub\n" + " url = https://github.com/example/sub.git\n" + ) + sub = tmp_path / "libs" / "sub" + sub.mkdir(parents=True) + (sub / ".git").write_text("gitdir: ../../.git/modules/libs/sub\n") + (sub / "mod.py").write_text("y = 2\n") + repo.index.add(["a.py", ".gitmodules"]) + repo.index.commit("feat: add module a + submodule") + repo.close() + + +def _graph_paths(tmp_path, **kwargs) -> set[str]: + parsed_files, _source_map, _builder, _structure, _count = _build_repo_graph( + tmp_path, [], **kwargs + ) + return {p.file_info.path for p in parsed_files} + + +def test_update_graph_keeps_submodule_files_when_flag_set(tmp_path): + """Equivalence with init: a submodule-indexed repo must keep its + submodule files in the update-built graph.""" + _init_repo_with_initialized_submodule(tmp_path) + + paths = _graph_paths(tmp_path, include_submodules=True) + + assert "a.py" in paths + assert "libs/sub/mod.py" in paths + + +def test_update_graph_drops_submodule_files_by_default(tmp_path): + """Legacy behavior: missing state key means submodules stay excluded.""" + _init_repo_with_initialized_submodule(tmp_path) + + paths = _graph_paths(tmp_path) + + assert "a.py" in paths + assert "libs/sub/mod.py" not in paths + + +def test_rebuild_threads_include_submodules(tmp_path): + """The full incremental rebuild (graph + git re-index) honors the flag.""" + _init_repo_with_initialized_submodule(tmp_path) + + parsed_files, _sm, _gb, _rs, _fc, _gm = _rebuild_graph_and_git( + tmp_path, [], {}, [], include_submodules=True + ) + + paths = {p.file_info.path for p in parsed_files} + assert "libs/sub/mod.py" in paths diff --git a/tests/unit/ingestion/test_traverser.py b/tests/unit/ingestion/test_traverser.py index 83276dfa..0e83d1be 100644 --- a/tests/unit/ingestion/test_traverser.py +++ b/tests/unit/ingestion/test_traverser.py @@ -473,6 +473,60 @@ def test_include_submodules_flag(self, tmp_path: Path) -> None: paths = [f.path for f in traverser.traverse()] assert any("libs/foo" in p for p in paths) + def test_include_submodules_with_initialized_submodule(self, tmp_path: Path) -> None: + """An *initialized* submodule carries a `.git` file — the nested-git + boundary check must not override the explicit opt-in. + + Regression: ``include_submodules=True`` previously skipped parsing + ``.gitmodules`` entirely, so initialized submodules fell through to + the nested-git skip and were silently dropped anyway. + """ + (tmp_path / ".gitmodules").write_text( + '[submodule "libs/foo"]\n' + " path = libs/foo\n" + " url = https://github.com/example/foo.git\n" + ) + (tmp_path / "libs" / "foo").mkdir(parents=True) + (tmp_path / "libs" / "foo" / ".git").write_text("gitdir: ../../.git/modules/libs/foo\n") + (tmp_path / "libs" / "foo" / "main.py").write_text("pass") + traverser = FileTraverser(tmp_path, include_submodules=True) + paths = [f.path for f in traverser.traverse()] + assert any("libs/foo/main.py" in p for p in paths) + assert traverser.stats.skipped_nested_repo == 0 + + def test_initialized_submodule_skipped_by_default(self, tmp_path: Path) -> None: + (tmp_path / ".gitmodules").write_text( + '[submodule "libs/foo"]\n' + " path = libs/foo\n" + " url = https://github.com/example/foo.git\n" + ) + (tmp_path / "libs" / "foo").mkdir(parents=True) + (tmp_path / "libs" / "foo" / ".git").write_text("gitdir: ../../.git/modules/libs/foo\n") + (tmp_path / "libs" / "foo" / "main.py").write_text("pass") + traverser = FileTraverser(tmp_path) + paths = [f.path for f in traverser.traverse()] + assert not any("libs/foo" in p for p in paths) + assert traverser.stats.skipped_submodule >= 1 + + def test_include_submodules_keeps_other_nested_repos_skipped(self, tmp_path: Path) -> None: + """The submodule opt-in must not widen to arbitrary nested repos.""" + (tmp_path / ".gitmodules").write_text( + '[submodule "libs/foo"]\n' + " path = libs/foo\n" + " url = https://github.com/example/foo.git\n" + ) + (tmp_path / "libs" / "foo").mkdir(parents=True) + (tmp_path / "libs" / "foo" / ".git").write_text("gitdir: ../../.git/modules/libs/foo\n") + (tmp_path / "libs" / "foo" / "main.py").write_text("pass") + (tmp_path / "sibling_repo").mkdir() + (tmp_path / "sibling_repo" / ".git").mkdir() + (tmp_path / "sibling_repo" / "inner.py").write_text("pass") + traverser = FileTraverser(tmp_path, include_submodules=True) + paths = [f.path for f in traverser.traverse()] + assert any("libs/foo/main.py" in p for p in paths) + assert not any("sibling_repo" in p for p in paths) + assert traverser.stats.skipped_nested_repo >= 1 + def test_no_gitmodules_file(self, tmp_path: Path) -> None: (tmp_path / "app.py").write_text("pass") traverser = FileTraverser(tmp_path) From 1bc8cf5d7b0259a206678ec6361a3cce0e4d5887 Mon Sep 17 00:00:00 2001 From: RaghavChamadiya Date: Fri, 5 Jun 2026 16:03:17 +0530 Subject: [PATCH 2/2] fix(upgrade): honor persisted submodule flags in the docs re-parse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit upgrade_to_full re-parses the repo for doc generation via _reparse, which constructed FileTraverser without the submodule flags — a fast index built with --include-submodules would drop submodule files from the ASTs/source fed to generation. Read the persisted state.json flags there too (missing -> False). --- .../cli/src/repowise/cli/commands/upgrade_flow.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/cli/src/repowise/cli/commands/upgrade_flow.py b/packages/cli/src/repowise/cli/commands/upgrade_flow.py index bf5c7c88..5ed3453c 100644 --- a/packages/cli/src/repowise/cli/commands/upgrade_flow.py +++ b/packages/cli/src/repowise/cli/commands/upgrade_flow.py @@ -52,7 +52,16 @@ def _reparse(repo_path: Path, exclude_patterns: list[str]) -> tuple[list[Any], d """ from repowise.core.ingestion import ASTParser, FileTraverser - traverser = FileTraverser(repo_path, extra_exclude_patterns=exclude_patterns or None) + # Honor the persisted submodule semantics of the original index — a + # fast index built with --include-submodules must not drop submodule + # files from the docs re-parse (missing key → False, legacy behavior). + state = load_state(repo_path) + traverser = FileTraverser( + repo_path, + extra_exclude_patterns=exclude_patterns or None, + include_submodules=bool(state.get("include_submodules", False)), + include_nested_repos=bool(state.get("include_nested_repos", False)), + ) file_infos = list(traverser.traverse()) repo_structure = traverser.get_repo_structure()