Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions packages/cli/src/repowise/cli/commands/init_cmd/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,10 @@ async def _index_with_resume() -> Any:
# same tier instead of silently upgrading ESSENTIAL → FULL (issue #341).
base_state["run_mode"] = run_mode
base_state["git_tier"] = git_tier_for_run_mode(run_mode)
# Record whether submodules were indexed so `repowise update` rebuilds
# the graph with the same boundary semantics (same pattern as git_tier:
# missing → False keeps legacy behavior for old state files).
base_state["include_submodules"] = include_submodules
if phase_timings:
base_state["phase_timings"] = phase_timings
kg = getattr(result, "knowledge_graph_result", None)
Expand Down Expand Up @@ -835,6 +839,7 @@ async def _index_with_resume() -> Any:
commit_limit=commit_limit,
resolved_commit_limit=resolved_commit_limit,
resolved_reasoning=resolved_reasoning,
include_submodules=include_submodules,
)

# ---- Completion panel ----
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def save_full_state_and_config(
commit_limit: int | None,
resolved_commit_limit: int,
resolved_reasoning: str,
include_submodules: bool = False,
) -> None:
"""Persist state.json + config for a completed full-mode (docs) init run."""

Expand Down Expand Up @@ -216,6 +217,9 @@ async def _count_db_pages() -> int:
# Full-mode docs runs always index the FULL git tier.
state["run_mode"] = "standard"
state["git_tier"] = "full"
# Same pattern as git_tier: `repowise update` reads this back so its
# graph rebuild keeps the init run's submodule boundary semantics.
state["include_submodules"] = include_submodules
total_tokens = sum(p.total_tokens for p in (result.generated_pages or []))
state["total_tokens"] = total_tokens
if phase_timings:
Expand Down
37 changes: 33 additions & 4 deletions packages/cli/src/repowise/cli/commands/update_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,8 @@ def _build_repo_graph(
exclude_patterns: list[str],
*,
collect_sources: bool = False,
include_submodules: bool = False,
include_nested_repos: bool = False,
) -> tuple[list, dict[str, bytes], Any, Any, int]:
"""Traverse + parse the repo and build the graph (+ framework-aware edges).

Expand All @@ -348,7 +350,12 @@ def _build_repo_graph(

from repowise.core.ingestion import ASTParser, FileTraverser, GraphBuilder, compute_content_hash

traverser = FileTraverser(repo_path, extra_exclude_patterns=exclude_patterns or None)
traverser = FileTraverser(
repo_path,
extra_exclude_patterns=exclude_patterns or None,
include_submodules=include_submodules,
include_nested_repos=include_nested_repos,
)
file_infos = list(traverser.traverse())
repo_structure = traverser.get_repo_structure()

Expand All @@ -371,6 +378,8 @@ def _build_repo_graph(
repo_path,
exclude_patterns=exclude_patterns,
centrality_cache_dir=PathlibPath(repo_path) / ".repowise",
include_submodules=include_submodules,
include_nested_repos=include_nested_repos,
)

skipped = 0
Expand Down Expand Up @@ -419,6 +428,8 @@ def _rebuild_graph_and_git(
cfg: dict,
exclude_patterns: list[str],
git_tier: str | None = None,
include_submodules: bool = False,
include_nested_repos: bool = False,
) -> tuple[list, dict[str, bytes], Any, Any, int, dict[str, dict]]:
"""Re-traverse + parse the repo, rebuild the graph (+ framework edges), and
re-index git metadata for the changed files.
Expand All @@ -428,12 +439,21 @@ def _rebuild_graph_and_git(
its index never had. Unknown/missing values fall back to FULL, matching
the historical behavior for legacy state files.

``include_submodules`` / ``include_nested_repos`` are likewise read from
state.json: a repo indexed with ``init --include-submodules`` must not
silently drop its submodule files on every incremental update. Missing
keys fall back to False (legacy behavior).

Returns ``(parsed_files, source_map, graph_builder, repo_structure,
file_count, git_meta_map)``.
"""
# Full re-ingest for graph (needed for cascade analysis)
parsed_files, source_map, graph_builder, repo_structure, file_count = _build_repo_graph(
repo_path, exclude_patterns, collect_sources=True
repo_path,
exclude_patterns,
collect_sources=True,
include_submodules=include_submodules,
include_nested_repos=include_nested_repos,
)

# Re-index git metadata for changed files
Expand Down Expand Up @@ -720,7 +740,10 @@ def _run_full_health_rescore(
# Share the rebuild path with the incremental update so both produce the
# same graph (same parser, same framework-aware synthetic edges).
parsed_files, _source_map, graph_builder, _repo_structure, _file_count = _build_repo_graph(
repo_path, exclude_patterns
repo_path,
exclude_patterns,
include_submodules=bool(state.get("include_submodules", False)),
include_nested_repos=bool(state.get("include_nested_repos", False)),
)

# Fan-out metric precompute (mirrors _rebuild_graph_and_git) — the
Expand Down Expand Up @@ -1145,7 +1168,13 @@ def update_command(

parsed_files, source_map, graph_builder, repo_structure, file_count, git_meta_map = (
_rebuild_graph_and_git(
repo_path, file_diffs, cfg, exclude_patterns, git_tier=state.get("git_tier")
repo_path,
file_diffs,
cfg,
exclude_patterns,
git_tier=state.get("git_tier"),
include_submodules=bool(state.get("include_submodules", False)),
include_nested_repos=bool(state.get("include_nested_repos", False)),
)
)

Expand Down
11 changes: 10 additions & 1 deletion packages/cli/src/repowise/cli/commands/upgrade_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,16 @@ def _reparse(repo_path: Path, exclude_patterns: list[str]) -> tuple[list[Any], d
"""
from repowise.core.ingestion import ASTParser, FileTraverser

traverser = FileTraverser(repo_path, extra_exclude_patterns=exclude_patterns or None)
# Honor the persisted submodule semantics of the original index — a
# fast index built with --include-submodules must not drop submodule
# files from the docs re-parse (missing key → False, legacy behavior).
state = load_state(repo_path)
traverser = FileTraverser(
repo_path,
extra_exclude_patterns=exclude_patterns or None,
include_submodules=bool(state.get("include_submodules", False)),
include_nested_repos=bool(state.get("include_nested_repos", False)),
)
file_infos = list(traverser.traverse())
repo_structure = traverser.get_repo_structure()

Expand Down
25 changes: 17 additions & 8 deletions packages/core/src/repowise/core/ingestion/traverser.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,11 @@ def __init__(
self._dir_ignore_cache: dict[str, pathspec.PathSpec] = {
str(self.repo_root): self._extra_ignore,
}
self._submodule_paths: frozenset[str] = frozenset()
if not include_submodules:
self._submodule_paths = _parse_gitmodules(self.repo_root)
# Parse .gitmodules unconditionally: when submodules are *included*
# the set is what exempts initialized submodules (whose `.git` file
# makes them look like nested repos) from the nested-git skip below.
self._submodule_paths: frozenset[str] = _parse_gitmodules(self.repo_root)
self._include_submodules = include_submodules
self._include_nested_repos = include_nested_repos
self.stats = TraversalStats()
self._count_lock = threading.Lock()
Expand All @@ -238,7 +240,7 @@ def __init__(
repo_root=str(self.repo_root),
max_file_size_kb=max_file_size_kb,
extra_exclude_patterns=len(patterns),
submodules_skipped=len(self._submodule_paths),
submodules_skipped=0 if include_submodules else len(self._submodule_paths),
include_nested_repos=include_nested_repos,
)

Expand Down Expand Up @@ -352,15 +354,19 @@ def _should_skip_dir(
if dirname in _BLOCKED_DIRS:
return True
rel_str = rel_path.as_posix()
if rel_str in self._submodule_paths:
is_submodule = rel_str in self._submodule_paths
if is_submodule and not self._include_submodules:
self.stats.skipped_submodule += 1
return True
# Nested git repos are independent units — stop at the boundary
# unless the caller explicitly opted in. Mirrors the workspace
# scanner, which already refuses to descend into nested `.git`
# markers. Without this, a parent repo that physically contains
# sibling repos gets walked end-to-end.
if not self._include_nested_repos and _is_nested_git_repo(abs_path):
# sibling repos gets walked end-to-end. An *initialized* submodule
# carries a `.git` file and would match here too — submodules that
# were explicitly opted in above are exempt (they still fall through
# to the gitignore/exclude checks below).
if not self._include_nested_repos and not is_submodule and _is_nested_git_repo(abs_path):
self.stats.skipped_nested_repo += 1
log.debug("Skipping nested git repo", path=rel_str)
return True
Expand Down Expand Up @@ -479,7 +485,10 @@ def _detect_monorepo(self) -> tuple[list[PackageInfo], bool]:
"""
packages: list[PackageInfo] = []
seen_paths: set[str] = set()
prune_nested = not self._include_nested_repos
# Mirrors GraphBuilder._prune_nested_git: when submodules or nested
# repos are indexed, package-language/entry-point scans must not
# prune them (both are `.git`-bearing subdirs to fs_walk).
prune_nested = not (self._include_submodules or self._include_nested_repos)

for depth in (1, 2):
pattern = "/".join(["*"] * depth) + "/*"
Expand Down
76 changes: 76 additions & 0 deletions tests/unit/cli/test_update_include_submodules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""`repowise update` must honor the persisted submodule flags.

A repo indexed with ``init --include-submodules`` records
``include_submodules: true`` in state.json; its incremental updates must
rebuild the graph with the same boundary semantics. ``_build_repo_graph``
previously constructed FileTraverser and GraphBuilder without the flags —
silently dropping submodule files (and their manifests) on every update.
Same class of bug as the ``git_tier`` gap (see test_update_git_tier.py).
"""

from __future__ import annotations

from repowise.cli.commands.update_cmd import _build_repo_graph, _rebuild_graph_and_git


def _init_repo_with_initialized_submodule(tmp_path):
"""A parent repo containing an *initialized* submodule (`.git` file)."""
import git as gitpython

repo = gitpython.Repo.init(tmp_path)
with repo.config_writer() as cw:
cw.set_value("user", "name", "Alice")
cw.set_value("user", "email", "alice@example.com")
(tmp_path / "a.py").write_text("x = 1\n")
(tmp_path / ".gitmodules").write_text(
'[submodule "libs/sub"]\n'
" path = libs/sub\n"
" url = https://github.com/example/sub.git\n"
)
sub = tmp_path / "libs" / "sub"
sub.mkdir(parents=True)
(sub / ".git").write_text("gitdir: ../../.git/modules/libs/sub\n")
(sub / "mod.py").write_text("y = 2\n")
repo.index.add(["a.py", ".gitmodules"])
repo.index.commit("feat: add module a + submodule")
repo.close()


def _graph_paths(tmp_path, **kwargs) -> set[str]:
parsed_files, _source_map, _builder, _structure, _count = _build_repo_graph(
tmp_path, [], **kwargs
)
return {p.file_info.path for p in parsed_files}


def test_update_graph_keeps_submodule_files_when_flag_set(tmp_path):
"""Equivalence with init: a submodule-indexed repo must keep its
submodule files in the update-built graph."""
_init_repo_with_initialized_submodule(tmp_path)

paths = _graph_paths(tmp_path, include_submodules=True)

assert "a.py" in paths
assert "libs/sub/mod.py" in paths


def test_update_graph_drops_submodule_files_by_default(tmp_path):
"""Legacy behavior: missing state key means submodules stay excluded."""
_init_repo_with_initialized_submodule(tmp_path)

paths = _graph_paths(tmp_path)

assert "a.py" in paths
assert "libs/sub/mod.py" not in paths


def test_rebuild_threads_include_submodules(tmp_path):
"""The full incremental rebuild (graph + git re-index) honors the flag."""
_init_repo_with_initialized_submodule(tmp_path)

parsed_files, _sm, _gb, _rs, _fc, _gm = _rebuild_graph_and_git(
tmp_path, [], {}, [], include_submodules=True
)

paths = {p.file_info.path for p in parsed_files}
assert "libs/sub/mod.py" in paths
54 changes: 54 additions & 0 deletions tests/unit/ingestion/test_traverser.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,60 @@ def test_include_submodules_flag(self, tmp_path: Path) -> None:
paths = [f.path for f in traverser.traverse()]
assert any("libs/foo" in p for p in paths)

def test_include_submodules_with_initialized_submodule(self, tmp_path: Path) -> None:
"""An *initialized* submodule carries a `.git` file — the nested-git
boundary check must not override the explicit opt-in.

Regression: ``include_submodules=True`` previously skipped parsing
``.gitmodules`` entirely, so initialized submodules fell through to
the nested-git skip and were silently dropped anyway.
"""
(tmp_path / ".gitmodules").write_text(
'[submodule "libs/foo"]\n'
" path = libs/foo\n"
" url = https://github.com/example/foo.git\n"
)
(tmp_path / "libs" / "foo").mkdir(parents=True)
(tmp_path / "libs" / "foo" / ".git").write_text("gitdir: ../../.git/modules/libs/foo\n")
(tmp_path / "libs" / "foo" / "main.py").write_text("pass")
traverser = FileTraverser(tmp_path, include_submodules=True)
paths = [f.path for f in traverser.traverse()]
assert any("libs/foo/main.py" in p for p in paths)
assert traverser.stats.skipped_nested_repo == 0

def test_initialized_submodule_skipped_by_default(self, tmp_path: Path) -> None:
(tmp_path / ".gitmodules").write_text(
'[submodule "libs/foo"]\n'
" path = libs/foo\n"
" url = https://github.com/example/foo.git\n"
)
(tmp_path / "libs" / "foo").mkdir(parents=True)
(tmp_path / "libs" / "foo" / ".git").write_text("gitdir: ../../.git/modules/libs/foo\n")
(tmp_path / "libs" / "foo" / "main.py").write_text("pass")
traverser = FileTraverser(tmp_path)
paths = [f.path for f in traverser.traverse()]
assert not any("libs/foo" in p for p in paths)
assert traverser.stats.skipped_submodule >= 1

def test_include_submodules_keeps_other_nested_repos_skipped(self, tmp_path: Path) -> None:
"""The submodule opt-in must not widen to arbitrary nested repos."""
(tmp_path / ".gitmodules").write_text(
'[submodule "libs/foo"]\n'
" path = libs/foo\n"
" url = https://github.com/example/foo.git\n"
)
(tmp_path / "libs" / "foo").mkdir(parents=True)
(tmp_path / "libs" / "foo" / ".git").write_text("gitdir: ../../.git/modules/libs/foo\n")
(tmp_path / "libs" / "foo" / "main.py").write_text("pass")
(tmp_path / "sibling_repo").mkdir()
(tmp_path / "sibling_repo" / ".git").mkdir()
(tmp_path / "sibling_repo" / "inner.py").write_text("pass")
traverser = FileTraverser(tmp_path, include_submodules=True)
paths = [f.path for f in traverser.traverse()]
assert any("libs/foo/main.py" in p for p in paths)
assert not any("sibling_repo" in p for p in paths)
assert traverser.stats.skipped_nested_repo >= 1

def test_no_gitmodules_file(self, tmp_path: Path) -> None:
(tmp_path / "app.py").write_text("pass")
traverser = FileTraverser(tmp_path)
Expand Down
Loading