diff --git a/code_review_graph/incremental.py b/code_review_graph/incremental.py index c2f8d6c..5394b92 100644 --- a/code_review_graph/incremental.py +++ b/code_review_graph/incremental.py @@ -131,6 +131,13 @@ def _is_binary(path: Path) -> bool: _GIT_TIMEOUT = int(os.environ.get("CRG_GIT_TIMEOUT", "30")) # seconds, configurable +# When True, `git ls-files --recurse-submodules` is used so that files +# inside git submodules are included in the graph. Opt-in via env var; +# can also be overridden per-call through function parameters. +_RECURSE_SUBMODULES = os.environ.get( + "CRG_RECURSE_SUBMODULES", "" +).lower() in ("1", "true", "yes") + _SAFE_GIT_REF = re.compile(r"^[A-Za-z0-9_.~^/@{}\-]+$") @@ -185,11 +192,29 @@ def get_staged_and_unstaged(repo_root: Path) -> list[str]: return [] -def get_all_tracked_files(repo_root: Path) -> list[str]: - """Get all files tracked by git.""" +def get_all_tracked_files( + repo_root: Path, + recurse_submodules: bool | None = None, +) -> list[str]: + """Get all files tracked by git. + + Args: + repo_root: Repository root directory. + recurse_submodules: If True, pass ``--recurse-submodules`` to + ``git ls-files`` so that files inside git submodules are + included. When *None* (default), falls back to the + ``CRG_RECURSE_SUBMODULES`` environment variable. + """ + if recurse_submodules is None: + recurse_submodules = _RECURSE_SUBMODULES + + cmd = ["git", "ls-files"] + if recurse_submodules: + cmd.append("--recurse-submodules") + try: result = subprocess.run( - ["git", "ls-files"], + cmd, capture_output=True, text=True, cwd=str(repo_root), @@ -200,14 +225,23 @@ def get_all_tracked_files(repo_root: Path) -> list[str]: return [] -def collect_all_files(repo_root: Path) -> list[str]: - """Collect all parseable files in the repo, respecting ignore patterns.""" +def collect_all_files( + repo_root: Path, + recurse_submodules: bool | None = None, +) -> list[str]: + """Collect all parseable files in the repo, respecting ignore patterns. + + Args: + repo_root: Repository root directory. + recurse_submodules: If True, include files from git submodules. + When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var. + """ ignore_patterns = _load_ignore_patterns(repo_root) parser = CodeParser() files = [] # Prefer git ls-files for tracked files - tracked = get_all_tracked_files(repo_root) + tracked = get_all_tracked_files(repo_root, recurse_submodules) if tracked: candidates = tracked else: @@ -259,10 +293,21 @@ def find_dependents(store: GraphStore, file_path: str) -> list[str]: return list(dependents) -def full_build(repo_root: Path, store: GraphStore) -> dict: - """Full rebuild of the entire graph.""" +def full_build( + repo_root: Path, + store: GraphStore, + recurse_submodules: bool | None = None, +) -> dict: + """Full rebuild of the entire graph. + + Args: + repo_root: Repository root directory. + store: Graph database store. + recurse_submodules: If True, include files from git submodules. + When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var. + """ parser = CodeParser() - files = collect_all_files(repo_root) + files = collect_all_files(repo_root, recurse_submodules) # Purge stale data from files no longer on disk existing_files = set(store.get_all_files()) diff --git a/code_review_graph/main.py b/code_review_graph/main.py index be412bf..01dafa4 100644 --- a/code_review_graph/main.py +++ b/code_review_graph/main.py @@ -61,6 +61,7 @@ def build_or_update_graph_tool( full_rebuild: bool = False, repo_root: Optional[str] = None, base: str = "HEAD~1", + recurse_submodules: Optional[bool] = None, ) -> dict: """Build or incrementally update the code knowledge graph. @@ -72,9 +73,14 @@ def build_or_update_graph_tool( full_rebuild: If True, re-parse all files. Default: False (incremental). repo_root: Repository root path. Auto-detected from current directory if omitted. base: Git ref to diff against for incremental updates. Default: HEAD~1. + recurse_submodules: If True, include files from git submodules. + When None (default), falls back to CRG_RECURSE_SUBMODULES env var. """ return build_or_update_graph( - full_rebuild=full_rebuild, repo_root=repo_root, base=base + full_rebuild=full_rebuild, + repo_root=repo_root, + base=base, + recurse_submodules=recurse_submodules, ) diff --git a/code_review_graph/tools/build.py b/code_review_graph/tools/build.py index ad832ec..33f7905 100644 --- a/code_review_graph/tools/build.py +++ b/code_review_graph/tools/build.py @@ -16,6 +16,7 @@ def build_or_update_graph( full_rebuild: bool = False, repo_root: str | None = None, base: str = "HEAD~1", + recurse_submodules: bool | None = None, ) -> dict[str, Any]: """Build or incrementally update the code knowledge graph. @@ -24,6 +25,10 @@ def build_or_update_graph( only re-parse files changed since `base`. repo_root: Path to the repository root. Auto-detected if omitted. base: Git ref for incremental diff (default: HEAD~1). + recurse_submodules: If True, include files from git submodules + via ``git ls-files --recurse-submodules``. When None + (default), falls back to the CRG_RECURSE_SUBMODULES + environment variable. Default: disabled. Returns: Summary with files_parsed/updated, node/edge counts, and errors. @@ -31,7 +36,7 @@ def build_or_update_graph( store, root = _get_store(repo_root) try: if full_rebuild: - result = full_build(root, store) + result = full_build(root, store, recurse_submodules) build_result = { "status": "ok", "build_type": "full", diff --git a/tests/test_incremental.py b/tests/test_incremental.py index ec3184f..b8ffdd5 100644 --- a/tests/test_incremental.py +++ b/tests/test_incremental.py @@ -171,6 +171,62 @@ def test_get_all_tracked_files(self, mock_run, tmp_path): result = get_all_tracked_files(tmp_path) assert result == ["a.py", "b.py", "c.go"] + @patch("code_review_graph.incremental.subprocess.run") + def test_get_all_tracked_files_recurse_submodules_param( + self, mock_run, tmp_path + ): + mock_run.return_value = MagicMock( + returncode=0, + stdout="a.py\nsub/b.py\n", + ) + result = get_all_tracked_files(tmp_path, recurse_submodules=True) + assert result == ["a.py", "sub/b.py"] + cmd = mock_run.call_args[0][0] + assert "--recurse-submodules" in cmd + + @patch("code_review_graph.incremental.subprocess.run") + def test_get_all_tracked_files_no_recurse_by_default( + self, mock_run, tmp_path + ): + mock_run.return_value = MagicMock( + returncode=0, + stdout="a.py\n", + ) + result = get_all_tracked_files(tmp_path) + assert result == ["a.py"] + cmd = mock_run.call_args[0][0] + assert "--recurse-submodules" not in cmd + + @patch("code_review_graph.incremental.subprocess.run") + @patch("code_review_graph.incremental._RECURSE_SUBMODULES", True) + def test_get_all_tracked_files_env_var_fallback( + self, mock_run, tmp_path + ): + mock_run.return_value = MagicMock( + returncode=0, + stdout="a.py\nsub/c.py\n", + ) + # None -> falls back to env var (_RECURSE_SUBMODULES=True) + result = get_all_tracked_files(tmp_path, recurse_submodules=None) + assert result == ["a.py", "sub/c.py"] + cmd = mock_run.call_args[0][0] + assert "--recurse-submodules" in cmd + + @patch("code_review_graph.incremental.subprocess.run") + @patch("code_review_graph.incremental._RECURSE_SUBMODULES", True) + def test_get_all_tracked_files_param_overrides_env( + self, mock_run, tmp_path + ): + mock_run.return_value = MagicMock( + returncode=0, + stdout="a.py\n", + ) + # Explicit False overrides env var + result = get_all_tracked_files(tmp_path, recurse_submodules=False) + assert result == ["a.py"] + cmd = mock_run.call_args[0][0] + assert "--recurse-submodules" not in cmd + class TestFullBuild: def test_full_build_parses_files(self, tmp_path): diff --git a/tests/test_integration_git.py b/tests/test_integration_git.py index f71dc20..cabc2f8 100644 --- a/tests/test_integration_git.py +++ b/tests/test_integration_git.py @@ -19,7 +19,9 @@ from code_review_graph.changes import parse_git_diff_ranges from code_review_graph.graph import GraphStore from code_review_graph.incremental import ( + collect_all_files, full_build, + get_all_tracked_files, get_changed_files, incremental_update, ) @@ -146,6 +148,103 @@ def test_base_validation_rejects_injection(git_repo: Path) -> None: # ------------------------------------------------------------------ +# ------------------------------------------------------------------ +# 6. recurse_submodules includes submodule files +# ------------------------------------------------------------------ + + +@pytest.fixture() +def git_repo_with_submodule(tmp_path: Path) -> Path: + """Create a parent repo containing a git submodule with a Python file.""" + # Create the "library" repo that will become a submodule + lib_repo = tmp_path / "lib" + lib_repo.mkdir() + _git(lib_repo, "init") + _git(lib_repo, "config", "user.email", "test@test.com") + _git(lib_repo, "config", "user.name", "Test") + (lib_repo / "util.py").write_text("def helper():\n pass\n") + _git(lib_repo, "add", "util.py") + _git(lib_repo, "commit", "-m", "lib initial") + + # Create the parent repo and add lib as a submodule + parent = tmp_path / "parent" + parent.mkdir() + _git(parent, "init") + _git(parent, "config", "user.email", "test@test.com") + _git(parent, "config", "user.name", "Test") + (parent / "main.py").write_text("def main():\n pass\n") + _git(parent, "add", "main.py") + _git(parent, "commit", "-m", "parent initial") + _git( + parent, "-c", "protocol.file.allow=always", + "submodule", "add", str(lib_repo), "lib", + ) + _git(parent, "commit", "-m", "add lib submodule") + + return parent + + +def test_get_all_tracked_files_without_recurse( + git_repo_with_submodule: Path, +) -> None: + """Without recurse_submodules, submodule files are NOT listed.""" + files = get_all_tracked_files( + git_repo_with_submodule, recurse_submodules=False + ) + assert "main.py" in files + # Submodule entry appears as a gitlink, not as individual files + assert not any(f.startswith("lib/") for f in files) + + +def test_get_all_tracked_files_with_recurse( + git_repo_with_submodule: Path, +) -> None: + """With recurse_submodules=True, submodule files ARE listed.""" + files = get_all_tracked_files( + git_repo_with_submodule, recurse_submodules=True + ) + assert "main.py" in files + assert "lib/util.py" in files + + +def test_collect_all_files_with_recurse( + git_repo_with_submodule: Path, +) -> None: + """collect_all_files with recurse_submodules includes submodule code.""" + files = collect_all_files( + git_repo_with_submodule, recurse_submodules=True + ) + assert "main.py" in files + assert "lib/util.py" in files + + +def test_full_build_with_recurse_submodules( + git_repo_with_submodule: Path, +) -> None: + """full_build with recurse_submodules parses submodule files.""" + db_path = git_repo_with_submodule / ".code-review-graph" / "graph.db" + db_path.parent.mkdir(parents=True, exist_ok=True) + store = GraphStore(db_path) + try: + result = full_build( + git_repo_with_submodule, store, recurse_submodules=True + ) + assert result["files_parsed"] >= 2 # main.py + lib/util.py + assert result["errors"] == [] + + # Verify both parent and submodule nodes exist + parent_nodes = store.get_nodes_by_file( + str(git_repo_with_submodule / "main.py") + ) + sub_nodes = store.get_nodes_by_file( + str(git_repo_with_submodule / "lib" / "util.py") + ) + assert len(parent_nodes) > 0 + assert len(sub_nodes) > 0 + finally: + store.close() + + def test_wiki_page_path_traversal_blocked(tmp_path: Path) -> None: """get_wiki_page must not serve files outside the wiki directory.""" wiki_dir = tmp_path / "wiki"