Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 54 additions & 9 deletions code_review_graph/incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,13 @@ def _is_binary(path: Path) -> bool:

_GIT_TIMEOUT = int(os.environ.get("CRG_GIT_TIMEOUT", "30")) # seconds, configurable

# When True, `git ls-files --recurse-submodules` is used so that files
# inside git submodules are included in the graph. Opt-in via env var;
# can also be overridden per-call through function parameters.
_RECURSE_SUBMODULES = os.environ.get(
"CRG_RECURSE_SUBMODULES", ""
).lower() in ("1", "true", "yes")

_SAFE_GIT_REF = re.compile(r"^[A-Za-z0-9_.~^/@{}\-]+$")


Expand Down Expand Up @@ -185,11 +192,29 @@ def get_staged_and_unstaged(repo_root: Path) -> list[str]:
return []


def get_all_tracked_files(repo_root: Path) -> list[str]:
"""Get all files tracked by git."""
def get_all_tracked_files(
repo_root: Path,
recurse_submodules: bool | None = None,
) -> list[str]:
"""Get all files tracked by git.

Args:
repo_root: Repository root directory.
recurse_submodules: If True, pass ``--recurse-submodules`` to
``git ls-files`` so that files inside git submodules are
included. When *None* (default), falls back to the
``CRG_RECURSE_SUBMODULES`` environment variable.
"""
if recurse_submodules is None:
recurse_submodules = _RECURSE_SUBMODULES

cmd = ["git", "ls-files"]
if recurse_submodules:
cmd.append("--recurse-submodules")

try:
result = subprocess.run(
["git", "ls-files"],
cmd,
capture_output=True,
text=True,
cwd=str(repo_root),
Expand All @@ -200,14 +225,23 @@ def get_all_tracked_files(repo_root: Path) -> list[str]:
return []


def collect_all_files(repo_root: Path) -> list[str]:
"""Collect all parseable files in the repo, respecting ignore patterns."""
def collect_all_files(
repo_root: Path,
recurse_submodules: bool | None = None,
) -> list[str]:
"""Collect all parseable files in the repo, respecting ignore patterns.

Args:
repo_root: Repository root directory.
recurse_submodules: If True, include files from git submodules.
When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var.
"""
ignore_patterns = _load_ignore_patterns(repo_root)
parser = CodeParser()
files = []

# Prefer git ls-files for tracked files
tracked = get_all_tracked_files(repo_root)
tracked = get_all_tracked_files(repo_root, recurse_submodules)
if tracked:
candidates = tracked
else:
Expand Down Expand Up @@ -259,10 +293,21 @@ def find_dependents(store: GraphStore, file_path: str) -> list[str]:
return list(dependents)


def full_build(repo_root: Path, store: GraphStore) -> dict:
"""Full rebuild of the entire graph."""
def full_build(
repo_root: Path,
store: GraphStore,
recurse_submodules: bool | None = None,
) -> dict:
"""Full rebuild of the entire graph.

Args:
repo_root: Repository root directory.
store: Graph database store.
recurse_submodules: If True, include files from git submodules.
When *None*, falls back to ``CRG_RECURSE_SUBMODULES`` env var.
"""
parser = CodeParser()
files = collect_all_files(repo_root)
files = collect_all_files(repo_root, recurse_submodules)

# Purge stale data from files no longer on disk
existing_files = set(store.get_all_files())
Expand Down
8 changes: 7 additions & 1 deletion code_review_graph/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def build_or_update_graph_tool(
full_rebuild: bool = False,
repo_root: Optional[str] = None,
base: str = "HEAD~1",
recurse_submodules: Optional[bool] = None,
) -> dict:
"""Build or incrementally update the code knowledge graph.

Expand All @@ -72,9 +73,14 @@ def build_or_update_graph_tool(
full_rebuild: If True, re-parse all files. Default: False (incremental).
repo_root: Repository root path. Auto-detected from current directory if omitted.
base: Git ref to diff against for incremental updates. Default: HEAD~1.
recurse_submodules: If True, include files from git submodules.
When None (default), falls back to CRG_RECURSE_SUBMODULES env var.
"""
return build_or_update_graph(
full_rebuild=full_rebuild, repo_root=repo_root, base=base
full_rebuild=full_rebuild,
repo_root=repo_root,
base=base,
recurse_submodules=recurse_submodules,
)


Expand Down
7 changes: 6 additions & 1 deletion code_review_graph/tools/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def build_or_update_graph(
full_rebuild: bool = False,
repo_root: str | None = None,
base: str = "HEAD~1",
recurse_submodules: bool | None = None,
) -> dict[str, Any]:
"""Build or incrementally update the code knowledge graph.

Expand All @@ -24,14 +25,18 @@ def build_or_update_graph(
only re-parse files changed since `base`.
repo_root: Path to the repository root. Auto-detected if omitted.
base: Git ref for incremental diff (default: HEAD~1).
recurse_submodules: If True, include files from git submodules
via ``git ls-files --recurse-submodules``. When None
(default), falls back to the CRG_RECURSE_SUBMODULES
environment variable. Default: disabled.

Returns:
Summary with files_parsed/updated, node/edge counts, and errors.
"""
store, root = _get_store(repo_root)
try:
if full_rebuild:
result = full_build(root, store)
result = full_build(root, store, recurse_submodules)
build_result = {
"status": "ok",
"build_type": "full",
Expand Down
56 changes: 56 additions & 0 deletions tests/test_incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,62 @@ def test_get_all_tracked_files(self, mock_run, tmp_path):
result = get_all_tracked_files(tmp_path)
assert result == ["a.py", "b.py", "c.go"]

@patch("code_review_graph.incremental.subprocess.run")
def test_get_all_tracked_files_recurse_submodules_param(
self, mock_run, tmp_path
):
mock_run.return_value = MagicMock(
returncode=0,
stdout="a.py\nsub/b.py\n",
)
result = get_all_tracked_files(tmp_path, recurse_submodules=True)
assert result == ["a.py", "sub/b.py"]
cmd = mock_run.call_args[0][0]
assert "--recurse-submodules" in cmd

@patch("code_review_graph.incremental.subprocess.run")
def test_get_all_tracked_files_no_recurse_by_default(
self, mock_run, tmp_path
):
mock_run.return_value = MagicMock(
returncode=0,
stdout="a.py\n",
)
result = get_all_tracked_files(tmp_path)
assert result == ["a.py"]
cmd = mock_run.call_args[0][0]
assert "--recurse-submodules" not in cmd

@patch("code_review_graph.incremental.subprocess.run")
@patch("code_review_graph.incremental._RECURSE_SUBMODULES", True)
def test_get_all_tracked_files_env_var_fallback(
self, mock_run, tmp_path
):
mock_run.return_value = MagicMock(
returncode=0,
stdout="a.py\nsub/c.py\n",
)
# None -> falls back to env var (_RECURSE_SUBMODULES=True)
result = get_all_tracked_files(tmp_path, recurse_submodules=None)
assert result == ["a.py", "sub/c.py"]
cmd = mock_run.call_args[0][0]
assert "--recurse-submodules" in cmd

@patch("code_review_graph.incremental.subprocess.run")
@patch("code_review_graph.incremental._RECURSE_SUBMODULES", True)
def test_get_all_tracked_files_param_overrides_env(
self, mock_run, tmp_path
):
mock_run.return_value = MagicMock(
returncode=0,
stdout="a.py\n",
)
# Explicit False overrides env var
result = get_all_tracked_files(tmp_path, recurse_submodules=False)
assert result == ["a.py"]
cmd = mock_run.call_args[0][0]
assert "--recurse-submodules" not in cmd


class TestFullBuild:
def test_full_build_parses_files(self, tmp_path):
Expand Down
99 changes: 99 additions & 0 deletions tests/test_integration_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
from code_review_graph.changes import parse_git_diff_ranges
from code_review_graph.graph import GraphStore
from code_review_graph.incremental import (
collect_all_files,
full_build,
get_all_tracked_files,
get_changed_files,
incremental_update,
)
Expand Down Expand Up @@ -146,6 +148,103 @@ def test_base_validation_rejects_injection(git_repo: Path) -> None:
# ------------------------------------------------------------------


# ------------------------------------------------------------------
# 6. recurse_submodules includes submodule files
# ------------------------------------------------------------------


@pytest.fixture()
def git_repo_with_submodule(tmp_path: Path) -> Path:
"""Create a parent repo containing a git submodule with a Python file."""
# Create the "library" repo that will become a submodule
lib_repo = tmp_path / "lib"
lib_repo.mkdir()
_git(lib_repo, "init")
_git(lib_repo, "config", "user.email", "test@test.com")
_git(lib_repo, "config", "user.name", "Test")
(lib_repo / "util.py").write_text("def helper():\n pass\n")
_git(lib_repo, "add", "util.py")
_git(lib_repo, "commit", "-m", "lib initial")

# Create the parent repo and add lib as a submodule
parent = tmp_path / "parent"
parent.mkdir()
_git(parent, "init")
_git(parent, "config", "user.email", "test@test.com")
_git(parent, "config", "user.name", "Test")
(parent / "main.py").write_text("def main():\n pass\n")
_git(parent, "add", "main.py")
_git(parent, "commit", "-m", "parent initial")
_git(
parent, "-c", "protocol.file.allow=always",
"submodule", "add", str(lib_repo), "lib",
)
_git(parent, "commit", "-m", "add lib submodule")

return parent


def test_get_all_tracked_files_without_recurse(
git_repo_with_submodule: Path,
) -> None:
"""Without recurse_submodules, submodule files are NOT listed."""
files = get_all_tracked_files(
git_repo_with_submodule, recurse_submodules=False
)
assert "main.py" in files
# Submodule entry appears as a gitlink, not as individual files
assert not any(f.startswith("lib/") for f in files)


def test_get_all_tracked_files_with_recurse(
git_repo_with_submodule: Path,
) -> None:
"""With recurse_submodules=True, submodule files ARE listed."""
files = get_all_tracked_files(
git_repo_with_submodule, recurse_submodules=True
)
assert "main.py" in files
assert "lib/util.py" in files


def test_collect_all_files_with_recurse(
git_repo_with_submodule: Path,
) -> None:
"""collect_all_files with recurse_submodules includes submodule code."""
files = collect_all_files(
git_repo_with_submodule, recurse_submodules=True
)
assert "main.py" in files
assert "lib/util.py" in files


def test_full_build_with_recurse_submodules(
git_repo_with_submodule: Path,
) -> None:
"""full_build with recurse_submodules parses submodule files."""
db_path = git_repo_with_submodule / ".code-review-graph" / "graph.db"
db_path.parent.mkdir(parents=True, exist_ok=True)
store = GraphStore(db_path)
try:
result = full_build(
git_repo_with_submodule, store, recurse_submodules=True
)
assert result["files_parsed"] >= 2 # main.py + lib/util.py
assert result["errors"] == []

# Verify both parent and submodule nodes exist
parent_nodes = store.get_nodes_by_file(
str(git_repo_with_submodule / "main.py")
)
sub_nodes = store.get_nodes_by_file(
str(git_repo_with_submodule / "lib" / "util.py")
)
assert len(parent_nodes) > 0
assert len(sub_nodes) > 0
finally:
store.close()


def test_wiki_page_path_traversal_blocked(tmp_path: Path) -> None:
"""get_wiki_page must not serve files outside the wiki directory."""
wiki_dir = tmp_path / "wiki"
Expand Down