Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 94 additions & 1 deletion src/mcp_codebase_index/project_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import logging
import os
import re
import subprocess
import sys
import time
from pathlib import Path
Expand Down Expand Up @@ -336,7 +337,11 @@ def rebuild_graphs(self) -> None:
# ------------------------------------------------------------------

def _discover_files(self) -> list[str]:
"""Discover files matching include patterns, excluding exclude patterns."""
"""Discover files matching include patterns, excluding exclude patterns.

In git repositories, paths ignored by .gitignore (and .git/info/exclude,
global gitignore) are also excluded via ``git check-ignore``.
"""
root = Path(self.root_path)
matched: set[str] = set()

Expand All @@ -360,6 +365,15 @@ def _discover_files(self) -> list[str]:

matched.add(abs_str)

# In git repos, also honour .gitignore / .git/info/exclude / global
# gitignore by asking git which of the discovered paths it would ignore.
git_ignored = self._get_git_ignored_paths(matched)
if git_ignored:
logger.info(
"Excluding %d git-ignored files from index", len(git_ignored),
)
matched -= git_ignored

return sorted(matched)

def _is_excluded(self, rel_path: str) -> bool:
Expand All @@ -378,6 +392,85 @@ def _is_excluded(self, rel_path: str) -> bool:
return True
return False

@staticmethod
def _find_git_root(path: str) -> str | None:
"""Return the git work-tree root for *path*, or ``None``."""
try:
result = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
cwd=path,
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0:
return result.stdout.strip()
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
return None

def _get_git_ignored_paths(self, paths: set[str]) -> set[str]:
"""Return the subset of *paths* that git considers ignored.

Groups paths by their containing git repository and runs
``git check-ignore --stdin`` once per repo, so that per-repo
``.gitignore`` files are respected even when ``PROJECT_ROOT``
spans multiple repositories or is not itself a git repo.

Returns an empty set when no paths are inside a git repository
or git is unavailable.
"""
if not paths:
return set()

# Group paths by their git repo root.
repos: dict[str, list[str]] = {} # repo_root -> [abs_paths]
no_repo_dirs: set[str] = set() # dirs already known to lack a repo
for abs_path in paths:
dir_path = os.path.dirname(abs_path)
# Fast-path: skip lookup if we already know this dir has no repo.
if dir_path in no_repo_dirs:
continue
# Check cache of known repo roots.
repo_root: str | None = None
for known_root in repos:
if abs_path.startswith(known_root + os.sep):
repo_root = known_root
break
if repo_root is None:
repo_root = self._find_git_root(dir_path)
if repo_root is None:
no_repo_dirs.add(dir_path)
continue
repos.setdefault(repo_root, []).append(abs_path)

# Run git check-ignore once per repo.
ignored: set[str] = set()
for repo_root, repo_paths in repos.items():
try:
result = subprocess.run(
["git", "check-ignore", "--stdin"],
input="\n".join(repo_paths),
cwd=repo_root,
capture_output=True,
text=True,
timeout=30,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
continue

# git check-ignore exits 0 when at least one path is ignored,
# 1 when none are ignored, and 128 on error.
if result.returncode not in (0, 1):
continue

for line in result.stdout.splitlines():
line = line.strip()
if line:
ignored.add(line)

return ignored

def _read_file(self, abs_path: str) -> str:
"""Read a file as text, trying UTF-8 first then latin-1 as fallback."""
try:
Expand Down
14 changes: 12 additions & 2 deletions src/mcp_codebase_index/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,9 @@ def _maybe_incremental_update() -> None:
if path in idx.files:
_indexer.remove_file(path)

# Process modifications and additions
# Process modifications and additions — collect candidates first,
# then filter out git-ignored paths in one batch call.
candidates: list[tuple[str, str]] = [] # (rel_path, abs_path)
for path in changeset.modified + changeset.added:
if _indexer._is_excluded(path):
continue
Expand All @@ -328,7 +330,15 @@ def _maybe_incremental_update() -> None:
abs_path = os.path.join(_project_root, path)
if not os.path.isfile(abs_path):
continue
_indexer.reindex_file(path, skip_graph_rebuild=True)
candidates.append((path, abs_path))

git_ignored = _indexer._get_git_ignored_paths(
{abs_path for _, abs_path in candidates},
)
for rel_path, abs_path in candidates:
if abs_path in git_ignored:
continue
_indexer.reindex_file(rel_path, skip_graph_rebuild=True)

# Rebuild cross-file graphs once
_indexer.rebuild_graphs()
Expand Down
161 changes: 161 additions & 0 deletions tests/test_project_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,167 @@ def test_rust_import_resolution(self, rust_project):
assert models_path in imports or utils_path in imports


# ---------------------------------------------------------------------------
# Test: .gitignore / git-ignore filtering
# ---------------------------------------------------------------------------


@pytest.fixture
def git_project(tmp_path):
"""Create a project inside a git repo with a .gitignore.

Structure:
repo/
.gitignore (ignores build/ and *.log)
src/
app.py
build/
output.py (should be git-ignored)
debug.log (should be git-ignored)
README.md
"""
import subprocess

root = tmp_path / "repo"
root.mkdir()

# Initialise a git repo so git check-ignore works.
subprocess.run(
["git", "init"], cwd=str(root), capture_output=True, check=True,
)
subprocess.run(
["git", "config", "user.email", "test@test.com"],
cwd=str(root), capture_output=True, check=True,
)
subprocess.run(
["git", "config", "user.name", "Test"],
cwd=str(root), capture_output=True, check=True,
)

# .gitignore
(root / ".gitignore").write_text("build/\n*.log\n")

# Source file (not ignored)
src = root / "src"
src.mkdir()
(src / "app.py").write_text("def main():\n pass\n")

# Build artefact (git-ignored)
build = root / "build"
build.mkdir()
(build / "output.py").write_text("x = 1\n")

# Log file (git-ignored)
(root / "debug.log").write_text("some log\n")

# README (not ignored)
(root / "README.md").write_text("# Repo\n")

return root


class TestGitIgnore:
"""Verify that _get_git_ignored_paths and _discover_files honour
.gitignore when the project is a git repository."""

def test_git_ignored_paths_detected(self, git_project):
indexer = ProjectIndexer(str(git_project))
# Build a set of all candidate paths (before filtering).
all_paths: set[str] = set()
for pattern in indexer.include_patterns:
from pathlib import Path

for p in Path(str(git_project)).glob(pattern):
if p.is_file():
all_paths.add(str(p))

ignored = indexer._get_git_ignored_paths(all_paths)
ignored_names = {os.path.basename(p) for p in ignored}

assert "output.py" in ignored_names, "build/output.py should be git-ignored"

def test_discover_files_excludes_git_ignored(self, git_project):
indexer = ProjectIndexer(str(git_project))
idx = indexer.index()

filenames = {os.path.basename(f) for f in idx.files}
assert "app.py" in filenames, "src/app.py should be indexed"
assert "README.md" in filenames, "README.md should be indexed"
assert "output.py" not in filenames, "build/output.py should be excluded"

def test_non_git_project_no_filtering(self, sample_project):
"""In a non-git directory, _get_git_ignored_paths returns empty."""
indexer = ProjectIndexer(str(sample_project))
ignored = indexer._get_git_ignored_paths({str(sample_project / "README.md")})
assert ignored == set()

def test_multi_repo_parent_directory(self, tmp_path):
"""PROJECT_ROOT spans multiple git repos, each with its own .gitignore."""
import subprocess

parent = tmp_path / "code"
parent.mkdir()

def _init_repo(name, gitignore, src_file, ignored_dir, ignored_file):
repo = parent / name
repo.mkdir()
subprocess.run(
["git", "init"], cwd=str(repo), capture_output=True, check=True,
)
subprocess.run(
["git", "config", "user.email", "test@test.com"],
cwd=str(repo), capture_output=True, check=True,
)
subprocess.run(
["git", "config", "user.name", "Test"],
cwd=str(repo), capture_output=True, check=True,
)
(repo / ".gitignore").write_text(gitignore)
src = repo / "src"
src.mkdir()
(src / src_file).write_text("x = 1\n")
ign = repo / ignored_dir
ign.mkdir()
(ign / ignored_file).write_text("x = 1\n")

_init_repo("alpha", "dist/\n", "main.py", "dist", "bundle.py")
_init_repo("beta", "build/\n", "app.py", "build", "output.py")
_init_repo("gamma", ".next/\n", "index.py", ".next", "cache.py")

indexer = ProjectIndexer(str(parent))
idx = indexer.index()

filenames = {os.path.basename(f) for f in idx.files}
# Source files should be indexed.
assert "main.py" in filenames
assert "app.py" in filenames
assert "index.py" in filenames
# Git-ignored files should be excluded.
assert "bundle.py" not in filenames, "alpha/dist/bundle.py should be excluded"
assert "output.py" not in filenames, "beta/build/output.py should be excluded"
assert "cache.py" not in filenames, "gamma/.next/cache.py should be excluded"

def test_gitignore_with_nested_patterns(self, git_project):
"""Additional patterns added to .gitignore are also respected."""
import subprocess

# Add a new ignore pattern
gitignore = git_project / ".gitignore"
gitignore.write_text(gitignore.read_text() + "src/generated/\n")

# Create the directory with a file
gen = git_project / "src" / "generated"
gen.mkdir()
(gen / "auto.py").write_text("y = 2\n")

indexer = ProjectIndexer(str(git_project))
idx = indexer.index()

filenames = {os.path.basename(f) for f in idx.files}
assert "auto.py" not in filenames, "generated/auto.py should be excluded"
assert "app.py" in filenames, "src/app.py should still be indexed"


class TestIntegration:
def test_index_mcp_codebase_index_source(self):
"""Index the actual mcp-codebase-index src directory as an integration test."""
Expand Down