From d9e1b83568bfb453ac526c4e9ad59f7aac80bc50 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 11 Feb 2026 20:32:16 -0800 Subject: [PATCH 1/4] feat(rag): respect .gitignore patterns when indexing documents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RAG proxy now loads .gitignore files from the docs folder and its parent directories, and skips matching files during both initial indexing and file watching. Supports wildcards, directory-only patterns, rooted patterns, negation, nested path patterns, and ** globbing — all using stdlib fnmatch (no new dependencies). --- agent_cli/rag/_indexer.py | 7 +- agent_cli/rag/_indexing.py | 20 ++++- agent_cli/rag/_utils.py | 151 ++++++++++++++++++++++++++++++++++++- tests/rag/test_utils.py | 103 +++++++++++++++++++++++++ 4 files changed, 274 insertions(+), 7 deletions(-) diff --git a/agent_cli/rag/_indexer.py b/agent_cli/rag/_indexer.py index a11ef0c4d..968f052d1 100644 --- a/agent_cli/rag/_indexer.py +++ b/agent_cli/rag/_indexer.py @@ -7,7 +7,7 @@ from agent_cli.core.watch import watch_directory from agent_cli.rag._indexing import index_file, remove_file -from agent_cli.rag._utils import should_ignore_path +from agent_cli.rag._utils import load_gitignore_patterns, should_ignore_path if TYPE_CHECKING: from pathlib import Path @@ -26,6 +26,7 @@ async def watch_docs( ) -> None: """Watch docs folder for changes and update index asynchronously.""" LOGGER.info("📁 Watching folder: %s", docs_folder) + gitignore_patterns = load_gitignore_patterns(docs_folder) await watch_directory( docs_folder, @@ -37,7 +38,9 @@ async def watch_docs( file_hashes, file_mtimes, ), - ignore_filter=should_ignore_path, + ignore_filter=lambda p, base: should_ignore_path( + p, base, gitignore_patterns=gitignore_patterns + ), ) diff --git a/agent_cli/rag/_indexing.py b/agent_cli/rag/_indexing.py index a36da844a..d7d02bdb8 100644 --- a/agent_cli/rag/_indexing.py +++ b/agent_cli/rag/_indexing.py @@ -8,7 +8,13 @@ from typing import TYPE_CHECKING from agent_cli.rag._store import delete_by_file_path, get_all_metadata, upsert_docs -from agent_cli.rag._utils import chunk_text, get_file_hash, load_document_text, should_ignore_path +from agent_cli.rag._utils import ( + chunk_text, + get_file_hash, + load_document_text, + load_gitignore_patterns, + should_ignore_path, +) from agent_cli.rag.models import DocMetadata if TYPE_CHECKING: @@ -172,9 +178,17 @@ def initial_index( processed_files = [] removed_files = [] - # Gather all files first, excluding hidden and common development directories + # Load .gitignore patterns once for the entire scan + gitignore_patterns = load_gitignore_patterns(docs_folder) + if gitignore_patterns: + LOGGER.info("📋 Loaded %d .gitignore patterns", len(gitignore_patterns)) + + # Gather all files first, excluding hidden, common dev dirs, and gitignored paths all_files = [ - p for p in docs_folder.rglob("*") if p.is_file() and not should_ignore_path(p, docs_folder) + p + for p in docs_folder.rglob("*") + if p.is_file() + and not should_ignore_path(p, docs_folder, gitignore_patterns=gitignore_patterns) ] # 1. Index Existing Files in Parallel diff --git a/agent_cli/rag/_utils.py b/agent_cli/rag/_utils.py index a6d14891f..85e42a4da 100644 --- a/agent_cli/rag/_utils.py +++ b/agent_cli/rag/_utils.py @@ -2,6 +2,7 @@ from __future__ import annotations +import fnmatch import hashlib import logging from typing import TYPE_CHECKING @@ -33,7 +34,140 @@ ) -def should_ignore_path(path: Path, base_folder: Path) -> bool: +def _parse_gitignore(gitignore_path: Path) -> list[str]: + """Parse a .gitignore file and return a list of patterns.""" + try: + text = gitignore_path.read_text(errors="ignore") + except OSError: + return [] + patterns = [] + for line in text.splitlines(): + line = line.strip() # noqa: PLW2901 + # Skip empty lines and comments + if not line or line.startswith("#"): + continue + patterns.append(line) + return patterns + + +def _gitignore_pattern_matches(pattern: str, rel_path_str: str, is_dir: bool) -> bool: + """Check if a single gitignore pattern matches a relative path. + + Supports: + - Simple filename patterns (e.g. ``*.log``) + - Directory-only patterns with trailing ``/`` (e.g. ``build/``) + - Rooted patterns with leading ``/`` (e.g. ``/dist``) + - Patterns with ``/`` that match against the full path + - ``**`` for matching across directories + """ + # Directory-only pattern (trailing /) + dir_only = pattern.endswith("/") + if dir_only: + pattern = pattern.rstrip("/") + + # Rooted pattern (leading /) + rooted = pattern.startswith("/") + if rooted: + pattern = pattern.lstrip("/") + + # Convert ** to fnmatch-compatible pattern + # "**/" matches any number of directories + glob_pattern = pattern.replace("**/", "__GLOBSTAR__/") + glob_pattern = glob_pattern.replace("/**", "/__GLOBSTAR__") + glob_pattern = glob_pattern.replace("**", "__GLOBSTAR__") + + # For patterns without /, match against any path component (unless rooted) + if "/" not in pattern and not rooted: + # Simple pattern like "*.log" or "build" — match against each component + parts = rel_path_str.split("/") + # For dir-only patterns, only match directory components (all except last for files) + components_to_check = (parts[:-1] if not is_dir else parts) if dir_only else parts + return any(fnmatch.fnmatch(part, pattern) for part in components_to_check) + + # Pattern contains / — match against full relative path + # Restore ** handling + glob_pattern = glob_pattern.replace("__GLOBSTAR__", "*") + + if rooted: + # Must match from the root + return fnmatch.fnmatch(rel_path_str, glob_pattern) + + # Non-rooted patterns with / can match anywhere in the path + # Try matching from each directory level + parts = rel_path_str.split("/") + for i in range(len(parts)): + sub_path = "/".join(parts[i:]) + if fnmatch.fnmatch(sub_path, glob_pattern): + return True + return False + + +def _matches_gitignore( + rel_path_str: str, + is_dir: bool, + gitignore_patterns: list[str], +) -> bool: + """Check if a path matches gitignore patterns. + + Processes patterns in order; negation patterns (``!``) can un-ignore + previously matched paths. Also checks parent directories: if a + parent directory is ignored, the file inside it is ignored too. + """ + # Build list of paths to check: all parent dirs, then the file/dir itself + parts = rel_path_str.split("/") + paths_to_check = [("/".join(parts[:i]), True) for i in range(1, len(parts))] + paths_to_check.append((rel_path_str, is_dir)) + + ignored = False + for pattern in gitignore_patterns: + if pattern.startswith("!"): + neg_pattern = pattern[1:] + for check_path, check_is_dir in paths_to_check: + if _gitignore_pattern_matches(neg_pattern, check_path, check_is_dir): + ignored = False + break + else: + for check_path, check_is_dir in paths_to_check: + if _gitignore_pattern_matches(pattern, check_path, check_is_dir): + ignored = True + break + return ignored + + +def load_gitignore_patterns(docs_folder: Path) -> list[str]: + """Load .gitignore patterns from the docs folder and its parents. + + Walks up from ``docs_folder`` to the filesystem root, collecting + ``.gitignore`` files. Patterns from parent directories are applied + first (lower priority), then patterns from directories closer to + ``docs_folder`` (higher priority), matching Git's behaviour. + """ + gitignore_files: list[Path] = [] + current = docs_folder.resolve() + while True: + candidate = current / ".gitignore" + if candidate.is_file(): + gitignore_files.append(candidate) + parent = current.parent + if parent == current: + break + current = parent + + # Reverse so parent patterns come first (lower priority) + gitignore_files.reverse() + + all_patterns: list[str] = [] + for gi in gitignore_files: + all_patterns.extend(_parse_gitignore(gi)) + return all_patterns + + +def should_ignore_path( + path: Path, + base_folder: Path, + *, + gitignore_patterns: list[str] | None = None, +) -> bool: """Check if a path should be ignored during indexing. Ignores: @@ -41,10 +175,13 @@ def should_ignore_path(path: Path, base_folder: Path) -> bool: - Common development directories (__pycache__, node_modules, venv, etc.) - .egg-info directories - OS metadata files (Thumbs.db) + - Paths matching .gitignore patterns (when provided) Args: path: The file path to check. base_folder: The base folder for computing relative paths. + gitignore_patterns: Pre-parsed gitignore patterns from + :func:`load_gitignore_patterns`. Returns: True if the path should be ignored, False otherwise. @@ -64,7 +201,17 @@ def should_ignore_path(path: Path, base_folder: Path) -> bool: return True # Check specific file patterns - return path.name in DEFAULT_IGNORE_FILES + if path.name in DEFAULT_IGNORE_FILES: + return True + + # Check gitignore patterns + if gitignore_patterns: + rel_path_str = "/".join(rel_parts) + is_dir = path.is_dir() + if _matches_gitignore(rel_path_str, is_dir, gitignore_patterns): + return True + + return False # Files to read as plain text directly (fast path) diff --git a/tests/rag/test_utils.py b/tests/rag/test_utils.py index 0d0428c09..a6603b5cb 100644 --- a/tests/rag/test_utils.py +++ b/tests/rag/test_utils.py @@ -502,3 +502,106 @@ def test_path_outside_base_folder_raises(self, tmp_path: Path) -> None: other_path = Path("/some/other/path.txt") with pytest.raises(ValueError, match="is not in the subpath"): _utils.should_ignore_path(other_path, tmp_path) + + +class TestGitignoreSupport: + """Tests for .gitignore pattern matching in should_ignore_path.""" + + def test_gitignore_wildcard_pattern(self, tmp_path: Path) -> None: + """Test that *.log pattern ignores log files.""" + (tmp_path / ".gitignore").write_text("*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f = tmp_path / "debug.log" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_does_not_ignore_unmatched(self, tmp_path: Path) -> None: + """Test that files not matching gitignore patterns are kept.""" + (tmp_path / ".gitignore").write_text("*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f = tmp_path / "readme.md" + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_directory_pattern(self, tmp_path: Path) -> None: + """Test that directory-only patterns (trailing /) work.""" + (tmp_path / ".gitignore").write_text("output/\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + out = tmp_path / "output" + out.mkdir() + f = out / "result.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_rooted_pattern(self, tmp_path: Path) -> None: + """Test that rooted patterns (leading /) only match at the root.""" + (tmp_path / ".gitignore").write_text("/secret.txt\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + # Root-level match + f = tmp_path / "secret.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + # Nested should NOT match + subdir = tmp_path / "sub" + subdir.mkdir() + f2 = subdir / "secret.txt" + f2.touch() + assert not _utils.should_ignore_path(f2, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_negation_pattern(self, tmp_path: Path) -> None: + """Test that negation patterns (!) un-ignore files.""" + (tmp_path / ".gitignore").write_text("*.log\n!important.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f1 = tmp_path / "debug.log" + f1.touch() + assert _utils.should_ignore_path(f1, tmp_path, gitignore_patterns=patterns) + f2 = tmp_path / "important.log" + f2.touch() + assert not _utils.should_ignore_path(f2, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_comment_and_blank_lines(self, tmp_path: Path) -> None: + """Test that comments and blank lines in .gitignore are skipped.""" + (tmp_path / ".gitignore").write_text("# comment\n\n*.tmp\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + assert len(patterns) == 1 + assert patterns[0] == "*.tmp" + + def test_gitignore_nested_directory_pattern(self, tmp_path: Path) -> None: + """Test pattern with path separator matches nested paths.""" + (tmp_path / ".gitignore").write_text("logs/*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + logs = tmp_path / "logs" + logs.mkdir() + f = logs / "app.log" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_double_star_pattern(self, tmp_path: Path) -> None: + """Test that ** matches across directories.""" + (tmp_path / ".gitignore").write_text("**/test_output\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + deep = tmp_path / "a" / "b" / "test_output" + deep.mkdir(parents=True) + f = deep / "result.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_no_gitignore_file(self, tmp_path: Path) -> None: + """Test that missing .gitignore returns empty patterns.""" + patterns = _utils.load_gitignore_patterns(tmp_path) + assert patterns == [] + + def test_parent_gitignore_is_loaded(self, tmp_path: Path) -> None: + """Test that .gitignore from parent directories is picked up.""" + (tmp_path / ".gitignore").write_text("*.bak\n") + subdir = tmp_path / "docs" + subdir.mkdir() + patterns = _utils.load_gitignore_patterns(subdir) + assert "*.bak" in patterns + + def test_gitignore_no_patterns_means_no_extra_ignoring(self, tmp_path: Path) -> None: + """Test that passing None patterns behaves like no gitignore.""" + f = tmp_path / "normal.txt" + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=None) + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=[]) From a3d21f5a2ef2aac3b17cd633698a992cc812b867 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 11 Feb 2026 21:31:52 -0800 Subject: [PATCH 2/4] fix(rag): align gitignore matching semantics --- agent_cli/rag/_indexer.py | 9 +- agent_cli/rag/_utils.py | 236 +++++++++++++++++++++++++------------- tests/rag/test_utils.py | 43 ++++++- 3 files changed, 202 insertions(+), 86 deletions(-) diff --git a/agent_cli/rag/_indexer.py b/agent_cli/rag/_indexer.py index 968f052d1..e35589c89 100644 --- a/agent_cli/rag/_indexer.py +++ b/agent_cli/rag/_indexer.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +from functools import partial, update_wrapper from typing import TYPE_CHECKING from agent_cli.core.watch import watch_directory @@ -27,6 +28,10 @@ async def watch_docs( """Watch docs folder for changes and update index asynchronously.""" LOGGER.info("📁 Watching folder: %s", docs_folder) gitignore_patterns = load_gitignore_patterns(docs_folder) + ignore_filter = update_wrapper( + partial(should_ignore_path, gitignore_patterns=gitignore_patterns), + should_ignore_path, + ) await watch_directory( docs_folder, @@ -38,9 +43,7 @@ async def watch_docs( file_hashes, file_mtimes, ), - ignore_filter=lambda p, base: should_ignore_path( - p, base, gitignore_patterns=gitignore_patterns - ), + ignore_filter=ignore_filter, ) diff --git a/agent_cli/rag/_utils.py b/agent_cli/rag/_utils.py index 85e42a4da..132c270ad 100644 --- a/agent_cli/rag/_utils.py +++ b/agent_cli/rag/_utils.py @@ -5,6 +5,9 @@ import fnmatch import hashlib import logging +import re +from dataclasses import dataclass +from functools import lru_cache from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -34,107 +37,178 @@ ) -def _parse_gitignore(gitignore_path: Path) -> list[str]: - """Parse a .gitignore file and return a list of patterns.""" +@dataclass(frozen=True) +class GitignorePattern: + """A normalized gitignore pattern plus the source directory context.""" + + pattern: str + negated: bool + dir_only: bool + anchored: bool + has_slash: bool + base_prefix: tuple[str, ...] + + +def _normalize_gitignore_line(line: str) -> tuple[bool, str] | None: + """Normalize one .gitignore line. + + Returns: + ``None`` if the line should be ignored, otherwise ``(negated, pattern)``. + + """ + line = line.strip() + if not line: + return None + if line.startswith((r"\#", r"\!")): + return False, line[1:] + if line.startswith("#"): + return None + + negated = line.startswith("!") + if negated: + line = line[1:] + if not line: + return None + return negated, line + + +def _parse_gitignore(gitignore_path: Path, docs_folder: Path) -> list[GitignorePattern]: + """Parse one .gitignore file into normalized rule objects.""" try: text = gitignore_path.read_text(errors="ignore") except OSError: return [] - patterns = [] - for line in text.splitlines(): - line = line.strip() # noqa: PLW2901 - # Skip empty lines and comments - if not line or line.startswith("#"): + + try: + base_prefix = docs_folder.resolve().relative_to(gitignore_path.parent.resolve()).parts + except ValueError: + return [] + + patterns: list[GitignorePattern] = [] + for raw_line in text.splitlines(): + normalized = _normalize_gitignore_line(raw_line) + if normalized is None: continue - patterns.append(line) + negated, line = normalized + + dir_only = line.endswith("/") + if dir_only: + line = line.rstrip("/") + + anchored = line.startswith("/") + if anchored: + line = line.lstrip("/") + + if not line: + continue + + patterns.append( + GitignorePattern( + pattern=line, + negated=negated, + dir_only=dir_only, + anchored=anchored, + has_slash="/" in line, + base_prefix=base_prefix, + ), + ) return patterns -def _gitignore_pattern_matches(pattern: str, rel_path_str: str, is_dir: bool) -> bool: - """Check if a single gitignore pattern matches a relative path. +@lru_cache(maxsize=512) +def _compile_gitignore_regex(pattern: str) -> re.Pattern[str]: + """Compile a gitignore-like pattern into a regex. - Supports: - - Simple filename patterns (e.g. ``*.log``) - - Directory-only patterns with trailing ``/`` (e.g. ``build/``) - - Rooted patterns with leading ``/`` (e.g. ``/dist``) - - Patterns with ``/`` that match against the full path - - ``**`` for matching across directories + This keeps the key semantics needed here: + - ``*`` does not cross path separators + - ``**`` may cross path separators """ - # Directory-only pattern (trailing /) - dir_only = pattern.endswith("/") - if dir_only: - pattern = pattern.rstrip("/") - - # Rooted pattern (leading /) - rooted = pattern.startswith("/") - if rooted: - pattern = pattern.lstrip("/") - - # Convert ** to fnmatch-compatible pattern - # "**/" matches any number of directories - glob_pattern = pattern.replace("**/", "__GLOBSTAR__/") - glob_pattern = glob_pattern.replace("/**", "/__GLOBSTAR__") - glob_pattern = glob_pattern.replace("**", "__GLOBSTAR__") - - # For patterns without /, match against any path component (unless rooted) - if "/" not in pattern and not rooted: - # Simple pattern like "*.log" or "build" — match against each component - parts = rel_path_str.split("/") - # For dir-only patterns, only match directory components (all except last for files) - components_to_check = (parts[:-1] if not is_dir else parts) if dir_only else parts - return any(fnmatch.fnmatch(part, pattern) for part in components_to_check) - - # Pattern contains / — match against full relative path - # Restore ** handling - glob_pattern = glob_pattern.replace("__GLOBSTAR__", "*") - - if rooted: - # Must match from the root - return fnmatch.fnmatch(rel_path_str, glob_pattern) - - # Non-rooted patterns with / can match anywhere in the path - # Try matching from each directory level - parts = rel_path_str.split("/") - for i in range(len(parts)): - sub_path = "/".join(parts[i:]) - if fnmatch.fnmatch(sub_path, glob_pattern): - return True - return False + regex_parts: list[str] = [] + i = 0 + while i < len(pattern): + char = pattern[i] + if char == "*": + if i + 1 < len(pattern) and pattern[i + 1] == "*": + # Collapse runs like ** or *** + while i + 1 < len(pattern) and pattern[i + 1] == "*": + i += 1 + if i + 1 < len(pattern) and pattern[i + 1] == "/": + regex_parts.append("(?:.*/)?") + i += 1 + else: + regex_parts.append(".*") + else: + regex_parts.append("[^/]*") + elif char == "?": + regex_parts.append("[^/]") + else: + regex_parts.append(re.escape(char)) + i += 1 + + return re.compile(f"^{''.join(regex_parts)}$") + + +def _gitignore_rule_matches( + rule: GitignorePattern, + rel_parts: tuple[str, ...], + is_dir: bool, +) -> bool: + """Check whether one parsed gitignore rule matches one path.""" + if rule.dir_only and not is_dir: + return False + if not rel_parts: + return False + + path_parts = (*rule.base_prefix, *rel_parts) + if not path_parts: + return False + + # No-slash patterns match only the basename at any depth. + # Ancestor directories are handled by `_matches_gitignore`. + if not rule.has_slash and not rule.anchored: + return fnmatch.fnmatchcase(path_parts[-1], rule.pattern) + + rel_path_str = "/".join(path_parts) + return bool(_compile_gitignore_regex(rule.pattern).fullmatch(rel_path_str)) + + +def _is_path_ignored_by_rules( + rel_parts: tuple[str, ...], + is_dir: bool, + gitignore_patterns: list[GitignorePattern], +) -> bool: + """Evaluate gitignore rules for a single path.""" + ignored = False + for rule in gitignore_patterns: + if _gitignore_rule_matches(rule, rel_parts, is_dir): + ignored = not rule.negated + return ignored def _matches_gitignore( rel_path_str: str, is_dir: bool, - gitignore_patterns: list[str], + gitignore_patterns: list[GitignorePattern], ) -> bool: """Check if a path matches gitignore patterns. Processes patterns in order; negation patterns (``!``) can un-ignore - previously matched paths. Also checks parent directories: if a - parent directory is ignored, the file inside it is ignored too. + previously matched paths. Parent directories are evaluated separately: + if any parent directory is ignored, the file inside remains ignored. """ - # Build list of paths to check: all parent dirs, then the file/dir itself - parts = rel_path_str.split("/") - paths_to_check = [("/".join(parts[:i]), True) for i in range(1, len(parts))] - paths_to_check.append((rel_path_str, is_dir)) + parts = tuple(part for part in rel_path_str.split("/") if part) + if not parts: + return False - ignored = False - for pattern in gitignore_patterns: - if pattern.startswith("!"): - neg_pattern = pattern[1:] - for check_path, check_is_dir in paths_to_check: - if _gitignore_pattern_matches(neg_pattern, check_path, check_is_dir): - ignored = False - break - else: - for check_path, check_is_dir in paths_to_check: - if _gitignore_pattern_matches(pattern, check_path, check_is_dir): - ignored = True - break - return ignored + # If any ancestor directory is ignored, this path is ignored too. + for i in range(1, len(parts)): + if _is_path_ignored_by_rules(parts[:i], is_dir=True, gitignore_patterns=gitignore_patterns): + return True + + return _is_path_ignored_by_rules(parts, is_dir, gitignore_patterns) -def load_gitignore_patterns(docs_folder: Path) -> list[str]: +def load_gitignore_patterns(docs_folder: Path) -> list[GitignorePattern]: """Load .gitignore patterns from the docs folder and its parents. Walks up from ``docs_folder`` to the filesystem root, collecting @@ -156,9 +230,9 @@ def load_gitignore_patterns(docs_folder: Path) -> list[str]: # Reverse so parent patterns come first (lower priority) gitignore_files.reverse() - all_patterns: list[str] = [] + all_patterns: list[GitignorePattern] = [] for gi in gitignore_files: - all_patterns.extend(_parse_gitignore(gi)) + all_patterns.extend(_parse_gitignore(gi, docs_folder)) return all_patterns @@ -166,7 +240,7 @@ def should_ignore_path( path: Path, base_folder: Path, *, - gitignore_patterns: list[str] | None = None, + gitignore_patterns: list[GitignorePattern] | None = None, ) -> bool: """Check if a path should be ignored during indexing. diff --git a/tests/rag/test_utils.py b/tests/rag/test_utils.py index a6603b5cb..2f4629b90 100644 --- a/tests/rag/test_utils.py +++ b/tests/rag/test_utils.py @@ -564,7 +564,7 @@ def test_gitignore_comment_and_blank_lines(self, tmp_path: Path) -> None: (tmp_path / ".gitignore").write_text("# comment\n\n*.tmp\n") patterns = _utils.load_gitignore_patterns(tmp_path) assert len(patterns) == 1 - assert patterns[0] == "*.tmp" + assert patterns[0].pattern == "*.tmp" def test_gitignore_nested_directory_pattern(self, tmp_path: Path) -> None: """Test pattern with path separator matches nested paths.""" @@ -576,6 +576,16 @@ def test_gitignore_nested_directory_pattern(self, tmp_path: Path) -> None: f.touch() assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + def test_gitignore_nested_directory_pattern_no_overmatch(self, tmp_path: Path) -> None: + """Test pattern with slash does not over-match deeper directories.""" + (tmp_path / ".gitignore").write_text("logs/*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + nested = tmp_path / "logs" / "sub" + nested.mkdir(parents=True) + f = nested / "app.log" + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + def test_gitignore_double_star_pattern(self, tmp_path: Path) -> None: """Test that ** matches across directories.""" (tmp_path / ".gitignore").write_text("**/test_output\n") @@ -597,7 +607,36 @@ def test_parent_gitignore_is_loaded(self, tmp_path: Path) -> None: subdir = tmp_path / "docs" subdir.mkdir() patterns = _utils.load_gitignore_patterns(subdir) - assert "*.bak" in patterns + assert any(p.pattern == "*.bak" for p in patterns) + + def test_parent_rooted_pattern_anchored_at_parent(self, tmp_path: Path) -> None: + """Test rooted parent pattern stays rooted to parent gitignore directory.""" + (tmp_path / ".gitignore").write_text("/foo\n") + subdir = tmp_path / "docs" + subdir.mkdir() + patterns = _utils.load_gitignore_patterns(subdir) + f = subdir / "foo" + f.touch() + assert not _utils.should_ignore_path(f, subdir, gitignore_patterns=patterns) + + def test_gitignore_slash_pattern_is_not_suffix_match(self, tmp_path: Path) -> None: + """Test slash patterns are rooted to gitignore directory, not suffix-matched.""" + (tmp_path / ".gitignore").write_text("foo/bar\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f = tmp_path / "x" / "foo" / "bar" + f.parent.mkdir(parents=True) + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_negation_cannot_reinclude_file_in_ignored_directory(self, tmp_path: Path) -> None: + """Test !file does not re-include when parent directory remains ignored.""" + (tmp_path / ".gitignore").write_text("logs/\n!logs/keep.txt\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + logs = tmp_path / "logs" + logs.mkdir() + f = logs / "keep.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) def test_gitignore_no_patterns_means_no_extra_ignoring(self, tmp_path: Path) -> None: """Test that passing None patterns behaves like no gitignore.""" From 33784423eaea331cc9861cde8e95dc258a67bbda Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 11 Feb 2026 21:41:19 -0800 Subject: [PATCH 3/4] fix(rag): scope gitignore loading to repo root --- agent_cli/rag/_utils.py | 30 ++++++++++++++++++++++++++---- tests/rag/test_utils.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/agent_cli/rag/_utils.py b/agent_cli/rag/_utils.py index 132c270ad..1d60491c8 100644 --- a/agent_cli/rag/_utils.py +++ b/agent_cli/rag/_utils.py @@ -208,20 +208,42 @@ def _matches_gitignore( return _is_path_ignored_by_rules(parts, is_dir, gitignore_patterns) +def _find_git_root(start: Path) -> Path | None: + """Find the nearest git repository root at or above ``start``.""" + current = start.resolve() + while True: + if (current / ".git").exists(): + return current + parent = current.parent + if parent == current: + return None + current = parent + + def load_gitignore_patterns(docs_folder: Path) -> list[GitignorePattern]: """Load .gitignore patterns from the docs folder and its parents. - Walks up from ``docs_folder`` to the filesystem root, collecting - ``.gitignore`` files. Patterns from parent directories are applied - first (lower priority), then patterns from directories closer to - ``docs_folder`` (higher priority), matching Git's behaviour. + Walks up from ``docs_folder`` to the git repository root, collecting + ``.gitignore`` files. If ``docs_folder`` is not inside a git repo, + only ``docs_folder/.gitignore`` is considered. + + Patterns from parent directories are applied first (lower priority), + then patterns from directories closer to ``docs_folder`` (higher priority), + matching Git's behaviour. """ gitignore_files: list[Path] = [] current = docs_folder.resolve() + git_root = _find_git_root(current) while True: candidate = current / ".gitignore" if candidate.is_file(): gitignore_files.append(candidate) + + if git_root is None: + break + if current == git_root: + break + parent = current.parent if parent == current: break diff --git a/tests/rag/test_utils.py b/tests/rag/test_utils.py index 2f4629b90..717b10fd5 100644 --- a/tests/rag/test_utils.py +++ b/tests/rag/test_utils.py @@ -602,13 +602,43 @@ def test_no_gitignore_file(self, tmp_path: Path) -> None: assert patterns == [] def test_parent_gitignore_is_loaded(self, tmp_path: Path) -> None: - """Test that .gitignore from parent directories is picked up.""" + """Test that parent .gitignore is loaded when inside a git repo.""" + (tmp_path / ".git").mkdir() (tmp_path / ".gitignore").write_text("*.bak\n") subdir = tmp_path / "docs" subdir.mkdir() patterns = _utils.load_gitignore_patterns(subdir) assert any(p.pattern == "*.bak" for p in patterns) + def test_parent_gitignore_not_loaded_outside_repo_root(self, tmp_path: Path) -> None: + """Test that .gitignore above repository root is ignored.""" + outer = tmp_path / "outer" + repo_root = outer / "repo" + docs = repo_root / "docs" + docs.mkdir(parents=True) + (repo_root / ".git").mkdir() + (outer / ".gitignore").write_text("*.foo\n") + + patterns = _utils.load_gitignore_patterns(docs) + f = docs / "a.foo" + f.touch() + assert not _utils.should_ignore_path(f, docs, gitignore_patterns=patterns) + + def test_not_in_git_repo_only_loads_docs_gitignore(self, tmp_path: Path) -> None: + """Test that parent .gitignore is ignored when not in a git repo.""" + (tmp_path / ".gitignore").write_text("*.foo\n") + docs = tmp_path / "docs" + docs.mkdir() + (docs / ".gitignore").write_text("*.bar\n") + + patterns = _utils.load_gitignore_patterns(docs) + foo = docs / "x.foo" + foo.touch() + bar = docs / "x.bar" + bar.touch() + assert not _utils.should_ignore_path(foo, docs, gitignore_patterns=patterns) + assert _utils.should_ignore_path(bar, docs, gitignore_patterns=patterns) + def test_parent_rooted_pattern_anchored_at_parent(self, tmp_path: Path) -> None: """Test rooted parent pattern stays rooted to parent gitignore directory.""" (tmp_path / ".gitignore").write_text("/foo\n") From dc71dbe11d9181c296b71922e75590f7868d2eff Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 11 Feb 2026 21:48:31 -0800 Subject: [PATCH 4/4] refactor(rag): remove test-specific ignore filter wrapper --- agent_cli/rag/_indexer.py | 8 +++----- tests/rag/test_indexer.py | 12 +++++++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/agent_cli/rag/_indexer.py b/agent_cli/rag/_indexer.py index e35589c89..1695c4ed9 100644 --- a/agent_cli/rag/_indexer.py +++ b/agent_cli/rag/_indexer.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -from functools import partial, update_wrapper from typing import TYPE_CHECKING from agent_cli.core.watch import watch_directory @@ -28,10 +27,9 @@ async def watch_docs( """Watch docs folder for changes and update index asynchronously.""" LOGGER.info("📁 Watching folder: %s", docs_folder) gitignore_patterns = load_gitignore_patterns(docs_folder) - ignore_filter = update_wrapper( - partial(should_ignore_path, gitignore_patterns=gitignore_patterns), - should_ignore_path, - ) + + def ignore_filter(path: Path, base_folder: Path) -> bool: + return should_ignore_path(path, base_folder, gitignore_patterns=gitignore_patterns) await watch_directory( docs_folder, diff --git a/tests/rag/test_indexer.py b/tests/rag/test_indexer.py index cedbb74e1..0981039dc 100644 --- a/tests/rag/test_indexer.py +++ b/tests/rag/test_indexer.py @@ -57,12 +57,17 @@ async def fake_watch_directory(_root: Path, handler: Any, **_kwargs) -> None: # @pytest.mark.asyncio async def test_watch_docs_passes_ignore_filter(tmp_path: Path) -> None: - """Test that watch_docs passes the should_ignore_path filter to watch_directory.""" + """Test that watch_docs passes a configured ignore filter to watch_directory.""" mock_collection = MagicMock() docs_folder = tmp_path / "docs" docs_folder.mkdir() + (docs_folder / ".gitignore").write_text("*.log\n") file_hashes: dict[str, str] = {} file_mtimes: dict[str, float] = {} + debug_log = docs_folder / "debug.log" + debug_log.touch() + readme = docs_folder / "readme.md" + readme.touch() async def fake_watch_directory( _root: Path, @@ -71,9 +76,10 @@ async def fake_watch_directory( ignore_filter: Any = None, **_kwargs: Any, ) -> None: - # Verify ignore_filter is provided and is the should_ignore_path function + # Verify ignore_filter is provided and applies loaded .gitignore rules. assert ignore_filter is not None - assert ignore_filter.__name__ == "should_ignore_path" + assert ignore_filter(debug_log, docs_folder) + assert not ignore_filter(readme, docs_folder) with patch( "agent_cli.rag._indexer.watch_directory",