diff --git a/agent_cli/rag/_indexer.py b/agent_cli/rag/_indexer.py index a11ef0c4d..1695c4ed9 100644 --- a/agent_cli/rag/_indexer.py +++ b/agent_cli/rag/_indexer.py @@ -7,7 +7,7 @@ from agent_cli.core.watch import watch_directory from agent_cli.rag._indexing import index_file, remove_file -from agent_cli.rag._utils import should_ignore_path +from agent_cli.rag._utils import load_gitignore_patterns, should_ignore_path if TYPE_CHECKING: from pathlib import Path @@ -26,6 +26,10 @@ async def watch_docs( ) -> None: """Watch docs folder for changes and update index asynchronously.""" LOGGER.info("📁 Watching folder: %s", docs_folder) + gitignore_patterns = load_gitignore_patterns(docs_folder) + + def ignore_filter(path: Path, base_folder: Path) -> bool: + return should_ignore_path(path, base_folder, gitignore_patterns=gitignore_patterns) await watch_directory( docs_folder, @@ -37,7 +41,7 @@ async def watch_docs( file_hashes, file_mtimes, ), - ignore_filter=should_ignore_path, + ignore_filter=ignore_filter, ) diff --git a/agent_cli/rag/_indexing.py b/agent_cli/rag/_indexing.py index a36da844a..d7d02bdb8 100644 --- a/agent_cli/rag/_indexing.py +++ b/agent_cli/rag/_indexing.py @@ -8,7 +8,13 @@ from typing import TYPE_CHECKING from agent_cli.rag._store import delete_by_file_path, get_all_metadata, upsert_docs -from agent_cli.rag._utils import chunk_text, get_file_hash, load_document_text, should_ignore_path +from agent_cli.rag._utils import ( + chunk_text, + get_file_hash, + load_document_text, + load_gitignore_patterns, + should_ignore_path, +) from agent_cli.rag.models import DocMetadata if TYPE_CHECKING: @@ -172,9 +178,17 @@ def initial_index( processed_files = [] removed_files = [] - # Gather all files first, excluding hidden and common development directories + # Load .gitignore patterns once for the entire scan + gitignore_patterns = load_gitignore_patterns(docs_folder) + if gitignore_patterns: + LOGGER.info("📋 Loaded %d .gitignore patterns", len(gitignore_patterns)) + + # Gather all files first, excluding hidden, common dev dirs, and gitignored paths all_files = [ - p for p in docs_folder.rglob("*") if p.is_file() and not should_ignore_path(p, docs_folder) + p + for p in docs_folder.rglob("*") + if p.is_file() + and not should_ignore_path(p, docs_folder, gitignore_patterns=gitignore_patterns) ] # 1. Index Existing Files in Parallel diff --git a/agent_cli/rag/_utils.py b/agent_cli/rag/_utils.py index a6d14891f..1d60491c8 100644 --- a/agent_cli/rag/_utils.py +++ b/agent_cli/rag/_utils.py @@ -2,8 +2,12 @@ from __future__ import annotations +import fnmatch import hashlib import logging +import re +from dataclasses import dataclass +from functools import lru_cache from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -33,7 +37,233 @@ ) -def should_ignore_path(path: Path, base_folder: Path) -> bool: +@dataclass(frozen=True) +class GitignorePattern: + """A normalized gitignore pattern plus the source directory context.""" + + pattern: str + negated: bool + dir_only: bool + anchored: bool + has_slash: bool + base_prefix: tuple[str, ...] + + +def _normalize_gitignore_line(line: str) -> tuple[bool, str] | None: + """Normalize one .gitignore line. + + Returns: + ``None`` if the line should be ignored, otherwise ``(negated, pattern)``. + + """ + line = line.strip() + if not line: + return None + if line.startswith((r"\#", r"\!")): + return False, line[1:] + if line.startswith("#"): + return None + + negated = line.startswith("!") + if negated: + line = line[1:] + if not line: + return None + return negated, line + + +def _parse_gitignore(gitignore_path: Path, docs_folder: Path) -> list[GitignorePattern]: + """Parse one .gitignore file into normalized rule objects.""" + try: + text = gitignore_path.read_text(errors="ignore") + except OSError: + return [] + + try: + base_prefix = docs_folder.resolve().relative_to(gitignore_path.parent.resolve()).parts + except ValueError: + return [] + + patterns: list[GitignorePattern] = [] + for raw_line in text.splitlines(): + normalized = _normalize_gitignore_line(raw_line) + if normalized is None: + continue + negated, line = normalized + + dir_only = line.endswith("/") + if dir_only: + line = line.rstrip("/") + + anchored = line.startswith("/") + if anchored: + line = line.lstrip("/") + + if not line: + continue + + patterns.append( + GitignorePattern( + pattern=line, + negated=negated, + dir_only=dir_only, + anchored=anchored, + has_slash="/" in line, + base_prefix=base_prefix, + ), + ) + return patterns + + +@lru_cache(maxsize=512) +def _compile_gitignore_regex(pattern: str) -> re.Pattern[str]: + """Compile a gitignore-like pattern into a regex. + + This keeps the key semantics needed here: + - ``*`` does not cross path separators + - ``**`` may cross path separators + """ + regex_parts: list[str] = [] + i = 0 + while i < len(pattern): + char = pattern[i] + if char == "*": + if i + 1 < len(pattern) and pattern[i + 1] == "*": + # Collapse runs like ** or *** + while i + 1 < len(pattern) and pattern[i + 1] == "*": + i += 1 + if i + 1 < len(pattern) and pattern[i + 1] == "/": + regex_parts.append("(?:.*/)?") + i += 1 + else: + regex_parts.append(".*") + else: + regex_parts.append("[^/]*") + elif char == "?": + regex_parts.append("[^/]") + else: + regex_parts.append(re.escape(char)) + i += 1 + + return re.compile(f"^{''.join(regex_parts)}$") + + +def _gitignore_rule_matches( + rule: GitignorePattern, + rel_parts: tuple[str, ...], + is_dir: bool, +) -> bool: + """Check whether one parsed gitignore rule matches one path.""" + if rule.dir_only and not is_dir: + return False + if not rel_parts: + return False + + path_parts = (*rule.base_prefix, *rel_parts) + if not path_parts: + return False + + # No-slash patterns match only the basename at any depth. + # Ancestor directories are handled by `_matches_gitignore`. + if not rule.has_slash and not rule.anchored: + return fnmatch.fnmatchcase(path_parts[-1], rule.pattern) + + rel_path_str = "/".join(path_parts) + return bool(_compile_gitignore_regex(rule.pattern).fullmatch(rel_path_str)) + + +def _is_path_ignored_by_rules( + rel_parts: tuple[str, ...], + is_dir: bool, + gitignore_patterns: list[GitignorePattern], +) -> bool: + """Evaluate gitignore rules for a single path.""" + ignored = False + for rule in gitignore_patterns: + if _gitignore_rule_matches(rule, rel_parts, is_dir): + ignored = not rule.negated + return ignored + + +def _matches_gitignore( + rel_path_str: str, + is_dir: bool, + gitignore_patterns: list[GitignorePattern], +) -> bool: + """Check if a path matches gitignore patterns. + + Processes patterns in order; negation patterns (``!``) can un-ignore + previously matched paths. Parent directories are evaluated separately: + if any parent directory is ignored, the file inside remains ignored. + """ + parts = tuple(part for part in rel_path_str.split("/") if part) + if not parts: + return False + + # If any ancestor directory is ignored, this path is ignored too. + for i in range(1, len(parts)): + if _is_path_ignored_by_rules(parts[:i], is_dir=True, gitignore_patterns=gitignore_patterns): + return True + + return _is_path_ignored_by_rules(parts, is_dir, gitignore_patterns) + + +def _find_git_root(start: Path) -> Path | None: + """Find the nearest git repository root at or above ``start``.""" + current = start.resolve() + while True: + if (current / ".git").exists(): + return current + parent = current.parent + if parent == current: + return None + current = parent + + +def load_gitignore_patterns(docs_folder: Path) -> list[GitignorePattern]: + """Load .gitignore patterns from the docs folder and its parents. + + Walks up from ``docs_folder`` to the git repository root, collecting + ``.gitignore`` files. If ``docs_folder`` is not inside a git repo, + only ``docs_folder/.gitignore`` is considered. + + Patterns from parent directories are applied first (lower priority), + then patterns from directories closer to ``docs_folder`` (higher priority), + matching Git's behaviour. + """ + gitignore_files: list[Path] = [] + current = docs_folder.resolve() + git_root = _find_git_root(current) + while True: + candidate = current / ".gitignore" + if candidate.is_file(): + gitignore_files.append(candidate) + + if git_root is None: + break + if current == git_root: + break + + parent = current.parent + if parent == current: + break + current = parent + + # Reverse so parent patterns come first (lower priority) + gitignore_files.reverse() + + all_patterns: list[GitignorePattern] = [] + for gi in gitignore_files: + all_patterns.extend(_parse_gitignore(gi, docs_folder)) + return all_patterns + + +def should_ignore_path( + path: Path, + base_folder: Path, + *, + gitignore_patterns: list[GitignorePattern] | None = None, +) -> bool: """Check if a path should be ignored during indexing. Ignores: @@ -41,10 +271,13 @@ def should_ignore_path(path: Path, base_folder: Path) -> bool: - Common development directories (__pycache__, node_modules, venv, etc.) - .egg-info directories - OS metadata files (Thumbs.db) + - Paths matching .gitignore patterns (when provided) Args: path: The file path to check. base_folder: The base folder for computing relative paths. + gitignore_patterns: Pre-parsed gitignore patterns from + :func:`load_gitignore_patterns`. Returns: True if the path should be ignored, False otherwise. @@ -64,7 +297,17 @@ def should_ignore_path(path: Path, base_folder: Path) -> bool: return True # Check specific file patterns - return path.name in DEFAULT_IGNORE_FILES + if path.name in DEFAULT_IGNORE_FILES: + return True + + # Check gitignore patterns + if gitignore_patterns: + rel_path_str = "/".join(rel_parts) + is_dir = path.is_dir() + if _matches_gitignore(rel_path_str, is_dir, gitignore_patterns): + return True + + return False # Files to read as plain text directly (fast path) diff --git a/tests/rag/test_indexer.py b/tests/rag/test_indexer.py index cedbb74e1..0981039dc 100644 --- a/tests/rag/test_indexer.py +++ b/tests/rag/test_indexer.py @@ -57,12 +57,17 @@ async def fake_watch_directory(_root: Path, handler: Any, **_kwargs) -> None: # @pytest.mark.asyncio async def test_watch_docs_passes_ignore_filter(tmp_path: Path) -> None: - """Test that watch_docs passes the should_ignore_path filter to watch_directory.""" + """Test that watch_docs passes a configured ignore filter to watch_directory.""" mock_collection = MagicMock() docs_folder = tmp_path / "docs" docs_folder.mkdir() + (docs_folder / ".gitignore").write_text("*.log\n") file_hashes: dict[str, str] = {} file_mtimes: dict[str, float] = {} + debug_log = docs_folder / "debug.log" + debug_log.touch() + readme = docs_folder / "readme.md" + readme.touch() async def fake_watch_directory( _root: Path, @@ -71,9 +76,10 @@ async def fake_watch_directory( ignore_filter: Any = None, **_kwargs: Any, ) -> None: - # Verify ignore_filter is provided and is the should_ignore_path function + # Verify ignore_filter is provided and applies loaded .gitignore rules. assert ignore_filter is not None - assert ignore_filter.__name__ == "should_ignore_path" + assert ignore_filter(debug_log, docs_folder) + assert not ignore_filter(readme, docs_folder) with patch( "agent_cli.rag._indexer.watch_directory", diff --git a/tests/rag/test_utils.py b/tests/rag/test_utils.py index 0d0428c09..717b10fd5 100644 --- a/tests/rag/test_utils.py +++ b/tests/rag/test_utils.py @@ -502,3 +502,175 @@ def test_path_outside_base_folder_raises(self, tmp_path: Path) -> None: other_path = Path("/some/other/path.txt") with pytest.raises(ValueError, match="is not in the subpath"): _utils.should_ignore_path(other_path, tmp_path) + + +class TestGitignoreSupport: + """Tests for .gitignore pattern matching in should_ignore_path.""" + + def test_gitignore_wildcard_pattern(self, tmp_path: Path) -> None: + """Test that *.log pattern ignores log files.""" + (tmp_path / ".gitignore").write_text("*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f = tmp_path / "debug.log" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_does_not_ignore_unmatched(self, tmp_path: Path) -> None: + """Test that files not matching gitignore patterns are kept.""" + (tmp_path / ".gitignore").write_text("*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f = tmp_path / "readme.md" + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_directory_pattern(self, tmp_path: Path) -> None: + """Test that directory-only patterns (trailing /) work.""" + (tmp_path / ".gitignore").write_text("output/\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + out = tmp_path / "output" + out.mkdir() + f = out / "result.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_rooted_pattern(self, tmp_path: Path) -> None: + """Test that rooted patterns (leading /) only match at the root.""" + (tmp_path / ".gitignore").write_text("/secret.txt\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + # Root-level match + f = tmp_path / "secret.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + # Nested should NOT match + subdir = tmp_path / "sub" + subdir.mkdir() + f2 = subdir / "secret.txt" + f2.touch() + assert not _utils.should_ignore_path(f2, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_negation_pattern(self, tmp_path: Path) -> None: + """Test that negation patterns (!) un-ignore files.""" + (tmp_path / ".gitignore").write_text("*.log\n!important.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f1 = tmp_path / "debug.log" + f1.touch() + assert _utils.should_ignore_path(f1, tmp_path, gitignore_patterns=patterns) + f2 = tmp_path / "important.log" + f2.touch() + assert not _utils.should_ignore_path(f2, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_comment_and_blank_lines(self, tmp_path: Path) -> None: + """Test that comments and blank lines in .gitignore are skipped.""" + (tmp_path / ".gitignore").write_text("# comment\n\n*.tmp\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + assert len(patterns) == 1 + assert patterns[0].pattern == "*.tmp" + + def test_gitignore_nested_directory_pattern(self, tmp_path: Path) -> None: + """Test pattern with path separator matches nested paths.""" + (tmp_path / ".gitignore").write_text("logs/*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + logs = tmp_path / "logs" + logs.mkdir() + f = logs / "app.log" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_nested_directory_pattern_no_overmatch(self, tmp_path: Path) -> None: + """Test pattern with slash does not over-match deeper directories.""" + (tmp_path / ".gitignore").write_text("logs/*.log\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + nested = tmp_path / "logs" / "sub" + nested.mkdir(parents=True) + f = nested / "app.log" + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_double_star_pattern(self, tmp_path: Path) -> None: + """Test that ** matches across directories.""" + (tmp_path / ".gitignore").write_text("**/test_output\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + deep = tmp_path / "a" / "b" / "test_output" + deep.mkdir(parents=True) + f = deep / "result.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_no_gitignore_file(self, tmp_path: Path) -> None: + """Test that missing .gitignore returns empty patterns.""" + patterns = _utils.load_gitignore_patterns(tmp_path) + assert patterns == [] + + def test_parent_gitignore_is_loaded(self, tmp_path: Path) -> None: + """Test that parent .gitignore is loaded when inside a git repo.""" + (tmp_path / ".git").mkdir() + (tmp_path / ".gitignore").write_text("*.bak\n") + subdir = tmp_path / "docs" + subdir.mkdir() + patterns = _utils.load_gitignore_patterns(subdir) + assert any(p.pattern == "*.bak" for p in patterns) + + def test_parent_gitignore_not_loaded_outside_repo_root(self, tmp_path: Path) -> None: + """Test that .gitignore above repository root is ignored.""" + outer = tmp_path / "outer" + repo_root = outer / "repo" + docs = repo_root / "docs" + docs.mkdir(parents=True) + (repo_root / ".git").mkdir() + (outer / ".gitignore").write_text("*.foo\n") + + patterns = _utils.load_gitignore_patterns(docs) + f = docs / "a.foo" + f.touch() + assert not _utils.should_ignore_path(f, docs, gitignore_patterns=patterns) + + def test_not_in_git_repo_only_loads_docs_gitignore(self, tmp_path: Path) -> None: + """Test that parent .gitignore is ignored when not in a git repo.""" + (tmp_path / ".gitignore").write_text("*.foo\n") + docs = tmp_path / "docs" + docs.mkdir() + (docs / ".gitignore").write_text("*.bar\n") + + patterns = _utils.load_gitignore_patterns(docs) + foo = docs / "x.foo" + foo.touch() + bar = docs / "x.bar" + bar.touch() + assert not _utils.should_ignore_path(foo, docs, gitignore_patterns=patterns) + assert _utils.should_ignore_path(bar, docs, gitignore_patterns=patterns) + + def test_parent_rooted_pattern_anchored_at_parent(self, tmp_path: Path) -> None: + """Test rooted parent pattern stays rooted to parent gitignore directory.""" + (tmp_path / ".gitignore").write_text("/foo\n") + subdir = tmp_path / "docs" + subdir.mkdir() + patterns = _utils.load_gitignore_patterns(subdir) + f = subdir / "foo" + f.touch() + assert not _utils.should_ignore_path(f, subdir, gitignore_patterns=patterns) + + def test_gitignore_slash_pattern_is_not_suffix_match(self, tmp_path: Path) -> None: + """Test slash patterns are rooted to gitignore directory, not suffix-matched.""" + (tmp_path / ".gitignore").write_text("foo/bar\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + f = tmp_path / "x" / "foo" / "bar" + f.parent.mkdir(parents=True) + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_negation_cannot_reinclude_file_in_ignored_directory(self, tmp_path: Path) -> None: + """Test !file does not re-include when parent directory remains ignored.""" + (tmp_path / ".gitignore").write_text("logs/\n!logs/keep.txt\n") + patterns = _utils.load_gitignore_patterns(tmp_path) + logs = tmp_path / "logs" + logs.mkdir() + f = logs / "keep.txt" + f.touch() + assert _utils.should_ignore_path(f, tmp_path, gitignore_patterns=patterns) + + def test_gitignore_no_patterns_means_no_extra_ignoring(self, tmp_path: Path) -> None: + """Test that passing None patterns behaves like no gitignore.""" + f = tmp_path / "normal.txt" + f.touch() + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=None) + assert not _utils.should_ignore_path(f, tmp_path, gitignore_patterns=[])