Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions agent_cli/rag/_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from agent_cli.core.watch import watch_directory
from agent_cli.rag._indexing import index_file, remove_file
from agent_cli.rag._utils import should_ignore_path
from agent_cli.rag._utils import load_gitignore_patterns, should_ignore_path

if TYPE_CHECKING:
from pathlib import Path
Expand All @@ -26,6 +26,10 @@ async def watch_docs(
) -> None:
"""Watch docs folder for changes and update index asynchronously."""
LOGGER.info("📁 Watching folder: %s", docs_folder)
gitignore_patterns = load_gitignore_patterns(docs_folder)

def ignore_filter(path: Path, base_folder: Path) -> bool:
return should_ignore_path(path, base_folder, gitignore_patterns=gitignore_patterns)

await watch_directory(
docs_folder,
Expand All @@ -37,7 +41,7 @@ async def watch_docs(
file_hashes,
file_mtimes,
),
ignore_filter=should_ignore_path,
ignore_filter=ignore_filter,
)


Expand Down
20 changes: 17 additions & 3 deletions agent_cli/rag/_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
from typing import TYPE_CHECKING

from agent_cli.rag._store import delete_by_file_path, get_all_metadata, upsert_docs
from agent_cli.rag._utils import chunk_text, get_file_hash, load_document_text, should_ignore_path
from agent_cli.rag._utils import (
chunk_text,
get_file_hash,
load_document_text,
load_gitignore_patterns,
should_ignore_path,
)
from agent_cli.rag.models import DocMetadata

if TYPE_CHECKING:
Expand Down Expand Up @@ -172,9 +178,17 @@ def initial_index(
processed_files = []
removed_files = []

# Gather all files first, excluding hidden and common development directories
# Load .gitignore patterns once for the entire scan
gitignore_patterns = load_gitignore_patterns(docs_folder)
if gitignore_patterns:
LOGGER.info("📋 Loaded %d .gitignore patterns", len(gitignore_patterns))

# Gather all files first, excluding hidden, common dev dirs, and gitignored paths
all_files = [
p for p in docs_folder.rglob("*") if p.is_file() and not should_ignore_path(p, docs_folder)
p
for p in docs_folder.rglob("*")
if p.is_file()
and not should_ignore_path(p, docs_folder, gitignore_patterns=gitignore_patterns)
]

# 1. Index Existing Files in Parallel
Expand Down
247 changes: 245 additions & 2 deletions agent_cli/rag/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

from __future__ import annotations

import fnmatch
import hashlib
import logging
import re
from dataclasses import dataclass
from functools import lru_cache
from typing import TYPE_CHECKING

if TYPE_CHECKING:
Expand Down Expand Up @@ -33,18 +37,247 @@
)


def should_ignore_path(path: Path, base_folder: Path) -> bool:
@dataclass(frozen=True)
class GitignorePattern:
"""A normalized gitignore pattern plus the source directory context."""

pattern: str
negated: bool
dir_only: bool
anchored: bool
has_slash: bool
base_prefix: tuple[str, ...]


def _normalize_gitignore_line(line: str) -> tuple[bool, str] | None:
"""Normalize one .gitignore line.

Returns:
``None`` if the line should be ignored, otherwise ``(negated, pattern)``.

"""
line = line.strip()
if not line:
return None
if line.startswith((r"\#", r"\!")):
return False, line[1:]
if line.startswith("#"):
return None

negated = line.startswith("!")
if negated:
line = line[1:]
if not line:
return None
return negated, line


def _parse_gitignore(gitignore_path: Path, docs_folder: Path) -> list[GitignorePattern]:
"""Parse one .gitignore file into normalized rule objects."""
try:
text = gitignore_path.read_text(errors="ignore")
except OSError:
return []

try:
base_prefix = docs_folder.resolve().relative_to(gitignore_path.parent.resolve()).parts
except ValueError:
return []

patterns: list[GitignorePattern] = []
for raw_line in text.splitlines():
normalized = _normalize_gitignore_line(raw_line)
if normalized is None:
continue
negated, line = normalized

dir_only = line.endswith("/")
if dir_only:
line = line.rstrip("/")

anchored = line.startswith("/")
if anchored:
line = line.lstrip("/")

if not line:
continue

patterns.append(
GitignorePattern(
pattern=line,
negated=negated,
dir_only=dir_only,
anchored=anchored,
has_slash="/" in line,
base_prefix=base_prefix,
),
)
return patterns


@lru_cache(maxsize=512)
def _compile_gitignore_regex(pattern: str) -> re.Pattern[str]:
"""Compile a gitignore-like pattern into a regex.

This keeps the key semantics needed here:
- ``*`` does not cross path separators
- ``**`` may cross path separators
"""
regex_parts: list[str] = []
i = 0
while i < len(pattern):
char = pattern[i]
if char == "*":
if i + 1 < len(pattern) and pattern[i + 1] == "*":
# Collapse runs like ** or ***
while i + 1 < len(pattern) and pattern[i + 1] == "*":
i += 1
if i + 1 < len(pattern) and pattern[i + 1] == "/":
regex_parts.append("(?:.*/)?")
i += 1
else:
regex_parts.append(".*")
else:
regex_parts.append("[^/]*")
elif char == "?":
regex_parts.append("[^/]")
else:
regex_parts.append(re.escape(char))
i += 1

return re.compile(f"^{''.join(regex_parts)}$")


def _gitignore_rule_matches(
rule: GitignorePattern,
rel_parts: tuple[str, ...],
is_dir: bool,
) -> bool:
"""Check whether one parsed gitignore rule matches one path."""
if rule.dir_only and not is_dir:
return False
if not rel_parts:
return False

path_parts = (*rule.base_prefix, *rel_parts)
if not path_parts:
return False

# No-slash patterns match only the basename at any depth.
# Ancestor directories are handled by `_matches_gitignore`.
if not rule.has_slash and not rule.anchored:
return fnmatch.fnmatchcase(path_parts[-1], rule.pattern)

rel_path_str = "/".join(path_parts)
return bool(_compile_gitignore_regex(rule.pattern).fullmatch(rel_path_str))


def _is_path_ignored_by_rules(
rel_parts: tuple[str, ...],
is_dir: bool,
gitignore_patterns: list[GitignorePattern],
) -> bool:
"""Evaluate gitignore rules for a single path."""
ignored = False
for rule in gitignore_patterns:
if _gitignore_rule_matches(rule, rel_parts, is_dir):
ignored = not rule.negated
return ignored


def _matches_gitignore(
rel_path_str: str,
is_dir: bool,
gitignore_patterns: list[GitignorePattern],
) -> bool:
"""Check if a path matches gitignore patterns.

Processes patterns in order; negation patterns (``!``) can un-ignore
previously matched paths. Parent directories are evaluated separately:
if any parent directory is ignored, the file inside remains ignored.
"""
parts = tuple(part for part in rel_path_str.split("/") if part)
if not parts:
return False

# If any ancestor directory is ignored, this path is ignored too.
for i in range(1, len(parts)):
if _is_path_ignored_by_rules(parts[:i], is_dir=True, gitignore_patterns=gitignore_patterns):
return True

return _is_path_ignored_by_rules(parts, is_dir, gitignore_patterns)


def _find_git_root(start: Path) -> Path | None:
"""Find the nearest git repository root at or above ``start``."""
current = start.resolve()
while True:
if (current / ".git").exists():
return current
parent = current.parent
if parent == current:
return None
current = parent


def load_gitignore_patterns(docs_folder: Path) -> list[GitignorePattern]:
"""Load .gitignore patterns from the docs folder and its parents.

Walks up from ``docs_folder`` to the git repository root, collecting
``.gitignore`` files. If ``docs_folder`` is not inside a git repo,
only ``docs_folder/.gitignore`` is considered.

Patterns from parent directories are applied first (lower priority),
then patterns from directories closer to ``docs_folder`` (higher priority),
matching Git's behaviour.
"""
gitignore_files: list[Path] = []
current = docs_folder.resolve()
git_root = _find_git_root(current)
while True:
candidate = current / ".gitignore"
if candidate.is_file():
gitignore_files.append(candidate)

if git_root is None:
break
if current == git_root:
break

parent = current.parent
if parent == current:
break
current = parent

# Reverse so parent patterns come first (lower priority)
gitignore_files.reverse()

all_patterns: list[GitignorePattern] = []
for gi in gitignore_files:
all_patterns.extend(_parse_gitignore(gi, docs_folder))
return all_patterns


def should_ignore_path(
path: Path,
base_folder: Path,
*,
gitignore_patterns: list[GitignorePattern] | None = None,
) -> bool:
"""Check if a path should be ignored during indexing.

Ignores:
- Any path component starting with '.' (hidden files/dirs)
- Common development directories (__pycache__, node_modules, venv, etc.)
- .egg-info directories
- OS metadata files (Thumbs.db)
- Paths matching .gitignore patterns (when provided)

Args:
path: The file path to check.
base_folder: The base folder for computing relative paths.
gitignore_patterns: Pre-parsed gitignore patterns from
:func:`load_gitignore_patterns`.

Returns:
True if the path should be ignored, False otherwise.
Expand All @@ -64,7 +297,17 @@ def should_ignore_path(path: Path, base_folder: Path) -> bool:
return True

# Check specific file patterns
return path.name in DEFAULT_IGNORE_FILES
if path.name in DEFAULT_IGNORE_FILES:
return True

# Check gitignore patterns
if gitignore_patterns:
rel_path_str = "/".join(rel_parts)
is_dir = path.is_dir()
if _matches_gitignore(rel_path_str, is_dir, gitignore_patterns):
return True

return False


# Files to read as plain text directly (fast path)
Expand Down
12 changes: 9 additions & 3 deletions tests/rag/test_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,17 @@ async def fake_watch_directory(_root: Path, handler: Any, **_kwargs) -> None: #

@pytest.mark.asyncio
async def test_watch_docs_passes_ignore_filter(tmp_path: Path) -> None:
"""Test that watch_docs passes the should_ignore_path filter to watch_directory."""
"""Test that watch_docs passes a configured ignore filter to watch_directory."""
mock_collection = MagicMock()
docs_folder = tmp_path / "docs"
docs_folder.mkdir()
(docs_folder / ".gitignore").write_text("*.log\n")
file_hashes: dict[str, str] = {}
file_mtimes: dict[str, float] = {}
debug_log = docs_folder / "debug.log"
debug_log.touch()
readme = docs_folder / "readme.md"
readme.touch()

async def fake_watch_directory(
_root: Path,
Expand All @@ -71,9 +76,10 @@ async def fake_watch_directory(
ignore_filter: Any = None,
**_kwargs: Any,
) -> None:
# Verify ignore_filter is provided and is the should_ignore_path function
# Verify ignore_filter is provided and applies loaded .gitignore rules.
assert ignore_filter is not None
assert ignore_filter.__name__ == "should_ignore_path"
assert ignore_filter(debug_log, docs_folder)
assert not ignore_filter(readme, docs_folder)

with patch(
"agent_cli.rag._indexer.watch_directory",
Expand Down
Loading