Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 78 additions & 28 deletions src/copilot_usage/vscode_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,24 +111,42 @@ def __post_init__(self) -> None:
# ---------------------------------------------------------------------------
# Module-level discovery cache: candidate_root → _VSCodeDiscoveryCache.
# Avoids redundant multi-level glob traversals when the candidate root
# directory and its immediate child session directories have not changed
# (no sessions or window directories added or removed).
# directory and its most recently modified child have not changed.
# The root mtime check catches session-directory additions/removals,
# while the *newest_child* sentinel catches new window directories
# inside existing sessions — keeping steady-state cost at O(1) (two
# stat calls: root + sentinel child).
# NOTE: only changes under the cached newest child are detected by the
# sentinel; modifications to older session directories may go unnoticed
# until the root directory itself changes or the cache is cleared.
# ---------------------------------------------------------------------------


@dataclass(slots=True)
class _VSCodeDiscoveryCache:
"""Cached result of discover_vscode_logs for a given root directory.

*child_ids* stores the ``(name, file_identity)`` pairs of immediate
child directories under the candidate root. This allows the cache to
detect new or modified session sub-directories (e.g. a new ``window*``
directory) whose creation would not update the root directory's own
mtime.
*root_id* (``(st_mtime_ns, st_size)``) catches new or removed session
directories (child additions update parent mtime on Linux/macOS).
*newest_child_path* / *newest_child_id* store the most recently
modified immediate child at population time; re-stat'ing this single
sentinel on a hit detects new window directories added inside an
existing session. *child_ids* is recorded at population time and
retained for diagnostics but is not fully rescanned on cache hits.

**Limitation:** only changes under the cached newest session directory
are detected by the sentinel. If a different (older) session directory
is modified (e.g. a new ``window*/`` appears under a non-newest
session), ``root_id`` will still match and the sentinel stat will also
match, so the cache may return stale ``log_paths`` until the root
directory itself changes or the cache is cleared. This is an accepted
trade-off for O(1) steady-state cost.
"""

root_id: tuple[int, int] # (st_mtime_ns, st_size) of the logs root
child_ids: _ChildIds
newest_child_path: Path | None # most-recently-modified session dir
newest_child_id: tuple[int, int] | None # its identity at population
log_paths: tuple[Path, ...]


Expand Down Expand Up @@ -218,21 +236,43 @@ def discover_vscode_logs(base_path: Path | None = None) -> list[Path]:
return all_logs


def _newest_child_from_ids(
root: Path,
child_ids: _ChildIds,
) -> tuple[Path | None, tuple[int, int] | None]:
"""Return the path and identity of the most recently modified child.

Picks the child with the highest ``st_mtime_ns`` from *child_ids*.
Ties are broken deterministically by child name.
Returns ``(None, None)`` when *child_ids* is empty.
"""
if not child_ids:
return None, None
name, identity = max(child_ids, key=lambda item: (item[1][0], item[0]))
return root / name, identity


def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]:
"""Return discovered log paths, skipping glob when the root is unchanged.

Each candidate root directory is stat'd and its immediate child
directories are scanned via :func:`_scan_child_ids` (one
``os.scandir`` plus one ``DirEntry.stat`` per child). On a cache
hit (same root ``(st_mtime_ns, st_size)`` *and* same child-directory
identities), the stored paths are reused without re-running the
multi-level glob. On a miss, the glob runs and the cache is
updated.

Steady-state cost per candidate root is therefore O(children) —
one ``stat`` on the root plus one ``scandir`` walk — which is still
significantly cheaper than the deep recursive glob it replaces, but
not constant-time.
Each candidate root directory is stat'd. On a cache hit (same
root ``(st_mtime_ns, st_size)`` *and* same identity of the most
recently modified child), the stored paths are reused without
scanning all child directories or running the multi-level glob.
On a miss, :func:`_scan_child_ids` runs and the glob executes to
repopulate the cache.

The root identity check catches session-directory additions/removals
(child additions update parent mtime on Linux/macOS). The
*newest_child* sentinel catches new window directories inside
existing sessions: adding ``window2/`` under a session dir updates
that session dir's mtime, which the sentinel stat detects.
Steady-state cost is O(1) — two ``stat()`` calls (root + sentinel).

**Limitation:** the sentinel only tracks the most recently modified
child at cache-population time. Changes under a different (older)
session directory will not be detected until the root directory
itself changes or the cache is cleared.

A non-directory candidate is skipped with an empty result, matching
the behaviour of :func:`discover_vscode_logs`.
Expand All @@ -256,19 +296,29 @@ def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]:
)
continue
root_id: tuple[int, int] = (st.st_mtime_ns, st.st_size)
child_ids = _scan_child_ids(candidate)
cached = _VSCODE_DISCOVERY_CACHE.get(candidate)
if (
cached is not None
and cached.root_id == root_id
and cached.child_ids == child_ids
and (
cached.newest_child_path is None
or safe_file_identity(cached.newest_child_path)
== cached.newest_child_id
)
):
# Root + sentinel child unchanged — reuse cached log paths.
result.extend(cached.log_paths)
continue
# Cache miss — run glob
# Cache miss or root/sentinel changed — scan children and run glob
child_ids = _scan_child_ids(candidate)
newest_path, newest_id = _newest_child_from_ids(candidate, child_ids)
found = sorted(candidate.glob(_GLOB_PATTERN))
_VSCODE_DISCOVERY_CACHE[candidate] = _VSCodeDiscoveryCache(
root_id=root_id, child_ids=child_ids, log_paths=tuple(found)
root_id=root_id,
child_ids=child_ids,
newest_child_path=newest_path,
newest_child_id=newest_id,
log_paths=tuple(found),
)
result.extend(found)
result.sort()
Expand Down Expand Up @@ -543,11 +593,11 @@ def get_vscode_summary(base_path: Path | None = None) -> VSCodeLogSummary:

Discovery uses :func:`_cached_discover_vscode_logs` to avoid
redundant multi-level glob traversals when the candidate root
directories and their immediate child session directories have not
changed on disk. The steady-state discovery cost is O(children)
per candidate root (one ``stat`` on the root plus one ``scandir``
walk), not constant-time — but still much cheaper than the deep
recursive glob it replaces.
directories and their most recently modified child session
directories have not changed on disk. The steady-state discovery
cost is O(1) per candidate root (two ``stat`` calls: one on the root
directory and one on the sentinel child), which is much cheaper than
the deep recursive glob it replaces.

Uses :func:`_get_cached_vscode_requests` so that unchanged log files
are not re-parsed on repeated invocations. A module-level summary
Expand Down
51 changes: 49 additions & 2 deletions tests/copilot_usage/test_vscode_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2153,6 +2153,8 @@ def test_cache_invalidated_on_root_mtime_change(self, tmp_path: Path) -> None:
_VSCODE_DISCOVERY_CACHE[tmp_path] = _VSCodeDiscoveryCache(
root_id=(cached.root_id[0] + 1_000_000_000, cached.root_id[1]),
child_ids=cached.child_ids,
newest_child_path=cached.newest_child_path,
newest_child_id=cached.newest_child_id,
log_paths=cached.log_paths,
)

Expand Down Expand Up @@ -2192,7 +2194,13 @@ def test_discovery_cache_populated(self, tmp_path: Path) -> None:
def test_new_window_under_existing_session_triggers_rediscovery(
self, tmp_path: Path
) -> None:
"""Adding a window dir under an existing session invalidates the cache."""
"""Adding a window dir under an existing session invalidates the cache.

The discovery cache stores a sentinel (the most recently modified
session directory). When a new ``window*`` directory is created
under that session, the session directory's mtime changes, causing
the sentinel check to miss and trigger a full re-glob.
"""
session_dir = tmp_path / "20260313T211400"
log_dir = session_dir / "window1" / "exthost" / "GitHub.copilot-chat"
log_dir.mkdir(parents=True)
Expand Down Expand Up @@ -2223,7 +2231,8 @@ def _counting_glob(
)

s2 = get_vscode_summary(tmp_path)
assert glob_call_count == 2 # re-globbed due to child change
# Session dir mtime changed → sentinel miss → re-globbed
assert glob_call_count == 2
assert s2.total_requests == 2

def test_non_directory_candidate_skipped(self, tmp_path: Path) -> None:
Expand Down Expand Up @@ -2289,3 +2298,41 @@ def test_missing_candidate_skipped(self, tmp_path: Path) -> None:
result = _cached_discover_vscode_logs(missing)
assert result == []
assert missing not in _VSCODE_DISCOVERY_CACHE


class TestCachedDiscoverSkipsChildScanOnHit:
"""Verify _scan_child_ids is not called on root_id + sentinel cache hits.

After a warm call populates _VSCODE_DISCOVERY_CACHE, a subsequent
call with an unchanged root *and* unchanged sentinel child must
short-circuit without invoking _scan_child_ids — avoiding
O(n_children) stat syscalls on every steady-state invocation.
"""

def test_cached_discover_skips_child_scan_on_root_id_hit(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""_scan_child_ids must not be called when root_id matches the cache."""
log_dir = (
tmp_path / "20260313T211400" / "window1" / "exthost" / "GitHub.copilot-chat"
)
log_dir.mkdir(parents=True)
(log_dir / "GitHub Copilot Chat.log").write_text(_make_log_line(req_idx=0))

# Warm the cache with the real implementation.
_cached_discover_vscode_logs(tmp_path)
assert tmp_path in _VSCODE_DISCOVERY_CACHE

# Spy on _scan_child_ids for subsequent calls.
import copilot_usage.vscode_parser as _mod

scan_calls: list[Path] = []
original = _mod._scan_child_ids # pyright: ignore[reportPrivateUsage]

def spy(root: Path) -> frozenset[tuple[str, tuple[int, int]]]:
scan_calls.append(root)
return original(root)

monkeypatch.setattr(_mod, "_scan_child_ids", spy)
_cached_discover_vscode_logs(tmp_path)
assert scan_calls == [], "child scan must be skipped on root_id cache hit"
Loading