diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index 25cbcd3..b275556 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -111,8 +111,14 @@ def __post_init__(self) -> None: # --------------------------------------------------------------------------- # Module-level discovery cache: candidate_root → _VSCodeDiscoveryCache. # Avoids redundant multi-level glob traversals when the candidate root -# directory and its immediate child session directories have not changed -# (no sessions or window directories added or removed). +# directory and its most recently modified child have not changed. +# The root mtime check catches session-directory additions/removals, +# while the *newest_child* sentinel catches new window directories +# inside existing sessions — keeping steady-state cost at O(1) (two +# stat calls: root + sentinel child). +# NOTE: only changes under the cached newest child are detected by the +# sentinel; modifications to older session directories may go unnoticed +# until the root directory itself changes or the cache is cleared. # --------------------------------------------------------------------------- @@ -120,15 +126,27 @@ def __post_init__(self) -> None: class _VSCodeDiscoveryCache: """Cached result of discover_vscode_logs for a given root directory. - *child_ids* stores the ``(name, file_identity)`` pairs of immediate - child directories under the candidate root. This allows the cache to - detect new or modified session sub-directories (e.g. a new ``window*`` - directory) whose creation would not update the root directory's own - mtime. + *root_id* (``(st_mtime_ns, st_size)``) catches new or removed session + directories (child additions update parent mtime on Linux/macOS). + *newest_child_path* / *newest_child_id* store the most recently + modified immediate child at population time; re-stat'ing this single + sentinel on a hit detects new window directories added inside an + existing session. *child_ids* is recorded at population time and + retained for diagnostics but is not fully rescanned on cache hits. + + **Limitation:** only changes under the cached newest session directory + are detected by the sentinel. If a different (older) session directory + is modified (e.g. a new ``window*/`` appears under a non-newest + session), ``root_id`` will still match and the sentinel stat will also + match, so the cache may return stale ``log_paths`` until the root + directory itself changes or the cache is cleared. This is an accepted + trade-off for O(1) steady-state cost. """ root_id: tuple[int, int] # (st_mtime_ns, st_size) of the logs root child_ids: _ChildIds + newest_child_path: Path | None # most-recently-modified session dir + newest_child_id: tuple[int, int] | None # its identity at population log_paths: tuple[Path, ...] @@ -218,21 +236,43 @@ def discover_vscode_logs(base_path: Path | None = None) -> list[Path]: return all_logs +def _newest_child_from_ids( + root: Path, + child_ids: _ChildIds, +) -> tuple[Path | None, tuple[int, int] | None]: + """Return the path and identity of the most recently modified child. + + Picks the child with the highest ``st_mtime_ns`` from *child_ids*. + Ties are broken deterministically by child name. + Returns ``(None, None)`` when *child_ids* is empty. + """ + if not child_ids: + return None, None + name, identity = max(child_ids, key=lambda item: (item[1][0], item[0])) + return root / name, identity + + def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]: """Return discovered log paths, skipping glob when the root is unchanged. - Each candidate root directory is stat'd and its immediate child - directories are scanned via :func:`_scan_child_ids` (one - ``os.scandir`` plus one ``DirEntry.stat`` per child). On a cache - hit (same root ``(st_mtime_ns, st_size)`` *and* same child-directory - identities), the stored paths are reused without re-running the - multi-level glob. On a miss, the glob runs and the cache is - updated. - - Steady-state cost per candidate root is therefore O(children) — - one ``stat`` on the root plus one ``scandir`` walk — which is still - significantly cheaper than the deep recursive glob it replaces, but - not constant-time. + Each candidate root directory is stat'd. On a cache hit (same + root ``(st_mtime_ns, st_size)`` *and* same identity of the most + recently modified child), the stored paths are reused without + scanning all child directories or running the multi-level glob. + On a miss, :func:`_scan_child_ids` runs and the glob executes to + repopulate the cache. + + The root identity check catches session-directory additions/removals + (child additions update parent mtime on Linux/macOS). The + *newest_child* sentinel catches new window directories inside + existing sessions: adding ``window2/`` under a session dir updates + that session dir's mtime, which the sentinel stat detects. + Steady-state cost is O(1) — two ``stat()`` calls (root + sentinel). + + **Limitation:** the sentinel only tracks the most recently modified + child at cache-population time. Changes under a different (older) + session directory will not be detected until the root directory + itself changes or the cache is cleared. A non-directory candidate is skipped with an empty result, matching the behaviour of :func:`discover_vscode_logs`. @@ -256,19 +296,29 @@ def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]: ) continue root_id: tuple[int, int] = (st.st_mtime_ns, st.st_size) - child_ids = _scan_child_ids(candidate) cached = _VSCODE_DISCOVERY_CACHE.get(candidate) if ( cached is not None and cached.root_id == root_id - and cached.child_ids == child_ids + and ( + cached.newest_child_path is None + or safe_file_identity(cached.newest_child_path) + == cached.newest_child_id + ) ): + # Root + sentinel child unchanged — reuse cached log paths. result.extend(cached.log_paths) continue - # Cache miss — run glob + # Cache miss or root/sentinel changed — scan children and run glob + child_ids = _scan_child_ids(candidate) + newest_path, newest_id = _newest_child_from_ids(candidate, child_ids) found = sorted(candidate.glob(_GLOB_PATTERN)) _VSCODE_DISCOVERY_CACHE[candidate] = _VSCodeDiscoveryCache( - root_id=root_id, child_ids=child_ids, log_paths=tuple(found) + root_id=root_id, + child_ids=child_ids, + newest_child_path=newest_path, + newest_child_id=newest_id, + log_paths=tuple(found), ) result.extend(found) result.sort() @@ -543,11 +593,11 @@ def get_vscode_summary(base_path: Path | None = None) -> VSCodeLogSummary: Discovery uses :func:`_cached_discover_vscode_logs` to avoid redundant multi-level glob traversals when the candidate root - directories and their immediate child session directories have not - changed on disk. The steady-state discovery cost is O(children) - per candidate root (one ``stat`` on the root plus one ``scandir`` - walk), not constant-time — but still much cheaper than the deep - recursive glob it replaces. + directories and their most recently modified child session + directories have not changed on disk. The steady-state discovery + cost is O(1) per candidate root (two ``stat`` calls: one on the root + directory and one on the sentinel child), which is much cheaper than + the deep recursive glob it replaces. Uses :func:`_get_cached_vscode_requests` so that unchanged log files are not re-parsed on repeated invocations. A module-level summary diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index 3122b10..a33d224 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -2153,6 +2153,8 @@ def test_cache_invalidated_on_root_mtime_change(self, tmp_path: Path) -> None: _VSCODE_DISCOVERY_CACHE[tmp_path] = _VSCodeDiscoveryCache( root_id=(cached.root_id[0] + 1_000_000_000, cached.root_id[1]), child_ids=cached.child_ids, + newest_child_path=cached.newest_child_path, + newest_child_id=cached.newest_child_id, log_paths=cached.log_paths, ) @@ -2192,7 +2194,13 @@ def test_discovery_cache_populated(self, tmp_path: Path) -> None: def test_new_window_under_existing_session_triggers_rediscovery( self, tmp_path: Path ) -> None: - """Adding a window dir under an existing session invalidates the cache.""" + """Adding a window dir under an existing session invalidates the cache. + + The discovery cache stores a sentinel (the most recently modified + session directory). When a new ``window*`` directory is created + under that session, the session directory's mtime changes, causing + the sentinel check to miss and trigger a full re-glob. + """ session_dir = tmp_path / "20260313T211400" log_dir = session_dir / "window1" / "exthost" / "GitHub.copilot-chat" log_dir.mkdir(parents=True) @@ -2223,7 +2231,8 @@ def _counting_glob( ) s2 = get_vscode_summary(tmp_path) - assert glob_call_count == 2 # re-globbed due to child change + # Session dir mtime changed → sentinel miss → re-globbed + assert glob_call_count == 2 assert s2.total_requests == 2 def test_non_directory_candidate_skipped(self, tmp_path: Path) -> None: @@ -2289,3 +2298,41 @@ def test_missing_candidate_skipped(self, tmp_path: Path) -> None: result = _cached_discover_vscode_logs(missing) assert result == [] assert missing not in _VSCODE_DISCOVERY_CACHE + + +class TestCachedDiscoverSkipsChildScanOnHit: + """Verify _scan_child_ids is not called on root_id + sentinel cache hits. + + After a warm call populates _VSCODE_DISCOVERY_CACHE, a subsequent + call with an unchanged root *and* unchanged sentinel child must + short-circuit without invoking _scan_child_ids — avoiding + O(n_children) stat syscalls on every steady-state invocation. + """ + + def test_cached_discover_skips_child_scan_on_root_id_hit( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """_scan_child_ids must not be called when root_id matches the cache.""" + log_dir = ( + tmp_path / "20260313T211400" / "window1" / "exthost" / "GitHub.copilot-chat" + ) + log_dir.mkdir(parents=True) + (log_dir / "GitHub Copilot Chat.log").write_text(_make_log_line(req_idx=0)) + + # Warm the cache with the real implementation. + _cached_discover_vscode_logs(tmp_path) + assert tmp_path in _VSCODE_DISCOVERY_CACHE + + # Spy on _scan_child_ids for subsequent calls. + import copilot_usage.vscode_parser as _mod + + scan_calls: list[Path] = [] + original = _mod._scan_child_ids # pyright: ignore[reportPrivateUsage] + + def spy(root: Path) -> frozenset[tuple[str, tuple[int, int]]]: + scan_calls.append(root) + return original(root) + + monkeypatch.setattr(_mod, "_scan_child_ids", spy) + _cached_discover_vscode_logs(tmp_path) + assert scan_calls == [], "child scan must be skipped on root_id cache hit"