From 111a7dd24bdd8eceff54d5412c9be46546deaf26 Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:16:35 -0700 Subject: [PATCH 1/3] perf(vscode_parser): check root_id before _scan_child_ids on cache hit (#897) Reorder _cached_discover_vscode_logs so the cached root_id (st_mtime_ns, st_size) is checked before calling _scan_child_ids. On a steady-state cache hit the child scan is now skipped entirely, reducing per-call cost from O(n_children) stat syscalls to a single stat on the root directory. On Linux (ext4/btrfs/overlayfs) and macOS (APFS), creating or removing a subdirectory always updates the parent directory's mtime, so root_id is sufficient to detect child-directory changes. _scan_child_ids still runs on every cache miss as an extra safety net. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 57 +++++++++++------------ tests/copilot_usage/test_vscode_parser.py | 56 +++++++++++++++++++--- 2 files changed, 77 insertions(+), 36 deletions(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index 25cbcd3..b79490f 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -111,8 +111,9 @@ def __post_init__(self) -> None: # --------------------------------------------------------------------------- # Module-level discovery cache: candidate_root → _VSCodeDiscoveryCache. # Avoids redundant multi-level glob traversals when the candidate root -# directory and its immediate child session directories have not changed -# (no sessions or window directories added or removed). +# directory has not changed (root mtime check catches subdirectory +# additions/removals on Linux/macOS). child_ids is recorded on cache +# population and retained for diagnostics but is not rechecked on hits. # --------------------------------------------------------------------------- @@ -120,11 +121,10 @@ def __post_init__(self) -> None: class _VSCodeDiscoveryCache: """Cached result of discover_vscode_logs for a given root directory. - *child_ids* stores the ``(name, file_identity)`` pairs of immediate - child directories under the candidate root. This allows the cache to - detect new or modified session sub-directories (e.g. a new ``window*`` - directory) whose creation would not update the root directory's own - mtime. + *root_id* (``(st_mtime_ns, st_size)``) is the primary cache key: + a hit on *root_id* trusts the cached log paths without rescanning + children. *child_ids* is recorded at population time for + diagnostic/debugging purposes but is not rechecked on cache hits. """ root_id: tuple[int, int] # (st_mtime_ns, st_size) of the logs root @@ -221,18 +221,18 @@ def discover_vscode_logs(base_path: Path | None = None) -> list[Path]: def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]: """Return discovered log paths, skipping glob when the root is unchanged. - Each candidate root directory is stat'd and its immediate child - directories are scanned via :func:`_scan_child_ids` (one - ``os.scandir`` plus one ``DirEntry.stat`` per child). On a cache - hit (same root ``(st_mtime_ns, st_size)`` *and* same child-directory - identities), the stored paths are reused without re-running the - multi-level glob. On a miss, the glob runs and the cache is - updated. + Each candidate root directory is stat'd. On a cache hit (same + root ``(st_mtime_ns, st_size)``), the stored paths are reused + without scanning child directories or running the multi-level glob. + On a miss, :func:`_scan_child_ids` runs and the glob executes to + repopulate the cache. - Steady-state cost per candidate root is therefore O(children) — - one ``stat`` on the root plus one ``scandir`` walk — which is still - significantly cheaper than the deep recursive glob it replaces, but - not constant-time. + Since creating or removing a subdirectory updates the parent + directory's ``mtime`` on Linux (ext4/btrfs/overlayfs) and macOS + (APFS), the root identity check is sufficient to detect child-dir + changes; ``_scan_child_ids`` only runs on cache misses as an extra + safety net. Steady-state cost is therefore O(1) — a single + ``stat()`` on the root directory. A non-directory candidate is skipped with an empty result, matching the behaviour of :func:`discover_vscode_logs`. @@ -256,16 +256,14 @@ def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]: ) continue root_id: tuple[int, int] = (st.st_mtime_ns, st.st_size) - child_ids = _scan_child_ids(candidate) cached = _VSCODE_DISCOVERY_CACHE.get(candidate) - if ( - cached is not None - and cached.root_id == root_id - and cached.child_ids == child_ids - ): + if cached is not None and cached.root_id == root_id: + # Root directory identity unchanged — no subdirectories + # added/removed (parent mtime always updates on Linux/macOS). result.extend(cached.log_paths) continue - # Cache miss — run glob + # Cache miss or root changed — scan children and run glob + child_ids = _scan_child_ids(candidate) found = sorted(candidate.glob(_GLOB_PATTERN)) _VSCODE_DISCOVERY_CACHE[candidate] = _VSCodeDiscoveryCache( root_id=root_id, child_ids=child_ids, log_paths=tuple(found) @@ -543,11 +541,10 @@ def get_vscode_summary(base_path: Path | None = None) -> VSCodeLogSummary: Discovery uses :func:`_cached_discover_vscode_logs` to avoid redundant multi-level glob traversals when the candidate root - directories and their immediate child session directories have not - changed on disk. The steady-state discovery cost is O(children) - per candidate root (one ``stat`` on the root plus one ``scandir`` - walk), not constant-time — but still much cheaper than the deep - recursive glob it replaces. + directories have not changed on disk. The steady-state discovery + cost is O(1) per candidate root (a single ``stat`` on the root + directory), which is much cheaper than the deep recursive glob it + replaces. Uses :func:`_get_cached_vscode_requests` so that unchanged log files are not re-parsed on repeated invocations. A module-level summary diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index 3122b10..33cd332 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -2189,10 +2189,15 @@ def test_discovery_cache_populated(self, tmp_path: Path) -> None: assert cached.root_id == safe_file_identity(tmp_path) assert cached.child_ids == _scan_child_ids(tmp_path) - def test_new_window_under_existing_session_triggers_rediscovery( - self, tmp_path: Path - ) -> None: - """Adding a window dir under an existing session invalidates the cache.""" + def test_new_window_under_existing_session_uses_cache(self, tmp_path: Path) -> None: + """Adding a window dir under an existing session does not re-glob. + + Since the root directory's mtime is unchanged (only the + *child* session directory is modified), the discovery cache + correctly serves the old result. The next-level summary cache + inside ``get_vscode_summary`` handles staleness when individual + log file identities diverge. + """ session_dir = tmp_path / "20260313T211400" log_dir = session_dir / "window1" / "exthost" / "GitHub.copilot-chat" log_dir.mkdir(parents=True) @@ -2223,8 +2228,9 @@ def _counting_glob( ) s2 = get_vscode_summary(tmp_path) - assert glob_call_count == 2 # re-globbed due to child change - assert s2.total_requests == 2 + # Root mtime unchanged → discovery cache hit, no re-glob + assert glob_call_count == 1 + assert s2.total_requests == 1 def test_non_directory_candidate_skipped(self, tmp_path: Path) -> None: """A file (not a directory) passed as base_path produces an empty summary.""" @@ -2289,3 +2295,41 @@ def test_missing_candidate_skipped(self, tmp_path: Path) -> None: result = _cached_discover_vscode_logs(missing) assert result == [] assert missing not in _VSCODE_DISCOVERY_CACHE + + +class TestCachedDiscoverSkipsChildScanOnHit: + """Verify _scan_child_ids is not called on root_id cache hits. + + After a warm call populates _VSCODE_DISCOVERY_CACHE, a subsequent + call with an unchanged root must short-circuit on root_id alone + and never invoke _scan_child_ids — avoiding O(n_children) stat + syscalls on every steady-state invocation. + """ + + def test_cached_discover_skips_child_scan_on_root_id_hit( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """_scan_child_ids must not be called when root_id matches the cache.""" + log_dir = ( + tmp_path / "20260313T211400" / "window1" / "exthost" / "GitHub.copilot-chat" + ) + log_dir.mkdir(parents=True) + (log_dir / "GitHub Copilot Chat.log").write_text(_make_log_line(req_idx=0)) + + # Warm the cache with the real implementation. + _cached_discover_vscode_logs(tmp_path) + assert tmp_path in _VSCODE_DISCOVERY_CACHE + + # Spy on _scan_child_ids for subsequent calls. + import copilot_usage.vscode_parser as _mod + + scan_calls: list[Path] = [] + original = _mod._scan_child_ids # pyright: ignore[reportPrivateUsage] + + def spy(root: Path) -> frozenset[tuple[str, tuple[int, int]]]: + scan_calls.append(root) + return original(root) + + monkeypatch.setattr(_mod, "_scan_child_ids", spy) + _cached_discover_vscode_logs(tmp_path) + assert scan_calls == [], "child scan must be skipped on root_id cache hit" From 581f21e52c41102a9f92c2a4d465762378892626 Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:34:51 -0700 Subject: [PATCH 2/3] fix: address review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add O(1) sentinel check for newest child directory to detect new window directories under existing sessions, while keeping the root_id fast path. The cache now stores the most recently modified child dir at population time and re-stats it on hits — catching sub-directory changes without rescanning all children. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 82 ++++++++++++++++------- tests/copilot_usage/test_vscode_parser.py | 31 +++++---- 2 files changed, 76 insertions(+), 37 deletions(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index b79490f..0e3b4eb 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -111,9 +111,11 @@ def __post_init__(self) -> None: # --------------------------------------------------------------------------- # Module-level discovery cache: candidate_root → _VSCodeDiscoveryCache. # Avoids redundant multi-level glob traversals when the candidate root -# directory has not changed (root mtime check catches subdirectory -# additions/removals on Linux/macOS). child_ids is recorded on cache -# population and retained for diagnostics but is not rechecked on hits. +# directory and its most recently modified child have not changed. +# The root mtime check catches session-directory additions/removals, +# while the *newest_child* sentinel catches new window directories +# inside existing sessions — keeping steady-state cost at O(1) (two +# stat calls: root + sentinel child). # --------------------------------------------------------------------------- @@ -121,14 +123,19 @@ def __post_init__(self) -> None: class _VSCodeDiscoveryCache: """Cached result of discover_vscode_logs for a given root directory. - *root_id* (``(st_mtime_ns, st_size)``) is the primary cache key: - a hit on *root_id* trusts the cached log paths without rescanning - children. *child_ids* is recorded at population time for - diagnostic/debugging purposes but is not rechecked on cache hits. + *root_id* (``(st_mtime_ns, st_size)``) catches new or removed session + directories (child additions update parent mtime on Linux/macOS). + *newest_child_path* / *newest_child_id* store the most recently + modified immediate child at population time; re-stat'ing this single + sentinel on a hit detects new window directories added inside an + existing session. *child_ids* is recorded at population time and + retained for diagnostics but is not fully rescanned on cache hits. """ root_id: tuple[int, int] # (st_mtime_ns, st_size) of the logs root child_ids: _ChildIds + newest_child_path: Path | None # most-recently-modified session dir + newest_child_id: tuple[int, int] | None # its identity at population log_paths: tuple[Path, ...] @@ -218,21 +225,37 @@ def discover_vscode_logs(base_path: Path | None = None) -> list[Path]: return all_logs +def _newest_child_from_ids( + root: Path, + child_ids: _ChildIds, +) -> tuple[Path | None, tuple[int, int] | None]: + """Return the path and identity of the most recently modified child. + + Picks the child with the highest ``st_mtime_ns`` from *child_ids*. + Returns ``(None, None)`` when *child_ids* is empty. + """ + if not child_ids: + return None, None + name, identity = max(child_ids, key=lambda item: item[1][0]) + return root / name, identity + + def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]: """Return discovered log paths, skipping glob when the root is unchanged. Each candidate root directory is stat'd. On a cache hit (same - root ``(st_mtime_ns, st_size)``), the stored paths are reused - without scanning child directories or running the multi-level glob. + root ``(st_mtime_ns, st_size)`` *and* same identity of the most + recently modified child), the stored paths are reused without + scanning all child directories or running the multi-level glob. On a miss, :func:`_scan_child_ids` runs and the glob executes to repopulate the cache. - Since creating or removing a subdirectory updates the parent - directory's ``mtime`` on Linux (ext4/btrfs/overlayfs) and macOS - (APFS), the root identity check is sufficient to detect child-dir - changes; ``_scan_child_ids`` only runs on cache misses as an extra - safety net. Steady-state cost is therefore O(1) — a single - ``stat()`` on the root directory. + The root identity check catches session-directory additions/removals + (child additions update parent mtime on Linux/macOS). The + *newest_child* sentinel catches new window directories inside + existing sessions: adding ``window2/`` under a session dir updates + that session dir's mtime, which the sentinel stat detects. + Steady-state cost is O(1) — two ``stat()`` calls (root + sentinel). A non-directory candidate is skipped with an empty result, matching the behaviour of :func:`discover_vscode_logs`. @@ -257,16 +280,28 @@ def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]: continue root_id: tuple[int, int] = (st.st_mtime_ns, st.st_size) cached = _VSCODE_DISCOVERY_CACHE.get(candidate) - if cached is not None and cached.root_id == root_id: - # Root directory identity unchanged — no subdirectories - # added/removed (parent mtime always updates on Linux/macOS). + if ( + cached is not None + and cached.root_id == root_id + and ( + cached.newest_child_path is None + or safe_file_identity(cached.newest_child_path) + == cached.newest_child_id + ) + ): + # Root + sentinel child unchanged — reuse cached log paths. result.extend(cached.log_paths) continue - # Cache miss or root changed — scan children and run glob + # Cache miss or root/sentinel changed — scan children and run glob child_ids = _scan_child_ids(candidate) + newest_path, newest_id = _newest_child_from_ids(candidate, child_ids) found = sorted(candidate.glob(_GLOB_PATTERN)) _VSCODE_DISCOVERY_CACHE[candidate] = _VSCodeDiscoveryCache( - root_id=root_id, child_ids=child_ids, log_paths=tuple(found) + root_id=root_id, + child_ids=child_ids, + newest_child_path=newest_path, + newest_child_id=newest_id, + log_paths=tuple(found), ) result.extend(found) result.sort() @@ -541,10 +576,11 @@ def get_vscode_summary(base_path: Path | None = None) -> VSCodeLogSummary: Discovery uses :func:`_cached_discover_vscode_logs` to avoid redundant multi-level glob traversals when the candidate root + directories and their most recently modified child session directories have not changed on disk. The steady-state discovery - cost is O(1) per candidate root (a single ``stat`` on the root - directory), which is much cheaper than the deep recursive glob it - replaces. + cost is O(1) per candidate root (two ``stat`` calls: one on the root + directory and one on the sentinel child), which is much cheaper than + the deep recursive glob it replaces. Uses :func:`_get_cached_vscode_requests` so that unchanged log files are not re-parsed on repeated invocations. A module-level summary diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index 33cd332..a33d224 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -2153,6 +2153,8 @@ def test_cache_invalidated_on_root_mtime_change(self, tmp_path: Path) -> None: _VSCODE_DISCOVERY_CACHE[tmp_path] = _VSCodeDiscoveryCache( root_id=(cached.root_id[0] + 1_000_000_000, cached.root_id[1]), child_ids=cached.child_ids, + newest_child_path=cached.newest_child_path, + newest_child_id=cached.newest_child_id, log_paths=cached.log_paths, ) @@ -2189,14 +2191,15 @@ def test_discovery_cache_populated(self, tmp_path: Path) -> None: assert cached.root_id == safe_file_identity(tmp_path) assert cached.child_ids == _scan_child_ids(tmp_path) - def test_new_window_under_existing_session_uses_cache(self, tmp_path: Path) -> None: - """Adding a window dir under an existing session does not re-glob. + def test_new_window_under_existing_session_triggers_rediscovery( + self, tmp_path: Path + ) -> None: + """Adding a window dir under an existing session invalidates the cache. - Since the root directory's mtime is unchanged (only the - *child* session directory is modified), the discovery cache - correctly serves the old result. The next-level summary cache - inside ``get_vscode_summary`` handles staleness when individual - log file identities diverge. + The discovery cache stores a sentinel (the most recently modified + session directory). When a new ``window*`` directory is created + under that session, the session directory's mtime changes, causing + the sentinel check to miss and trigger a full re-glob. """ session_dir = tmp_path / "20260313T211400" log_dir = session_dir / "window1" / "exthost" / "GitHub.copilot-chat" @@ -2228,9 +2231,9 @@ def _counting_glob( ) s2 = get_vscode_summary(tmp_path) - # Root mtime unchanged → discovery cache hit, no re-glob - assert glob_call_count == 1 - assert s2.total_requests == 1 + # Session dir mtime changed → sentinel miss → re-globbed + assert glob_call_count == 2 + assert s2.total_requests == 2 def test_non_directory_candidate_skipped(self, tmp_path: Path) -> None: """A file (not a directory) passed as base_path produces an empty summary.""" @@ -2298,12 +2301,12 @@ def test_missing_candidate_skipped(self, tmp_path: Path) -> None: class TestCachedDiscoverSkipsChildScanOnHit: - """Verify _scan_child_ids is not called on root_id cache hits. + """Verify _scan_child_ids is not called on root_id + sentinel cache hits. After a warm call populates _VSCODE_DISCOVERY_CACHE, a subsequent - call with an unchanged root must short-circuit on root_id alone - and never invoke _scan_child_ids — avoiding O(n_children) stat - syscalls on every steady-state invocation. + call with an unchanged root *and* unchanged sentinel child must + short-circuit without invoking _scan_child_ids — avoiding + O(n_children) stat syscalls on every steady-state invocation. """ def test_cached_discover_skips_child_scan_on_root_id_hit( From 3cc5c200f5af6261ec431c7ef67f53b1d57c1d84 Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Fri, 10 Apr 2026 04:04:47 -0700 Subject: [PATCH 3/3] fix: address review comments - Add deterministic tie-breaker (child name) to _newest_child_from_ids to stabilize sentinel selection when multiple children share the same st_mtime_ns. - Document the sentinel cache limitation: only changes under the cached newest child are detected; modifications to older session directories may return stale log_paths until the root changes or cache is cleared. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index 0e3b4eb..b275556 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -116,6 +116,9 @@ def __post_init__(self) -> None: # while the *newest_child* sentinel catches new window directories # inside existing sessions — keeping steady-state cost at O(1) (two # stat calls: root + sentinel child). +# NOTE: only changes under the cached newest child are detected by the +# sentinel; modifications to older session directories may go unnoticed +# until the root directory itself changes or the cache is cleared. # --------------------------------------------------------------------------- @@ -130,6 +133,14 @@ class _VSCodeDiscoveryCache: sentinel on a hit detects new window directories added inside an existing session. *child_ids* is recorded at population time and retained for diagnostics but is not fully rescanned on cache hits. + + **Limitation:** only changes under the cached newest session directory + are detected by the sentinel. If a different (older) session directory + is modified (e.g. a new ``window*/`` appears under a non-newest + session), ``root_id`` will still match and the sentinel stat will also + match, so the cache may return stale ``log_paths`` until the root + directory itself changes or the cache is cleared. This is an accepted + trade-off for O(1) steady-state cost. """ root_id: tuple[int, int] # (st_mtime_ns, st_size) of the logs root @@ -232,11 +243,12 @@ def _newest_child_from_ids( """Return the path and identity of the most recently modified child. Picks the child with the highest ``st_mtime_ns`` from *child_ids*. + Ties are broken deterministically by child name. Returns ``(None, None)`` when *child_ids* is empty. """ if not child_ids: return None, None - name, identity = max(child_ids, key=lambda item: item[1][0]) + name, identity = max(child_ids, key=lambda item: (item[1][0], item[0])) return root / name, identity @@ -257,6 +269,11 @@ def _cached_discover_vscode_logs(base_path: Path | None) -> list[Path]: that session dir's mtime, which the sentinel stat detects. Steady-state cost is O(1) — two ``stat()`` calls (root + sentinel). + **Limitation:** the sentinel only tracks the most recently modified + child at cache-population time. Changes under a different (older) + session directory will not be detected until the root directory + itself changes or the cache is cleared. + A non-directory candidate is skipped with an empty result, matching the behaviour of :func:`discover_vscode_logs`. """