From d454f20d341de1ee08aa23ea2bfd32a6471610fa Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sun, 5 Apr 2026 08:26:14 -0700 Subject: [PATCH 1/6] fix: vscode_parser incremental append-only parsing (#764) Replace full re-parse of VS Code Copilot Chat logs with incremental offset-based parsing. The cache now stores the byte offset where parsing last stopped; when the file has grown (append-only), only the new bytes are read. If the file was truncated/replaced, a full re-parse is performed. - Add _parse_vscode_log_from_offset() that reads from a byte offset and returns (requests, end_offset). - Refactor parse_vscode_log() to delegate to the new function. - Extend _VSCODE_LOG_CACHE value to (file_id, end_offset, requests). - Add _update_vscode_cache() helper for insert/evict logic. - Update _get_cached_vscode_requests() with incremental growth detection. - Add TestIncrementalParsing class with 5 tests covering: append returns N+M requests, reads from previous offset, file replacement triggers full reparse, noise lines, and same-content rewrite. - Update existing tests that patched parse_vscode_log to patch _parse_vscode_log_from_offset instead. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 112 ++++++++++---- tests/copilot_usage/test_vscode_parser.py | 170 ++++++++++++++++++---- 2 files changed, 231 insertions(+), 51 deletions(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index c2e22adf..f01b0b83 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -129,12 +129,32 @@ def parse_vscode_log(log_path: Path) -> list[VSCodeRequest]: Returns a list of parsed requests (possibly empty when no lines match). + Raises: + OSError: If the file cannot be opened or read. + """ + requests, _ = _parse_vscode_log_from_offset(log_path, 0) + return requests + + +def _parse_vscode_log_from_offset( + log_path: Path, offset: int +) -> tuple[list[VSCodeRequest], int]: + """Parse VS Code Copilot Chat log starting at *offset* bytes. + + Returns ``(requests, end_offset)`` where *end_offset* is the byte + position at the end of the last complete line read. This allows + callers to resume parsing from that point on a subsequent call. + Raises: OSError: If the file cannot be opened or read. """ requests: list[VSCodeRequest] = [] - with log_path.open(encoding="utf-8", errors="replace") as f: - for line in f: + with log_path.open("rb") as fb: + if offset > 0: + fb.seek(offset) + for raw_line in fb: + # Decode with replacement to mirror parse_vscode_log behaviour. + line = raw_line.decode("utf-8", errors="replace") # Fast pre-filter: only ~1–5% of lines contain "ccreq:" if "ccreq:" not in line: continue @@ -155,53 +175,93 @@ def parse_vscode_log(log_path: Path) -> list[VSCodeRequest]: category=category, ) ) - logger.debug("Parsed {} request(s) from {}", len(requests), log_path) - return requests + end_offset: int = fb.tell() + logger.debug( + "Parsed {} request(s) from {} (offset {}→{})", + len(requests), + log_path, + offset, + end_offset, + ) + return requests, end_offset # --------------------------------------------------------------------------- # Module-level parsed-requests cache (mirrors parser._EVENTS_CACHE). # Uses OrderedDict for LRU eviction: most-recently-used entries are at # the back, least-recently-used at the front. +# +# Cache value layout: (file_id, end_offset, requests_tuple) +# file_id – (st_mtime_ns, st_size) or None when stat() fails +# end_offset – byte position of the last parsed line +# requests – immutable tuple of parsed VSCodeRequest objects # --------------------------------------------------------------------------- _MAX_CACHED_VSCODE_LOGS: Final[int] = 64 _VSCODE_LOG_CACHE: OrderedDict[ - Path, tuple[tuple[int, int] | None, tuple[VSCodeRequest, ...]] + Path, tuple[tuple[int, int] | None, int, tuple[VSCodeRequest, ...]] ] = OrderedDict() +def _update_vscode_cache( + log_path: Path, + file_id: tuple[int, int] | None, + end_offset: int, + requests: tuple[VSCodeRequest, ...], +) -> None: + """Insert or replace a cache entry with LRU eviction.""" + if log_path in _VSCODE_LOG_CACHE: + del _VSCODE_LOG_CACHE[log_path] + elif len(_VSCODE_LOG_CACHE) >= _MAX_CACHED_VSCODE_LOGS: + _VSCODE_LOG_CACHE.popitem(last=False) # evict LRU (front) + _VSCODE_LOG_CACHE[log_path] = (file_id, end_offset, requests) + + def _get_cached_vscode_requests(log_path: Path) -> tuple[VSCodeRequest, ...]: - """Return parsed requests, re-parsing only when ``(mtime_ns, size)`` changes. + """Return parsed requests, incrementally parsing only new content. On the first call for a given *log_path*, delegates to - :func:`parse_vscode_log` and stores the result. Subsequent calls - return the cached tuple as long as the file identity is unchanged. + :func:`_parse_vscode_log_from_offset` (offset 0) and stores the + result together with the byte offset reached. Subsequent calls + detect whether the file has **grown** (append-only) by comparing + the new ``st_size`` against the cached size — if so, only the + bytes after the stored offset are parsed and appended to the + existing result. + + When the file is **replaced** (new size < cached size) or + ``st_size`` cannot be determined, a full re-parse is performed. + The cache is bounded to :data:`_MAX_CACHED_VSCODE_LOGS` entries; the **least-recently used** entry is evicted when the limit is reached. - The parsed list is converted to a ``tuple`` before storage so that - callers cannot accidentally append, pop, or reorder entries in the - cache — matching the container-level immutability pattern used by - :func:`copilot_usage.parser.get_cached_events`. - Raises: - OSError: Propagated from :func:`parse_vscode_log` when the file - cannot be opened or read. + OSError: Propagated from :func:`_parse_vscode_log_from_offset` + when the file cannot be opened or read. """ - file_id = _safe_file_identity(log_path) + new_id = _safe_file_identity(log_path) cached = _VSCODE_LOG_CACHE.get(log_path) - if cached is not None and cached[0] == file_id: - _VSCODE_LOG_CACHE.move_to_end(log_path) - return cached[1] - requests = tuple(parse_vscode_log(log_path)) - if log_path in _VSCODE_LOG_CACHE: - del _VSCODE_LOG_CACHE[log_path] - elif len(_VSCODE_LOG_CACHE) >= _MAX_CACHED_VSCODE_LOGS: - _VSCODE_LOG_CACHE.popitem(last=False) # evict LRU (front) - _VSCODE_LOG_CACHE[log_path] = (file_id, requests) - return requests + + if cached is not None: + old_id, end_offset, old_requests = cached + + # Exact match: file unchanged — return cached result. + if old_id == new_id: + _VSCODE_LOG_CACHE.move_to_end(log_path) + return old_requests + + # Incremental path: file grew (append-only). + if new_id is not None and old_id is not None and new_id[1] >= old_id[1]: + new_reqs, new_end = _parse_vscode_log_from_offset(log_path, end_offset) + combined = old_requests + tuple(new_reqs) + _update_vscode_cache(log_path, new_id, new_end, combined) + return combined + + # Full parse: first call or file was truncated/replaced. + requests, end_offset = _parse_vscode_log_from_offset(log_path, 0) + result = tuple(requests) + _update_vscode_cache(log_path, new_id, end_offset, result) + return result @dataclass(slots=True) diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index 33354f19..08ae3f32 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -18,6 +18,7 @@ VSCodeRequest, _default_log_candidates, # pyright: ignore[reportPrivateUsage] _get_cached_vscode_requests, # pyright: ignore[reportPrivateUsage] + _parse_vscode_log_from_offset, # pyright: ignore[reportPrivateUsage] _SummaryAccumulator, # pyright: ignore[reportPrivateUsage] _update_vscode_summary, # pyright: ignore[reportPrivateUsage] build_vscode_summary, @@ -617,10 +618,10 @@ def test_incremental_aggregation(self) -> None: ), ] - def _fake_parse(path: Path) -> list[VSCodeRequest]: + def _fake_parse(path: Path, offset: int) -> tuple[list[VSCodeRequest], int]: if path == file_a: - return list(requests_a) - return list(requests_b) + return list(requests_a), 100 + return list(requests_b), 50 with ( patch( @@ -628,7 +629,7 @@ def _fake_parse(path: Path) -> list[VSCodeRequest]: return_value=[file_a, file_b], ), patch( - "copilot_usage.vscode_parser.parse_vscode_log", + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", side_effect=_fake_parse, ) as mock_parse, patch( @@ -640,7 +641,7 @@ def _fake_parse(path: Path) -> list[VSCodeRequest]: assert summary.total_requests == 3 assert summary.log_files_parsed == 2 assert mock_parse.call_count == 2 - mock_parse.assert_has_calls([call(file_a), call(file_b)]) + mock_parse.assert_has_calls([call(file_a, 0), call(file_b, 0)]) # Verify the incremental path is used: build_vscode_summary must NOT # be called because get_vscode_summary now aggregates per-file via # _update_vscode_summary instead of collecting all requests first. @@ -665,10 +666,10 @@ def test_oserror_skips_file_and_continues(self) -> None: ), ] - def _fake_parse(path: Path) -> list[VSCodeRequest]: + def _fake_parse(path: Path, offset: int) -> tuple[list[VSCodeRequest], int]: if path == file_a: raise OSError("Permission denied") - return list(requests_b) + return list(requests_b), 50 with ( patch( @@ -676,7 +677,7 @@ def _fake_parse(path: Path) -> list[VSCodeRequest]: return_value=[file_a, file_b], ), patch( - "copilot_usage.vscode_parser.parse_vscode_log", + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", side_effect=_fake_parse, ), ): @@ -699,14 +700,17 @@ def test_log_files_found_equals_discovered(self) -> None: category="panel", ) + def _fake_parse(path: Path, offset: int) -> tuple[list[VSCodeRequest], int]: + return [req], 50 + with ( patch( "copilot_usage.vscode_parser.discover_vscode_logs", return_value=paths, ), patch( - "copilot_usage.vscode_parser.parse_vscode_log", - return_value=[req], + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + side_effect=_fake_parse, ), ): summary = get_vscode_summary() @@ -728,10 +732,10 @@ def test_log_files_found_vs_parsed_on_oserror(self) -> None: category="inline", ) - def _fake_parse(path: Path) -> list[VSCodeRequest]: + def _fake_parse(path: Path, offset: int) -> tuple[list[VSCodeRequest], int]: if path == path1: raise OSError("Permission denied") - return [req] + return [req], 50 with ( patch( @@ -739,7 +743,7 @@ def _fake_parse(path: Path) -> list[VSCodeRequest]: return_value=[path1, path2], ), patch( - "copilot_usage.vscode_parser.parse_vscode_log", + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", side_effect=_fake_parse, ), ): @@ -776,20 +780,21 @@ def test_vscode_single_file_oserror_logs_warning( _LOG_OPUS + "\n", encoding="utf-8" ) - # Make parse_vscode_log raise OSError only on the first call. + # Make _parse_vscode_log_from_offset raise OSError only on the first call. call_count = 0 - _real_parse = parse_vscode_log + _real_parse = _parse_vscode_log_from_offset - def _failing_once(path: Path) -> list[VSCodeRequest]: + def _failing_once(path: Path, offset: int) -> tuple[list[VSCodeRequest], int]: nonlocal call_count call_count += 1 if call_count == 1: msg = "Permission denied" raise OSError(msg) - return _real_parse(path) + return _real_parse(path, offset) monkeypatch.setattr( - "copilot_usage.vscode_parser.parse_vscode_log", _failing_once + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + _failing_once, ) warnings: list[str] = [] @@ -822,7 +827,8 @@ def _always_raise(*_a: object, **_kw: object) -> object: raise OSError(msg) monkeypatch.setattr( - "copilot_usage.vscode_parser.parse_vscode_log", _always_raise + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + _always_raise, ) runner = CliRunner() result = runner.invoke(main, ["vscode", "--vscode-logs", str(tmp_path)]) @@ -1267,20 +1273,20 @@ def test_first_call_populates_cache(self, tmp_path: Path) -> None: assert log_file in _VSCODE_LOG_CACHE def test_second_call_returns_cached_without_reparse(self, tmp_path: Path) -> None: - """parse_vscode_log is only called once when file is unchanged.""" + """_parse_vscode_log_from_offset is only called once when file is unchanged.""" log_file = tmp_path / "chat.log" log_file.write_text(_make_log_line(req_idx=0)) with patch( - "copilot_usage.vscode_parser.parse_vscode_log", - wraps=parse_vscode_log, + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + wraps=_parse_vscode_log_from_offset, ) as spy: first = _get_cached_vscode_requests(log_file) second = _get_cached_vscode_requests(log_file) assert spy.call_count == 1 assert first == second assert log_file in _VSCODE_LOG_CACHE - assert _VSCODE_LOG_CACHE[log_file][1] == second + assert _VSCODE_LOG_CACHE[log_file][2] == second def test_cache_invalidated_on_file_change(self, tmp_path: Path) -> None: """Changing the file causes a re-parse on the next call.""" @@ -1340,11 +1346,125 @@ def test_get_vscode_summary_uses_cache(self, tmp_path: Path) -> None: log_file.write_text(_make_log_line(req_idx=0)) with patch( - "copilot_usage.vscode_parser.parse_vscode_log", - wraps=parse_vscode_log, + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + wraps=_parse_vscode_log_from_offset, ) as spy: s1 = get_vscode_summary(tmp_path) s2 = get_vscode_summary(tmp_path) assert spy.call_count == 1 assert s1.total_requests == 1 assert s2.total_requests == 1 + + +# --------------------------------------------------------------------------- +# Incremental append-only parsing tests +# --------------------------------------------------------------------------- + + +class TestIncrementalParsing: + """Tests for incremental (append-only) parsing in _get_cached_vscode_requests.""" + + def test_append_returns_combined_requests(self, tmp_path: Path) -> None: + """Appending lines to a cached log returns N + M requests.""" + log_file = tmp_path / "chat.log" + initial_lines = "\n".join(_make_log_line(req_idx=i) for i in range(3)) + log_file.write_text(initial_lines + "\n") + + first = _get_cached_vscode_requests(log_file) + assert len(first) == 3 + + # Append 2 more request lines. + with log_file.open("a") as f: + for i in range(3, 5): + f.write(_make_log_line(req_idx=i) + "\n") + + second = _get_cached_vscode_requests(log_file) + assert len(second) == 5 + # Original requests are preserved at the start. + assert second[:3] == first + + def test_append_only_reads_from_previous_offset(self, tmp_path: Path) -> None: + """Incremental path starts reading at the previous end offset.""" + log_file = tmp_path / "chat.log" + initial_lines = "\n".join(_make_log_line(req_idx=i) for i in range(4)) + log_file.write_text(initial_lines + "\n") + + _get_cached_vscode_requests(log_file) + cached_entry = _VSCODE_LOG_CACHE[log_file] + old_end_offset = cached_entry[1] + assert old_end_offset > 0 + + # Append new lines. + with log_file.open("a") as f: + for i in range(4, 7): + f.write(_make_log_line(req_idx=i) + "\n") + + with patch( + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + wraps=_parse_vscode_log_from_offset, + ) as spy: + result = _get_cached_vscode_requests(log_file) + + assert len(result) == 7 + # The incremental call should have used the old end offset. + spy.assert_called_once_with(log_file, old_end_offset) + + def test_file_replaced_triggers_full_reparse(self, tmp_path: Path) -> None: + """Replacing (truncating) a file triggers a full re-parse from offset 0.""" + log_file = tmp_path / "chat.log" + # Write a large initial file. + initial_lines = "\n".join(_make_log_line(req_idx=i) for i in range(10)) + log_file.write_text(initial_lines + "\n") + + first = _get_cached_vscode_requests(log_file) + assert len(first) == 10 + + # Replace the file with fewer lines (smaller size). + log_file.write_text(_make_log_line(req_idx=99) + "\n") + + with patch( + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + wraps=_parse_vscode_log_from_offset, + ) as spy: + second = _get_cached_vscode_requests(log_file) + + assert len(second) == 1 + assert second[0].request_id == "req00099" + # Full re-parse: offset should be 0. + spy.assert_called_once_with(log_file, 0) + + def test_append_with_noise_lines(self, tmp_path: Path) -> None: + """Appended noise (non-ccreq) lines don't produce phantom requests.""" + log_file = tmp_path / "chat.log" + log_file.write_text(_make_log_line(req_idx=0) + "\n") + + first = _get_cached_vscode_requests(log_file) + assert len(first) == 1 + + # Append noise + one real request. + with log_file.open("a") as f: + f.write("2026-03-13 22:11:00.000 [info] Some noise line\n") + f.write("2026-03-13 22:11:01.000 [info] More noise\n") + f.write(_make_log_line(req_idx=1) + "\n") + + second = _get_cached_vscode_requests(log_file) + assert len(second) == 2 + + def test_append_empty_content(self, tmp_path: Path) -> None: + """Appending empty content (touch) still returns all original requests.""" + log_file = tmp_path / "chat.log" + log_file.write_text(_make_log_line(req_idx=0) + "\n") + + first = _get_cached_vscode_requests(log_file) + assert len(first) == 1 + + # "Touch" the file to change mtime without changing size. + # We need to actually grow it slightly for the incremental path. + import time + + time.sleep(0.01) + log_file.write_bytes(log_file.read_bytes()) # rewrite same content + + second = _get_cached_vscode_requests(log_file) + assert len(second) == 1 + assert second == first From 6f3adb400c0ed9e7ce5dd1446f6d978822dfe5af Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sun, 5 Apr 2026 08:41:39 -0700 Subject: [PATCH 2/6] fix: address review comments - Track only complete (newline-terminated) lines in end_offset to avoid missing records when a partial line at EOF is later completed - Compare new file size against end_offset (not cached stat size) for incremental path to prevent seeking past EOF after truncation - Fix misleading test comment in test_append_empty_content - Add test_partial_line_at_eof_excluded for partial-line handling - Ensure all test fixtures write newline-terminated log lines Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 23 +++++++--- tests/copilot_usage/test_vscode_parser.py | 56 +++++++++++++++++------ 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index f01b0b83..152a9dc4 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -142,17 +142,25 @@ def _parse_vscode_log_from_offset( """Parse VS Code Copilot Chat log starting at *offset* bytes. Returns ``(requests, end_offset)`` where *end_offset* is the byte - position at the end of the last complete line read. This allows - callers to resume parsing from that point on a subsequent call. + position at the end of the last **complete** (newline-terminated) + line read. Partial lines at EOF are intentionally excluded so that + the next incremental call can re-read them once the writer finishes + the line. Raises: OSError: If the file cannot be opened or read. """ requests: list[VSCodeRequest] = [] + safe_end: int = offset with log_path.open("rb") as fb: if offset > 0: fb.seek(offset) for raw_line in fb: + if not raw_line.endswith(b"\n"): + # Partial line at EOF — stop advancing so the next + # incremental call re-reads this line once complete. + break + safe_end += len(raw_line) # Decode with replacement to mirror parse_vscode_log behaviour. line = raw_line.decode("utf-8", errors="replace") # Fast pre-filter: only ~1–5% of lines contain "ccreq:" @@ -175,15 +183,14 @@ def _parse_vscode_log_from_offset( category=category, ) ) - end_offset: int = fb.tell() logger.debug( "Parsed {} request(s) from {} (offset {}→{})", len(requests), log_path, offset, - end_offset, + safe_end, ) - return requests, end_offset + return requests, safe_end # --------------------------------------------------------------------------- @@ -250,8 +257,10 @@ def _get_cached_vscode_requests(log_path: Path) -> tuple[VSCodeRequest, ...]: _VSCODE_LOG_CACHE.move_to_end(log_path) return old_requests - # Incremental path: file grew (append-only). - if new_id is not None and old_id is not None and new_id[1] >= old_id[1]: + # Incremental path: file grew (append-only) beyond the cached + # resume point. Compare against ``end_offset`` because that is the + # position we will seek to when resuming parsing. + if new_id is not None and old_id is not None and new_id[1] >= end_offset: new_reqs, new_end = _parse_vscode_log_from_offset(log_path, end_offset) combined = old_requests + tuple(new_reqs) _update_vscode_cache(log_path, new_id, new_end, combined) diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index 08ae3f32..cf7b2735 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -104,7 +104,7 @@ class TestParseVscodeLog: def test_parses_real_lines(self, tmp_path: Path) -> None: log_file = tmp_path / "test.log" log_file.write_text( - "\n".join([_LOG_OPUS, _LOG_NOISE, _LOG_REDIRECT, _LOG_GPT4O]), + "\n".join([_LOG_OPUS, _LOG_NOISE, _LOG_REDIRECT, _LOG_GPT4O]) + "\n", encoding="utf-8", ) requests = parse_vscode_log(log_file) @@ -136,7 +136,7 @@ def test_invalid_timestamp_line_is_skipped(self, tmp_path: Path) -> None: assert _CCREQ_RE.match(bad_line) is not None good_line = _LOG_OPUS # a valid known-good line log_file = tmp_path / "test.log" - log_file.write_text(f"{bad_line}\n{good_line}", encoding="utf-8") + log_file.write_text(f"{bad_line}\n{good_line}\n", encoding="utf-8") result = parse_vscode_log(log_file) assert len(result) == 1 # bad line skipped assert result[0].model == "claude-opus-4.6" @@ -547,7 +547,7 @@ def test_end_to_end(self, tmp_path: Path) -> None: log_dir = tmp_path / "20260313" / "window1" / "exthost" / "GitHub.copilot-chat" log_dir.mkdir(parents=True) (log_dir / "GitHub Copilot Chat.log").write_text( - "\n".join([_LOG_OPUS, _LOG_REDIRECT, _LOG_NOISE, _LOG_GPT4O]), + "\n".join([_LOG_OPUS, _LOG_REDIRECT, _LOG_NOISE, _LOG_GPT4O]) + "\n", encoding="utf-8", ) summary = get_vscode_summary(tmp_path) @@ -1218,7 +1218,7 @@ def test_all_lines_match(self, tmp_path: Path) -> None: for i in range(n) ] log_file = tmp_path / "all_match.log" - log_file.write_text("\n".join(lines), encoding="utf-8") + log_file.write_text("\n".join(lines) + "\n", encoding="utf-8") requests = parse_vscode_log(log_file) assert len(requests) == n for i, req in enumerate(requests): @@ -1241,7 +1241,7 @@ def test_1000_matching_lines_parsed(self, tmp_path: Path) -> None: for i in range(n) ] log_file = tmp_path / "fromisoformat_1000.log" - log_file.write_text("\n".join(lines), encoding="utf-8") + log_file.write_text("\n".join(lines) + "\n", encoding="utf-8") requests = parse_vscode_log(log_file) assert len(requests) == n for i, req in enumerate(requests): @@ -1267,7 +1267,7 @@ class TestVscodeLogCache: def test_first_call_populates_cache(self, tmp_path: Path) -> None: log_file = tmp_path / "chat.log" - log_file.write_text(_make_log_line(req_idx=0)) + log_file.write_text(_make_log_line(req_idx=0) + "\n") requests = _get_cached_vscode_requests(log_file) assert len(requests) == 1 assert log_file in _VSCODE_LOG_CACHE @@ -1275,7 +1275,7 @@ def test_first_call_populates_cache(self, tmp_path: Path) -> None: def test_second_call_returns_cached_without_reparse(self, tmp_path: Path) -> None: """_parse_vscode_log_from_offset is only called once when file is unchanged.""" log_file = tmp_path / "chat.log" - log_file.write_text(_make_log_line(req_idx=0)) + log_file.write_text(_make_log_line(req_idx=0) + "\n") with patch( "copilot_usage.vscode_parser._parse_vscode_log_from_offset", @@ -1291,7 +1291,7 @@ def test_second_call_returns_cached_without_reparse(self, tmp_path: Path) -> Non def test_cache_invalidated_on_file_change(self, tmp_path: Path) -> None: """Changing the file causes a re-parse on the next call.""" log_file = tmp_path / "chat.log" - log_file.write_text(_make_log_line(req_idx=0)) + log_file.write_text(_make_log_line(req_idx=0) + "\n") first = _get_cached_vscode_requests(log_file) assert len(first) == 1 @@ -1300,7 +1300,7 @@ def test_cache_invalidated_on_file_change(self, tmp_path: Path) -> None: # has different contents and size, so its identity changes and the # next cache lookup should re-parse it. log_file.write_text( - _make_log_line(req_idx=0) + "\n" + _make_log_line(req_idx=1) + _make_log_line(req_idx=0) + "\n" + _make_log_line(req_idx=1) + "\n" ) second = _get_cached_vscode_requests(log_file) @@ -1311,7 +1311,7 @@ def test_lru_eviction(self, tmp_path: Path) -> None: paths: list[Path] = [] for i in range(_MAX_CACHED_VSCODE_LOGS + 1): p = tmp_path / f"log_{i}.log" - p.write_text(_make_log_line(req_idx=i)) + p.write_text(_make_log_line(req_idx=i) + "\n") paths.append(p) for p in paths: @@ -1325,8 +1325,8 @@ def test_lru_promotion_on_access(self, tmp_path: Path) -> None: """Accessing a cached entry moves it to the back (most-recently used).""" p1 = tmp_path / "a.log" p2 = tmp_path / "b.log" - p1.write_text(_make_log_line(req_idx=0)) - p2.write_text(_make_log_line(req_idx=1)) + p1.write_text(_make_log_line(req_idx=0) + "\n") + p2.write_text(_make_log_line(req_idx=1) + "\n") _get_cached_vscode_requests(p1) _get_cached_vscode_requests(p2) @@ -1343,7 +1343,7 @@ def test_get_vscode_summary_uses_cache(self, tmp_path: Path) -> None: ) log_dir.mkdir(parents=True) log_file = log_dir / "GitHub Copilot Chat.log" - log_file.write_text(_make_log_line(req_idx=0)) + log_file.write_text(_make_log_line(req_idx=0) + "\n") with patch( "copilot_usage.vscode_parser._parse_vscode_log_from_offset", @@ -1458,8 +1458,10 @@ def test_append_empty_content(self, tmp_path: Path) -> None: first = _get_cached_vscode_requests(log_file) assert len(first) == 1 - # "Touch" the file to change mtime without changing size. - # We need to actually grow it slightly for the incremental path. + # Rewrite the file with identical content to change mtime while + # keeping size unchanged. The cache detects the mtime change and + # takes the incremental path which seeks to end_offset, finds + # nothing new, and returns the original cached requests. import time time.sleep(0.01) @@ -1468,3 +1470,27 @@ def test_append_empty_content(self, tmp_path: Path) -> None: second = _get_cached_vscode_requests(log_file) assert len(second) == 1 assert second == first + + def test_partial_line_at_eof_excluded(self, tmp_path: Path) -> None: + """A partial (no-newline) line at EOF is excluded until completed.""" + log_file = tmp_path / "chat.log" + log_file.write_text(_make_log_line(req_idx=0) + "\n") + + first = _get_cached_vscode_requests(log_file) + assert len(first) == 1 + + # Append a partial line (no trailing newline). + with log_file.open("a") as f: + f.write(_make_log_line(req_idx=1)) # no \n + + second = _get_cached_vscode_requests(log_file) + # Partial line is excluded — still only the original request. + assert len(second) == 1 + + # Complete the partial line by appending a newline. + with log_file.open("a") as f: + f.write("\n") + + third = _get_cached_vscode_requests(log_file) + # Now both requests are returned. + assert len(third) == 2 From 0d6b1734187d4f9f4873cbc9e5f4866c67831b8c Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sun, 5 Apr 2026 08:52:23 -0700 Subject: [PATCH 3/6] fix: address review comments Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 6 ++--- tests/copilot_usage/test_vscode_parser.py | 27 +++++++++++++++-------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index 152a9dc4..56f99cac 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -161,11 +161,11 @@ def _parse_vscode_log_from_offset( # incremental call re-reads this line once complete. break safe_end += len(raw_line) - # Decode with replacement to mirror parse_vscode_log behaviour. - line = raw_line.decode("utf-8", errors="replace") # Fast pre-filter: only ~1–5% of lines contain "ccreq:" - if "ccreq:" not in line: + if b"ccreq:" not in raw_line: continue + # Decode with replacement to mirror parse_vscode_log behaviour. + line = raw_line.decode("utf-8", errors="replace") m = _CCREQ_RE.match(line) if m is None: continue diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index cf7b2735..36aad43c 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -1,5 +1,6 @@ """Tests for copilot_usage.vscode_parser and the vscode CLI subcommand.""" +import os import re from datetime import datetime from pathlib import Path @@ -1451,23 +1452,31 @@ def test_append_with_noise_lines(self, tmp_path: Path) -> None: assert len(second) == 2 def test_append_empty_content(self, tmp_path: Path) -> None: - """Appending empty content (touch) still returns all original requests.""" + """Rewriting identical content triggers the incremental path at end_offset.""" log_file = tmp_path / "chat.log" log_file.write_text(_make_log_line(req_idx=0) + "\n") first = _get_cached_vscode_requests(log_file) assert len(first) == 1 - # Rewrite the file with identical content to change mtime while - # keeping size unchanged. The cache detects the mtime change and - # takes the incremental path which seeks to end_offset, finds - # nothing new, and returns the original cached requests. - import time - - time.sleep(0.01) + # Rewrite the file with identical content and force a distinct mtime + # via os.utime so the cache reliably detects a change regardless of + # filesystem mtime resolution. log_file.write_bytes(log_file.read_bytes()) # rewrite same content + stat = log_file.stat() + os.utime(log_file, ns=(stat.st_atime_ns, stat.st_mtime_ns + 1_000_000)) + + with patch( + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + wraps=_parse_vscode_log_from_offset, + ) as spy: + second = _get_cached_vscode_requests(log_file) + # The incremental path must be called with the cached end_offset + # (i.e. the byte length of the original content), not 0. + spy.assert_called_once() + call_offset = spy.call_args[0][1] + assert call_offset > 0, "Expected incremental call with non-zero offset" - second = _get_cached_vscode_requests(log_file) assert len(second) == 1 assert second == first From eec38798bfa5c1d67c295910c4d65cefc8d8140b Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:03:14 -0700 Subject: [PATCH 4/6] fix: address review comments Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 18 +++++++++++++++--- tests/copilot_usage/test_vscode_parser.py | 9 +++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index 56f99cac..3a7c45d0 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -128,16 +128,22 @@ def parse_vscode_log(log_path: Path) -> list[VSCodeRequest]: """Parse a single VS Code Copilot Chat log file into request objects. Returns a list of parsed requests (possibly empty when no lines match). + Unlike incremental parsing via :func:`_parse_vscode_log_from_offset`, + this performs a complete one-shot read and includes the final line even + when it is not newline-terminated. Raises: OSError: If the file cannot be opened or read. """ - requests, _ = _parse_vscode_log_from_offset(log_path, 0) + requests, _ = _parse_vscode_log_from_offset(log_path, 0, include_partial_tail=True) return requests def _parse_vscode_log_from_offset( - log_path: Path, offset: int + log_path: Path, + offset: int, + *, + include_partial_tail: bool = False, ) -> tuple[list[VSCodeRequest], int]: """Parse VS Code Copilot Chat log starting at *offset* bytes. @@ -147,6 +153,11 @@ def _parse_vscode_log_from_offset( the next incremental call can re-read them once the writer finishes the line. + When *include_partial_tail* is ``True`` (used by :func:`parse_vscode_log` + for one-shot full parsing), a final non-newline-terminated line is + **included** in the results to preserve text-mode semantics where + every line — including the last — is processed. + Raises: OSError: If the file cannot be opened or read. """ @@ -156,7 +167,8 @@ def _parse_vscode_log_from_offset( if offset > 0: fb.seek(offset) for raw_line in fb: - if not raw_line.endswith(b"\n"): + is_complete = raw_line.endswith(b"\n") + if not is_complete and not include_partial_tail: # Partial line at EOF — stop advancing so the next # incremental call re-reads this line once complete. break diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index 36aad43c..c6ac1bf2 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -191,6 +191,15 @@ def test_ccreq_line_with_replacement_char_is_skipped(self, tmp_path: Path) -> No result = parse_vscode_log(log_path) assert result == [] + def test_partial_tail_included_for_one_shot_parse(self, tmp_path: Path) -> None: + """parse_vscode_log includes a final line even without a trailing newline.""" + log_file = tmp_path / "test.log" + # Write a valid ccreq line without a trailing newline. + log_file.write_text(_make_log_line(req_idx=0), encoding="utf-8") + result = parse_vscode_log(log_file) + assert len(result) == 1 + assert result[0].request_id == "req00000" + # --------------------------------------------------------------------------- # build_vscode_summary From e3299c4c6c336f593fd27eeaed68f920c6819919 Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:16:13 -0700 Subject: [PATCH 5/6] fix: address review comments - Clarify _parse_vscode_log_from_offset docstring to document that end_offset advances past partial tail when include_partial_tail=True - Add os.fstat TOCTOU guard in _parse_vscode_log_from_offset: re-validate actual file size after open() to detect truncation between stat() and open(), falling back to full reparse when actual_size < offset - Handle truncation detection in _get_cached_vscode_requests by checking if returned new_end < requested end_offset Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/copilot_usage/vscode_parser.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/copilot_usage/vscode_parser.py b/src/copilot_usage/vscode_parser.py index 3a7c45d0..4103a836 100644 --- a/src/copilot_usage/vscode_parser.py +++ b/src/copilot_usage/vscode_parser.py @@ -148,15 +148,16 @@ def _parse_vscode_log_from_offset( """Parse VS Code Copilot Chat log starting at *offset* bytes. Returns ``(requests, end_offset)`` where *end_offset* is the byte - position at the end of the last **complete** (newline-terminated) - line read. Partial lines at EOF are intentionally excluded so that - the next incremental call can re-read them once the writer finishes - the line. + position immediately after the last line consumed by this call. + With the default ``include_partial_tail=False``, this is the end of + the last **complete** (newline-terminated) line read; a partial line + at EOF is intentionally excluded so that the next incremental call + can re-read it once the writer finishes the line. When *include_partial_tail* is ``True`` (used by :func:`parse_vscode_log` for one-shot full parsing), a final non-newline-terminated line is - **included** in the results to preserve text-mode semantics where - every line — including the last — is processed. + **included** in the results, and ``end_offset`` advances past that + consumed partial tail as well to preserve full-file text semantics. Raises: OSError: If the file cannot be opened or read. @@ -165,6 +166,13 @@ def _parse_vscode_log_from_offset( safe_end: int = offset with log_path.open("rb") as fb: if offset > 0: + # Guard against TOCTOU race: the file may have been + # truncated/replaced between the caller's stat() and this + # open(). Re-validate with fstat on the open descriptor. + actual_size = os.fstat(fb.fileno()).st_size + if actual_size < offset: + offset = 0 + safe_end = 0 fb.seek(offset) for raw_line in fb: is_complete = raw_line.endswith(b"\n") @@ -274,6 +282,12 @@ def _get_cached_vscode_requests(log_path: Path) -> tuple[VSCodeRequest, ...]: # position we will seek to when resuming parsing. if new_id is not None and old_id is not None and new_id[1] >= end_offset: new_reqs, new_end = _parse_vscode_log_from_offset(log_path, end_offset) + if new_end < end_offset: + # fstat inside the parser detected truncation — the + # returned results are a full reparse, not a delta. + result = tuple(new_reqs) + _update_vscode_cache(log_path, new_id, new_end, result) + return result combined = old_requests + tuple(new_reqs) _update_vscode_cache(log_path, new_id, new_end, combined) return combined From a07b4fcd4d656565514710d948297aed2996109a Mon Sep 17 00:00:00 2001 From: Sasa Junuzovic <44276455+microsasa@users.noreply.github.com> Date: Sun, 5 Apr 2026 09:25:28 -0700 Subject: [PATCH 6/6] fix: add tests for TOCTOU truncation paths to satisfy diff-coverage Add two tests to TestIncrementalParsing covering previously untested defensive code paths in vscode_parser.py: - test_toctou_truncation_resets_offset: exercises lines 174-175 where _parse_vscode_log_from_offset detects via fstat that the file shrunk since the caller's stat() and resets the read offset to 0. - test_incremental_fstat_truncation_returns_full_reparse: exercises lines 288-290 where _get_cached_vscode_requests detects that the incremental parser reported truncation (new_end < end_offset) and treats the result as a full reparse rather than a delta. Diff-coverage for vscode_parser.py is now 100%. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/copilot_usage/test_vscode_parser.py | 43 +++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/copilot_usage/test_vscode_parser.py b/tests/copilot_usage/test_vscode_parser.py index c6ac1bf2..c9bcd93a 100644 --- a/tests/copilot_usage/test_vscode_parser.py +++ b/tests/copilot_usage/test_vscode_parser.py @@ -1512,3 +1512,46 @@ def test_partial_line_at_eof_excluded(self, tmp_path: Path) -> None: third = _get_cached_vscode_requests(log_file) # Now both requests are returned. assert len(third) == 2 + + def test_toctou_truncation_resets_offset(self, tmp_path: Path) -> None: + """_parse_vscode_log_from_offset resets to 0 when fstat shows file shrunk.""" + log_file = tmp_path / "chat.log" + log_file.write_text(_make_log_line(req_idx=0) + "\n") + + # Call with an offset beyond the file's actual size to trigger the + # TOCTOU guard (actual_size < offset → reset offset and safe_end to 0). + reqs, end_offset = _parse_vscode_log_from_offset(log_file, 999_999) + assert len(reqs) == 1 + # end_offset should equal the full file size (parsed from byte 0). + assert end_offset == log_file.stat().st_size + + def test_incremental_fstat_truncation_returns_full_reparse( + self, tmp_path: Path + ) -> None: + """Incremental path falls back when parser detects TOCTOU truncation.""" + log_file = tmp_path / "chat.log" + initial = "\n".join(_make_log_line(req_idx=i) for i in range(5)) + log_file.write_text(initial + "\n") + + # Populate cache. + first = _get_cached_vscode_requests(log_file) + assert len(first) == 5 + cached_entry = _VSCODE_LOG_CACHE[log_file] + old_end = cached_entry[1] + + # Grow the file so the incremental path is taken (new_id[1] >= end_offset). + with log_file.open("a") as f: + f.write(_make_log_line(req_idx=99) + "\n") + + # Mock _parse_vscode_log_from_offset to simulate the fstat-inside-open + # truncation: return new_end < end_offset so the cache treats it as a + # full reparse rather than a delta. + single_req = _parse_vscode_log_from_offset(log_file, 0)[0][:1] + with patch( + "copilot_usage.vscode_parser._parse_vscode_log_from_offset", + return_value=(single_req, old_end - 1), + ): + result = _get_cached_vscode_requests(log_file) + + # Result should be only the mocked full-reparse output, not combined. + assert len(result) == 1