Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 122 additions & 27 deletions src/copilot_usage/vscode_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,64 @@ def parse_vscode_log(log_path: Path) -> list[VSCodeRequest]:
"""Parse a single VS Code Copilot Chat log file into request objects.

Returns a list of parsed requests (possibly empty when no lines match).
Unlike incremental parsing via :func:`_parse_vscode_log_from_offset`,
this performs a complete one-shot read and includes the final line even
when it is not newline-terminated.

Raises:
OSError: If the file cannot be opened or read.
"""
requests, _ = _parse_vscode_log_from_offset(log_path, 0, include_partial_tail=True)
return requests


def _parse_vscode_log_from_offset(
log_path: Path,
offset: int,
*,
include_partial_tail: bool = False,
) -> tuple[list[VSCodeRequest], int]:
"""Parse VS Code Copilot Chat log starting at *offset* bytes.

Returns ``(requests, end_offset)`` where *end_offset* is the byte
position immediately after the last line consumed by this call.
With the default ``include_partial_tail=False``, this is the end of
the last **complete** (newline-terminated) line read; a partial line
at EOF is intentionally excluded so that the next incremental call
can re-read it once the writer finishes the line.

When *include_partial_tail* is ``True`` (used by :func:`parse_vscode_log`
for one-shot full parsing), a final non-newline-terminated line is
**included** in the results, and ``end_offset`` advances past that
consumed partial tail as well to preserve full-file text semantics.

Raises:
OSError: If the file cannot be opened or read.
"""
requests: list[VSCodeRequest] = []
with log_path.open(encoding="utf-8", errors="replace") as f:
for line in f:
safe_end: int = offset
with log_path.open("rb") as fb:
if offset > 0:
# Guard against TOCTOU race: the file may have been
# truncated/replaced between the caller's stat() and this
# open(). Re-validate with fstat on the open descriptor.
actual_size = os.fstat(fb.fileno()).st_size
if actual_size < offset:
offset = 0
safe_end = 0
fb.seek(offset)
for raw_line in fb:
is_complete = raw_line.endswith(b"\n")
if not is_complete and not include_partial_tail:
# Partial line at EOF — stop advancing so the next
# incremental call re-reads this line once complete.
break
safe_end += len(raw_line)
# Fast pre-filter: only ~1–5% of lines contain "ccreq:"
if "ccreq:" not in line:
if b"ccreq:" not in raw_line:
continue
# Decode with replacement to mirror parse_vscode_log behaviour.
line = raw_line.decode("utf-8", errors="replace")
m = _CCREQ_RE.match(line)
if m is None:
continue
Expand All @@ -155,53 +203,100 @@ def parse_vscode_log(log_path: Path) -> list[VSCodeRequest]:
category=category,
)
)
logger.debug("Parsed {} request(s) from {}", len(requests), log_path)
return requests
logger.debug(
"Parsed {} request(s) from {} (offset {}→{})",
len(requests),
log_path,
offset,
safe_end,
)
return requests, safe_end


# ---------------------------------------------------------------------------
# Module-level parsed-requests cache (mirrors parser._EVENTS_CACHE).
# Uses OrderedDict for LRU eviction: most-recently-used entries are at
# the back, least-recently-used at the front.
#
# Cache value layout: (file_id, end_offset, requests_tuple)
# file_id – (st_mtime_ns, st_size) or None when stat() fails
# end_offset – byte position of the last parsed line
# requests – immutable tuple of parsed VSCodeRequest objects
# ---------------------------------------------------------------------------

_MAX_CACHED_VSCODE_LOGS: Final[int] = 64
_VSCODE_LOG_CACHE: OrderedDict[
Path, tuple[tuple[int, int] | None, tuple[VSCodeRequest, ...]]
Path, tuple[tuple[int, int] | None, int, tuple[VSCodeRequest, ...]]
] = OrderedDict()


def _update_vscode_cache(
log_path: Path,
file_id: tuple[int, int] | None,
end_offset: int,
requests: tuple[VSCodeRequest, ...],
) -> None:
"""Insert or replace a cache entry with LRU eviction."""
if log_path in _VSCODE_LOG_CACHE:
del _VSCODE_LOG_CACHE[log_path]
elif len(_VSCODE_LOG_CACHE) >= _MAX_CACHED_VSCODE_LOGS:
_VSCODE_LOG_CACHE.popitem(last=False) # evict LRU (front)
_VSCODE_LOG_CACHE[log_path] = (file_id, end_offset, requests)


def _get_cached_vscode_requests(log_path: Path) -> tuple[VSCodeRequest, ...]:
"""Return parsed requests, re-parsing only when ``(mtime_ns, size)`` changes.
"""Return parsed requests, incrementally parsing only new content.

On the first call for a given *log_path*, delegates to
:func:`parse_vscode_log` and stores the result. Subsequent calls
return the cached tuple as long as the file identity is unchanged.
:func:`_parse_vscode_log_from_offset` (offset 0) and stores the
result together with the byte offset reached. Subsequent calls
detect whether the file has **grown** (append-only) by comparing
the new ``st_size`` against the cached size — if so, only the
bytes after the stored offset are parsed and appended to the
existing result.

When the file is **replaced** (new size < cached size) or
``st_size`` cannot be determined, a full re-parse is performed.
Comment on lines +252 to +259
Copy link

Copilot AI Apr 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _get_cached_vscode_requests docstring says it detects append-only growth by comparing st_size to the “cached size” and treats replacement as new size < cached size, but the implementation actually compares the current st_size against the cached end_offset (safe parse boundary). Update the wording to reference end_offset (and that truncation/replacement is detected when st_size < end_offset) so the docs match behavior, especially when a previous parse stopped before EOF due to a partial line.

Suggested change
result together with the byte offset reached. Subsequent calls
detect whether the file has **grown** (append-only) by comparing
the new ``st_size`` against the cached sizeif so, only the
bytes after the stored offset are parsed and appended to the
existing result.
When the file is **replaced** (new size < cached size) or
``st_size`` cannot be determined, a full re-parse is performed.
result together with the safe byte offset reached (``end_offset``).
Subsequent calls detect whether the file has **grown**
(append-only) by comparing the new ``st_size`` against the cached
``end_offset``if so, only the bytes after that stored offset are
parsed and appended to the existing result. This matters when a
previous parse stopped before EOF due to a partial trailing line:
``end_offset`` is the safe resume point, not necessarily the prior
end of file.
When the file is **truncated or replaced** (``st_size < end_offset``)
or ``st_size`` cannot be determined, a full re-parse is performed.

Copilot uses AI. Check for mistakes.

The cache is bounded to :data:`_MAX_CACHED_VSCODE_LOGS` entries;
the **least-recently used** entry is evicted when the limit is
reached.

The parsed list is converted to a ``tuple`` before storage so that
callers cannot accidentally append, pop, or reorder entries in the
cache — matching the container-level immutability pattern used by
:func:`copilot_usage.parser.get_cached_events`.

Raises:
OSError: Propagated from :func:`parse_vscode_log` when the file
cannot be opened or read.
OSError: Propagated from :func:`_parse_vscode_log_from_offset`
when the file cannot be opened or read.
"""
file_id = _safe_file_identity(log_path)
new_id = _safe_file_identity(log_path)
cached = _VSCODE_LOG_CACHE.get(log_path)
if cached is not None and cached[0] == file_id:
_VSCODE_LOG_CACHE.move_to_end(log_path)
return cached[1]
requests = tuple(parse_vscode_log(log_path))
if log_path in _VSCODE_LOG_CACHE:
del _VSCODE_LOG_CACHE[log_path]
elif len(_VSCODE_LOG_CACHE) >= _MAX_CACHED_VSCODE_LOGS:
_VSCODE_LOG_CACHE.popitem(last=False) # evict LRU (front)
_VSCODE_LOG_CACHE[log_path] = (file_id, requests)
return requests

if cached is not None:
old_id, end_offset, old_requests = cached

# Exact match: file unchanged — return cached result.
if old_id == new_id:
_VSCODE_LOG_CACHE.move_to_end(log_path)
return old_requests

# Incremental path: file grew (append-only) beyond the cached
# resume point. Compare against ``end_offset`` because that is the
# position we will seek to when resuming parsing.
if new_id is not None and old_id is not None and new_id[1] >= end_offset:
new_reqs, new_end = _parse_vscode_log_from_offset(log_path, end_offset)
if new_end < end_offset:
# fstat inside the parser detected truncation — the
# returned results are a full reparse, not a delta.
result = tuple(new_reqs)
_update_vscode_cache(log_path, new_id, new_end, result)
return result
combined = old_requests + tuple(new_reqs)
_update_vscode_cache(log_path, new_id, new_end, combined)
return combined

# Full parse: first call or file was truncated/replaced.
requests, end_offset = _parse_vscode_log_from_offset(log_path, 0)
result = tuple(requests)
_update_vscode_cache(log_path, new_id, end_offset, result)
return result


@dataclass(slots=True)
Expand Down
Loading
Loading