-
-
Notifications
You must be signed in to change notification settings - Fork 8
debug: add memory_snapshot WS command for leak diagnosis #935
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,6 +46,17 @@ | |
| "CRITICAL": "red", | ||
| } | ||
|
|
||
| # Accepted "on" spellings for ``$ESPHOME_DEBUG_MEMORY``. Anything else | ||
| # (``0`` / ``false`` / empty / unset / a typo) leaves tracemalloc off | ||
| # so a bare ``ESPHOME_DEBUG_MEMORY=0`` doesn't silently turn it on the | ||
| # way a ``bool(os.environ.get(...))`` check would. | ||
| _DEBUG_MEMORY_TRUTHY = frozenset({"1", "true", "yes", "on"}) | ||
|
|
||
|
|
||
| def _memory_tracking_enabled_from_env() -> bool: | ||
| """Whether ``$ESPHOME_DEBUG_MEMORY`` is set to an "on"-shape value.""" | ||
| return os.environ.get("ESPHOME_DEBUG_MEMORY", "").strip().lower() in _DEBUG_MEMORY_TRUTHY | ||
|
|
||
|
|
||
| def _setup_logging(log_level: str, log_file: str | None = None) -> None: | ||
| """Set up logging with a coloured console handler and an optional rotating file.""" | ||
|
|
@@ -101,6 +112,16 @@ def _setup_logging(log_level: str, log_file: str | None = None) -> None: | |
|
|
||
| def main() -> None: | ||
| """Run the ESPHome Device Builder.""" | ||
| # Enable tracemalloc as the very first step so the | ||
| # ``debug/memory_snapshot`` WS command (helpers/memory.py) can | ||
| # produce diffs that include the catalog loads and other | ||
| # startup allocations. Off by default — adds per-allocation | ||
| # overhead. | ||
| if _memory_tracking_enabled_from_env(): | ||
| import tracemalloc # noqa: PLC0415 | ||
|
|
||
| tracemalloc.start(25) | ||
|
Comment on lines
+115
to
+123
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added in b948822 — extracted the gate into |
||
|
|
||
| parser = argparse.ArgumentParser( | ||
| description="ESPHome Device Builder", | ||
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| """Debug WS commands — memory snapshots for support / leak hunts.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
|
|
||
| from ..helpers import memory | ||
| from ..helpers.api import CommandError, api_command | ||
| from ..models import ErrorCode | ||
|
|
||
| if TYPE_CHECKING: | ||
| from ..device_builder import DeviceBuilder | ||
|
|
||
| _LOGGER = logging.getLogger(__name__) | ||
|
|
||
| _MAX_TOP_N = 200 | ||
| _MAX_BASELINE_NAME_LEN = 100 | ||
|
|
||
|
|
||
| def _validate_baseline_name(value: Any, *, field: str) -> str: | ||
| """Return *value* as a non-empty bounded-length ``str`` or raise INVALID_ARGS.""" | ||
| if not isinstance(value, str) or not value or len(value) > _MAX_BASELINE_NAME_LEN: | ||
| raise CommandError( | ||
| ErrorCode.INVALID_ARGS, | ||
| f"{field} must be a non-empty string of at most {_MAX_BASELINE_NAME_LEN} characters", | ||
| ) | ||
| return value | ||
|
|
||
|
|
||
| class DebugController: | ||
| """Owns the ``debug/*`` WS commands. Stateless beyond ``helpers.memory``.""" | ||
|
|
||
| def __init__(self, device_builder: DeviceBuilder) -> None: | ||
| self._db = device_builder | ||
|
|
||
| @api_command("debug/memory_snapshot") | ||
| async def memory_snapshot( | ||
| self, | ||
| *, | ||
| top_n: int = 25, | ||
| save_as: str | None = None, | ||
| compare_with: str | None = None, | ||
| drop_baseline: str | None = None, | ||
| **_kwargs: Any, | ||
| ) -> dict[str, Any]: | ||
| """ | ||
| Return process memory stats + the top ``tracemalloc`` allocators. | ||
|
|
||
| First call enables ``tracemalloc`` lazily and returns an empty | ||
| ``top_allocators`` (allocations before this call aren't traced). | ||
| Set ``ESPHOME_DEBUG_MEMORY=1`` at process start to catch | ||
| startup allocations too. | ||
|
|
||
| ``save_as``: bookmark the snapshot for later ``compare_with``. | ||
| ``compare_with``: diff against a previously-saved baseline. | ||
| ``drop_baseline``: forget the named baseline; succeeds silently | ||
| if it wasn't saved. | ||
| """ | ||
| if not isinstance(top_n, int) or top_n < 1 or top_n > _MAX_TOP_N: | ||
| raise CommandError( | ||
| ErrorCode.INVALID_ARGS, | ||
| f"top_n must be an int between 1 and {_MAX_TOP_N}", | ||
| ) | ||
|
|
||
| if drop_baseline is not None: | ||
| memory.drop_baseline(_validate_baseline_name(drop_baseline, field="drop_baseline")) | ||
|
|
||
|
Comment on lines
+60
to
+68
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed in b948822 — non-string |
||
| if not memory.is_tracking(): | ||
| memory.start_tracking() | ||
| _LOGGER.info("Memory tracking enabled via debug/memory_snapshot") | ||
| return { | ||
| "system": memory.system_stats(), | ||
| "top_allocators": [], | ||
| "baseline_names": memory.baseline_names(), | ||
| "note": ( | ||
| "tracemalloc was just enabled — allocations before " | ||
| "this call aren't traced. Run a build, then call " | ||
| "again with save_as to bookmark a baseline, and " | ||
| "again later with compare_with to see what grew." | ||
| ), | ||
| } | ||
|
|
||
| snapshot = memory.take_snapshot() | ||
|
|
||
| baseline = None | ||
| if compare_with is not None: | ||
| compare_with = _validate_baseline_name(compare_with, field="compare_with") | ||
| baseline = memory.get_baseline(compare_with) | ||
| if baseline is None: | ||
| raise CommandError( | ||
| ErrorCode.NOT_FOUND, | ||
| f"baseline {compare_with!r} not saved; known: {memory.baseline_names()}", | ||
| ) | ||
|
|
||
| if save_as is not None: | ||
| memory.save_baseline(_validate_baseline_name(save_as, field="save_as"), snapshot) | ||
|
|
||
| return { | ||
| "system": memory.system_stats(), | ||
| "top_allocators": memory.format_top_allocators( | ||
| snapshot, baseline=baseline, top_n=top_n | ||
| ), | ||
| "baseline_names": memory.baseline_names(), | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| """ | ||
| Memory-debugging helpers backing the ``debug/memory_snapshot`` WS command. | ||
|
|
||
| Wraps stdlib ``tracemalloc`` plus a small in-memory baseline store | ||
| so support requests asking for a heap diff don't need users to | ||
| attach a profiler — just enable tracking (set | ||
| ``ESPHOME_DEBUG_MEMORY=1`` or call the WS command once), | ||
| ``save_as="before"`` a build, ``compare_with="before"`` after, and | ||
| paste the diff. The baseline store is process-local and lost on | ||
| restart; that's fine for ad-hoc debugging. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import gc | ||
| import sys | ||
| import tracemalloc | ||
| from typing import Any | ||
|
|
||
| try: | ||
| import resource | ||
| except ImportError: | ||
| # Windows doesn't ship the ``resource`` module. The RSS field | ||
| # is best-effort everywhere and just gets omitted there. | ||
| resource = None # type: ignore[assignment] | ||
|
|
||
| _DEFAULT_FRAMES = 25 | ||
|
|
||
| _baselines: dict[str, tracemalloc.Snapshot] = {} | ||
|
|
||
|
|
||
| def start_tracking(frames: int = _DEFAULT_FRAMES) -> None: | ||
| """Enable ``tracemalloc`` allocation tracking. Idempotent.""" | ||
| if not tracemalloc.is_tracing(): | ||
| tracemalloc.start(frames) | ||
|
|
||
|
|
||
| def is_tracking() -> bool: | ||
| """Return whether ``tracemalloc`` is currently tracking allocations.""" | ||
| return tracemalloc.is_tracing() | ||
|
|
||
|
|
||
| def take_snapshot() -> tracemalloc.Snapshot: | ||
| """Return a fresh ``tracemalloc`` snapshot. Caller ensures tracking is on.""" | ||
| return tracemalloc.take_snapshot() | ||
|
|
||
|
|
||
| def save_baseline(name: str, snapshot: tracemalloc.Snapshot) -> None: | ||
| """Store *snapshot* under *name* for later ``compare_with``.""" | ||
| _baselines[name] = snapshot | ||
|
Comment on lines
+29
to
+50
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmm not sure I agree with this - this is a debug endpoint |
||
|
|
||
|
|
||
| def get_baseline(name: str) -> tracemalloc.Snapshot | None: | ||
| """Return the snapshot stored under *name*, or ``None`` if unknown.""" | ||
| return _baselines.get(name) | ||
|
|
||
|
|
||
| def baseline_names() -> list[str]: | ||
| """List currently-stored baseline names, sorted.""" | ||
| return sorted(_baselines) | ||
|
|
||
|
|
||
| def drop_baseline(name: str) -> bool: | ||
| """Drop the named baseline; return whether it existed.""" | ||
| return _baselines.pop(name, None) is not None | ||
|
|
||
|
|
||
| def format_top_allocators( | ||
| snapshot: tracemalloc.Snapshot, | ||
| *, | ||
| baseline: tracemalloc.Snapshot | None = None, | ||
| top_n: int = 25, | ||
| ) -> list[dict[str, Any]]: | ||
| """ | ||
| Return the top-*top_n* allocators in *snapshot* (or diff vs *baseline*). | ||
|
|
||
| Each entry: ``{traceback, size_bytes, size_diff_bytes, count, count_diff}``. | ||
| Diff fields are zero when no baseline is supplied. ``traceback`` is | ||
| a list of ``"<file>:<lineno>"`` strings, deepest frame last. | ||
| """ | ||
| if baseline is not None: | ||
| stats = snapshot.compare_to(baseline, "lineno") | ||
| else: | ||
| stats = snapshot.statistics("lineno") | ||
| return [_stat_to_dict(stat) for stat in stats[:top_n]] | ||
|
|
||
|
|
||
| def system_stats() -> dict[str, Any]: | ||
| """Return cheap process-wide memory stats — safe to call without tracking.""" | ||
| stats: dict[str, Any] = { | ||
| "gc_counts": list(gc.get_count()), | ||
| "sys_allocated_blocks": sys.getallocatedblocks(), | ||
| "tracking": tracemalloc.is_tracing(), | ||
| } | ||
| if tracemalloc.is_tracing(): | ||
| current, peak = tracemalloc.get_traced_memory() | ||
| stats["tracemalloc_current_bytes"] = current | ||
| stats["tracemalloc_peak_bytes"] = peak | ||
| stats["tracemalloc_overhead_bytes"] = tracemalloc.get_tracemalloc_memory() | ||
| max_rss = _max_rss_bytes() | ||
| if max_rss is not None: | ||
| stats["max_rss_bytes"] = max_rss | ||
| return stats | ||
|
|
||
|
|
||
| def _stat_to_dict(stat: Any) -> dict[str, Any]: | ||
| """Convert a ``tracemalloc`` Statistic / StatisticDiff to wire shape.""" | ||
| return { | ||
| "traceback": [str(frame) for frame in stat.traceback], | ||
| "size_bytes": stat.size, | ||
| "size_diff_bytes": getattr(stat, "size_diff", 0), | ||
| "count": stat.count, | ||
| "count_diff": getattr(stat, "count_diff", 0), | ||
|
Comment on lines
+106
to
+113
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is an authed debug endpoint behind an env var — full paths are what you need to localise the leak. callers paste these straight into bug reports; basenames would drop the diagnostic value. happy to revisit if we ever expose it outside authed contexts. |
||
| } | ||
|
|
||
|
|
||
| def _max_rss_bytes() -> int | None: | ||
| """Best-effort RSS high-water mark; ``None`` when ``resource`` is missing.""" | ||
| if resource is None: | ||
| return None | ||
| ru_maxrss = int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) | ||
| # macOS reports ru_maxrss in bytes; Linux / BSD report it in KB. | ||
| if sys.platform == "darwin": | ||
| return ru_maxrss | ||
| return ru_maxrss * 1024 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed in b948822 — only
1/true/yes/on(case-insensitive, whitespace-tolerant) enable it now, so=0/ empty /false/ typos leave it off.