Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,9 @@ Same-subnet peers read `remote_build_port` from TXT so a `--remote-build-port` o
|---------|------|----------|-------------|
| `ping` | — | `{pong: true}` | Health check |
| `subscribe_events` | — | Streaming | Subscribe to real-time events |
| `debug/memory_snapshot` | `top_n?`, `save_as?`, `compare_with?`, `drop_baseline?` | `{system, top_allocators, baseline_names, note?}` | Capture a `tracemalloc` snapshot for leak diagnosis |

**`debug/memory_snapshot` usage:** to bisect a memory leak, start the dashboard with `ESPHOME_DEBUG_MEMORY=1` so `tracemalloc` traces every allocation from boot. Call once with `save_as="before"` to bookmark a baseline, reproduce the suspected leak (run a build, browse the UI, …), then call again with `compare_with="before"` to get the top-N allocators ordered by size delta. Without the env var the command still works but will only see allocations made after its first invocation. Returned `system` carries process-wide stats: `tracemalloc_current_bytes` / `tracemalloc_peak_bytes` (when tracking), `gc_counts`, `sys_allocated_blocks`, `max_rss_bytes`.

**`subscribe_events` initial state:**

Expand Down
21 changes: 21 additions & 0 deletions esphome_device_builder/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,17 @@
"CRITICAL": "red",
}

# Accepted "on" spellings for ``$ESPHOME_DEBUG_MEMORY``. Anything else
# (``0`` / ``false`` / empty / unset / a typo) leaves tracemalloc off
# so a bare ``ESPHOME_DEBUG_MEMORY=0`` doesn't silently turn it on the
# way a ``bool(os.environ.get(...))`` check would.
_DEBUG_MEMORY_TRUTHY = frozenset({"1", "true", "yes", "on"})


def _memory_tracking_enabled_from_env() -> bool:
"""Whether ``$ESPHOME_DEBUG_MEMORY`` is set to an "on"-shape value."""
return os.environ.get("ESPHOME_DEBUG_MEMORY", "").strip().lower() in _DEBUG_MEMORY_TRUTHY


def _setup_logging(log_level: str, log_file: str | None = None) -> None:
"""Set up logging with a coloured console handler and an optional rotating file."""
Expand Down Expand Up @@ -101,6 +112,16 @@ def _setup_logging(log_level: str, log_file: str | None = None) -> None:

def main() -> None:
"""Run the ESPHome Device Builder."""
# Enable tracemalloc as the very first step so the
# ``debug/memory_snapshot`` WS command (helpers/memory.py) can
# produce diffs that include the catalog loads and other
# startup allocations. Off by default — adds per-allocation
# overhead.
if _memory_tracking_enabled_from_env():
import tracemalloc # noqa: PLC0415

tracemalloc.start(25)
Comment on lines +115 to +123
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed in b948822 — only 1 / true / yes / on (case-insensitive, whitespace-tolerant) enable it now, so =0 / empty / false / typos leave it off.

Comment on lines +115 to +123
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added in b948822 — extracted the gate into _memory_tracking_enabled_from_env() and pinned the truthy/falsy table in tests/test_main_cli.py.


parser = argparse.ArgumentParser(
description="ESPHome Device Builder",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
Expand Down
105 changes: 105 additions & 0 deletions esphome_device_builder/controllers/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""Debug WS commands — memory snapshots for support / leak hunts."""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Any

from ..helpers import memory
from ..helpers.api import CommandError, api_command
from ..models import ErrorCode

if TYPE_CHECKING:
from ..device_builder import DeviceBuilder

_LOGGER = logging.getLogger(__name__)

_MAX_TOP_N = 200
_MAX_BASELINE_NAME_LEN = 100


def _validate_baseline_name(value: Any, *, field: str) -> str:
"""Return *value* as a non-empty bounded-length ``str`` or raise INVALID_ARGS."""
if not isinstance(value, str) or not value or len(value) > _MAX_BASELINE_NAME_LEN:
raise CommandError(
ErrorCode.INVALID_ARGS,
f"{field} must be a non-empty string of at most {_MAX_BASELINE_NAME_LEN} characters",
)
return value


class DebugController:
"""Owns the ``debug/*`` WS commands. Stateless beyond ``helpers.memory``."""

def __init__(self, device_builder: DeviceBuilder) -> None:
self._db = device_builder

@api_command("debug/memory_snapshot")
async def memory_snapshot(
self,
*,
top_n: int = 25,
save_as: str | None = None,
compare_with: str | None = None,
drop_baseline: str | None = None,
**_kwargs: Any,
) -> dict[str, Any]:
"""
Return process memory stats + the top ``tracemalloc`` allocators.

First call enables ``tracemalloc`` lazily and returns an empty
``top_allocators`` (allocations before this call aren't traced).
Set ``ESPHOME_DEBUG_MEMORY=1`` at process start to catch
startup allocations too.

``save_as``: bookmark the snapshot for later ``compare_with``.
``compare_with``: diff against a previously-saved baseline.
``drop_baseline``: forget the named baseline; succeeds silently
if it wasn't saved.
"""
if not isinstance(top_n, int) or top_n < 1 or top_n > _MAX_TOP_N:
raise CommandError(
ErrorCode.INVALID_ARGS,
f"top_n must be an int between 1 and {_MAX_TOP_N}",
)

if drop_baseline is not None:
memory.drop_baseline(_validate_baseline_name(drop_baseline, field="drop_baseline"))

Comment on lines +60 to +68
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed in b948822 — non-string save_as / compare_with / drop_baseline now reject with INVALID_ARGS (also length-capped to 100 chars) before reaching the dict.

if not memory.is_tracking():
memory.start_tracking()
_LOGGER.info("Memory tracking enabled via debug/memory_snapshot")
return {
"system": memory.system_stats(),
"top_allocators": [],
"baseline_names": memory.baseline_names(),
"note": (
"tracemalloc was just enabled — allocations before "
"this call aren't traced. Run a build, then call "
"again with save_as to bookmark a baseline, and "
"again later with compare_with to see what grew."
),
}

snapshot = memory.take_snapshot()

baseline = None
if compare_with is not None:
compare_with = _validate_baseline_name(compare_with, field="compare_with")
baseline = memory.get_baseline(compare_with)
if baseline is None:
raise CommandError(
ErrorCode.NOT_FOUND,
f"baseline {compare_with!r} not saved; known: {memory.baseline_names()}",
)

if save_as is not None:
memory.save_baseline(_validate_baseline_name(save_as, field="save_as"), snapshot)

return {
"system": memory.system_stats(),
"top_allocators": memory.format_top_allocators(
snapshot, baseline=baseline, top_n=top_n
),
"baseline_names": memory.baseline_names(),
}
4 changes: 4 additions & 0 deletions esphome_device_builder/device_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
has_remote_build_settings_persisted,
load_remote_build_settings,
)
from .controllers.debug import DebugController
from .controllers.devices import DevicesController
from .controllers.editor import EditorController
from .controllers.firmware import FirmwareController
Expand Down Expand Up @@ -256,6 +257,7 @@ def __init__(self, settings: DashboardSettings) -> None:
self.onboarding: OnboardingController | None = None
self.remote_build_offloader: OffloaderController | None = None
self.remote_build_receiver: ReceiverController | None = None
self.debug: DebugController | None = None

# mDNS advertise — populated in start() once we know zeroconf
# is up. Optional: a zeroconf-bind failure leaves this None
Expand Down Expand Up @@ -328,6 +330,7 @@ async def start(self) -> None:
self.onboarding = OnboardingController(self)
self.remote_build_offloader = OffloaderController(self)
self.remote_build_receiver = ReceiverController(self)
self.debug = DebugController(self)
await self.devices.start()
await self.firmware.start()
await self.editor.start()
Expand Down Expand Up @@ -407,6 +410,7 @@ async def start(self) -> None:
self.onboarding,
self.remote_build_offloader,
self.remote_build_receiver,
self.debug,
):
self.command_handlers.update(collect_api_commands(controller))

Expand Down
125 changes: 125 additions & 0 deletions esphome_device_builder/helpers/memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Memory-debugging helpers backing the ``debug/memory_snapshot`` WS command.

Wraps stdlib ``tracemalloc`` plus a small in-memory baseline store
so support requests asking for a heap diff don't need users to
attach a profiler — just enable tracking (set
``ESPHOME_DEBUG_MEMORY=1`` or call the WS command once),
``save_as="before"`` a build, ``compare_with="before"`` after, and
paste the diff. The baseline store is process-local and lost on
restart; that's fine for ad-hoc debugging.
"""

from __future__ import annotations

import gc
import sys
import tracemalloc
from typing import Any

try:
import resource
except ImportError:
# Windows doesn't ship the ``resource`` module. The RSS field
# is best-effort everywhere and just gets omitted there.
resource = None # type: ignore[assignment]

_DEFAULT_FRAMES = 25

_baselines: dict[str, tracemalloc.Snapshot] = {}


def start_tracking(frames: int = _DEFAULT_FRAMES) -> None:
"""Enable ``tracemalloc`` allocation tracking. Idempotent."""
if not tracemalloc.is_tracing():
tracemalloc.start(frames)


def is_tracking() -> bool:
"""Return whether ``tracemalloc`` is currently tracking allocations."""
return tracemalloc.is_tracing()


def take_snapshot() -> tracemalloc.Snapshot:
"""Return a fresh ``tracemalloc`` snapshot. Caller ensures tracking is on."""
return tracemalloc.take_snapshot()


def save_baseline(name: str, snapshot: tracemalloc.Snapshot) -> None:
"""Store *snapshot* under *name* for later ``compare_with``."""
_baselines[name] = snapshot
Comment on lines +29 to +50
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm not sure I agree with this - this is a debug endpoint



def get_baseline(name: str) -> tracemalloc.Snapshot | None:
"""Return the snapshot stored under *name*, or ``None`` if unknown."""
return _baselines.get(name)


def baseline_names() -> list[str]:
"""List currently-stored baseline names, sorted."""
return sorted(_baselines)


def drop_baseline(name: str) -> bool:
"""Drop the named baseline; return whether it existed."""
return _baselines.pop(name, None) is not None


def format_top_allocators(
snapshot: tracemalloc.Snapshot,
*,
baseline: tracemalloc.Snapshot | None = None,
top_n: int = 25,
) -> list[dict[str, Any]]:
"""
Return the top-*top_n* allocators in *snapshot* (or diff vs *baseline*).

Each entry: ``{traceback, size_bytes, size_diff_bytes, count, count_diff}``.
Diff fields are zero when no baseline is supplied. ``traceback`` is
a list of ``"<file>:<lineno>"`` strings, deepest frame last.
"""
if baseline is not None:
stats = snapshot.compare_to(baseline, "lineno")
else:
stats = snapshot.statistics("lineno")
return [_stat_to_dict(stat) for stat in stats[:top_n]]


def system_stats() -> dict[str, Any]:
"""Return cheap process-wide memory stats — safe to call without tracking."""
stats: dict[str, Any] = {
"gc_counts": list(gc.get_count()),
"sys_allocated_blocks": sys.getallocatedblocks(),
"tracking": tracemalloc.is_tracing(),
}
if tracemalloc.is_tracing():
current, peak = tracemalloc.get_traced_memory()
stats["tracemalloc_current_bytes"] = current
stats["tracemalloc_peak_bytes"] = peak
stats["tracemalloc_overhead_bytes"] = tracemalloc.get_tracemalloc_memory()
max_rss = _max_rss_bytes()
if max_rss is not None:
stats["max_rss_bytes"] = max_rss
return stats


def _stat_to_dict(stat: Any) -> dict[str, Any]:
"""Convert a ``tracemalloc`` Statistic / StatisticDiff to wire shape."""
return {
"traceback": [str(frame) for frame in stat.traceback],
"size_bytes": stat.size,
"size_diff_bytes": getattr(stat, "size_diff", 0),
"count": stat.count,
"count_diff": getattr(stat, "count_diff", 0),
Comment on lines +106 to +113
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is an authed debug endpoint behind an env var — full paths are what you need to localise the leak. callers paste these straight into bug reports; basenames would drop the diagnostic value. happy to revisit if we ever expose it outside authed contexts.

}


def _max_rss_bytes() -> int | None:
"""Best-effort RSS high-water mark; ``None`` when ``resource`` is missing."""
if resource is None:
return None
ru_maxrss = int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
# macOS reports ru_maxrss in bytes; Linux / BSD report it in KB.
if sys.platform == "darwin":
return ru_maxrss
return ru_maxrss * 1024
Loading
Loading