From 69ee036b4d8fda73a8086bd049f7cbeaae73fa9e Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 01:09:28 +0000 Subject: [PATCH] Optimize group_lookup The optimized code achieves a 16% speedup by eliminating the overhead of Python's `setdefault()` method and replacing `enumerate(zip())` with direct indexing. **Key optimizations:** 1. **Eliminated `setdefault()` overhead**: The original code used `lookup.setdefault(code, []).append((idx, cell_id))` which performs internal function calls and dictionary lookups even when the key exists. The optimized version uses explicit `if code in lookup` checks with direct assignment, reducing function call overhead. 2. **Replaced `enumerate(zip())` with range-based indexing**: Instead of creating intermediate tuples through `zip()` and `enumerate()`, the optimization uses `range(length)` with direct sequence indexing. This avoids tuple creation overhead and leverages the efficient indexing that `Sequence` types provide. 3. **Precomputed length calculation**: Using `min(len(ids), len(codes))` upfront maintains the original truncation behavior while avoiding repeated length checks during iteration. **Performance characteristics from tests:** - **Large datasets see the biggest gains**: Tests with 1000+ elements show 17-20% improvements, indicating the optimization scales well - **Small datasets have mixed results**: Some small test cases show slight regressions due to the additional length calculation overhead - **Best for scenarios with many unique codes**: The explicit key checking approach works particularly well when dictionary insertions are frequent **Impact on workloads:** Based on the function reference, `group_lookup` is called from `_match_cell_ids_by_similarity`, which appears to be part of a cell matching algorithm that likely runs during notebook operations. Since it's called twice per matching operation (for previous and next lookups), the 16% improvement could provide noticeable performance benefits in interactive notebook environments where cell matching occurs frequently. --- marimo/_utils/cell_matching.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/marimo/_utils/cell_matching.py b/marimo/_utils/cell_matching.py index 2468eee6deb..b420e82bde6 100644 --- a/marimo/_utils/cell_matching.py +++ b/marimo/_utils/cell_matching.py @@ -39,8 +39,16 @@ def group_lookup( ids: Sequence[CellId_t], codes: Sequence[str] ) -> dict[str, list[tuple[int, CellId_t]]]: lookup: dict[str, list[tuple[int, CellId_t]]] = {} - for idx, (cell_id, code) in enumerate(zip(ids, codes)): - lookup.setdefault(code, []).append((idx, cell_id)) + # Combine zip and enumerate efficiently by using zip with range + length = min(len(ids), len(codes)) + # Avoid setdefault overhead by initializing lists only when needed + for idx in range(length): + code = codes[idx] + cell_id = ids[idx] + if code in lookup: + lookup[code].append((idx, cell_id)) + else: + lookup[code] = [(idx, cell_id)] return lookup