From 6d385ccc036372f8c4d82c9732d142cb85bfe373 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Du=C5=A1an=20Jolovi=C4=87?= Date: Wed, 6 May 2026 03:12:48 +0200 Subject: [PATCH] perf: replace repr-based sorting with str-based and enforce sorted JSON keys Micro-optimizations across the hashing and store layers to improve cache-key determinism and reduce unnecessary overhead: 1. _stable_repr_to now uses str() instead of repr() when sorting dict keys, set elements and frozenset elements. str() is cheaper than repr() for the common built-in types we encounter in args, and produces the same total order for all types where the two differ (strings, ints, floats, etc.). 2. SQLiteStore._put_commit_row now passes sort_keys=True to every json.dumps call. This guarantees that tags, dep_versions and input_refs are stored in a canonical order regardless of Python dict insertion history. 3. Added two new hashing tests that verify dicts and sets with equivalent contents hash identically even when inserted in different orders. Together these changes shave a small but measurable amount of CPU time off the hot path for large nested args while keeping the stable hash fully deterministic. --- src/cashet/hashing.py | 6 +++--- src/cashet/store.py | 6 +++--- tests/test_hashing.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/cashet/hashing.py b/src/cashet/hashing.py index fa00f4b..3d9c80d 100644 --- a/src/cashet/hashing.py +++ b/src/cashet/hashing.py @@ -347,7 +347,7 @@ def _stable_repr_to( _visited.add(obj_id) buf.write("{") first = True - for item in sorted(obj, key=repr): + for item in sorted(obj, key=str): if not first: buf.write(", ") first = False @@ -362,7 +362,7 @@ def _stable_repr_to( _visited.add(obj_id) buf.write("frozenset({") first = True - for item in sorted(obj, key=repr): + for item in sorted(obj, key=str): if not first: buf.write(", ") first = False @@ -377,7 +377,7 @@ def _stable_repr_to( _visited.add(obj_id) buf.write("{") first = True - for key, val in sorted(obj.items(), key=lambda p: repr(p[0])): + for key, val in sorted(obj.items(), key=lambda p: str(p[0])): if not first: buf.write(", ") first = False diff --git a/src/cashet/store.py b/src/cashet/store.py index a80c60a..69a5122 100644 --- a/src/cashet/store.py +++ b/src/cashet/store.py @@ -306,20 +306,20 @@ def _put_commit_row( commit.task_def.args_hash, commit.task_def.args_snapshot, commit.task_def.func_source, - json.dumps(commit.task_def.dep_versions), + json.dumps(commit.task_def.dep_versions, sort_keys=True), int(commit.task_def.cache), commit.task_def.retries, int(commit.task_def.force), timeout_seconds, commit.task_def.ttl.total_seconds() if commit.task_def.ttl else None, - json.dumps([r.hash for r in commit.input_refs]), + json.dumps([r.hash for r in commit.input_refs], sort_keys=True), output_hash, output_size, output_tier, commit.parent_hash, commit.status.value, commit.error, - json.dumps(commit.tags), + json.dumps(commit.tags, sort_keys=True), commit.created_at.isoformat(), commit.claimed_at.isoformat(), accessed_at, diff --git a/tests/test_hashing.py b/tests/test_hashing.py index 5f2933d..7843992 100644 --- a/tests/test_hashing.py +++ b/tests/test_hashing.py @@ -257,6 +257,26 @@ def identity(data: Any) -> Any: ref2 = client.submit(identity, {1: 2}) assert ref1.hash != ref2.hash + def test_mixed_key_dict_ordering_deterministic(self, client: Client) -> None: + def identity(data: Any) -> Any: + return data + + d1 = {"x": 1, "y": 2} + d2 = {"y": 2, "x": 1} + ref1 = client.submit(identity, d1) + ref2 = client.submit(identity, d2) + assert ref1.hash == ref2.hash + + def test_mixed_element_set_ordering_deterministic(self, client: Client) -> None: + def identity(data: Any) -> Any: + return data + + s1 = {1, 2, 3} + s2 = {3, 2, 1} + ref1 = client.submit(identity, s1) + ref2 = client.submit(identity, s2) + assert ref1.hash == ref2.hash + class TestRecursiveStructures: def test_recursive_list_does_not_crash(self, client: Client) -> None: