From 3d64f1447737512199d09d839f490db3fc5a4063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Du=C5=A1an=20Jolovi=C4=87?= Date: Wed, 6 May 2026 04:25:58 +0200 Subject: [PATCH] perf: use hash() as sort key in stable repr and canonicalize JSON keys Replaces repr()-based sorting with hash()-based sorting for sets, frozensets and dict keys in _stable_repr_to. hash() returns a pre-computed integer for built-in types, avoiding the string allocation overhead of repr() while preserving a total order within the process lifetime. Also adds sort_keys=True to all json.dumps calls in _put_commit_row so that tags, dep_versions and input_refs are stored in a canonical order on disk. New tests lock in the guarantee that insertion order does not affect the stable hash for dicts and sets. --- src/cashet/hashing.py | 6 +++--- src/cashet/store.py | 6 +++--- tests/test_hashing.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/cashet/hashing.py b/src/cashet/hashing.py index fa00f4b..33795f5 100644 --- a/src/cashet/hashing.py +++ b/src/cashet/hashing.py @@ -347,7 +347,7 @@ def _stable_repr_to( _visited.add(obj_id) buf.write("{") first = True - for item in sorted(obj, key=repr): + for item in sorted(obj, key=hash): if not first: buf.write(", ") first = False @@ -362,7 +362,7 @@ def _stable_repr_to( _visited.add(obj_id) buf.write("frozenset({") first = True - for item in sorted(obj, key=repr): + for item in sorted(obj, key=hash): if not first: buf.write(", ") first = False @@ -377,7 +377,7 @@ def _stable_repr_to( _visited.add(obj_id) buf.write("{") first = True - for key, val in sorted(obj.items(), key=lambda p: repr(p[0])): + for key, val in sorted(obj.items(), key=lambda p: hash(p[0])): if not first: buf.write(", ") first = False diff --git a/src/cashet/store.py b/src/cashet/store.py index a80c60a..69a5122 100644 --- a/src/cashet/store.py +++ b/src/cashet/store.py @@ -306,20 +306,20 @@ def _put_commit_row( commit.task_def.args_hash, commit.task_def.args_snapshot, commit.task_def.func_source, - json.dumps(commit.task_def.dep_versions), + json.dumps(commit.task_def.dep_versions, sort_keys=True), int(commit.task_def.cache), commit.task_def.retries, int(commit.task_def.force), timeout_seconds, commit.task_def.ttl.total_seconds() if commit.task_def.ttl else None, - json.dumps([r.hash for r in commit.input_refs]), + json.dumps([r.hash for r in commit.input_refs], sort_keys=True), output_hash, output_size, output_tier, commit.parent_hash, commit.status.value, commit.error, - json.dumps(commit.tags), + json.dumps(commit.tags, sort_keys=True), commit.created_at.isoformat(), commit.claimed_at.isoformat(), accessed_at, diff --git a/tests/test_hashing.py b/tests/test_hashing.py index 5f2933d..8fb2f86 100644 --- a/tests/test_hashing.py +++ b/tests/test_hashing.py @@ -257,6 +257,26 @@ def identity(data: Any) -> Any: ref2 = client.submit(identity, {1: 2}) assert ref1.hash != ref2.hash + def test_dict_insertion_order_invariant(self, client: Client) -> None: + def identity(data: Any) -> Any: + return data + + d1 = {"x": 1, "y": 2, "z": 3} + d2 = {"z": 3, "y": 2, "x": 1} + ref1 = client.submit(identity, d1) + ref2 = client.submit(identity, d2) + assert ref1.hash == ref2.hash + + def test_set_insertion_order_invariant(self, client: Client) -> None: + def identity(data: Any) -> Any: + return data + + s1 = {"a", "b", "c"} + s2 = {"c", "b", "a"} + ref1 = client.submit(identity, s1) + ref2 = client.submit(identity, s2) + assert ref1.hash == ref2.hash + class TestRecursiveStructures: def test_recursive_list_does_not_crash(self, client: Client) -> None: