Bump to v0.1.1 with simplied endianness handling

juntyr · juntyr · commit 22c3591b1b96 · 2025-12-17T13:43:32.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "numcodecs-tokenize"
-version = "0.1.0"
+version = "0.1.1"
 description = "Tokenization codec for the `numcodecs` buffer compression API"
 readme = "README.md"
 license = "MPL-2.0"
diff --git a/src/numcodecs_tokenize/__init__.py b/src/numcodecs_tokenize/__init__.py
@@ -5,7 +5,6 @@
 __all__ = ["TokenizeCodec"]
 
 from io import BytesIO
-from sys import byteorder
 
 import numcodecs.compat
 import numcodecs.registry
@@ -83,32 +82,28 @@ def encode(
         else:
             utype = a.dtype
 
+        assert (dtype.itemsize % utype.itemsize) == 0
+
         # insert padding to align with itemsize
         message.append(
             b"\0" * (utype.itemsize - (sum(len(m) for m in message) % utype.itemsize))
         )
 
         # ensure that the table keys are encoded in little endian binary
         table_keys_array = unique[argsort]
-        table_keys_byteorder = table_keys_array.dtype.byteorder
-        table_keys_byteorder = (
-            table_keys_byteorder
-            if table_keys_byteorder in ("<", ">")
-            else ("<" if (byteorder == "little") else ">")
+        message.append(
+            table_keys_array.astype(table_keys_array.dtype.newbyteorder("<")).tobytes()
         )
-        if table_keys_byteorder != "<":
-            table_keys_array = table_keys_array.byteswap()
-        message.append(table_keys_array.tobytes())
 
         indices = argsortinv[inverse].astype(utype)
-        if table_keys_byteorder != "<":
-            indices = indices.byteswap()
-        message.append(indices.tobytes())
+        message.append(indices.astype(indices.dtype.newbyteorder("<")).tobytes())
 
         encoded_bytes = b"".join(message)
 
         encoded: np.ndarray[tuple[int], np.dtype[np.unsignedinteger]] = np.frombuffer(
-            encoded_bytes, dtype=utype, count=len(encoded_bytes) // utype.itemsize
+            encoded_bytes,
+            dtype=utype.newbyteorder("<"),
+            count=len(encoded_bytes) // utype.itemsize,
         )
 
         return encoded  # type: ignore
@@ -168,24 +163,16 @@ def decode(
             dtype=_dtype_bits(dtype).newbyteorder("<"),
             count=table_len,
         )
-        dtype_bits_byteorder = _dtype_bits(dtype).byteorder
-        dtype_bits_byteorder = (
-            dtype_bits_byteorder
-            if dtype_bits_byteorder in ("<", ">")
-            else ("<" if (byteorder == "little") else ">")
-        )
-        if dtype_bits_byteorder != "<":
-            table_keys = table_keys.byteswap()
 
         indices = np.frombuffer(
             b_io.read(),
             dtype=utype.newbyteorder("<"),
             count=np.prod(shape, dtype=np.uintp),
         )
-        if dtype_bits_byteorder != "<":
-            indices = indices.byteswap()
 
-        decoded = table_keys[indices].view(dtype).reshape(shape)
+        decoded = (
+            table_keys[indices].astype(_dtype_bits(dtype)).view(dtype).reshape(shape)
+        )
 
         return numcodecs.compat.ndarray_copy(decoded, out)  # type: ignore
 
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -24,6 +24,16 @@ def test_roundtrip():
     check_roundtrip(np.zeros(tuple()))
     check_roundtrip(np.zeros((0,)))
     check_roundtrip(np.arange(1000).reshape(10, 10, 10))
+    check_roundtrip(
+        np.arange(1000)
+        .reshape(10, 10, 10)
+        .astype(np.dtype(np.uint32).newbyteorder("<"))
+    )
+    check_roundtrip(
+        np.arange(1000)
+        .reshape(10, 10, 10)
+        .astype(np.dtype(np.uint32).newbyteorder(">"))
+    )
     check_roundtrip(np.array([np.inf, -np.inf, np.nan, -np.nan, 0.0, -0.0]))
     check_roundtrip(
         np.array(