Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions ciftools/binary/encoding/impl/binary_cif_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,24 @@ def encode_cif_data(self, data: any) -> EncodedCIFData:
encodings: list[EncodingBase] = []

for encoder in self.encoders:
# get EncodedCIFData typeddict with 'data' and 'encoding'
encoded = encoder.encode(data)
# get ref to 'encoding' of that typeddict
added_encodings = encoded["encoding"]

# if 'encoding' is None or 0 - raise Error
if not added_encodings or not len(added_encodings):
raise ValueError("Encodings must be non-empty.")

# get ref to 'data' of typeddict
data = encoded["data"]

# add 'encoding' to list of encodings
encodings.extend(added_encodings)

# on next iteration, already encoded data by the first encoder,
# is encoded by the 2nd, then by 3rd, each time encoding is added to list

if not isinstance(data, bytes):
raise ValueError(
f"The encoding must result in bytes but it was {str(type(data))}. Fix your encoding chain."
Expand Down
206 changes: 191 additions & 15 deletions ciftools/binary/encoding/impl/encoders/integer_packing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
from numba import jit, int32, uint32, njit

import numpy as np
from ciftools.binary.encoding.base.cif_encoder_base import CIFEncoderBase
Expand All @@ -7,7 +8,6 @@
from ciftools.binary.encoding.types import EncodedCIFData
from numpy import int8, int16, uint8, uint16


class IntegerPackingCIFEncoder(CIFEncoderBase):
def encode(self, data: np.ndarray) -> EncodedCIFData:

Expand Down Expand Up @@ -38,22 +38,61 @@ def encode(self, data: np.ndarray) -> EncodedCIFData:

# TODO: figure out if there is a way to implement this
# better & faster with numpy methods.
packed_index = 0
for _v in data:
value = _v
if value >= 0:
while value >= upper_limit:
packed[packed_index] = upper_limit
packed_index += 1
value -= upper_limit
packed = _packing_loop(
data=data,
upper_limit=upper_limit,
lower_limit=lower_limit,
packed=packed
)

byte_array_result = BYTE_ARRAY_CIF_ENCODER.encode(packed)

integer_packing_encoding: IntegerPackingEncoding = {
"kind": EncodingEnun.IntegerPacking,
"isUnsigned": not packing.isSigned,
"srcSize": len(data),
"byteCount": packing.bytesPerElement,
}

return EncodedCIFData(
data=byte_array_result["data"], encoding=[integer_packing_encoding, byte_array_result["encoding"][0]]
)

def encode_optimized(self, data: np.ndarray) -> EncodedCIFData:

# TODO: must be 32bit integer

packing = _determine_packing_optimized(data)
if packing.bytesPerElement == 4:
return BYTE_ARRAY_CIF_ENCODER.encode(data)

# integer packing

if packing.isSigned:
if packing.bytesPerElement == 1:
upper_limit = 0x7F
packed = np.empty(packing.size, dtype=int8)
else:
while value <= lower_limit:
packed[packed_index] = lower_limit
packed_index += 1
value -= lower_limit
upper_limit = 0x7FFF
packed = np.empty(packing.size, dtype=int16)
else:
if packing.bytesPerElement == 1:
upper_limit = 0xFF
packed = np.empty(packing.size, dtype=uint8)
else:
upper_limit = 0xFFFF
packed = np.empty(packing.size, dtype=uint16)

packed[packed_index] = value
packed_index += 1
lower_limit = -upper_limit - 1

# TODO: figure out if there is a way to implement this
# better & faster with numpy methods.
packed = _packing_loop_optimized(
data=data,
upper_limit=upper_limit,
lower_limit=lower_limit,
packed=packed
)

byte_array_result = BYTE_ARRAY_CIF_ENCODER.encode(packed)

Expand All @@ -74,6 +113,46 @@ class _PackingInfo:
size: int
bytesPerElement: int

def _packing_loop(data: np.ndarray, upper_limit, lower_limit, packed: np.ndarray) -> np.ndarray:
packed_index = 0
for _v in data:
value = _v
if value >= 0:
while value >= upper_limit:
packed[packed_index] = upper_limit
packed_index += 1
value -= upper_limit
else:
while value <= lower_limit:
packed[packed_index] = lower_limit
packed_index += 1
value -= lower_limit

packed[packed_index] = value
packed_index += 1

return packed

@jit(nopython=True)
def _packing_loop_optimized(data: np.ndarray, upper_limit, lower_limit, packed: np.ndarray) -> np.ndarray:
packed_index = 0
for _v in data:
value = _v
if value >= 0:
while value >= upper_limit:
packed[packed_index] = upper_limit
packed_index += 1
value -= upper_limit
else:
while value <= lower_limit:
packed[packed_index] = lower_limit
packed_index += 1
value -= lower_limit

packed[packed_index] = value
packed_index += 1

return packed

def _determine_packing(data: np.ndarray) -> _PackingInfo:
# determine sign
Expand Down Expand Up @@ -102,6 +181,35 @@ def _determine_packing(data: np.ndarray) -> _PackingInfo:

return packing

def _determine_packing_optimized(data: np.ndarray) -> _PackingInfo:
# determine sign
is_signed = np.any(data < 0)

# determine packing size
size8 = _packing_size_optimized(data, 0x7F) if is_signed else _packing_size_optimized(data, 0xFF)
size16 = _packing_size_optimized(data, 0x7FFF) if is_signed else _packing_size_optimized(data, 0xFFFF)

# size8 = _packing_size_signed_optimized(data, 0x7F) if is_signed else _packing_size_unsigned_optimized(data, 0xFF)
# size16 = _packing_size_signed_optimized(data, 0x7FFF) if is_signed else _packing_size_unsigned_optimized(data, 0xFFFF)

packing = _PackingInfo()
packing.isSigned = is_signed

data_len = len(data)

if data_len * 4 < size16 * 2:
packing.size = data_len
packing.bytesPerElement = 4

elif size16 * 2 < size8:
packing.size = size16
packing.bytesPerElement = 2

else:
packing.size = size8
packing.bytesPerElement = 1

return packing

def _packing_size(data: np.ndarray, upper_limit: int) -> int:
lower_limit = -upper_limit - 1
Expand All @@ -121,5 +229,73 @@ def _packing_size(data: np.ndarray, upper_limit: int) -> int:

return size

@jit(nopython=True)
def _packing_size_unsigned_optimized(data: np.ndarray, upper_limit: int) -> int:
# lower_limit = -upper_limit - 1
size = 0

for value in data:
size = size + math.floor(value / upper_limit) + 1

return size

@jit(nopython=True)
def _packing_size_signed_optimized(data: np.ndarray, upper_limit: int) -> int:
lower_limit = -upper_limit - 1
size = 0

for value in data:
if value >= 0:
size = size + math.floor(value / upper_limit) + 1
else:
size = size + math.floor(value / lower_limit) + 1

return size

# works, but no difference in time
# @njit([(int32[:], int32), (uint32[:], int32)])
@njit
def _packing_size_optimized(data: np.ndarray, upper_limit: int) -> int:
lower_limit = -upper_limit - 1
size = 0

# Fastest
for value in data:
if value >= 0:
size = size + math.floor(value / upper_limit) + 1
else:
size = size + math.floor(value / lower_limit) + 1

# Masks - slower x3 times
# positives = data[data >= 0]
# positive_size = np.sum(np.floor(positives / upper_limit)) + len(positives)

# negatives = data[data < 0]
# negative_size = np.sum(np.floor(negatives / lower_limit)) + len(negatives)

# size = int(positive_size + negative_size)

# x3 times slower, //
# for value in data:
# if value >= 0:
# size = size + value // upper_limit + 1
# else:
# size = size + value // lower_limit + 1


# Original
# for value in data:
# if value == 0:
# size = size + 1
# elif value > 0:
# size = size + math.ceil(value / upper_limit)
# if value % upper_limit == 0:
# size = size + 1
# else:
# size = size + math.ceil(value / lower_limit)
# if value % lower_limit == 0:
# size = size + 1

return size

INTEGER_PACKING_CIF_ENCODER = IntegerPackingCIFEncoder()
14 changes: 12 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
numpy >= 1.11.1
msgpack >= 1.0.3
attrs==22.1.0
iniconfig==1.1.1
msgpack==1.0.4
numpy==1.23.3
packaging==21.3
pluggy==1.0.0
py==1.11.0
py-cpuinfo==8.0.0
pyparsing==3.0.9
pytest==7.1.3
pytest-benchmark==3.4.1
tomli==2.0.1
11 changes: 11 additions & 0 deletions tests/integer_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,14 @@ def test(self):
self.assertTrue(np.array_equal(test_arr, decoded))
self.assertEqual(is_unsigned, encoded["encoding"][0]["isUnsigned"])
self.assertEqual(byte_count, encoded["encoding"][0]["byteCount"])

# testing optimized versions too
for test_arr, is_unsigned, byte_count in test_suite:
encoder = INTEGER_PACKING_CIF_ENCODER
encoded = encoder.encode_optimized(test_arr)
decoded = decode_cif_data(encoded)
msgpack.loads(msgpack.dumps(encoded))

self.assertTrue(np.array_equal(test_arr, decoded))
self.assertEqual(is_unsigned, encoded["encoding"][0]["isUnsigned"])
self.assertEqual(byte_count, encoded["encoding"][0]["byteCount"])
65 changes: 65 additions & 0 deletions tests/test_benchmarking_integer_packing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from argparse import ArgumentError
import pytest
import msgpack
import numpy as np
from ciftools.binary.encoding.data_types import DataTypeEnum
from ciftools.binary.decoder import decode_cif_data
from ciftools.binary.encoding.impl.binary_cif_encoder import BinaryCIFEncoder
from ciftools.binary.encoding.impl.encoders.byte_array import BYTE_ARRAY_CIF_ENCODER
from numba import jit

from ciftools.binary.encoding.impl.encoders.integer_packing import INTEGER_PACKING_CIF_ENCODER

# TODO:
# Next - next encoder (quantization?)

# NOTE: Later:
# 2. function that produces inputs for decoding (sizes?) (negatives will be there or not?)
# 4. Test decoding - decode and decode optimized?
INPUT_DTYPE = 'i4'

INPUTS_FOR_ENCODING_NO_NEGATIVES = [
# 0.8, 8, 80 MB
np.random.randint(low=0, high=300, size=(2*10**5), dtype=INPUT_DTYPE),
np.random.randint(low=0, high=300, size=(2*10**6), dtype=INPUT_DTYPE),
np.random.randint(low=0, high=300, size=(2*10**7), dtype=INPUT_DTYPE)
]

INPUTS_FOR_ENCODING_WITH_NEGATIVES = [
np.random.randint(low=-50, high=50, size=(2*10**5), dtype=INPUT_DTYPE),
np.random.randint(low=-50, high=50, size=(2*10**6), dtype=INPUT_DTYPE),
# np.random.randint(low=-50, high=50, size=(2*10**7), dtype=INPUT_DTYPE)
]

def compute_inputs_for_decoding(inputs_for_encoding: list):
inputs_for_decoding = []
for input_arr in inputs_for_encoding:
encoder = BinaryCIFEncoder([BYTE_ARRAY_CIF_ENCODER])
encoded = encoder.encode_cif_data(input_arr)
inputs_for_decoding.append(encoded)

print(inputs_for_decoding)
return inputs_for_decoding

# INPUTS_FOR_DECODING = compute_inputs_for_decoding()

OPTIMIZED = [True]

def int_packing_encoding(encoding_input, optimization):
encoder = INTEGER_PACKING_CIF_ENCODER
if not optimization:
encoded = encoder.encode(encoding_input)
else:
encoded = encoder.encode_optimized(encoding_input)


@pytest.mark.parametrize("encoding_input", INPUTS_FOR_ENCODING_NO_NEGATIVES)
@pytest.mark.parametrize("optimization", OPTIMIZED)
def test_integer_packing_encoding_NO_negatives(benchmark, encoding_input, optimization):
result = benchmark(int_packing_encoding, encoding_input=encoding_input, optimization=optimization)

# @pytest.mark.parametrize("encoding_input", INPUTS_FOR_ENCODING_WITH_NEGATIVES)
# @pytest.mark.parametrize("optimization", OPTIMIZED)
# def test_integer_packing_encoding_WITH_negatives(benchmark, encoding_input, optimization):
# result = benchmark(int_packing_encoding, encoding_input=encoding_input, optimization=optimization)