From c0e5268252e8b20a0d6377928ac7f39468704bec Mon Sep 17 00:00:00 2001 From: Aliaksei Chareshneu Date: Wed, 28 Sep 2022 15:11:13 +0000 Subject: [PATCH 1/7] benchmarking integer packing --- tests/benchmarking_integer_packing.py | 67 +++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 tests/benchmarking_integer_packing.py diff --git a/tests/benchmarking_integer_packing.py b/tests/benchmarking_integer_packing.py new file mode 100644 index 0000000..3fd8f9f --- /dev/null +++ b/tests/benchmarking_integer_packing.py @@ -0,0 +1,67 @@ +from argparse import ArgumentError +import pytest +import msgpack +import numpy as np +from ciftools.binary.encoding.data_types import DataTypeEnum +from ciftools.binary.decoder import decode_cif_data +from ciftools.binary.encoding.impl.binary_cif_encoder import BinaryCIFEncoder +from ciftools.binary.encoding.impl.encoders.byte_array import BYTE_ARRAY_CIF_ENCODER +from numba import jit + +from ciftools.binary.encoding.impl.encoders.integer_packing import INTEGER_PACKING_CIF_ENCODER + +# TODO: +# Uncomment inputs for encoding +# Uncomment test_integer_packing_encoding_WITH_negatives +# Next - next encoder (quantization?) + +# NOTE: Later: +# 2. function that produces inputs for decoding (sizes?) (negatives will be there or not?) +# 4. Test decoding - decode and decode optimized? +INPUT_DTYPE = 'i4' + +INPUTS_FOR_ENCODING_NO_NEGATIVES = [ + # 0.8, 8, 80 MB + np.random.randint(low=0, high=100, size=(2*10**5), dtype=INPUT_DTYPE), + np.random.randint(low=0, high=100, size=(2*10**6), dtype=INPUT_DTYPE), + np.random.randint(low=0, high=100, size=(2*10**7), dtype=INPUT_DTYPE) +] + +INPUTS_FOR_ENCODING_WITH_NEGATIVES = [ + np.random.randint(low=-50, high=50, size=(2*10**5), dtype=INPUT_DTYPE), + np.random.randint(low=-50, high=50, size=(2*10**6), dtype=INPUT_DTYPE), + np.random.randint(low=-50, high=50, size=(2*10**7), dtype=INPUT_DTYPE) +] + +def compute_inputs_for_decoding(inputs_for_encoding: list): + inputs_for_decoding = [] + for input_arr in inputs_for_encoding: + encoder = BinaryCIFEncoder([BYTE_ARRAY_CIF_ENCODER]) + encoded = encoder.encode_cif_data(input_arr) + inputs_for_decoding.append(encoded) + + print(inputs_for_decoding) + return inputs_for_decoding + +# INPUTS_FOR_DECODING = compute_inputs_for_decoding() + +OPTIMIZED = [False, True] + +def int_packing_encoding(encoding_input, optimization): + encoder = INTEGER_PACKING_CIF_ENCODER + if not optimization: + encoded = encoder.encode(encoding_input) + else: + encoded = encoder.encode_optimized(encoding_input) + + +@pytest.mark.parametrize("encoding_input", INPUTS_FOR_ENCODING_NO_NEGATIVES) +@pytest.mark.parametrize("optimization", OPTIMIZED) +def test_integer_packing_encoding_NO_negatives(benchmark, encoding_input, optimization): + result = benchmark(int_packing_encoding, encoding_input=encoding_input, optimization=optimization) + +# @pytest.mark.parametrize("encoding_input", INPUTS_FOR_ENCODING_WITH_NEGATIVES) +# @pytest.mark.parametrize("optimization", OPTIMIZED) +# def test_integer_packing_encoding_WITH_negatives(benchmark, encoding_input, optimization): +# result = benchmark(int_packing_encoding, encoding_input=encoding_input, optimization=optimization) + From 4e2f6cc6f8a1b9080123e622bd7468ca632bbbc4 Mon Sep 17 00:00:00 2001 From: Aliaksei Chareshneu Date: Wed, 28 Sep 2022 15:11:54 +0000 Subject: [PATCH 2/7] optimized encode (just for loop) for integer packing --- .../encoding/impl/encoders/integer_packing.py | 109 +++++++++++++++--- 1 file changed, 94 insertions(+), 15 deletions(-) diff --git a/ciftools/binary/encoding/impl/encoders/integer_packing.py b/ciftools/binary/encoding/impl/encoders/integer_packing.py index 4bd0a0d..b2b3b16 100644 --- a/ciftools/binary/encoding/impl/encoders/integer_packing.py +++ b/ciftools/binary/encoding/impl/encoders/integer_packing.py @@ -1,4 +1,5 @@ import math +from numba import jit import numpy as np from ciftools.binary.encoding.base.cif_encoder_base import CIFEncoderBase @@ -7,7 +8,6 @@ from ciftools.binary.encoding.types import EncodedCIFData from numpy import int8, int16, uint8, uint16 - class IntegerPackingCIFEncoder(CIFEncoderBase): def encode(self, data: np.ndarray) -> EncodedCIFData: @@ -38,22 +38,61 @@ def encode(self, data: np.ndarray) -> EncodedCIFData: # TODO: figure out if there is a way to implement this # better & faster with numpy methods. - packed_index = 0 - for _v in data: - value = _v - if value >= 0: - while value >= upper_limit: - packed[packed_index] = upper_limit - packed_index += 1 - value -= upper_limit + packed = _packing_loop( + data=data, + upper_limit=upper_limit, + lower_limit=lower_limit, + packed=packed + ) + + byte_array_result = BYTE_ARRAY_CIF_ENCODER.encode(packed) + + integer_packing_encoding: IntegerPackingEncoding = { + "kind": EncodingEnun.IntegerPacking, + "isUnsigned": not packing.isSigned, + "srcSize": len(data), + "byteCount": packing.bytesPerElement, + } + + return EncodedCIFData( + data=byte_array_result["data"], encoding=[integer_packing_encoding, byte_array_result["encoding"][0]] + ) + + def encode_optimized(self, data: np.ndarray) -> EncodedCIFData: + + # TODO: must be 32bit integer + + packing = _determine_packing(data) + if packing.bytesPerElement == 4: + return BYTE_ARRAY_CIF_ENCODER.encode(data) + + # integer packing + + if packing.isSigned: + if packing.bytesPerElement == 1: + upper_limit = 0x7F + packed = np.empty(packing.size, dtype=int8) + else: + upper_limit = 0x7FFF + packed = np.empty(packing.size, dtype=int16) + else: + if packing.bytesPerElement == 1: + upper_limit = 0xFF + packed = np.empty(packing.size, dtype=uint8) else: - while value <= lower_limit: - packed[packed_index] = lower_limit - packed_index += 1 - value -= lower_limit + upper_limit = 0xFFFF + packed = np.empty(packing.size, dtype=uint16) - packed[packed_index] = value - packed_index += 1 + lower_limit = -upper_limit - 1 + + # TODO: figure out if there is a way to implement this + # better & faster with numpy methods. + packed = _packing_loop_optimized( + data=data, + upper_limit=upper_limit, + lower_limit=lower_limit, + packed=packed + ) byte_array_result = BYTE_ARRAY_CIF_ENCODER.encode(packed) @@ -74,6 +113,46 @@ class _PackingInfo: size: int bytesPerElement: int +def _packing_loop(data: np.ndarray, upper_limit, lower_limit, packed: np.ndarray) -> np.ndarray: + packed_index = 0 + for _v in data: + value = _v + if value >= 0: + while value >= upper_limit: + packed[packed_index] = upper_limit + packed_index += 1 + value -= upper_limit + else: + while value <= lower_limit: + packed[packed_index] = lower_limit + packed_index += 1 + value -= lower_limit + + packed[packed_index] = value + packed_index += 1 + + return packed + +@jit(nopython=True) +def _packing_loop_optimized(data: np.ndarray, upper_limit, lower_limit, packed: np.ndarray) -> np.ndarray: + packed_index = 0 + for _v in data: + value = _v + if value >= 0: + while value >= upper_limit: + packed[packed_index] = upper_limit + packed_index += 1 + value -= upper_limit + else: + while value <= lower_limit: + packed[packed_index] = lower_limit + packed_index += 1 + value -= lower_limit + + packed[packed_index] = value + packed_index += 1 + + return packed def _determine_packing(data: np.ndarray) -> _PackingInfo: # determine sign From 65d6958307bb7dd30491143576534399c54e81b8 Mon Sep 17 00:00:00 2001 From: Aliaksei Chareshneu Date: Wed, 28 Sep 2022 15:12:19 +0000 Subject: [PATCH 3/7] explanatory comments to binary cif encoder --- ciftools/binary/encoding/impl/binary_cif_encoder.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ciftools/binary/encoding/impl/binary_cif_encoder.py b/ciftools/binary/encoding/impl/binary_cif_encoder.py index 8612878..8ee3857 100644 --- a/ciftools/binary/encoding/impl/binary_cif_encoder.py +++ b/ciftools/binary/encoding/impl/binary_cif_encoder.py @@ -11,15 +11,24 @@ def encode_cif_data(self, data: any) -> EncodedCIFData: encodings: list[EncodingBase] = [] for encoder in self.encoders: + # get EncodedCIFData typeddict with 'data' and 'encoding' encoded = encoder.encode(data) + # get ref to 'encoding' of that typeddict added_encodings = encoded["encoding"] + # if 'encoding' is None or 0 - raise Error if not added_encodings or not len(added_encodings): raise ValueError("Encodings must be non-empty.") + # get ref to 'data' of typeddict data = encoded["data"] + + # add 'encoding' to list of encodings encodings.extend(added_encodings) + # on next iteration, already encoded data by the first encoder, + # is encoded by the 2nd, then by 3rd, each time encoding is added to list + if not isinstance(data, bytes): raise ValueError( f"The encoding must result in bytes but it was {str(type(data))}. Fix your encoding chain." From 6008fdf44129258daf21dd21318450a211d7b42e Mon Sep 17 00:00:00 2001 From: Aliaksei Chareshneu Date: Thu, 29 Sep 2022 13:03:23 +0000 Subject: [PATCH 4/7] optimize another for loop with jit --- .../encoding/impl/encoders/integer_packing.py | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/ciftools/binary/encoding/impl/encoders/integer_packing.py b/ciftools/binary/encoding/impl/encoders/integer_packing.py index b2b3b16..4dabe2a 100644 --- a/ciftools/binary/encoding/impl/encoders/integer_packing.py +++ b/ciftools/binary/encoding/impl/encoders/integer_packing.py @@ -62,7 +62,7 @@ def encode_optimized(self, data: np.ndarray) -> EncodedCIFData: # TODO: must be 32bit integer - packing = _determine_packing(data) + packing = _determine_packing_optimized(data) if packing.bytesPerElement == 4: return BYTE_ARRAY_CIF_ENCODER.encode(data) @@ -181,6 +181,32 @@ def _determine_packing(data: np.ndarray) -> _PackingInfo: return packing +def _determine_packing_optimized(data: np.ndarray) -> _PackingInfo: + # determine sign + is_signed = np.any(data < 0) + + # determine packing size + size8 = _packing_size_optimized(data, 0x7F) if is_signed else _packing_size_optimized(data, 0xFF) + size16 = _packing_size_optimized(data, 0x7FFF) if is_signed else _packing_size_optimized(data, 0xFFFF) + + packing = _PackingInfo() + packing.isSigned = is_signed + + data_len = len(data) + + if data_len * 4 < size16 * 2: + packing.size = data_len + packing.bytesPerElement = 4 + + elif size16 * 2 < size8: + packing.size = size16 + packing.bytesPerElement = 2 + + else: + packing.size = size8 + packing.bytesPerElement = 1 + + return packing def _packing_size(data: np.ndarray, upper_limit: int) -> int: lower_limit = -upper_limit - 1 @@ -200,5 +226,23 @@ def _packing_size(data: np.ndarray, upper_limit: int) -> int: return size +@jit(nopython=True) +def _packing_size_optimized(data: np.ndarray, upper_limit: int) -> int: + lower_limit = -upper_limit - 1 + size = 0 + + for value in data: + if value == 0: + size = size + 1 + elif value > 0: + size = size + math.ceil(value / upper_limit) + if value % upper_limit == 0: + size = size + 1 + else: + size = size + math.ceil(value / lower_limit) + if value % lower_limit == 0: + size = size + 1 + + return size INTEGER_PACKING_CIF_ENCODER = IntegerPackingCIFEncoder() From 40caea616a121ca8fbc1db6d2d530785a3718ec3 Mon Sep 17 00:00:00 2001 From: Aliaksei Chareshneu Date: Thu, 29 Sep 2022 13:03:44 +0000 Subject: [PATCH 5/7] rename --- ...eger_packing.py => test_benchmarking_integer_packing.py} | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) rename tests/{benchmarking_integer_packing.py => test_benchmarking_integer_packing.py} (91%) diff --git a/tests/benchmarking_integer_packing.py b/tests/test_benchmarking_integer_packing.py similarity index 91% rename from tests/benchmarking_integer_packing.py rename to tests/test_benchmarking_integer_packing.py index 3fd8f9f..f2eb853 100644 --- a/tests/benchmarking_integer_packing.py +++ b/tests/test_benchmarking_integer_packing.py @@ -11,8 +11,6 @@ from ciftools.binary.encoding.impl.encoders.integer_packing import INTEGER_PACKING_CIF_ENCODER # TODO: -# Uncomment inputs for encoding -# Uncomment test_integer_packing_encoding_WITH_negatives # Next - next encoder (quantization?) # NOTE: Later: @@ -24,13 +22,13 @@ # 0.8, 8, 80 MB np.random.randint(low=0, high=100, size=(2*10**5), dtype=INPUT_DTYPE), np.random.randint(low=0, high=100, size=(2*10**6), dtype=INPUT_DTYPE), - np.random.randint(low=0, high=100, size=(2*10**7), dtype=INPUT_DTYPE) + # np.random.randint(low=0, high=100, size=(2*10**7), dtype=INPUT_DTYPE) ] INPUTS_FOR_ENCODING_WITH_NEGATIVES = [ np.random.randint(low=-50, high=50, size=(2*10**5), dtype=INPUT_DTYPE), np.random.randint(low=-50, high=50, size=(2*10**6), dtype=INPUT_DTYPE), - np.random.randint(low=-50, high=50, size=(2*10**7), dtype=INPUT_DTYPE) + # np.random.randint(low=-50, high=50, size=(2*10**7), dtype=INPUT_DTYPE) ] def compute_inputs_for_decoding(inputs_for_encoding: list): From 5a043805aa5973cce499f31d31fd27de4a3ec453 Mon Sep 17 00:00:00 2001 From: Aliaksei Chareshneu Date: Thu, 6 Oct 2022 11:57:32 +0000 Subject: [PATCH 6/7] modify tests for optimized version to be tested; find best performing version of packing loop; --- .../encoding/impl/encoders/integer_packing.py | 73 ++++++++++++++++--- tests/integer_packing.py | 11 +++ tests/test_benchmarking_integer_packing.py | 8 +- 3 files changed, 78 insertions(+), 14 deletions(-) diff --git a/ciftools/binary/encoding/impl/encoders/integer_packing.py b/ciftools/binary/encoding/impl/encoders/integer_packing.py index 4dabe2a..77ff500 100644 --- a/ciftools/binary/encoding/impl/encoders/integer_packing.py +++ b/ciftools/binary/encoding/impl/encoders/integer_packing.py @@ -1,5 +1,5 @@ import math -from numba import jit +from numba import jit, int32, uint32, njit import numpy as np from ciftools.binary.encoding.base.cif_encoder_base import CIFEncoderBase @@ -189,6 +189,9 @@ def _determine_packing_optimized(data: np.ndarray) -> _PackingInfo: size8 = _packing_size_optimized(data, 0x7F) if is_signed else _packing_size_optimized(data, 0xFF) size16 = _packing_size_optimized(data, 0x7FFF) if is_signed else _packing_size_optimized(data, 0xFFFF) + # size8 = _packing_size_signed_optimized(data, 0x7F) if is_signed else _packing_size_unsigned_optimized(data, 0xFF) + # size16 = _packing_size_signed_optimized(data, 0x7FFF) if is_signed else _packing_size_unsigned_optimized(data, 0xFFFF) + packing = _PackingInfo() packing.isSigned = is_signed @@ -227,21 +230,71 @@ def _packing_size(data: np.ndarray, upper_limit: int) -> int: return size @jit(nopython=True) +def _packing_size_unsigned_optimized(data: np.ndarray, upper_limit: int) -> int: + # lower_limit = -upper_limit - 1 + size = 0 + + for value in data: + size = size + math.floor(value / upper_limit) + 1 + + return size + +@jit(nopython=True) +def _packing_size_signed_optimized(data: np.ndarray, upper_limit: int) -> int: + lower_limit = -upper_limit - 1 + size = 0 + + for value in data: + if value >= 0: + size = size + math.floor(value / upper_limit) + 1 + else: + size = size + math.floor(value / lower_limit) + 1 + + return size + +# works, but no difference in time +# @njit([(int32[:], int32), (uint32[:], int32)]) +@njit def _packing_size_optimized(data: np.ndarray, upper_limit: int) -> int: lower_limit = -upper_limit - 1 size = 0 + # Fastest for value in data: - if value == 0: - size = size + 1 - elif value > 0: - size = size + math.ceil(value / upper_limit) - if value % upper_limit == 0: - size = size + 1 + if value >= 0: + size = size + math.floor(value / upper_limit) + 1 else: - size = size + math.ceil(value / lower_limit) - if value % lower_limit == 0: - size = size + 1 + size = size + math.floor(value / lower_limit) + 1 + + # Masks - slower x3 times + # positives = data[data >= 0] + # positive_size = np.sum(np.floor(positives / upper_limit)) + len(positives) + + # negatives = data[data < 0] + # negative_size = np.sum(np.floor(negatives / lower_limit)) + len(negatives) + + # size = int(positive_size + negative_size) + + # x3 times slower, // + # for value in data: + # if value >= 0: + # size = size + value // upper_limit + 1 + # else: + # size = size + value // lower_limit + 1 + + + # Original + # for value in data: + # if value == 0: + # size = size + 1 + # elif value > 0: + # size = size + math.ceil(value / upper_limit) + # if value % upper_limit == 0: + # size = size + 1 + # else: + # size = size + math.ceil(value / lower_limit) + # if value % lower_limit == 0: + # size = size + 1 return size diff --git a/tests/integer_packing.py b/tests/integer_packing.py index e3365b3..395214d 100644 --- a/tests/integer_packing.py +++ b/tests/integer_packing.py @@ -26,3 +26,14 @@ def test(self): self.assertTrue(np.array_equal(test_arr, decoded)) self.assertEqual(is_unsigned, encoded["encoding"][0]["isUnsigned"]) self.assertEqual(byte_count, encoded["encoding"][0]["byteCount"]) + + # testing optimized versions too + for test_arr, is_unsigned, byte_count in test_suite: + encoder = INTEGER_PACKING_CIF_ENCODER + encoded = encoder.encode_optimized(test_arr) + decoded = decode_cif_data(encoded) + msgpack.loads(msgpack.dumps(encoded)) + + self.assertTrue(np.array_equal(test_arr, decoded)) + self.assertEqual(is_unsigned, encoded["encoding"][0]["isUnsigned"]) + self.assertEqual(byte_count, encoded["encoding"][0]["byteCount"]) diff --git a/tests/test_benchmarking_integer_packing.py b/tests/test_benchmarking_integer_packing.py index f2eb853..769c697 100644 --- a/tests/test_benchmarking_integer_packing.py +++ b/tests/test_benchmarking_integer_packing.py @@ -20,9 +20,9 @@ INPUTS_FOR_ENCODING_NO_NEGATIVES = [ # 0.8, 8, 80 MB - np.random.randint(low=0, high=100, size=(2*10**5), dtype=INPUT_DTYPE), - np.random.randint(low=0, high=100, size=(2*10**6), dtype=INPUT_DTYPE), - # np.random.randint(low=0, high=100, size=(2*10**7), dtype=INPUT_DTYPE) + np.random.randint(low=0, high=300, size=(2*10**5), dtype=INPUT_DTYPE), + np.random.randint(low=0, high=300, size=(2*10**6), dtype=INPUT_DTYPE), + np.random.randint(low=0, high=300, size=(2*10**7), dtype=INPUT_DTYPE) ] INPUTS_FOR_ENCODING_WITH_NEGATIVES = [ @@ -43,7 +43,7 @@ def compute_inputs_for_decoding(inputs_for_encoding: list): # INPUTS_FOR_DECODING = compute_inputs_for_decoding() -OPTIMIZED = [False, True] +OPTIMIZED = [True] def int_packing_encoding(encoding_input, optimization): encoder = INTEGER_PACKING_CIF_ENCODER From 48f007a1f08a4c586e143b414ba940a0c07404f0 Mon Sep 17 00:00:00 2001 From: Aliaksei Chareshneu Date: Thu, 6 Oct 2022 11:57:54 +0000 Subject: [PATCH 7/7] requirements update --- requirements.txt | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index aade9d2..35ced87 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,12 @@ -numpy >= 1.11.1 -msgpack >= 1.0.3 \ No newline at end of file +attrs==22.1.0 +iniconfig==1.1.1 +msgpack==1.0.4 +numpy==1.23.3 +packaging==21.3 +pluggy==1.0.0 +py==1.11.0 +py-cpuinfo==8.0.0 +pyparsing==3.0.9 +pytest==7.1.3 +pytest-benchmark==3.4.1 +tomli==2.0.1