diff --git a/ciftools/binary/decoder.py b/ciftools/binary/decoder.py index 906c719..50f8766 100644 --- a/ciftools/binary/decoder.py +++ b/ciftools/binary/decoder.py @@ -1,3 +1,6 @@ +from numba import jit +from numba.typed import Dict +from numba.core import types import numpy as np from ciftools.binary.data_types import DataType from ciftools.binary.encoded_data import EncodedCIFData @@ -47,12 +50,12 @@ def _decode_delta(data: np.ndarray, encoding: DeltaEncoding) -> np.ndarray: return np.cumsum(result, out=result) -# TODO: JIT -def _decode_integer_packing_signed(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray: - upper_limit = 0x7F if encoding["byteCount"] == 1 else 0x7FFF +@jit(nopython=True) +def _decode_integer_packing_signed(data: np.ndarray, encoding_byte_count, encoding_src_size) -> np.ndarray: + upper_limit = 0x7F if encoding_byte_count == 1 else 0x7FFF lower_limit = -upper_limit - 1 n = len(data) - output = np.zeros(encoding["srcSize"], dtype="i4") + output = np.zeros(encoding_src_size, dtype="i4") i = 0 j = 0 while i < n: @@ -69,21 +72,29 @@ def _decode_integer_packing_signed(data: np.ndarray, encoding: IntegerPackingEnc return output -# TODO: JIT -def _decode_integer_packing_unsigned(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray: - upper_limit = 0xFF if encoding["byteCount"] == 1 else 0xFFFF +@jit(nopython=True) +def _decode_integer_packing_unsigned(data: np.ndarray, encoding_byte_count, encoding_src_size) -> np.ndarray: + upper_limit = 0xFF if encoding_byte_count == 1 else 0xFFFF n = len(data) - output = np.zeros(encoding["srcSize"], dtype="i4") + output = np.zeros(encoding_src_size, dtype="i4") i = 0 j = 0 + print(data) while i < n: value = 0 + # 255 t = data[i] + # yes while t == upper_limit: + # 255 value += t + # 1 i += 1 + # 145 t = data[i] + # 255 + 145 = 400 value += t + # 400 output[j] = value i += 1 j += 1 @@ -93,10 +104,24 @@ def _decode_integer_packing_unsigned(data: np.ndarray, encoding: IntegerPackingE def _decode_integer_packing(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray: if len(data) == encoding["srcSize"]: return data + + # d = dict() + # for k, v in encoding.items(): + # d[k] = v + + + # d1 = Dict.empty( + # key_type=types.py2_string_type, + # value_type=types.int64, + # ) + + # for k, v in encoding.items(): + # d1[k] = v + if encoding["isUnsigned"]: - return _decode_integer_packing_unsigned(data, encoding) + return _decode_integer_packing_unsigned(data, encoding["byteCount"], encoding["srcSize"]) else: - return _decode_integer_packing_signed(data, encoding) + return _decode_integer_packing_signed(data, encoding["byteCount"], encoding["srcSize"]) def _decode_string_array(data: np.ndarray, encoding: StringArrayEncoding) -> np.ndarray: diff --git a/ciftools/binary/encoder.py b/ciftools/binary/encoder.py index 30c067c..35669a5 100644 --- a/ciftools/binary/encoder.py +++ b/ciftools/binary/encoder.py @@ -1,3 +1,4 @@ +import itertools import math import sys from typing import Any, Dict, List, Protocol, Tuple, Union @@ -304,34 +305,96 @@ def encode(self, data: np.ndarray) -> EncodedCIFData: class StringArray(BinaryCIFEncoder): def encode(self, data: Union[np.ndarray, List[str]]) -> EncodedCIFData: + # strings: List[str] = [] + # offsets = [0] + # indices = np.empty(len(data), dtype=" EncodedCIFData: strings: List[str] = [] offsets = [0] indices = np.empty(len(data), dtype=" Tuple[str, np.ndarray, np.ndarray]: + # strings = set(data) + # # mapping of strings to numbers + # str_map = {s: i for i, s in enumerate(strings)} + # string_data = "".join(strings) + + # indices = np.array([str_map[s] for s in data], dtype=' None: +# @jit(nopython=False, forceobj=True) +def _pack_strings_old(data: List[str], indices: np.ndarray, strings: List[str], offsets: List[int]) -> None: acc_len = 0 str_map: Dict[str, int] = dict() diff --git a/tests/_run_all_tests.py b/tests/_run_all_tests.py index fb31d71..9a486c9 100644 --- a/tests/_run_all_tests.py +++ b/tests/_run_all_tests.py @@ -1,15 +1,15 @@ import unittest testmodules = [ - "byte_array", - "delta", - "fixed_point", - "integer_packing", - "interval_quantization", - "run_length", + # "byte_array", + # "delta", + # "fixed_point", + # "integer_packing", + # "interval_quantization", + # "run_length", "string_array", - "_decoding", - "_encoding", + # "_decoding", + # "_encoding", ] suite = unittest.TestSuite() diff --git a/tests/benchmark_string_array.py b/tests/benchmark_string_array.py new file mode 100644 index 0000000..a7b2f81 --- /dev/null +++ b/tests/benchmark_string_array.py @@ -0,0 +1,95 @@ +from argparse import ArgumentError +from ciftools.binary.encoder import STRING_ARRAY +import pytest +import msgpack +import numpy as np +import random, string +from timeit import default_timer as timer + +MAX_LENGTH = 30 + +def _random_string(length): + return ''.join(random.choice(string.ascii_letters) for i in range(length)) + + +# Source: +def _generate_random_strings_list(size: int, n_of_strings: int): + l = [_random_string(random.randint(1, MAX_LENGTH)) for i in range(n_of_strings)] + repeated_l = l * size + return repeated_l + +INPUTS_FOR_ENCODING = {} + +for n_of_strings in [20, 50, 100]: + for size in [10**4, 10**5, 10**6]: + INPUTS_FOR_ENCODING[f'{n_of_strings}_{size}'] = _generate_random_strings_list(size=size, n_of_strings=n_of_strings) + +def encoding(encoding_input, optimization): + encoder = STRING_ARRAY + if not optimization: + encoded = encoder.encode_not_optimized(encoding_input) + else: + encoded = encoder.encode(encoding_input) + +print('not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd') +for case_name, encoding_input in INPUTS_FOR_ENCODING.items(): + + start_not_optimized = timer() + encoding(encoding_input=encoding_input, optimization=False) + stop_not_optimized = timer() + + start_not_optimized_2nd = timer() + encoding(encoding_input=encoding_input, optimization=False) + stop_not_optimized_2nd = timer() + + start_not_optimized_3rd = timer() + encoding(encoding_input=encoding_input, optimization=False) + stop_not_optimized_3rd = timer() + + start_optimized = timer() + encoding(encoding_input=encoding_input, optimization=True) + stop_optimized = timer() + + start_optimized_2nd = timer() + encoding(encoding_input=encoding_input, optimization=True) + stop_optimized_2nd = timer() + + start_optimized_3rd = timer() + encoding(encoding_input=encoding_input, optimization=True) + stop_optimized_3rd = timer() + + not_optimized = stop_not_optimized - start_not_optimized + not_optimized_2nd = stop_not_optimized_2nd - start_not_optimized_2nd + not_optimized_3rd = stop_not_optimized_3rd - start_not_optimized_3rd + + optimized = stop_optimized - start_optimized + optimized_2nd = stop_optimized_2nd - start_optimized_2nd + optimized_3rd = stop_optimized_3rd - start_optimized_3rd + + print(case_name) + print(not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd) + +# print(not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd) +# 0.678212083876133 0.07555452641099691 0.0717478571459651 0.8585679298266768 0.12347683496773243 0.1249676188454032 +# quite close to pytest-benchmark + +# NO JIT +# not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd +# 20_10000 +# 2.671183445956558 0.037990274955518544 0.038723392062820494 0.02901634795125574 0.027696584002114832 0.027359343017451465 +# 20_100000 +# 0.35828070098068565 0.35529211803805083 0.3570930790156126 0.2672198748914525 0.26997689506970346 0.26407589903101325 +# 20_1000000 +# 3.754518774920143 3.6613519359380007 3.590928812045604 2.7633999809622765 2.742795309983194 2.7856404760386795 +# 50_10000 +# 0.08946576493326575 0.08893638011068106 0.08875931904185563 0.05730705999303609 0.05827989405952394 0.05729172995779663 +# 50_100000 +# 0.886467493022792 0.8783201409969479 0.884077426046133 0.7304035819834098 0.7092675910098478 0.7059366019675508 +# 50_1000000 +# 9.578126006992534 9.592585199978203 9.336758888093755 7.011043209931813 6.888902453938499 6.8985374378971756 +# 100_10000 +# 0.17676137096714228 0.1785169130889699 0.17561967996880412 0.12274663499556482 0.12944400194101036 0.12680783797986805 +# 100_100000 +# 1.7786761060124263 1.7187940969597548 1.7390421900199726 1.4047608590917662 1.4061879760120064 1.3963392629520968 +# 100_1000000 +# 17.475244562956505 17.666481979074888 17.797239091945812 13.829927618033253 13.793060117051937 13.868345592985861 \ No newline at end of file