Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 35 additions & 10 deletions ciftools/binary/decoder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from numba import jit
from numba.typed import Dict
from numba.core import types
import numpy as np
from ciftools.binary.data_types import DataType
from ciftools.binary.encoded_data import EncodedCIFData
Expand Down Expand Up @@ -47,12 +50,12 @@ def _decode_delta(data: np.ndarray, encoding: DeltaEncoding) -> np.ndarray:
return np.cumsum(result, out=result)


# TODO: JIT
def _decode_integer_packing_signed(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray:
upper_limit = 0x7F if encoding["byteCount"] == 1 else 0x7FFF
@jit(nopython=True)
def _decode_integer_packing_signed(data: np.ndarray, encoding_byte_count, encoding_src_size) -> np.ndarray:
upper_limit = 0x7F if encoding_byte_count == 1 else 0x7FFF
lower_limit = -upper_limit - 1
n = len(data)
output = np.zeros(encoding["srcSize"], dtype="i4")
output = np.zeros(encoding_src_size, dtype="i4")
i = 0
j = 0
while i < n:
Expand All @@ -69,21 +72,29 @@ def _decode_integer_packing_signed(data: np.ndarray, encoding: IntegerPackingEnc
return output


# TODO: JIT
def _decode_integer_packing_unsigned(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray:
upper_limit = 0xFF if encoding["byteCount"] == 1 else 0xFFFF
@jit(nopython=True)
def _decode_integer_packing_unsigned(data: np.ndarray, encoding_byte_count, encoding_src_size) -> np.ndarray:
upper_limit = 0xFF if encoding_byte_count == 1 else 0xFFFF
n = len(data)
output = np.zeros(encoding["srcSize"], dtype="i4")
output = np.zeros(encoding_src_size, dtype="i4")
i = 0
j = 0
print(data)
while i < n:
value = 0
# 255
t = data[i]
# yes
while t == upper_limit:
# 255
value += t
# 1
i += 1
# 145
t = data[i]
# 255 + 145 = 400
value += t
# 400
output[j] = value
i += 1
j += 1
Expand All @@ -93,10 +104,24 @@ def _decode_integer_packing_unsigned(data: np.ndarray, encoding: IntegerPackingE
def _decode_integer_packing(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray:
if len(data) == encoding["srcSize"]:
return data

# d = dict()
# for k, v in encoding.items():
# d[k] = v


# d1 = Dict.empty(
# key_type=types.py2_string_type,
# value_type=types.int64,
# )

# for k, v in encoding.items():
# d1[k] = v

if encoding["isUnsigned"]:
return _decode_integer_packing_unsigned(data, encoding)
return _decode_integer_packing_unsigned(data, encoding["byteCount"], encoding["srcSize"])
else:
return _decode_integer_packing_signed(data, encoding)
return _decode_integer_packing_signed(data, encoding["byteCount"], encoding["srcSize"])


def _decode_string_array(data: np.ndarray, encoding: StringArrayEncoding) -> np.ndarray:
Expand Down
71 changes: 67 additions & 4 deletions ciftools/binary/encoder.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import math
import sys
from typing import Any, Dict, List, Protocol, Tuple, Union
Expand Down Expand Up @@ -304,34 +305,96 @@ def encode(self, data: np.ndarray) -> EncodedCIFData:

class StringArray(BinaryCIFEncoder):
def encode(self, data: Union[np.ndarray, List[str]]) -> EncodedCIFData:
# strings: List[str] = []
# offsets = [0]
# indices = np.empty(len(data), dtype="<i4")

# _pack_strings(
# data,
# indices,
# strings,
# offsets,
# )

# string_data = "".join(strings)

string_data, indices, offsets = _pack_strings(data)

encoded_offsets = _OFFSET_ENCODER.encode(np.array(offsets, dtype="<i4"))
encoded_data = _DATA_ENCODER.encode(indices)

encoding: StringArrayEncoding = {
"dataEncoding": encoded_data["encoding"],
"kind": EncodingEnun.StringArray,
"stringData": string_data,
"offsetEncoding": encoded_offsets["encoding"],
"offsets": encoded_offsets["data"], # type: ignore
}

return EncodedCIFData(data=encoded_data["data"], encoding=[encoding])

def encode_not_optimized(self, data: Union[np.ndarray, List[str]]) -> EncodedCIFData:
strings: List[str] = []
offsets = [0]
indices = np.empty(len(data), dtype="<i4")

_pack_strings(
_pack_strings_old(
data,
indices,
strings,
offsets,
)

string_data = "".join(strings)

encoded_offsets = _OFFSET_ENCODER.encode(np.array(offsets, dtype="<i4"))
encoded_data = _DATA_ENCODER.encode(indices)

encoding: StringArrayEncoding = {
"dataEncoding": encoded_data["encoding"],
"kind": EncodingEnun.StringArray,
"stringData": "".join(strings),
"stringData": string_data,
"offsetEncoding": encoded_offsets["encoding"],
"offsets": encoded_offsets["data"], # type: ignore
}

return EncodedCIFData(data=encoded_data["data"], encoding=[encoding])

# @jit(nopython=True)
def _pack_strings(data: List[str]) -> Tuple[str, np.ndarray, np.ndarray]:
# strings = set(data)
# # mapping of strings to numbers
# str_map = {s: i for i, s in enumerate(strings)}
# string_data = "".join(strings)

# indices = np.array([str_map[s] for s in data], dtype='<i4')
# offset_data = np.empty(len(strings) + 1, dtype='<i4')
# offset_data[0] = 0
# # np.cumsum([len(s) for s in strings], out=offset_data[1:])
# # NOTE: this also result in errors
# # print(f'Strings: {strings}')
# # NOTE: this not
# # print('Strings', strings)
# # temp = np.array([len(s) for s in strings])
# # print(temp)

# offset_data[1:] = np.cumsum(np.array([len(s) for s in strings]))
# return string_data, indices,
strings = set(data)
str_map = {s: i for i, s in enumerate(strings)}
string_data = "".join(strings)

indices = np.array([str_map[s] for s in data], dtype='<i4')
offset_data = np.empty(len(strings) + 1, dtype='<i4')
offset_data[0] = 0
np.cumsum([len(s) for s in strings], out=offset_data[1:])

return string_data, indices, offset_data


# TODO: benchmark if JIT helps here
@jit(nopython=False, forceobj=True)
def _pack_strings(data: List[str], indices: np.ndarray, strings: List[str], offsets: List[int]) -> None:
# @jit(nopython=False, forceobj=True)
def _pack_strings_old(data: List[str], indices: np.ndarray, strings: List[str], offsets: List[int]) -> None:
acc_len = 0
str_map: Dict[str, int] = dict()

Expand Down
16 changes: 8 additions & 8 deletions tests/_run_all_tests.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import unittest

testmodules = [
"byte_array",
"delta",
"fixed_point",
"integer_packing",
"interval_quantization",
"run_length",
# "byte_array",
# "delta",
# "fixed_point",
# "integer_packing",
# "interval_quantization",
# "run_length",
"string_array",
"_decoding",
"_encoding",
# "_decoding",
# "_encoding",
]

suite = unittest.TestSuite()
Expand Down
95 changes: 95 additions & 0 deletions tests/benchmark_string_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from argparse import ArgumentError
from ciftools.binary.encoder import STRING_ARRAY
import pytest
import msgpack
import numpy as np
import random, string
from timeit import default_timer as timer

MAX_LENGTH = 30

def _random_string(length):
return ''.join(random.choice(string.ascii_letters) for i in range(length))


# Source:
def _generate_random_strings_list(size: int, n_of_strings: int):
l = [_random_string(random.randint(1, MAX_LENGTH)) for i in range(n_of_strings)]
repeated_l = l * size
return repeated_l

INPUTS_FOR_ENCODING = {}

for n_of_strings in [20, 50, 100]:
for size in [10**4, 10**5, 10**6]:
INPUTS_FOR_ENCODING[f'{n_of_strings}_{size}'] = _generate_random_strings_list(size=size, n_of_strings=n_of_strings)

def encoding(encoding_input, optimization):
encoder = STRING_ARRAY
if not optimization:
encoded = encoder.encode_not_optimized(encoding_input)
else:
encoded = encoder.encode(encoding_input)

print('not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd')
for case_name, encoding_input in INPUTS_FOR_ENCODING.items():

start_not_optimized = timer()
encoding(encoding_input=encoding_input, optimization=False)
stop_not_optimized = timer()

start_not_optimized_2nd = timer()
encoding(encoding_input=encoding_input, optimization=False)
stop_not_optimized_2nd = timer()

start_not_optimized_3rd = timer()
encoding(encoding_input=encoding_input, optimization=False)
stop_not_optimized_3rd = timer()

start_optimized = timer()
encoding(encoding_input=encoding_input, optimization=True)
stop_optimized = timer()

start_optimized_2nd = timer()
encoding(encoding_input=encoding_input, optimization=True)
stop_optimized_2nd = timer()

start_optimized_3rd = timer()
encoding(encoding_input=encoding_input, optimization=True)
stop_optimized_3rd = timer()

not_optimized = stop_not_optimized - start_not_optimized
not_optimized_2nd = stop_not_optimized_2nd - start_not_optimized_2nd
not_optimized_3rd = stop_not_optimized_3rd - start_not_optimized_3rd

optimized = stop_optimized - start_optimized
optimized_2nd = stop_optimized_2nd - start_optimized_2nd
optimized_3rd = stop_optimized_3rd - start_optimized_3rd

print(case_name)
print(not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd)

# print(not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd)
# 0.678212083876133 0.07555452641099691 0.0717478571459651 0.8585679298266768 0.12347683496773243 0.1249676188454032
# quite close to pytest-benchmark

# NO JIT
# not_optimized, not_optimized_2nd, not_optimized_3rd, optimized, optimized_2nd, optimized_3rd
# 20_10000
# 2.671183445956558 0.037990274955518544 0.038723392062820494 0.02901634795125574 0.027696584002114832 0.027359343017451465
# 20_100000
# 0.35828070098068565 0.35529211803805083 0.3570930790156126 0.2672198748914525 0.26997689506970346 0.26407589903101325
# 20_1000000
# 3.754518774920143 3.6613519359380007 3.590928812045604 2.7633999809622765 2.742795309983194 2.7856404760386795
# 50_10000
# 0.08946576493326575 0.08893638011068106 0.08875931904185563 0.05730705999303609 0.05827989405952394 0.05729172995779663
# 50_100000
# 0.886467493022792 0.8783201409969479 0.884077426046133 0.7304035819834098 0.7092675910098478 0.7059366019675508
# 50_1000000
# 9.578126006992534 9.592585199978203 9.336758888093755 7.011043209931813 6.888902453938499 6.8985374378971756
# 100_10000
# 0.17676137096714228 0.1785169130889699 0.17561967996880412 0.12274663499556482 0.12944400194101036 0.12680783797986805
# 100_100000
# 1.7786761060124263 1.7187940969597548 1.7390421900199726 1.4047608590917662 1.4061879760120064 1.3963392629520968
# 100_1000000
# 17.475244562956505 17.666481979074888 17.797239091945812 13.829927618033253 13.793060117051937 13.868345592985861