diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh index 04f057e8..e5338e84 100644 --- a/.clusterfuzzlite/build.sh +++ b/.clusterfuzzlite/build.sh @@ -6,9 +6,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -AVAILABLE_FUZZERS="decompress roundtrip seekable pstream" +AVAILABLE_FUZZERS="decompress roundtrip seekable pstream dict" -LIB_SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" +LIB_SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" for fuzzer in $AVAILABLE_FUZZERS; do if [ -z "${FUZZER_TARGET:-}" ] || [ "${FUZZER_TARGET}" == "$fuzzer" ]; then diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 50b8cee9..ef751096 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -47,7 +47,7 @@ jobs: - name: Clone LZbench run: | # git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}" - git clone -b zxc-0.12.x https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}" + git clone -b zxc-0.12.x-dict https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}" - name: Copy Lib ZXC run: | diff --git a/.github/workflows/fuzzing.yml b/.github/workflows/fuzzing.yml index 8c2a21fb..1ecf083c 100644 --- a/.github/workflows/fuzzing.yml +++ b/.github/workflows/fuzzing.yml @@ -42,7 +42,7 @@ jobs: fail-fast: false matrix: sanitizer: [address, undefined] - fuzzer: [decompress, roundtrip, seekable, pstream] + fuzzer: [decompress, roundtrip, seekable, pstream, dict] steps: - name: Checkout Repository @@ -132,7 +132,7 @@ jobs: CFLAGS="-g -O1 -fprofile-instr-generate -fcoverage-mapping" DEFS="-DZXC_FUNCTION_SUFFIX=_default -DZXC_ONLY_DEFAULT" INCLUDES="-I include -I src/lib/vendors" - SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" + SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" clang $CFLAGS $INCLUDES $DEFS $SOURCES tests/fuzz_roundtrip.c \ -fsanitize=fuzzer -lm -lpthread -o build-cov/fuzz_roundtrip @@ -146,6 +146,9 @@ jobs: clang $CFLAGS $INCLUDES $DEFS $SOURCES tests/fuzz_pstream.c \ -fsanitize=fuzzer -lm -lpthread -o build-cov/fuzz_pstream + clang $CFLAGS $INCLUDES $DEFS $SOURCES tests/fuzz_dict.c \ + -fsanitize=fuzzer -lm -lpthread -o build-cov/fuzz_dict + - name: Replay Corpora run: | LLVM_PROFILE_FILE="build-cov/roundtrip.profraw" \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 98e8ff91..5bb7bfa3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,7 @@ endif() if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(ZXC_CORE_SOURCES src/lib/zxc_common.c + src/lib/zxc_dict.c src/lib/zxc_dispatch.c src/lib/zxc_pstream.c src/lib/zxc_seekable.c @@ -221,6 +222,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") else() set(ZXC_CORE_SOURCES src/lib/zxc_common.c + src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_pstream.c @@ -419,6 +421,7 @@ if(ZXC_BUILD_TESTS) tests/test_seekable_mt.c tests/test_format.c tests/test_misc.c + tests/test_dict.c ) # When building shared libraries, create a static version for tests @@ -429,6 +432,7 @@ if(ZXC_BUILD_TESTS) # and is already pulled in via ${ZXC_VARIANT_OBJECTS} below. add_library(zxc_lib_static STATIC src/lib/zxc_common.c + src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_pstream.c diff --git a/README.md b/README.md index 30b40a51..973d36d5 100644 --- a/README.md +++ b/README.md @@ -460,6 +460,47 @@ zxc_compress_opts_t opts = { --- +## Dictionary Compression + +For workloads compressed in **small blocks** (4 KB–128 KB), a pre-trained dictionary dramatically improves compression ratio. Because the dictionary prefills the LZ77 sliding window at the *start of each block*, the benefit is per-block: a block only has its own preceding bytes as history, so the smaller the block, the more it leans on the dictionary for representative patterns. This applies whether the input is a single small payload or a large payload split into many small blocks — any time the block size is small enough that early bytes would otherwise lack history to match against. + +**Typical use cases:** JSON API responses, small game assets, structured logs, key-value store records, RPC messages, and any large but homogeneous corpus compressed in small blocks for random access (e.g. seekable archives). + +The dictionary is **embedded in the archive** — there is no separate dictionary file to manage or ship. Decompression reads it from the archive automatically; nothing extra is needed at decode time. + +### Auto-trained dictionary (CLI) + +```bash +# Train a dictionary from the input and embed it — zero configuration +zxc -z -S -B 4K --auto-dict input.json +zxc -d input.json.zxc # no dictionary needed: it is in the archive +``` + +### Embedding a dictionary via the C API + +```c +// Train a dictionary from representative samples... +const void* samples[] = { buf1, buf2, buf3 }; +size_t sizes[] = { len1, len2, len3 }; +uint8_t dict[32768]; +int64_t dict_sz = zxc_train_dict(samples, sizes, 3, dict, sizeof(dict)); + +// ...then pass it to the compressor: it is embedded in the archive. +zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .dict = dict, + .dict_size = (size_t)dict_sz, +}; +int64_t compressed_size = zxc_compress(src, src_size, dst, dst_cap, &copts); + +// Decompression needs no dictionary — it is read from the archive. +int64_t original_size = zxc_decompress(compressed, comp_size, out, out_cap, NULL); +``` + +The dictionary is stored as a `ZXC_BLOCK_DICT` block right after the file header (flagged by `HAS_DICTIONARY`). A corrupted or mismatched embedded dictionary surfaces as `ZXC_ERROR_DICT_MISMATCH`. See [FORMAT.md](docs/FORMAT.md) §12 for the full specification. + +--- + ## Usage ### 1. CLI diff --git a/conformance/test_conformance.c b/conformance/test_conformance.c index 42a383f5..147070b4 100644 --- a/conformance/test_conformance.c +++ b/conformance/test_conformance.c @@ -70,6 +70,7 @@ static int has_suffix(const char *s, const char *suffix) /* ---------- valid vector test -------------------------------------------- */ + static int test_valid_vector(const char *zxc_path, const char *expected_path) { size_t comp_sz = 0, expected_sz = 0; @@ -103,8 +104,9 @@ static int test_valid_vector(const char *zxc_path, const char *expected_path) fprintf(stderr, "FAIL: %s OOM\n", zxc_path); ok = 0; } else { + zxc_decompress_opts_t dopts = {0}; int64_t result = zxc_decompress(comp, comp_sz, - output, (size_t)dec_sz, NULL); + output, (size_t)dec_sz, &dopts); if (result < 0) { fprintf(stderr, "FAIL: %s decompress failed: %s\n", zxc_path, zxc_error_name((int)result)); diff --git a/conformance/valid/dict_embedded.expected b/conformance/valid/dict_embedded.expected new file mode 100644 index 00000000..0a810edb --- /dev/null +++ b/conformance/valid/dict_embedded.expected @@ -0,0 +1,400 @@ +{"id":00000,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00001,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00002,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00003,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00004,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00005,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00006,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00007,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00008,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00009,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00010,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00011,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00012,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00013,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00014,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00015,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00016,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00017,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00018,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00019,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00020,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00021,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00022,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00023,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00024,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00025,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00026,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00027,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00028,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00029,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00030,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00031,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00032,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00033,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00034,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00035,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00036,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00037,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00038,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00039,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00040,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00041,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00042,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00043,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00044,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00045,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00046,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00047,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00048,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00049,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00050,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00051,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00052,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00053,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00054,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00055,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00056,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00057,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00058,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00059,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00060,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00061,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00062,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00063,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00064,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00065,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00066,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00067,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00068,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00069,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00070,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00071,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00072,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00073,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00074,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00075,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00076,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00077,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00078,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00079,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00080,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00081,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00082,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00083,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00084,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00085,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00086,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00087,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00088,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00089,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00090,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00091,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00092,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00093,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00094,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00095,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00096,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00097,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00098,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00099,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00100,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00101,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00102,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00103,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00104,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00105,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00106,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00107,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00108,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00109,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00110,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00111,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00112,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00113,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00114,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00115,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00116,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00117,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00118,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00119,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00120,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00121,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00122,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00123,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00124,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00125,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00126,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00127,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00128,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00129,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00130,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00131,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00132,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00133,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00134,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00135,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00136,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00137,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00138,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00139,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00140,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00141,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00142,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00143,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00144,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00145,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00146,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00147,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00148,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00149,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00150,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00151,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00152,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00153,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00154,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00155,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00156,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00157,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00158,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00159,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00160,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00161,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00162,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00163,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00164,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00165,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00166,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00167,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00168,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00169,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00170,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00171,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00172,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00173,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00174,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00175,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00176,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00177,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00178,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00179,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00180,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00181,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00182,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00183,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00184,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00185,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00186,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00187,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00188,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00189,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00190,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00191,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00192,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00193,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00194,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00195,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00196,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00197,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00198,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00199,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00200,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00201,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00202,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00203,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00204,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00205,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00206,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00207,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00208,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00209,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00210,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00211,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00212,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00213,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00214,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00215,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00216,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00217,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00218,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00219,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00220,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00221,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00222,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00223,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00224,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00225,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00226,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00227,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00228,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00229,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00230,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00231,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00232,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00233,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00234,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00235,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00236,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00237,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00238,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00239,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00240,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00241,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00242,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00243,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00244,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00245,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00246,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00247,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00248,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00249,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00250,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00251,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00252,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00253,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00254,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00255,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00256,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00257,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00258,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00259,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00260,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00261,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00262,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00263,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00264,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00265,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00266,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00267,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00268,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00269,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00270,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00271,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00272,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00273,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00274,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00275,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00276,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00277,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00278,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00279,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00280,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00281,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00282,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00283,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00284,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00285,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00286,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00287,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00288,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00289,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00290,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00291,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00292,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00293,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00294,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00295,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00296,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00297,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00298,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00299,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00300,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00301,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00302,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00303,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00304,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00305,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00306,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00307,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00308,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00309,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00310,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00311,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00312,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00313,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00314,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00315,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00316,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00317,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00318,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00319,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00320,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00321,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00322,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00323,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00324,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00325,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00326,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00327,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00328,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00329,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00330,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00331,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00332,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00333,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00334,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00335,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00336,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00337,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00338,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00339,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00340,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00341,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00342,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00343,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00344,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00345,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00346,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00347,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00348,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00349,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00350,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00351,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00352,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00353,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00354,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00355,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00356,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00357,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00358,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00359,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00360,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00361,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00362,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00363,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00364,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00365,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00366,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00367,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00368,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00369,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00370,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00371,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00372,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00373,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00374,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00375,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00376,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00377,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00378,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00379,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00380,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00381,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00382,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00383,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00384,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00385,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00386,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00387,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00388,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00389,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00390,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00391,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00392,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00393,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00394,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00395,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00396,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00397,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00398,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00399,"type":"event","level":"INFO","msg":"user logged in","ok":true} diff --git a/conformance/valid/dict_embedded.zxc b/conformance/valid/dict_embedded.zxc new file mode 100644 index 00000000..ac7c2df8 Binary files /dev/null and b/conformance/valid/dict_embedded.zxc differ diff --git a/conformance/valid/dict_embedded_seekable.expected b/conformance/valid/dict_embedded_seekable.expected new file mode 100644 index 00000000..0a810edb --- /dev/null +++ b/conformance/valid/dict_embedded_seekable.expected @@ -0,0 +1,400 @@ +{"id":00000,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00001,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00002,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00003,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00004,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00005,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00006,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00007,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00008,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00009,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00010,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00011,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00012,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00013,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00014,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00015,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00016,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00017,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00018,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00019,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00020,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00021,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00022,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00023,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00024,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00025,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00026,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00027,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00028,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00029,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00030,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00031,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00032,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00033,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00034,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00035,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00036,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00037,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00038,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00039,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00040,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00041,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00042,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00043,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00044,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00045,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00046,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00047,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00048,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00049,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00050,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00051,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00052,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00053,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00054,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00055,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00056,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00057,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00058,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00059,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00060,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00061,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00062,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00063,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00064,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00065,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00066,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00067,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00068,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00069,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00070,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00071,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00072,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00073,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00074,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00075,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00076,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00077,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00078,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00079,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00080,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00081,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00082,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00083,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00084,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00085,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00086,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00087,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00088,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00089,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00090,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00091,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00092,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00093,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00094,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00095,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00096,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00097,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00098,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00099,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00100,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00101,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00102,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00103,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00104,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00105,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00106,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00107,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00108,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00109,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00110,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00111,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00112,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00113,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00114,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00115,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00116,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00117,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00118,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00119,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00120,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00121,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00122,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00123,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00124,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00125,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00126,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00127,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00128,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00129,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00130,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00131,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00132,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00133,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00134,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00135,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00136,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00137,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00138,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00139,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00140,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00141,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00142,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00143,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00144,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00145,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00146,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00147,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00148,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00149,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00150,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00151,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00152,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00153,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00154,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00155,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00156,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00157,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00158,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00159,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00160,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00161,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00162,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00163,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00164,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00165,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00166,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00167,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00168,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00169,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00170,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00171,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00172,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00173,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00174,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00175,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00176,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00177,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00178,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00179,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00180,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00181,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00182,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00183,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00184,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00185,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00186,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00187,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00188,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00189,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00190,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00191,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00192,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00193,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00194,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00195,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00196,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00197,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00198,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00199,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00200,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00201,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00202,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00203,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00204,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00205,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00206,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00207,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00208,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00209,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00210,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00211,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00212,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00213,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00214,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00215,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00216,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00217,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00218,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00219,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00220,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00221,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00222,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00223,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00224,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00225,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00226,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00227,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00228,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00229,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00230,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00231,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00232,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00233,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00234,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00235,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00236,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00237,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00238,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00239,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00240,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00241,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00242,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00243,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00244,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00245,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00246,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00247,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00248,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00249,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00250,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00251,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00252,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00253,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00254,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00255,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00256,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00257,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00258,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00259,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00260,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00261,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00262,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00263,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00264,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00265,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00266,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00267,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00268,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00269,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00270,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00271,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00272,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00273,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00274,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00275,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00276,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00277,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00278,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00279,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00280,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00281,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00282,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00283,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00284,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00285,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00286,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00287,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00288,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00289,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00290,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00291,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00292,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00293,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00294,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00295,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00296,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00297,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00298,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00299,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00300,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00301,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00302,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00303,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00304,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00305,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00306,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00307,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00308,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00309,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00310,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00311,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00312,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00313,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00314,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00315,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00316,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00317,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00318,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00319,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00320,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00321,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00322,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00323,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00324,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00325,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00326,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00327,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00328,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00329,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00330,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00331,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00332,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00333,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00334,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00335,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00336,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00337,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00338,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00339,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00340,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00341,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00342,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00343,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00344,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00345,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00346,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00347,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00348,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00349,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00350,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00351,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00352,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00353,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00354,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00355,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00356,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00357,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00358,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00359,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00360,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00361,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00362,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00363,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00364,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00365,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00366,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00367,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00368,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00369,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00370,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00371,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00372,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00373,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00374,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00375,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00376,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00377,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00378,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00379,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00380,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00381,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00382,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00383,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00384,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00385,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00386,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00387,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00388,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00389,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00390,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00391,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00392,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00393,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00394,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00395,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00396,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00397,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00398,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00399,"type":"event","level":"INFO","msg":"user logged in","ok":true} diff --git a/conformance/valid/dict_embedded_seekable.zxc b/conformance/valid/dict_embedded_seekable.zxc new file mode 100644 index 00000000..bb56eee5 Binary files /dev/null and b/conformance/valid/dict_embedded_seekable.zxc differ diff --git a/docs/API.md b/docs/API.md index e9ad810d..e6b61db2 100644 --- a/docs/API.md +++ b/docs/API.md @@ -28,6 +28,7 @@ For the on-disk binary format see [`FORMAT.md`](FORMAT.md). - [10. Streaming API](#10-streaming-api) - [10b. Push Streaming API](#10b-push-streaming-api) - [11. Seekable API](#11-seekable-api) +- [11b. Dictionary API](#11b-dictionary-api) - [12. Error Handling](#12-error-handling) - [13. Thread Safety](#13-thread-safety) - [14. Exported Symbols Summary](#14-exported-symbols-summary) @@ -40,7 +41,9 @@ For the on-disk binary format see [`FORMAT.md`](FORMAT.md). zxc.h <- freestanding umbrella (no ; kernel-safe) ├── zxc_buffer.h <- Buffer API + Reusable Context API │ └── zxc_export.h <- visibility macros -├── zxc_constants.h <- version macros, compression levels, block sizes +├── zxc_constants.h <- version macros, compression levels, block sizes, dict sizes +├── zxc_dict.h <- Dictionary training, save/load, identification +│ └── zxc_export.h ├── zxc_error.h <- error codes + zxc_error_name() │ └── zxc_export.h ├── zxc_opts.h <- compression / decompression options structs @@ -204,7 +207,10 @@ typedef enum { ZXC_ERROR_IO = -11, // file read/write/seek failure ZXC_ERROR_NULL_INPUT = -12, // required pointer is NULL ZXC_ERROR_BAD_BLOCK_TYPE = -13, // unknown block type - ZXC_ERROR_BAD_BLOCK_SIZE = -14 // invalid block size + ZXC_ERROR_BAD_BLOCK_SIZE = -14, // invalid block size + ZXC_ERROR_DICT_REQUIRED = -15, // file requires a dictionary but none provided + ZXC_ERROR_DICT_MISMATCH = -16, // provided dictionary ID does not match header + ZXC_ERROR_DICT_TOO_LARGE = -17 // dictionary exceeds ZXC_DICT_SIZE_MAX } zxc_error_t; ``` @@ -225,6 +231,8 @@ typedef struct { size_t block_size; // Block size in bytes (0 = 512 KB default). int checksum_enabled; // 1 = enable checksums, 0 = disable. int seekable; // 1 = append seek table for random access. + const void* dict; // Pre-trained dictionary content (NULL = none). + size_t dict_size; // Dictionary size in bytes (0 = none, max 64 KB). zxc_progress_callback_t progress_cb; // Optional callback (NULL to disable). void* user_data; // Passed through to progress_cb. } zxc_compress_opts_t; @@ -236,6 +244,8 @@ typedef struct { typedef struct { int n_threads; // Worker thread count (0 = auto-detect). int checksum_enabled; // 1 = verify checksums, 0 = skip. + const void* dict; // Pre-trained dictionary content (NULL = none). + size_t dict_size; // Dictionary size in bytes (0 = none). zxc_progress_callback_t progress_cb; // Optional callback. void* user_data; // Passed through to progress_cb. } zxc_decompress_opts_t; @@ -670,7 +680,8 @@ is `NULL`. ```c ZXC_EXPORT size_t zxc_static_dctx_workspace_size( - const size_t block_size + const size_t block_size, + const size_t max_dict_size ); ``` @@ -680,7 +691,12 @@ for the given `block_size`. Unlike the compression variant, this size is provisioned worst-case because the decoder cannot predict the per-block literal encoding (RAW / RLE / HUFFMAN) until it sees each block header. -**Returns**: workspace size in bytes, or `0` if `block_size` is invalid. +`max_dict_size` (0 = no dictionary support) reserves room for decoding archives +that embed a dictionary up to that size (≤ `ZXC_DICT_SIZE_MAX`). With `0`, an +archive that embeds a dictionary is rejected with `ZXC_ERROR_DICT_REQUIRED` +(the malloc-free path cannot allocate a dictionary buffer on demand). + +**Returns**: workspace size in bytes, or `0` if a parameter is invalid. ### `zxc_init_static_dctx` @@ -688,14 +704,17 @@ literal encoding (RAW / RLE / HUFFMAN) until it sees each block header. ZXC_EXPORT zxc_dctx* zxc_init_static_dctx( void* workspace, const size_t workspace_size, - const size_t block_size + const size_t block_size, + const size_t max_dict_size ); ``` Initialises a decompression context inside a caller-supplied workspace. `block_size` is **pinned** at init time: feeding the returned handle an archive whose file header declares a different `block_size` returns -`ZXC_ERROR_BAD_BLOCK_SIZE`. +`ZXC_ERROR_BAD_BLOCK_SIZE`. `max_dict_size` must match the value passed to +`zxc_static_dctx_workspace_size`; an archive whose embedded dictionary exceeds +it returns `ZXC_ERROR_DICT_TOO_LARGE`. The returned handle points inside `workspace`; the workspace must remain valid for the lifetime of the handle. `zxc_free_dctx` is a no-op. @@ -727,11 +746,11 @@ zxc_free_cctx(cctx); /* no-op for static */ free(cws); /* caller owns the workspace */ /* --- Decompression side --- */ -size_t dws_sz = zxc_static_dctx_workspace_size(BLOCK_SZ); +size_t dws_sz = zxc_static_dctx_workspace_size(BLOCK_SZ, 0); void *dws = NULL; posix_memalign(&dws, 64, dws_sz); -zxc_dctx *dctx = zxc_init_static_dctx(dws, dws_sz, BLOCK_SZ); +zxc_dctx *dctx = zxc_init_static_dctx(dws, dws_sz, BLOCK_SZ, 0); zxc_decompress_dctx(dctx, in, in_sz, out, out_cap, NULL); zxc_free_dctx(dctx); /* no-op */ free(dws); @@ -804,7 +823,7 @@ must know it *before* calling `init`. Four patterns cover every use case: archive at the cost of over-allocation (~4 MB dctx). ```c - size_t dws_sz = zxc_static_dctx_workspace_size(ZXC_BLOCK_SIZE_MAX); + size_t dws_sz = zxc_static_dctx_workspace_size(ZXC_BLOCK_SIZE_MAX, 0); ``` If the workspace pool must stay tight and worst-case sizing is too @@ -1278,6 +1297,42 @@ Returns the encoded byte size of a seek table for `num_blocks` blocks. --- +## 11b. Dictionary API + +Declared in ``. Provides dictionary training and identification. A trained dictionary is passed to the compressor (`zxc_compress_opts_t::dict`) and embedded in the archive; there is no standalone dictionary file format. + +### `zxc_train_dict` + +```c +ZXC_EXPORT int64_t zxc_train_dict( + const void* const* samples, + const size_t* sample_sizes, + size_t n_samples, + void* dict_buf, + size_t dict_capacity // max ZXC_DICT_SIZE_MAX (64KB - 1) +); +``` + +Trains a dictionary from a corpus of representative samples. Returns the size of the trained dictionary, or a negative `zxc_error_t` code. + +### `zxc_dict_id` + +```c +ZXC_EXPORT uint32_t zxc_dict_id(const void* dict, size_t dict_size); +``` + +Returns a deterministic 32-bit hash of the dictionary content. This ID is stored in the ZXC file header and verified against the embedded dictionary at decompression time. Returns 0 for NULL/empty input. + +### `zxc_seekable_set_dict` + +```c +ZXC_EXPORT int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, size_t dict_size); +``` + +Attaches a dictionary to a seekable handle for random-access decompression. The content is copied internally. Must be called before any `zxc_seekable_decompress_range()` call. + +--- + ## 12. Error Handling ### `zxc_error_name` @@ -1374,6 +1429,9 @@ The shared library exports **47 symbols** (verified with `nm -gU`): | 49 | `zxc_write_seek_table` | Seekable | `zxc_seekable.h` | | 50 | `zxc_seek_table_size` | Seekable | `zxc_seekable.h` | | 51 | `zxc_error_name` | Error | `zxc_error.h` | +| 52 | `zxc_train_dict` | Dictionary | `zxc_dict.h` | +| 53 | `zxc_dict_id` | Dictionary | `zxc_dict.h` | +| 54 | `zxc_seekable_set_dict` | Seekable | `zxc_seekable.h` | No internal symbols leak into the public ABI. FMV dispatch variants (`_default`, `_neon`, `_avx2`, `_avx512`) are compiled with diff --git a/docs/FORMAT.md b/docs/FORMAT.md index 00ec4269..e37ccc23 100644 --- a/docs/FORMAT.md +++ b/docs/FORMAT.md @@ -63,9 +63,12 @@ Offset Size Field - Valid block sizes are powers of 2 in the range **4 KB – 2 MB**. - **Flags** (`u8`): - Bit 7 (`0x80`): `HAS_CHECKSUM`. + - Bit 6 (`0x40`): `HAS_DICTIONARY` — a pre-trained dictionary is required for decompression. - Bits 0..3: checksum algorithm id (`0` = RapidHash-based folding). - - Bits 4..6: reserved. -- **Reserved**: 7 bytes set to zero. + - Bits 4..5: reserved. +- **Reserved / Dictionary ID**: 7 bytes. + - When `HAS_DICTIONARY` is set: bytes `0x07..0x0A` contain a `dict_id` (`u32` LE), a 32-bit hash of the dictionary content. Bytes `0x0B..0x0D` remain zero. + - When `HAS_DICTIONARY` is clear: all 7 bytes are zero. - **Header CRC16** (`u16`): computed with `zxc_hash16` on the 16-byte header where bytes `0x0E..0x0F` are zeroed. --- @@ -561,7 +564,70 @@ For decoders processing untrusted input (e.g. network data, user uploads): --- -## 12. Summary of Useful Fixed Sizes +## 12. Pre-Trained Dictionary Support + +### 12.1 Overview + +A pre-trained dictionary improves compression ratio on small, similar payloads +(e.g. JSON API responses, game assets, structured logs) by prefilling the LZ77 +sliding window at the start of each block. The dictionary is **embedded in the +archive** (as a `ZXC_BLOCK_DICT` block, §12.4) and identified by a 32-bit ID in +the file header — there is no external dictionary file. Decompression reads the +dictionary from the archive itself. + +### 12.2 Mechanism + +The dictionary contains raw byte content (max 64 KB, bounded by the 64 KB LZ +sliding window). At compression time, the dictionary is logically prepended to +each block's input, seeding the hash tables so the match finder can reference +dictionary content immediately. At decompression time, the dictionary is +prepended to the output buffer so match copies that reference dictionary bytes +resolve naturally via pointer arithmetic. + +Since each block is independent, the dictionary prefill happens per-block. +This preserves O(1) seekable random-access: load the dictionary once, then +decompress any block independently. + +### 12.3 File header encoding + +When `HAS_DICTIONARY` (flag bit 6) is set, the reserved bytes at offsets +`0x07..0x0A` contain the `dict_id` (`u32` LE), and a `ZXC_BLOCK_DICT` block +(§12.4) carrying the dictionary content immediately follows the file header, +before the first data block. A decoder reads that block, verifies +`zxc_dict_id(content) == header.dict_id` (`ZXC_ERROR_DICT_MISMATCH` if not), and +uses the content as the dictionary for every data block. + +Older decoders that do not recognize the `HAS_DICTIONARY` flag will ignore it +(per §10.3: reserved flag bits are ignored), then fail on the unknown +`ZXC_BLOCK_DICT` block type rather than silently producing corrupt output. + +### 12.4 Embedded dictionary block (`ZXC_BLOCK_DICT`) + +The dictionary is stored as a standard block placed right after the 16-byte file +header and before the first data block: + +```text +[ block header (8 bytes) ] block_type = 0xFD (ZXC_BLOCK_DICT), comp_size = N +[ dictionary content (N) ] raw bytes (uncompressed), max 65535 +``` + +The block header is the ordinary 8-byte block header (§7), with +`block_type = ZXC_BLOCK_DICT (253)` and `comp_size` equal to the dictionary +content length. The content is raw bytes that prefill the LZ77 window; it is not +compressed. Seekable readers advance the first data block's offset past this +block. The dictionary ID stored in the file header (`dict_id`) is the +deterministic 32-bit hash (`zxc_dict_id`) of this content. + +### 12.5 Dictionary training + +The `zxc_train_dict()` function analyzes a corpus of representative samples to +select byte segments that maximize LZ77 match coverage. The most frequently +matched segments are placed at the end of the dictionary so they produce the +shortest offsets (closest to the block start in the virtual window). + +--- + +## 13. Summary of Useful Fixed Sizes - File header: **16** bytes - Block header: **8** bytes @@ -573,10 +639,17 @@ For decoders processing untrusted input (e.g. network data, user uploads): - GLO descriptors total: **32** bytes - GHI descriptors total: **24** bytes - File footer: **12** bytes +- Embedded dictionary block header (`ZXC_BLOCK_DICT`): **8** bytes (standard block header) + +**Magic word** — little-endian `u32` at offset `0x00`: + +| File | Magic (value) | On-disk bytes (LE) | +|------|---------------|--------------------| +| ZXC archive (`.zxc`) | `0x9CB02EF5` | `F5 2E B0 9C` | --- -## 13. Worked Example (Real Hexdump) +## 14. Worked Example (Real Hexdump) This example was produced with the CLI from a 10-byte input (`Hello ZXC\n`) using: @@ -586,7 +659,7 @@ zxc -z -C -1 sample.txt Generated archive size: **58 bytes**. -### 13.1 Full hexdump +### 14.1 Full hexdump ```text 00000000: F5 2E B0 9C 05 13 80 00 00 00 00 00 00 00 B8 90 @@ -595,7 +668,7 @@ Generated archive size: **58 bytes**. 00000030: 00 00 00 00 00 00 90 BB A1 75 ``` -### 13.2 Byte-level decoding +### 14.2 Byte-level decoding #### A) File Header (offset `0x00`, 16 bytes) @@ -665,7 +738,7 @@ global0 = 0 global1 = rotl1(global0) XOR block_crc = block_crc ``` -### 13.3 Structural view with absolute offsets +### 14.3 Structural view with absolute offsets ```text 0x00..0x0F File Header (16) @@ -676,7 +749,7 @@ global1 = rotl1(global0) XOR block_crc = block_crc 0x2E..0x39 File Footer (12) ``` -### 13.4 Seekable Variant (with Seek Table) +### 14.4 Seekable Variant (with Seek Table) Same 10-byte input (`Hello ZXC\n`), compressed with seekable mode enabled: diff --git a/include/zxc.h b/include/zxc.h index 9cd2be41..e8f5d4d0 100644 --- a/include/zxc.h +++ b/include/zxc.h @@ -10,6 +10,7 @@ #include "zxc_buffer.h" // IWYU pragma: keep #include "zxc_constants.h" // IWYU pragma: keep +#include "zxc_dict.h" // IWYU pragma: keep #include "zxc_error.h" // IWYU pragma: keep #include "zxc_opts.h" // IWYU pragma: keep #include "zxc_pstream.h" // IWYU pragma: keep diff --git a/include/zxc_buffer.h b/include/zxc_buffer.h index fb79d555..0309e78f 100644 --- a/include/zxc_buffer.h +++ b/include/zxc_buffer.h @@ -167,6 +167,18 @@ ZXC_EXPORT int64_t zxc_decompress(const void* src, const size_t src_size, void* */ ZXC_EXPORT uint64_t zxc_get_decompressed_size(const void* src, const size_t src_size); +/** + * @brief Returns the dictionary ID stored in a ZXC compressed buffer. + * + * Reads the file header flag and dict_id field without decompressing. + * Returns 0 if the file does not require a dictionary or the buffer is invalid. + * + * @param[in] src Pointer to the compressed data buffer. + * @param[in] src_size Size of the compressed data in bytes. + * @return Dictionary ID, or 0 if no dictionary is required. + */ +ZXC_EXPORT uint32_t zxc_get_dict_id(const void* src, size_t src_size); + /* ========================================================================= */ /* Block-Level API (no file framing) */ /* ========================================================================= */ @@ -561,11 +573,17 @@ ZXC_EXPORT zxc_cctx* zxc_init_static_cctx(void* workspace, const size_t workspac * the decoder cannot predict the per-block literal encoding until it sees * each block header. * - * @param[in] block_size Maximum block size the decoder will encounter - * (must satisfy the regular block-size constraints). - * @return Workspace size in bytes, or 0 if @p block_size is invalid. + * @param[in] block_size Maximum block size the decoder will encounter + * (must satisfy the regular block-size constraints). + * @param[in] max_dict_size Largest embedded dictionary the handle must be able + * to decode (0 = no dictionary support; archives that + * embed a dictionary are then rejected). When > 0, the + * workspace reserves room for a dictionary decode + * buffer; must be <= @ref ZXC_DICT_SIZE_MAX. + * @return Workspace size in bytes, or 0 if a parameter is invalid. */ -ZXC_EXPORT size_t zxc_static_dctx_workspace_size(const size_t block_size); +ZXC_EXPORT size_t zxc_static_dctx_workspace_size(const size_t block_size, + const size_t max_dict_size); /** * @brief Initialises a decompression context inside a caller-supplied @@ -585,11 +603,17 @@ ZXC_EXPORT size_t zxc_static_dctx_workspace_size(const size_t block_size); * @param[in,out] workspace Caller-allocated buffer, cache-line aligned. * @param[in] workspace_size Capacity of @p workspace in bytes. * @param[in] block_size Block size the decoder will accept. + * @param[in] max_dict_size Largest embedded dictionary to support (0 = + * none). Must match the value passed to + * @ref zxc_static_dctx_workspace_size and be + * <= @ref ZXC_DICT_SIZE_MAX. An archive whose + * embedded dictionary exceeds this returns + * @ref ZXC_ERROR_DICT_TOO_LARGE. * @return Handle pointing inside @p workspace on success, or @c NULL if - * the workspace is too small or @p block_size is invalid. + * the workspace is too small or a parameter is invalid. */ ZXC_EXPORT zxc_dctx* zxc_init_static_dctx(void* workspace, const size_t workspace_size, - const size_t block_size); + const size_t block_size, const size_t max_dict_size); /** @} */ /* end of static_context_api */ /** @} */ /* end of context_api */ diff --git a/include/zxc_constants.h b/include/zxc_constants.h index 23c46d1a..d22ae35d 100644 --- a/include/zxc_constants.h +++ b/include/zxc_constants.h @@ -64,6 +64,18 @@ #define ZXC_BLOCK_SIZE_MAX (1U << ZXC_BLOCK_SIZE_MAX_LOG2) /** @} */ /* end of block_size */ +/** + * @defgroup dictionary Dictionary + * @brief Constants for pre-trained dictionary support. + * @{ + */ +/** @brief Maximum dictionary content size in bytes (64 KB - 1). + * + * Bounded to 65535 because LZ77 match offsets are 16-bit (max distance 65535): + * a dictionary byte farther back than that could never be referenced. */ +#define ZXC_DICT_SIZE_MAX ((1U << 16) - 1U) +/** @} */ /* end of dictionary */ + /** * @defgroup threading Threading Limits * @brief Bounds on thread-count parameters accepted by the streaming APIs. diff --git a/include/zxc_dict.h b/include/zxc_dict.h new file mode 100644 index 00000000..0d38b34b --- /dev/null +++ b/include/zxc_dict.h @@ -0,0 +1,85 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/** + * @file zxc_dict.h + * @brief Pre-trained dictionary API for ZXC compression. + * + * Provides functions to train and identify dictionaries that improve + * compression ratio on small, similar payloads. A dictionary is raw byte + * content that prefills the LZ77 sliding window at the start of each block, + * giving the compressor immediate access to representative patterns. + * + * Dictionaries are embedded in the archive (no standalone file format): pass + * trained content to zxc_compress_opts_t::dict and it is stored in the archive, + * so decompression needs no external dictionary. + * + * @code + * // Train a dictionary from a corpus of JSON samples and embed it + * void* dict_buf = malloc(32768); + * int64_t dict_sz = zxc_train_dict(samples, sizes, n, dict_buf, 32768); + * zxc_compress_opts_t opts = { .level = 3, .dict = dict_buf, .dict_size = dict_sz }; + * zxc_compress(src, src_size, dst, dst_capacity, &opts); + * @endcode + */ + +#ifndef ZXC_DICT_H +#define ZXC_DICT_H + +#include +#include + +#include "zxc_export.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @defgroup dict Dictionary + * @brief Pre-trained dictionary training and identification. + * @{ + */ + +/** + * @brief Compute the dictionary ID for the given content. + * + * The ID is a deterministic 32-bit hash of the raw dictionary content. + * It is stored in the ZXC file header so the decoder can verify the embedded + * dictionary matches. + * + * @param[in] dict Pointer to dictionary content. + * @param[in] dict_size Size in bytes. + * @return 32-bit dictionary ID. Returns 0 if @p dict is NULL or @p dict_size is 0. + */ +ZXC_EXPORT uint32_t zxc_dict_id(const void* dict, size_t dict_size); + +/** + * @brief Train a dictionary from a corpus of samples. + * + * Analyzes the samples to select byte sequences that maximize LZ77 match + * coverage. The resulting dictionary content can be passed directly to + * zxc_compress_opts_t::dict (it is then embedded in the archive). + * + * @param[in] samples Array of pointers to sample buffers. + * @param[in] sample_sizes Array of sample sizes in bytes. + * @param[in] n_samples Number of samples. + * @param[out] dict_buf Output buffer for trained dictionary content. + * @param[in] dict_capacity Capacity of @p dict_buf (max ZXC_DICT_SIZE_MAX). + * @return Size of the trained dictionary on success, or a negative + * @ref zxc_error_t code. + */ +ZXC_EXPORT int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, + size_t n_samples, void* dict_buf, size_t dict_capacity); + +/** @} */ /* end of dict */ + +#ifdef __cplusplus +} +#endif + +#endif /* ZXC_DICT_H */ diff --git a/include/zxc_error.h b/include/zxc_error.h index 1326e563..8b300686 100644 --- a/include/zxc_error.h +++ b/include/zxc_error.h @@ -66,6 +66,11 @@ typedef enum { ZXC_ERROR_BAD_BLOCK_TYPE = -13, /**< Unknown or unexpected block type. */ ZXC_ERROR_BAD_BLOCK_SIZE = -14, /**< Invalid block size. */ + /* Dictionary errors */ + ZXC_ERROR_DICT_REQUIRED = -15, /**< File requires a dictionary but none was provided. */ + ZXC_ERROR_DICT_MISMATCH = -16, /**< Provided dictionary ID does not match the file header. */ + ZXC_ERROR_DICT_TOO_LARGE = -17, /**< Dictionary exceeds maximum allowed size. */ + } zxc_error_t; /** diff --git a/include/zxc_opts.h b/include/zxc_opts.h index 6517e77c..e52e5d75 100644 --- a/include/zxc_opts.h +++ b/include/zxc_opts.h @@ -63,6 +63,8 @@ typedef struct { of 2, [4KB - 2MB]. */ int checksum_enabled; /**< 1 to enable per-block and global checksums, 0 to disable. */ int seekable; /**< 1 to append a seek table for random-access decompression. */ + const void* dict; /**< Pre-trained dictionary content (NULL = none). */ + size_t dict_size; /**< Dictionary size in bytes (0 = none, max ZXC_DICT_SIZE_MAX). */ zxc_progress_callback_t progress_cb; /**< Optional progress callback (NULL to disable). */ void* user_data; /**< User context pointer passed to progress_cb. */ } zxc_compress_opts_t; @@ -80,6 +82,8 @@ typedef struct { typedef struct { int n_threads; /**< Worker thread count (0 = auto-detect CPU cores). */ int checksum_enabled; /**< 1 to verify per-block and global checksums, 0 to skip. */ + const void* dict; /**< Pre-trained dictionary content (NULL = none). */ + size_t dict_size; /**< Dictionary size in bytes (0 = none). */ zxc_progress_callback_t progress_cb; /**< Optional progress callback (NULL to disable). */ void* user_data; /**< User context pointer passed to progress_cb. */ } zxc_decompress_opts_t; diff --git a/include/zxc_seekable.h b/include/zxc_seekable.h index 84549f3b..05db650b 100644 --- a/include/zxc_seekable.h +++ b/include/zxc_seekable.h @@ -229,6 +229,20 @@ ZXC_EXPORT int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst, */ ZXC_EXPORT void zxc_seekable_free(zxc_seekable* s); +/** + * @brief Attach a pre-trained dictionary to a seekable handle. + * + * The dictionary content is copied internally; the caller may free + * @p dict after this call returns. Must be called before any + * zxc_seekable_decompress_range() call. + * + * @param[in] s Seekable handle. + * @param[in] dict Dictionary content. + * @param[in] dict_size Size in bytes (max ZXC_DICT_SIZE_MAX). + * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code. + */ +ZXC_EXPORT int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, size_t dict_size); + /* ========================================================================= */ /* Seek Table Writer (low-level) */ /* ========================================================================= */ diff --git a/meson.build b/meson.build index a8af39f3..788f64e7 100644 --- a/meson.build +++ b/meson.build @@ -81,6 +81,7 @@ endforeach libzxc_sources = files( 'src/lib/zxc_common.c', + 'src/lib/zxc_dict.c', 'src/lib/zxc_driver.c', 'src/lib/zxc_dispatch.c', 'src/lib/zxc_pstream.c', @@ -137,6 +138,7 @@ install_headers( 'include/zxc.h', 'include/zxc_buffer.h', 'include/zxc_constants.h', + 'include/zxc_dict.h', 'include/zxc_error.h', 'include/zxc_export.h', 'include/zxc_opts.h', @@ -178,6 +180,7 @@ if not meson.is_subproject() 'tests/test_seekable_mt.c', 'tests/test_format.c', 'tests/test_misc.c', + 'tests/test_dict.c', ), include_directories : libzxc_includes, link_with : libzxc_static, diff --git a/src/cli/main.c b/src/cli/main.c index ef5dfeeb..d4ff5f41 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -22,6 +22,8 @@ #include "../../include/zxc_buffer.h" #include "../../include/zxc_constants.h" +#include "../../include/zxc_dict.h" +#include "../../include/zxc_error.h" #include "../../include/zxc_stream.h" #define ZXC_STDIO_BUFFER_SIZE (1024 * 1024) @@ -297,6 +299,7 @@ static int zxc_validate_output_path(const char* path, char* resolved_buffer, siz // CLI Logging Helpers static int g_quiet = 0; static int g_verbose = 0; +static int g_auto_dict = 0; /** * @brief Standard logging function. Respects the global quiet flag. @@ -345,23 +348,23 @@ typedef enum { MODE_LIST } zxc_mode_t; -enum { OPT_VERSION = 1000, OPT_HELP }; +enum { OPT_VERSION = 1000, OPT_HELP, OPT_AUTO_DICT }; // Forward declaration for recursive mode static int process_single_file(const char* in_path, const char* out_path_override, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum, int level, size_t block_size, int json_output, - int seekable); + int seekable, const void* dict, size_t dict_size); // Forward declaration for processing directory static int process_directory(const char* dir_path, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum, int level, size_t block_size, - int json_output, int seekable); + int json_output, int seekable, const void* dict, size_t dict_size); // OS-specific implementation of directory processing static int process_directory(const char* dir_path, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum, int level, size_t block_size, - int json_output, int seekable) { + int json_output, int seekable, const void* dict, size_t dict_size) { int overall_ret = 0; #ifdef _WIN32 char search_path[MAX_PATH]; @@ -386,7 +389,7 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre if (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { overall_ret |= process_directory(full_path, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } else { // Check if it ends with .zxc to skip if compressing to avoid double compression if (mode == MODE_COMPRESS) { @@ -398,7 +401,7 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre overall_ret |= process_single_file(full_path, NULL, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } } while (FindNextFileA(hFind, &find_data) != 0); @@ -435,7 +438,8 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre if (S_ISDIR(st.st_mode)) { overall_ret |= process_directory(full_path, mode, num_threads, keep_input, force, to_stdout, - checksum, level, block_size, json_output, seekable); + checksum, level, block_size, json_output, seekable, + dict, dict_size); } else if (S_ISREG(st.st_mode)) { // Check if it ends with .zxc to skip if compressing to avoid double compression if (mode == MODE_COMPRESS) { @@ -448,7 +452,7 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre overall_ret |= process_single_file(full_path, NULL, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } } free(full_path); @@ -464,9 +468,10 @@ void print_help(const char* app) { "Standard Modes:\n" " -z, --compress Compress FILE {default}\n" " -d, --decompress Decompress FILE (or stdin -> stdout)\n" - " -l, --list List archive information\n" + " -l, --list List archive info\n" " -t, --test Test compressed FILE integrity\n" - " -b, --bench [N] Benchmark in-memory (N=seconds, default 5)\n\n" + " -b, --bench [N] Benchmark in-memory (N=seconds, default 5)\n" + "\n" "Batch Processing:\n" " -m, --multiple Multiple input files\n" " -r, --recursive Operate recursively on directories\n\n" @@ -479,6 +484,7 @@ void print_help(const char* app) { " -T, --threads N Number of threads (0=auto)\n" " -C, --checksum Enable checksum {default}\n" " -N, --no-checksum Disable checksum\n" + " --auto-dict Train a dictionary from the input and embed it (compression)\n" " -S, --seekable Append seek table for random-access decompression\n" " -k, --keep Keep input file\n" " -f, --force Force overwrite\n" @@ -668,6 +674,9 @@ static int zxc_list_archive(const char* path, int json_output) { ((uint32_t)footer[10] << 16) | ((uint32_t)footer[11] << 24); const char* checksum_method = (stored_checksum != 0) ? "RapidHash" : "-"; + // Dictionary ID (from header flag bit 6 + bytes 7-10) + const uint32_t dict_id = zxc_get_dict_id(header, ZXC_FILE_HEADER_SIZE); + // Calculate ratio (uncompressed / compressed, e.g., 2.5 means 2.5x compression) const double ratio = (file_size > 0) ? ((double)uncompressed_size / (double)file_size) : 0.0; @@ -676,8 +685,13 @@ static int zxc_list_archive(const char* path, int json_output) { format_size_decimal((uint64_t)file_size, comp_str, sizeof(comp_str)); format_size_decimal((uint64_t)uncompressed_size, uncomp_str, sizeof(uncomp_str)); + char dict_id_str[16]; + if (dict_id) + snprintf(dict_id_str, sizeof(dict_id_str), "0x%08X", dict_id); + else + snprintf(dict_id_str, sizeof(dict_id_str), "-"); + if (json_output) { - // JSON mode printf( "{\n" " \"filename\": \"%s\",\n" @@ -687,10 +701,12 @@ static int zxc_list_archive(const char* path, int json_output) { " \"format_version\": %u,\n" " \"block_size_kb\": %zu,\n" " \"checksum_method\": \"%s\",\n" - " \"checksum_value\": \"0x%08X\"\n" + " \"checksum_value\": \"0x%08X\",\n" + " \"dict_id\": %s%s%s\n" "}\n", path, (long long)file_size, (long long)uncompressed_size, ratio, format_version, - block_units * 4, (stored_checksum != 0) ? "RapidHash" : "none", stored_checksum); + block_units * 4, (stored_checksum != 0) ? "RapidHash" : "none", stored_checksum, + dict_id ? "\"" : "", dict_id ? dict_id_str : "null", dict_id ? "\"" : ""); } else if (g_verbose) { // Verbose mode: detailed vertical layout printf( @@ -702,6 +718,7 @@ static int zxc_list_archive(const char* path, int json_output) { path, format_version, block_units, (stored_checksum != 0) ? "RapidHash" : "None"); if (stored_checksum != 0) printf("Checksum Value: 0x%08X\n", stored_checksum); + if (dict_id) printf("Dictionary ID: %s\n", dict_id_str); printf( "-----------------------\n" @@ -711,19 +728,68 @@ static int zxc_list_archive(const char* path, int json_output) { comp_str, uncomp_str, ratio); } else { // Normal mode: table format - printf("\n %12s %12s %5s %-10s %s\n", "Compressed", "Uncompressed", "Ratio", - "Checksum", "Filename"); - printf(" %12s %12s %5.2f %-10s %s\n", comp_str, uncomp_str, ratio, checksum_method, - path); + printf("\n %12s %12s %5s %-10s %-10s %s\n", "Compressed", "Uncompressed", + "Ratio", "Checksum", "Dict ID", "Filename"); + printf(" %12s %12s %5.2f %-10s %-10s %s\n", comp_str, uncomp_str, ratio, + checksum_method, dict_id_str, path); } return 0; } +// --auto-dict: train a dictionary from the whole input file and return it as a +// malloc'd buffer (caller frees), sized to the block size. The trained dict is +// meant to be embedded in the archive. Returns NULL on failure. +static void* cli_auto_train_dict(const char* path, size_t block_size, size_t* out_size) { + *out_size = 0; + FILE* f = fopen(path, "rb"); + if (!f) return NULL; + fseeko(f, 0, SEEK_END); + const long long fsz = ftello(f); + fseeko(f, 0, SEEK_SET); + if (fsz <= 0) { + fclose(f); + return NULL; + } + /* Train on the whole file as a single sample (same as the external + * --train-dict workflow used to). The trainer samples k-gram frequencies + * representatively across the corpus internally (see ZXC_DICT_FREQ_SAMPLE_ + * TARGET), so the full input can be fed without analysing every byte. */ + const size_t corpus_sz = (size_t)fsz; + uint8_t* corpus = (uint8_t*)malloc(corpus_sz); + if (!corpus) { + fclose(f); + return NULL; + } + if (fread(corpus, 1, corpus_sz, f) != corpus_sz) { + free(corpus); + fclose(f); + return NULL; + } + fclose(f); + size_t dict_cap = ZXC_DICT_SIZE_MAX; + if (block_size > 0 && block_size < dict_cap) dict_cap = block_size; + uint8_t* dict = (uint8_t*)malloc(dict_cap); + if (!dict) { + free(corpus); + return NULL; + } + const void* samples[1] = {corpus}; + const size_t sizes[1] = {corpus_sz}; + const int64_t dsz = zxc_train_dict(samples, sizes, 1, dict, dict_cap); + free(corpus); + if (dsz <= 0) { + free(dict); + return NULL; + } + *out_size = (size_t)dsz; + return dict; +} + static int process_single_file(const char* in_path, const char* out_path_override, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum_enabled, int level, size_t block_size, - int json_output, int seekable) { + int json_output, int seekable, const void* dict, size_t dict_size) { FILE* f_in = stdin; FILE* f_out = stdout; char resolved_in_path[4096] = {0}; @@ -903,6 +969,22 @@ static int process_single_file(const char* in_path, const char* out_path_overrid .operation = (mode == MODE_COMPRESS) ? "Compressing" : "Decompressing", .total_size = total_size}; + /* --auto-dict: train a dictionary from the input and embed it (compress + * only, and only for a real file: a pipe cannot be re-read to train). */ + const void* eff_dict = dict; + size_t eff_dict_size = dict_size; + void* auto_dict_buf = NULL; + if (g_auto_dict && mode == MODE_COMPRESS && !use_stdin) { + auto_dict_buf = cli_auto_train_dict(resolved_in_path, block_size, &eff_dict_size); + if (auto_dict_buf) { + eff_dict = auto_dict_buf; + zxc_log_v("Auto-trained dictionary: %zu bytes (embedded)\n", eff_dict_size); + } else { + eff_dict_size = dict_size; + zxc_log("Warning: --auto-dict training failed; compressing without a dictionary\n"); + } + } + const double t0 = zxc_now(); int64_t bytes; if (mode == MODE_COMPRESS) { @@ -912,6 +994,8 @@ static int process_single_file(const char* in_path, const char* out_path_overrid .block_size = block_size, .checksum_enabled = checksum_enabled, .seekable = seekable, + .dict = eff_dict, + .dict_size = eff_dict_size, .progress_cb = show_progress ? cli_progress_callback : NULL, .user_data = &pctx, }; @@ -920,6 +1004,8 @@ static int process_single_file(const char* in_path, const char* out_path_overrid zxc_decompress_opts_t dopts = { .n_threads = num_threads, .checksum_enabled = checksum_enabled, + .dict = dict, + .dict_size = dict_size, .progress_cb = show_progress ? cli_progress_callback : NULL, .user_data = &pctx, }; @@ -947,6 +1033,7 @@ static int process_single_file(const char* in_path, const char* out_path_overrid free(b1); free(b2); + free(auto_dict_buf); if (bytes >= 0) { if (mode == MODE_INTEGRITY) { @@ -993,7 +1080,7 @@ static int process_single_file(const char* in_path, const char* out_path_overrid " Reason: Integrity check failed (corrupted data or invalid checksum)\n"); } } else { - zxc_log("Operation failed on %s.\n", in_path ? in_path : ""); + zxc_log("Error: %s: %s\n", in_path ? in_path : "", zxc_error_name((int)bytes)); if (created_out_file) unlink(resolved_out_path); } overall_ret = 1; @@ -1020,7 +1107,8 @@ int main(int argc, char** argv) { size_t block_size = 0; int seekable = 0; - static const struct option long_options[] = {{"compress", no_argument, 0, 'z'}, + static const struct option long_options[] = { + {"compress", no_argument, 0, 'z'}, {"decompress", no_argument, 0, 'd'}, {"list", no_argument, 0, 'l'}, {"test", no_argument, 0, 't'}, @@ -1040,6 +1128,7 @@ int main(int argc, char** argv) { {"recursive", no_argument, 0, 'r'}, {"block-size", required_argument, 0, 'B'}, {"seekable", no_argument, 0, 'S'}, + {"auto-dict", no_argument, 0, OPT_AUTO_DICT}, {0, 0, 0, 0}}; int opt; @@ -1129,6 +1218,9 @@ int main(int argc, char** argv) { case 'S': seekable = 1; break; + case OPT_AUTO_DICT: + g_auto_dict = 1; + break; case 'r': recursive_mode = 1; multiple_mode = 1; // Recursive implies multiple mode for files processing @@ -1201,12 +1293,18 @@ int main(int argc, char** argv) { checksum = (mode == MODE_BENCHMARK) ? 0 : 1; } + /* Dictionaries are produced internally (--auto-dict) and embedded in the + * archive; the CLI never takes a dictionary as input. */ + void* dict = NULL; + size_t dict_size = 0; + /* * Benchmark Mode * Loads the entire input file into RAM to measure raw algorithm throughput * without disk I/O bottlenecks. */ if (mode == MODE_BENCHMARK) { + free(dict); if (optind >= argc) { zxc_log("Benchmark requires input file.\n"); return 1; @@ -1407,6 +1505,7 @@ int main(int argc, char** argv) { * Displays archive information (compressed size, uncompressed size, ratio). */ if (mode == MODE_LIST) { + free(dict); if (optind >= argc) { zxc_log("List mode requires input file.\n"); return 1; @@ -1432,6 +1531,7 @@ int main(int argc, char** argv) { if (multiple_mode && to_stdout) { zxc_log("Error: cannot write to stdout when using multiple files mode (-m).\n"); + free(dict); return 1; } @@ -1445,11 +1545,13 @@ int main(int argc, char** argv) { // If no files passed but we aren't using stdin, or mode expects files: if (optind >= argc && mode == MODE_INTEGRITY) { zxc_log("Test mode requires at least one input file.\n"); + free(dict); return 1; } if (multiple_mode && optind >= argc) { zxc_log("Multiple files mode requires at least one input file.\n"); + free(dict); return 1; } @@ -1464,7 +1566,7 @@ int main(int argc, char** argv) { zxc_is_directory(current_arg)) { overall_ret |= process_directory(current_arg, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } else { const char* explicit_out_path = (!multiple_mode && optind + 1 < argc && current_arg && strcmp(current_arg, "-") != 0 && !to_stdout) @@ -1474,12 +1576,13 @@ int main(int argc, char** argv) { overall_ret |= process_single_file(current_arg, explicit_out_path, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } if (!multiple_mode) { break; // Standard mode only does the first argument as input } } + free(dict); return overall_ret; } diff --git a/src/lib/zxc_common.c b/src/lib/zxc_common.c index d7a72e1a..496cf2a1 100644 --- a/src/lib/zxc_common.c +++ b/src/lib/zxc_common.c @@ -289,7 +289,7 @@ void zxc_cctx_free(zxc_cctx_t* ctx) { * or a negative @ref zxc_error_t code. */ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, const size_t chunk_size, - const int has_checksum) { + const int has_checksum, const uint32_t dict_id) { if (UNLIKELY(dst_capacity < ZXC_FILE_HEADER_SIZE)) return ZXC_ERROR_DST_TOO_SMALL; zxc_store_le32(dst, ZXC_MAGIC_WORD); @@ -299,10 +299,13 @@ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, cons dst[5] = (uint8_t)zxc_log2_u32((uint32_t)chunk_size); // Flags are at offset 6 - dst[6] = has_checksum ? (ZXC_FILE_FLAG_HAS_CHECKSUM | ZXC_CHECKSUM_RAPIDHASH) : 0; + uint8_t flags = has_checksum ? (ZXC_FILE_FLAG_HAS_CHECKSUM | ZXC_CHECKSUM_RAPIDHASH) : 0; + if (dict_id != 0) flags |= ZXC_FILE_FLAG_HAS_DICTIONARY; + dst[6] = flags; - // Bytes 7-13: Reserved (must be 0, 7 bytes) + // Bytes 7-13: Reserved / dict_id ZXC_MEMSET(dst + 7, 0, 7); + if (dict_id != 0) zxc_store_le32(dst + 7, dict_id); // Bytes 14-15: CRC (16-bit) zxc_store_le16(dst + 14, 0); // Zero out before hashing @@ -325,7 +328,8 @@ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, cons * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code. */ int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size, - size_t* RESTRICT out_block_size, int* RESTRICT out_has_checksum) { + size_t* RESTRICT out_block_size, int* RESTRICT out_has_checksum, + uint32_t* RESTRICT out_dict_id) { if (UNLIKELY(src_size < ZXC_FILE_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL; if (UNLIKELY(zxc_le32(src) != ZXC_MAGIC_WORD)) return ZXC_ERROR_BAD_MAGIC; if (UNLIKELY(src[4] != ZXC_FILE_FORMAT_VERSION)) return ZXC_ERROR_BAD_VERSION; @@ -353,6 +357,7 @@ int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size, } // Flags are at offset 6 if (out_has_checksum) *out_has_checksum = (src[6] & ZXC_FILE_FLAG_HAS_CHECKSUM) ? 1 : 0; + if (out_dict_id) *out_dict_id = (src[6] & ZXC_FILE_FLAG_HAS_DICTIONARY) ? zxc_le32(src + 7) : 0; return ZXC_OK; } @@ -801,6 +806,12 @@ const char* zxc_error_name(const int code) { return "ZXC_ERROR_BAD_BLOCK_TYPE"; case ZXC_ERROR_BAD_BLOCK_SIZE: return "ZXC_ERROR_BAD_BLOCK_SIZE"; + case ZXC_ERROR_DICT_REQUIRED: + return "ZXC_ERROR_DICT_REQUIRED"; + case ZXC_ERROR_DICT_MISMATCH: + return "ZXC_ERROR_DICT_MISMATCH"; + case ZXC_ERROR_DICT_TOO_LARGE: + return "ZXC_ERROR_DICT_TOO_LARGE"; default: return "ZXC_UNKNOWN_ERROR"; } diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 694d07d6..27336c3d 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -626,6 +626,7 @@ static ZXC_ALWAYS_INLINE zxc_match_t zxc_lz77_find_best_match( static int zxc_encode_block_num(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, const size_t src_sz, uint8_t* RESTRICT dst, size_t dst_cap, size_t* RESTRICT out_sz) { + (void)ctx; if (UNLIKELY(src_sz % sizeof(uint32_t) != 0 || src_sz == 0 || dst_cap < ZXC_BLOCK_HEADER_SIZE + ZXC_NUM_HEADER_BINARY_SIZE)) return ZXC_ERROR_DST_TOO_SMALL; @@ -1071,19 +1072,26 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R zxc_lz77_params_t lzp_opt = zxc_get_lz77_params(level); lzp_opt.use_lazy = 0; // guard + /* When a dictionary is active, src = [dict | block_data]. DP arrays are + * indexed relative to the block start (position dict_sz in src). The + * variable src_base points to the first block byte for literal copies, + * while src remains the base for the match finder (absolute positions). */ + const size_t dict_sz = ctx->dict_size; + const size_t block_sz = src_sz - dict_sz; + const uint8_t* const src_base = src + dict_sz; const uint8_t* const iend = src + src_sz; /* Block too small for any match: emit all as literals. */ - if (UNLIKELY(src_sz < 13)) { - if (src_sz > 0) ZXC_MEMCPY(literals, src, src_sz); - *lit_c_out = src_sz; + if (UNLIKELY(block_sz < 13)) { + if (block_sz > 0) ZXC_MEMCPY(literals, src_base, block_sz); + *lit_c_out = block_sz; *seq_c_out = 0; *extras_sz_out = 0; *max_offset_out = 0; return 0; } - const size_t mflimit_pos = src_sz - 12; + const size_t mflimit_pos = block_sz - 12; const uint8_t* const mflimit = src + mflimit_pos; /* DP arrays carved from ctx->opt_scratch: a single allocation lazy- @@ -1125,8 +1133,8 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R * zxc_estimate_cctx_size(). */ (void)needed; - /* Per-block literal cost: */ - const uint32_t lit_cost = zxc_opt_estimate_lit_bits(src, src_sz, ctx->opt_scratch); + /* Per-block literal cost (sample only block data, not dict prefix): */ + const uint32_t lit_cost = zxc_opt_estimate_lit_bits(src_base, block_sz, ctx->opt_scratch); uint32_t* const dp = (uint32_t*)ctx->opt_scratch; uint16_t* const parent_len = (uint16_t*)(ctx->opt_scratch + sz_dp); @@ -1134,7 +1142,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R uint64_t* const match_end_bits = (uint64_t*)(ctx->opt_scratch + sz_dp + sz_pl + sz_po); dp[0] = 0; - ZXC_MEMSET(dp + 1, 0xFF, src_sz * sizeof(uint32_t)); + ZXC_MEMSET(dp + 1, 0xFF, block_sz * sizeof(uint32_t)); ZXC_MEMSET(parent_len, 0, sz_pl + sz_po + sz_bm); /* Forward DP: visit every position, update reachable successors. @@ -1159,8 +1167,10 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R /* Match transition: call find_best_match (no lazy, no backtrack via * anchor=ip). Iterate sub-lengths since any L <= max_L matches at the - * same offset and may end at a more useful DP position. */ - const uint8_t* ip = src + p; + * same offset and may end at a more useful DP position. + * ip uses absolute position (src + dict_sz + p) so match finder + * resolves dict references correctly via src as base. */ + const uint8_t* ip = src_base + p; const zxc_match_t m = zxc_lz77_find_best_match( src, ip, iend, mflimit, /*anchor=*/ip, hash_table, hash_tags, chain_table, epoch_mark, offset_mask, level, lzp_opt, last_off); @@ -1169,7 +1179,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R const uint32_t off = (uint32_t)(ip - m.ref); if (off > 0 && off <= ZXC_LZ_WINDOW_SIZE) { last_off = off; - const size_t L_max_raw = (m.len > src_sz - p) ? (src_sz - p) : (size_t)m.len; + const size_t L_max_raw = (m.len > block_sz - p) ? (block_sz - p) : (size_t)m.len; const size_t L_max = (L_max_raw > UINT16_MAX) ? UINT16_MAX : L_max_raw; /* The L-iteration cost function is piecewise constant in @@ -1227,7 +1237,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R } /* Last 12 bytes can only be literals (matches must end before iend). */ - for (size_t p = mflimit_pos; p < src_sz; p++) { + for (size_t p = mflimit_pos; p < block_sz; p++) { if (UNLIKELY(dp[p] == UINT32_MAX)) continue; const uint32_t lit_next = dp[p] + lit_cost; if (lit_next < dp[p + 1]) { @@ -1241,7 +1251,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R * runs of unmarked positions and are reconstructed during forward emission * via lit_start tracking, so they need no backtrack storage. */ { - size_t pos = src_sz; + size_t pos = block_sz; while (pos > 0) { const uint32_t L = parent_len[pos]; if (L == 0) { @@ -1273,7 +1283,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R const size_t LL = match_start - lit_start; if (LL > 0) { - ZXC_MEMCPY(literals + lit_c, src + lit_start, LL); + ZXC_MEMCPY(literals + lit_c, src_base + lit_start, LL); lit_c += LL; } const uint32_t ll = (uint32_t)LL; @@ -1301,9 +1311,9 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R } /* Tail literals after the last match (or all literals if no match). */ - if (lit_start < src_sz) { - const size_t tail = src_sz - lit_start; - ZXC_MEMCPY(literals + lit_c, src + lit_start, tail); + if (lit_start < block_sz) { + const size_t tail = block_sz - lit_start; + ZXC_MEMCPY(literals + lit_c, src_base + lit_start, tail); lit_c += tail; } @@ -1314,6 +1324,64 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R return 0; } +/** + * @brief Seeds the hash/chain tables from dictionary content prepended to @p src. + * + * When a dictionary is active, @p src is laid out as [dict_content | block_data]. + * This function inserts hash entries for dictionary positions [0, dict_size) so + * the match finder can reference them during block encoding. + * + * @param[in] src Source buffer starting with dictionary content. + * @param[in] dict_size Size of the dictionary prefix in bytes. + * @param[in,out] hash_table Hash table to seed with dictionary positions. + * @param[in,out] hash_tags Tag table for fast match rejection. + * @param[in,out] chain_table Chain table for collision resolution. + * @param[in] epoch_mark Current epoch marker for hash table entries. + * @param[in] offset_mask Position mask for epoch/offset encoding. + * @param[in] level Compression level (controls hash function variant). + */ +static void zxc_lz_seed_dict(const uint8_t* RESTRICT src, const size_t dict_size, + uint32_t* RESTRICT hash_table, uint8_t* RESTRICT hash_tags, + uint16_t* RESTRICT chain_table, const uint32_t epoch_mark, + const uint32_t offset_mask, const int level) { + if (UNLIKELY(dict_size < ZXC_LZ_MIN_MATCH_LEN)) return; + + const int use_hash5 = (level >= 3); + const size_t limit = dict_size - (ZXC_LZ_MIN_MATCH_LEN - 1); + + /* Sparse seeding for the first half, dense for the second half. + * Positions near the end of the dict produce shorter offsets and are + * more likely to yield matches, so they deserve full coverage. */ + const size_t half = limit / 2; + for (size_t i = 0; i < half; i += 4) { + const uint64_t val8 = zxc_le64(src + i); + const uint32_t h = zxc_hash_func(val8, use_hash5); + const uint32_t cur_pos = (uint32_t)i; + const uint8_t tag = (uint8_t)((uint32_t)val8 ^ ((uint32_t)val8 >> 16)); + + hash_table[h] = epoch_mark | cur_pos; + hash_tags[h] = tag; + chain_table[cur_pos & ZXC_LZ_WINDOW_MASK] = 0; + } + for (size_t i = half; i < limit; i++) { + const uint64_t val8 = zxc_le64(src + i); + const uint32_t h = zxc_hash_func(val8, use_hash5); + const uint32_t cur_pos = (uint32_t)i; + const uint8_t tag = (uint8_t)((uint32_t)val8 ^ ((uint32_t)val8 >> 16)); + + const uint32_t raw_head = hash_table[h]; + const uint32_t prev_idx = + ((raw_head & ~offset_mask) == epoch_mark) ? (raw_head & offset_mask) : 0; + + hash_table[h] = epoch_mark | cur_pos; + hash_tags[h] = tag; + + const uint32_t dist = cur_pos - prev_idx; + const uint32_t valid = -((int32_t)((prev_idx != 0) & (dist < ZXC_LZ_WINDOW_SIZE))); + chain_table[cur_pos & ZXC_LZ_WINDOW_MASK] = (uint16_t)(dist & valid); + } +} + /** * @brief Encodes a data block using the General (GLO) compression format. * @@ -1365,6 +1433,7 @@ static int zxc_encode_block_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const size_t src_sz, uint8_t* RESTRICT dst, size_t dst_cap, size_t* RESTRICT out_sz) { const int level = ctx->compression_level; + const size_t dict_sz = ctx->dict_size; const zxc_lz77_params_t lzp = zxc_get_lz77_params(level); @@ -1377,7 +1446,12 @@ static int zxc_encode_block_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const uint32_t offset_bits = ctx->offset_bits; const uint32_t offset_mask = ctx->offset_mask; const uint32_t epoch_mark = ctx->epoch << offset_bits; - const uint8_t *ip = src, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; + + if (dict_sz > 0) + zxc_lz_seed_dict(src, dict_sz, ctx->hash_table, ctx->hash_tags, ctx->chain_table, + epoch_mark, offset_mask, level); + + const uint8_t *ip = src + dict_sz, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; uint32_t* const hash_table = ctx->hash_table; uint8_t* const hash_tags = ctx->hash_tags; @@ -1980,6 +2054,7 @@ static int zxc_encode_block_ghi(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap, size_t* RESTRICT const out_sz) { const int level = ctx->compression_level; + const size_t dict_sz = ctx->dict_size; const zxc_lz77_params_t lzp = zxc_get_lz77_params(level); @@ -1992,7 +2067,12 @@ static int zxc_encode_block_ghi(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const uint32_t offset_bits = ctx->offset_bits; const uint32_t offset_mask = ctx->offset_mask; const uint32_t epoch_mark = ctx->epoch << offset_bits; - const uint8_t *ip = src, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; + + if (dict_sz > 0) + zxc_lz_seed_dict(src, dict_sz, ctx->hash_table, ctx->hash_tags, ctx->chain_table, + epoch_mark, offset_mask, level); + + const uint8_t *ip = src + dict_sz, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; uint32_t* const hash_table = ctx->hash_table; uint8_t* const hash_tags = ctx->hash_tags; @@ -2251,14 +2331,16 @@ static int zxc_probe_is_numeric(const uint8_t* src, const size_t size) { // cppcheck-suppress unusedFunction int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT chunk, const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap) { + const size_t dict_sz = ctx->dict_size; + const size_t block_sz = src_sz - dict_sz; + const uint8_t* block_data = chunk + dict_sz; size_t w = 0; int res = ZXC_OK; - int try_num = zxc_probe_is_numeric(chunk, src_sz); + int try_num = zxc_probe_is_numeric(block_data, block_sz); if (UNLIKELY(try_num)) { - res = zxc_encode_block_num(ctx, chunk, src_sz, dst, dst_cap, &w); - if (res != ZXC_OK || w > (src_sz - (src_sz >> 2))) // w > 75% of src_sz - try_num = 0; // NUM didn't compress well, try GLO/GHI instead + res = zxc_encode_block_num(ctx, block_data, block_sz, dst, dst_cap, &w); + if (res != ZXC_OK || w > (block_sz - (block_sz >> 2))) try_num = 0; } if (LIKELY(!try_num)) { @@ -2268,9 +2350,9 @@ int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT res = zxc_encode_block_glo(ctx, chunk, src_sz, dst, dst_cap, &w); } - // Check expansion. W contains Header + Payload. - if (UNLIKELY(res != ZXC_OK || w >= src_sz)) { - res = zxc_encode_block_raw(chunk, src_sz, dst, dst_cap, &w); + // Check expansion against block data size (excluding dict prefix). + if (UNLIKELY(res != ZXC_OK || w >= block_sz)) { + res = zxc_encode_block_raw(block_data, block_sz, dst, dst_cap, &w); if (UNLIKELY(res != ZXC_OK)) return res; } diff --git a/src/lib/zxc_decompress.c b/src/lib/zxc_decompress.c index 0dfb3f49..6a95e843 100644 --- a/src/lib/zxc_decompress.c +++ b/src/lib/zxc_decompress.c @@ -792,7 +792,9 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_glo_impl(zxc_cctx_t* RESTRICT ctx, // For 1-byte offsets (enc_off==1): validate until 256 bytes written (max 8-bit offset) // For 2-byte offsets (enc_off==0): validate until 65536 bytes written (max 16-bit offset) // After threshold, all offsets are guaranteed valid (can't exceed written bytes) - size_t written = 0; + // When a dictionary is active, dict_size bytes are logically "already written" + // (prepended by the caller), so the SAFE loop may be skipped entirely. + size_t written = ctx->dict_size; // --- SAFE Loop: offset validation until threshold (4x unroll) --- // For 1-byte offsets: bounds check until 256 bytes written @@ -1363,7 +1365,7 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_glo_impl(zxc_cctx_t* RESTRICT ctx, d_ptr += ll; const uint8_t* match_src = d_ptr - offset; - if (UNLIKELY(match_src < dst)) return ZXC_ERROR_BAD_OFFSET; + if (UNLIKELY(match_src < dst - ctx->dict_size)) return ZXC_ERROR_BAD_OFFSET; if (offset < ml) { for (size_t i = 0; i < ml; i++) d_ptr[i] = match_src[i]; @@ -1463,7 +1465,9 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_ghi_impl(zxc_cctx_t* RESTRICT ctx, // For 1-byte offsets (enc_off==1): validate until 256 bytes written (max 8-bit offset) // For 2-byte offsets (enc_off==0): validate until 65536 bytes written (max 16-bit offset) // After threshold, all offsets are guaranteed valid (can't exceed written bytes) - size_t written = 0; + // When a dictionary is active, dict_size bytes are logically "already written" + // (prepended by the caller), so the SAFE loop may be skipped entirely. + size_t written = ctx->dict_size; // --- SAFE Loop: offset validation until threshold (4x unroll) --- // Since offset is 16-bit, threshold is 65536. @@ -2000,7 +2004,7 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_ghi_impl(zxc_cctx_t* RESTRICT ctx, d_ptr += ll; const uint8_t* match_src = d_ptr - offset; - if (UNLIKELY(match_src < dst)) return ZXC_ERROR_BAD_OFFSET; + if (UNLIKELY(match_src < dst - ctx->dict_size)) return ZXC_ERROR_BAD_OFFSET; if (offset < ml) { for (size_t i = 0; i < ml; i++) d_ptr[i] = match_src[i]; diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c new file mode 100644 index 00000000..71a47309 --- /dev/null +++ b/src/lib/zxc_dict.c @@ -0,0 +1,291 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/** + * @file zxc_dict.c + * @brief Pre-trained dictionary: content-ID computation and training. Dictionaries + * are embedded in the archive (see ZXC_BLOCK_DICT); there is no standalone + * dictionary file format. + */ + +#include "../../include/zxc_dict.h" + +#include "zxc_internal.h" + +/* ------------------------------------------------------------------------- + * Dictionary ID + * ------------------------------------------------------------------------- */ + +uint32_t zxc_dict_id(const void* dict, const size_t dict_size) { + if (UNLIKELY(!dict || dict_size == 0)) return 0; + return zxc_checksum(dict, dict_size, 0); +} + +/* ------------------------------------------------------------------------- + * Dictionary training: k-gram frequency selection + * + * Algorithm: + * 1. Concatenate all samples into a corpus. + * 2. For each position in the corpus, hash the k-gram (k = MIN_MATCH_LEN) + * and count occurrences in a fixed-size hash map. + * 3. Walk the corpus, building candidate segments: each starts at a frequent + * k-gram and extends while neighbours stay frequent. A segment's score is + * the summed frequency of its k-grams (its coverage of the corpus). + * 4. Greedily fill the dictionary in descending coverage order, BUT account + * for overlap: once a pattern is placed, a single copy serves all future + * LZ matches, so its k-grams are zeroed in the frequency table. Segments + * whose coverage has since collapsed (mostly already in the dict) are + * skipped, so capacity goes to NEW patterns instead of redundant copies. + * ------------------------------------------------------------------------- */ + +static uint32_t zxc_dict_hash(const uint8_t* p) { + uint32_t v = zxc_le32(p); + v ^= (uint32_t)p[4]; + return (v * ZXC_LZ_HASH_PRIME1) >> (32 - ZXC_DICT_HT_BITS); +} + +/** + * @brief Segment descriptor for dictionary training, scored by coverage. + */ +typedef struct { + uint32_t offset; + uint16_t length; + uint32_t score; /**< Summed k-gram frequency (coverage) of the segment. */ +} zxc_dict_seg_t; + +/** + * @brief Restore the min-heap property at @p root over the range @p a[0..n). + * + * Sinks @p a[root] down the binary heap (children at @c 2i+1 / @c 2i+2) until + * both children are @c >= it, comparing on @ref zxc_dict_seg_t::score. The loop + * is iterative (no recursion), so the call stack stays O(1) regardless of @p n. + * + * @param[in,out] a Heap-ordered array; @p a[0..n) is treated as the heap. + * @param[in] root Index of the element to sift down. Must be @c < n. + * @param[in] n Number of valid elements in the heap. + * + * @note Complexity O(log n). + */ +static void zxc_dict_sift_down(zxc_dict_seg_t* RESTRICT a, size_t root, const size_t n) { + for (;;) { + size_t child = 2 * root + 1; + if (child >= n) break; + if (child + 1 < n && a[child + 1].score < a[child].score) child++; + if (a[root].score <= a[child].score) break; + const zxc_dict_seg_t t = a[root]; + a[root] = a[child]; + a[child] = t; + root = child; + } +} + +/** + * @brief Sort @p a[0..n) by @ref zxc_dict_seg_t::score in descending order. + * + * In-place heapsort: a min-heap is built over the whole array, then each + * extracted minimum is swapped to the shrinking tail. Because the smallest + * scores accumulate at the end, the array is left in descending order + * (largest score at index 0), as required by the dictionary fill step. + * + * Replaces a libc @c qsort call for two reasons: + * - **Freestanding/kernel-safe**: no dependency on @c qsort and no indirect + * comparator call (the @c score comparison is inlined in @ref + * zxc_dict_sift_down). + * - **Deterministic**: ordering is fixed by this code rather than by the + * platform's @c qsort, which matters for reproducible dictionary output + * across libc implementations. + * + * Equal scores keep an unspecified-but-deterministic relative order, matching + * the previous comparator that returned 0 on ties (heapsort is not stable). + * + * @param[in,out] a Array of @p n segments, sorted in place. + * @param[in] n Number of segments. @c n < 2 is a no-op. + * + * @note Complexity O(n log n) worst case with no extra allocation. In practice + * this matches or beats @c qsort on the sizes seen here (up to ~65536 + * segments): eliminating the per-comparison indirect call outweighs + * heapsort's weaker cache locality. This is a cold path (dictionary + * training), so absolute speed is not critical. + */ +static void zxc_dict_sort_segs_desc(zxc_dict_seg_t* RESTRICT a, const size_t n) { + if (UNLIKELY(n < 2)) return; + for (size_t i = n / 2; i-- > 0;) zxc_dict_sift_down(a, i, n); + for (size_t end = n; end > 1;) { + end--; + const zxc_dict_seg_t t = a[0]; + a[0] = a[end]; + a[end] = t; + zxc_dict_sift_down(a, 0, end); + } +} + +int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, + const size_t n_samples, void* dict_buf, const size_t dict_capacity) { + if (UNLIKELY(!samples || !sample_sizes || n_samples == 0 || !dict_buf || dict_capacity == 0)) + return ZXC_ERROR_NULL_INPUT; // LCOV_EXCL_LINE + if (UNLIKELY(dict_capacity > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + + /* Step 1: assemble the analysis corpus. The trainer only reads it, so a + * single sample is used in place (no copy) -- important since callers often + * pass one whole file, which can be hundreds of MB. Multiple samples are + * concatenated into an owned buffer (freed via corpus_owned at exit). */ + size_t corpus_size = 0; + for (size_t i = 0; i < n_samples; i++) corpus_size += sample_sizes[i]; + if (UNLIKELY(corpus_size < ZXC_DICT_KGRAM_LEN)) return ZXC_ERROR_SRC_TOO_SMALL; + + const uint8_t* corpus; + uint8_t* corpus_owned = NULL; + if (n_samples == 1) { + corpus = (const uint8_t*)samples[0]; + } else { + corpus_owned = (uint8_t*)ZXC_MALLOC(corpus_size); + if (UNLIKELY(!corpus_owned)) return ZXC_ERROR_MEMORY; + size_t pos = 0; + for (size_t i = 0; i < n_samples; i++) { + if (sample_sizes[i] > 0) ZXC_MEMCPY(corpus_owned + pos, samples[i], sample_sizes[i]); + pos += sample_sizes[i]; + } + corpus = corpus_owned; + } + + /* Step 2: count k-gram frequencies */ + uint16_t* freq = (uint16_t*)ZXC_MALLOC(ZXC_DICT_HT_SIZE * sizeof(uint16_t)); + if (UNLIKELY(!freq)) { + // LCOV_EXCL_START + ZXC_FREE(corpus_owned); + return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP + } + ZXC_MEMSET(freq, 0, ZXC_DICT_HT_SIZE * sizeof(uint16_t)); + + /* Count k-gram frequencies on a representative sample of positions, not all + * of them: counting a large corpus in full saturates the 16-bit counters, + * so the segment-extension test never stops and segments balloon into + * filler. Sampling keeps counts unsaturated and spread across the corpus. */ + const size_t kgram_limit = corpus_size - ZXC_DICT_KGRAM_LEN + 1; + size_t freq_stride = kgram_limit / ZXC_DICT_FREQ_SAMPLE_TARGET; + if (freq_stride < 1) freq_stride = 1; + for (size_t i = 0; i < kgram_limit; i += freq_stride) { + const uint32_t h = zxc_dict_hash(corpus + i); + if (freq[h] < UINT16_MAX) freq[h]++; + } + + /* Step 3: build candidate segments, each scored by its coverage. Spread the + * candidate starts across the whole corpus: a fixed k-gram stride exhausts + * the segment budget within the prefix, leaving a large input's later + * content unseen. Segments still extend k-gram by k-gram, so they stay + * contiguous. */ + const size_t max_segs = corpus_size / ZXC_DICT_KGRAM_LEN; + const size_t seg_alloc = (max_segs < ZXC_DICT_MAX_SEGMENTS) ? max_segs : ZXC_DICT_MAX_SEGMENTS; + size_t stride = ZXC_DICT_KGRAM_LEN; + if (seg_alloc > 0 && corpus_size / seg_alloc > stride) stride = corpus_size / seg_alloc; + + zxc_dict_seg_t* segs = (zxc_dict_seg_t*)ZXC_MALLOC(seg_alloc * sizeof(zxc_dict_seg_t)); + if (UNLIKELY(!segs)) { + // LCOV_EXCL_START + ZXC_FREE(freq); + ZXC_FREE(corpus_owned); + return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP + } + + size_t n_segs = 0; + for (size_t i = 0; i + ZXC_DICT_KGRAM_LEN <= corpus_size && n_segs < seg_alloc; i += stride) { + const uint32_t h = zxc_dict_hash(corpus + i); + const uint16_t f = freq[h]; + if (f < 2) continue; + + /* Extend the segment as long as the next k-gram is also frequent, and + * accumulate coverage (summed k-gram frequency) as the score. */ + uint32_t coverage = f; + size_t end = i + ZXC_DICT_KGRAM_LEN; + while (end + ZXC_DICT_KGRAM_LEN <= corpus_size && end - i < 4096) { + const uint16_t nf = freq[zxc_dict_hash(corpus + end)]; + if (nf < 2) break; + coverage += nf; + end += ZXC_DICT_KGRAM_LEN; + } + + segs[n_segs].offset = (uint32_t)i; + segs[n_segs].length = (uint16_t)(end - i); + segs[n_segs].score = coverage; + n_segs++; + } + + if (UNLIKELY(n_segs == 0)) { + /* No frequent patterns. Use tail of corpus as dict. */ + const size_t copy = (corpus_size < dict_capacity) ? corpus_size : dict_capacity; + ZXC_MEMCPY(dict_buf, corpus + corpus_size - copy, copy); + ZXC_FREE(freq); + ZXC_FREE(segs); + ZXC_FREE(corpus_owned); + return (int64_t)copy; + } + + /* Step 4: pick segments greedily in descending-coverage order, zeroing each + * pick's k-grams so overlapping patterns aren't copied twice. Picks are + * compacted in place into segs[0..n_sel); placement is step 5. */ + zxc_dict_sort_segs_desc(segs, n_segs); + + uint8_t* out = (uint8_t*)dict_buf; + size_t n_sel = 0; + size_t total = 0; + + for (size_t i = 0; i < n_segs && total < dict_capacity; i++) { + const size_t seg_off = segs[i].offset; + const size_t seg_end = seg_off + segs[i].length; + + /* Recompute coverage from the decrementing table: skip the segment if + * earlier picks have already covered more than half of its k-grams. */ + uint32_t cur = 0; + for (size_t p = seg_off; p + ZXC_DICT_KGRAM_LEN <= seg_end; p += ZXC_DICT_KGRAM_LEN) + cur += freq[zxc_dict_hash(corpus + p)]; + if (cur * 2 < segs[i].score) continue; + + size_t copy = segs[i].length; + if (copy > dict_capacity - total) copy = dict_capacity - total; + + /* One copy in the dictionary serves all future matches: mark this + * segment's k-grams as covered so later segments cover new ground. */ + for (size_t p = seg_off; p + ZXC_DICT_KGRAM_LEN <= seg_end; p += ZXC_DICT_KGRAM_LEN) + freq[zxc_dict_hash(corpus + p)] = 0; + + /* Record the pick (n_sel <= i, so this never clobbers an unread entry). */ + segs[n_sel].offset = (uint32_t)seg_off; + segs[n_sel].length = (uint16_t)copy; + n_sel++; + total += copy; + } + + ZXC_FREE(freq); + + /* Step 5: emit picks in reverse order so the highest-coverage segment ends + * up at the END of the dict. The dict sits just before the data, so bytes + * nearer its end have the smallest match offset: cheapest to encode and the + * last to leave the 16-bit (65535) offset window. + * + * No padding: if the picks don't fill the capacity, the dict is just + * shorter. The old tail-padding only added low-value bytes that raised + * offsets for everything after them. */ + size_t filled = 0; + for (size_t i = n_sel; i-- > 0;) { + ZXC_MEMCPY(out + filled, corpus + segs[i].offset, segs[i].length); + filled += segs[i].length; + } + + /* Nothing selected (every segment subsumed by earlier picks): fall back to + * the corpus tail so the dict is never empty, like the n_segs == 0 path. */ + if (UNLIKELY(filled == 0)) { + const size_t tail = (corpus_size < dict_capacity) ? corpus_size : dict_capacity; + ZXC_MEMCPY(out, corpus + corpus_size - tail, tail); + filled = tail; + } + + ZXC_FREE(segs); + ZXC_FREE(corpus_owned); + return (int64_t)filled; +} diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index d37c31e7..3b8a204a 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -15,6 +15,7 @@ * @ref zxc_decompress, @ref zxc_get_decompressed_size). */ +#include "../../include/zxc_dict.h" #include "../../include/zxc_error.h" #include "../../include/zxc_seekable.h" #include "zxc_internal.h" @@ -499,9 +500,14 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST const int level = (opts && opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT; const size_t block_size = (opts && opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; + if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; if (UNLIKELY(!zxc_validate_block_size(block_size))) return ZXC_ERROR_BAD_BLOCK_SIZE; + const uint32_t did = (dict && dict_size > 0) ? zxc_dict_id(dict, dict_size) : 0; + const uint8_t* ip = (const uint8_t*)src; uint8_t* op = (uint8_t*)dst; const uint8_t* op_start = op; @@ -509,15 +515,32 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST uint32_t global_hash = 0; zxc_cctx_t ctx; + const size_t eff_chunk = + dict_size > 0 ? zxc_block_size_ceil(dict_size + block_size) : block_size; // LCOV_EXCL_START - if (UNLIKELY(zxc_cctx_init(&ctx, block_size, 1, level, checksum_enabled) != ZXC_OK)) + if (UNLIKELY(zxc_cctx_init(&ctx, eff_chunk, 1, level, checksum_enabled) != ZXC_OK)) return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP + ctx.dict_size = dict_size; + + /* Dict input buffer: [dict_content | block_data] for the encoder. */ + uint8_t* dict_input = NULL; + if (dict_size > 0) { + dict_input = (uint8_t*)ZXC_MALLOC(dict_size + block_size); + if (UNLIKELY(!dict_input)) { + // LCOV_EXCL_START + zxc_cctx_free(&ctx); + return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP + } + ZXC_MEMCPY(dict_input, dict, dict_size); + } const int h_val = - zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled); + zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, did); // LCOV_EXCL_START if (UNLIKELY(h_val < 0)) { + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); return h_val; } @@ -531,13 +554,17 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST if (seekable) { const size_t block_count = src_size / block_size; if (UNLIKELY(block_count > (size_t)UINT32_MAX - 2)) { + // LCOV_EXCL_START + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); return ZXC_ERROR_BAD_BLOCK_SIZE; + // LCOV_EXCL_STOP } seek_cap = (uint32_t)(block_count + 2); seek_comp = (uint32_t*)ZXC_MALLOC(seek_cap * sizeof(uint32_t)); // LCOV_EXCL_START if (UNLIKELY(!seek_comp)) { + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); return ZXC_ERROR_MEMORY; } @@ -549,8 +576,15 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST const size_t chunk_len = (src_size - pos > block_size) ? block_size : (src_size - pos); const size_t rem_cap = (size_t)(op_end - op); - const int res = zxc_compress_chunk_wrapper(&ctx, ip + pos, chunk_len, op, rem_cap); + int res; + if (dict_input) { + ZXC_MEMCPY(dict_input + dict_size, ip + pos, chunk_len); + res = zxc_compress_chunk_wrapper(&ctx, dict_input, dict_size + chunk_len, op, rem_cap); + } else { + res = zxc_compress_chunk_wrapper(&ctx, ip + pos, chunk_len, op, rem_cap); + } if (UNLIKELY(res < 0)) { + ZXC_FREE(dict_input); ZXC_FREE(seek_comp); zxc_cctx_free(&ctx); return res; @@ -572,6 +606,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST seek_cap = seek_cap * 2; uint32_t* nc = (uint32_t*)ZXC_REALLOC(seek_comp, seek_cap * sizeof(uint32_t)); if (UNLIKELY(!nc)) { + ZXC_FREE(dict_input); ZXC_FREE(seek_comp); zxc_cctx_free(&ctx); return ZXC_ERROR_MEMORY; @@ -587,6 +622,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST pos += chunk_len; } + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); // Write EOF Block @@ -653,6 +689,8 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE } const int checksum_enabled = opts ? opts->checksum_enabled : 0; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; const uint8_t* ip = (const uint8_t*)src; const uint8_t* ip_end = ip + src_size; @@ -663,9 +701,9 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE zxc_cctx_t ctx; int file_has_checksums = 0; - // File header verification and context initialization - if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums) != - ZXC_OK || + uint32_t header_dict_id = 0; + if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums, + &header_dict_id) != ZXC_OK || zxc_cctx_init(&ctx, runtime_chunk_size, 0, 0, file_has_checksums && checksum_enabled) != ZXC_OK)) { return ZXC_ERROR_BAD_HEADER; @@ -673,11 +711,56 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE ip += ZXC_FILE_HEADER_SIZE; - // work_buf is sized to runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD - // inside zxc_cctx_init (mode == 0). The threshold below must match so - // the fast-path / bounce decision uses the actual work_buf capacity. + /* Dictionary handling. When the archive needs a dictionary it is normally + * embedded as a ZXC_BLOCK_DICT block right after the header; detect it by + * type and use it (skipping the block). Otherwise the caller must supply a + * matching in-memory dictionary. */ + if (header_dict_id != 0) { + zxc_block_header_t dbh; + if ((size_t)(ip_end - ip) >= ZXC_BLOCK_HEADER_SIZE && + zxc_read_block_header(ip, ZXC_BLOCK_HEADER_SIZE, &dbh) == ZXC_OK && + dbh.block_type == ZXC_BLOCK_DICT) { + if (dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX || + (size_t)(ip_end - ip) < ZXC_BLOCK_HEADER_SIZE + dbh.comp_size) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_BAD_HEADER; + } + const uint8_t* edict = ip + ZXC_BLOCK_HEADER_SIZE; + if (zxc_dict_id(edict, dbh.comp_size) != header_dict_id) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_MISMATCH; + } + dict = edict; + dict_size = dbh.comp_size; + ip += ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; /* skip the embedded dict block */ + } else { + if (!dict || dict_size == 0) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_REQUIRED; + } + if (zxc_dict_id(dict, dict_size) != header_dict_id) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_MISMATCH; + } + } + } + ctx.dict_size = dict_size; + const size_t work_sz = runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD; + /* Dict decode buffer: [dict_content | decode_space + PAD] */ + uint8_t* dict_dec = NULL; + if (dict_size > 0) { + dict_dec = (uint8_t*)ZXC_MALLOC(dict_size + work_sz); + if (UNLIKELY(!dict_dec)) { + // LCOV_EXCL_START + zxc_cctx_free(&ctx); + return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP + } + ZXC_MEMCPY(dict_dec, dict, dict_size); + } + // Block decompression loop uint32_t global_hash = 0; @@ -686,6 +769,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE zxc_block_header_t bh; // Read the block header to determine the compressed size if (UNLIKELY(zxc_read_block_header(ip, rem_src, &bh) != ZXC_OK)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_BAD_HEADER; } @@ -696,6 +780,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE // even when a seek table is inserted between EOF block and footer. // LCOV_EXCL_START if (UNLIKELY(src_size < ZXC_FILE_FOOTER_SIZE)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_SRC_TOO_SMALL; } @@ -705,6 +790,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE // Validate source size matches what we decompressed const uint64_t stored_size = zxc_le64(footer); if (UNLIKELY(stored_size != (uint64_t)(op - op_start))) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_CORRUPT_DATA; } @@ -713,6 +799,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE if (checksum_enabled && file_has_checksums) { const uint32_t stored_hash = zxc_le32(footer + sizeof(uint64_t)); if (UNLIKELY(stored_hash != global_hash)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_BAD_CHECKSUM; } @@ -722,7 +809,21 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE int res; const size_t rem_cap = (size_t)(op_end - op); - if (LIKELY(rem_cap >= work_sz)) { + if (dict_dec) { + /* Dict path: decode into bounce buffer with dict prefix so match + * copies that reference dict content resolve naturally. */ + res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, dict_dec + dict_size, work_sz); + if (LIKELY(res > 0)) { + if (UNLIKELY((size_t)res > rem_cap)) { + // LCOV_EXCL_START + ZXC_FREE(dict_dec); + zxc_cctx_free(&ctx); + return ZXC_ERROR_DST_TOO_SMALL; + // LCOV_EXCL_STOP + } + ZXC_MEMCPY(op, dict_dec + dict_size, (size_t)res); + } + } else if (LIKELY(rem_cap >= work_sz)) { // Fast path: decode directly into dst. Cap dst_cap to chunk_size + PAD res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, op, work_sz); } else { @@ -739,6 +840,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE } } if (UNLIKELY(res < 0)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return res; } @@ -754,6 +856,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE op += res; } + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return (int64_t)(op - op_start); } @@ -777,6 +880,16 @@ uint64_t zxc_get_decompressed_size(const void* src, const size_t src_size) { return zxc_le64(footer); } +// cppcheck-suppress unusedFunction +uint32_t zxc_get_dict_id(const void* src, const size_t src_size) { + if (UNLIKELY(!src || src_size < ZXC_FILE_HEADER_SIZE)) return 0; + + const uint8_t* const p = (const uint8_t*)src; + if (UNLIKELY(zxc_le32(p) != ZXC_MAGIC_WORD)) return 0; + + return (p[6] & ZXC_FILE_FLAG_HAS_DICTIONARY) ? zxc_le32(p + 7) : 0; +} + /* * ============================================================================ * REUSABLE CONTEXT API (Opaque) @@ -889,7 +1002,7 @@ int64_t zxc_compress_cctx(zxc_cctx* cctx, const void* RESTRICT src, const size_t uint32_t global_hash = 0; const int h_val = - zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled); + zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, 0); if (UNLIKELY(h_val < 0)) return h_val; // LCOV_EXCL_LINE op += h_val; @@ -940,6 +1053,12 @@ struct zxc_dctx_s { int owns_workspace; /* 0 = library-allocated (free in zxc_free_dctx), 1 = caller-supplied static workspace (no-op free, block_size pinned at init) */ + /* Dictionary decode bounce buffer [dict | decode_space]. For a static dctx + this points into the caller's workspace (sized via max_dict_size at init). + For a dynamic dctx it is lazily heap-allocated on the first dict archive + and freed in zxc_free_dctx. NULL/0 until needed. */ + uint8_t* dict_work; + size_t dict_work_cap; }; zxc_dctx* zxc_create_dctx(void) { @@ -953,6 +1072,7 @@ void zxc_free_dctx(zxc_dctx* dctx) { * which we do not own. Free is a no-op; the caller owns the workspace. */ if (dctx->owns_workspace) return; if (dctx->initialized) zxc_cctx_free(&dctx->inner); + ZXC_FREE(dctx->dict_work); /* lazily-allocated dict bounce buffer (dynamic dctx only) */ ZXC_FREE(dctx); } @@ -973,8 +1093,9 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size int file_has_checksums = 0; uint32_t global_hash = 0; - if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums) != - ZXC_OK)) + uint32_t header_dict_id = 0; + if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums, + &header_dict_id) != ZXC_OK)) return ZXC_ERROR_BAD_HEADER; /* Static dctx: block_size is locked at workspace init; reject any @@ -1009,6 +1130,43 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size * it stays in sync when chunk_size changes between calls. */ const size_t work_sz = runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD; + /* Dictionary: when present it is embedded as a ZXC_BLOCK_DICT block right + * after the header. Decode into a [dict | decode_space] bounce buffer so + * dict back-references resolve. The buffer is the caller's reserved + * workspace region (static dctx, sized via max_dict_size) or a buffer the + * dynamic dctx allocates once and keeps. */ + size_t emb_dict_size = 0; + if (header_dict_id != 0) { + zxc_block_header_t dbh; + if ((size_t)(ip_end - ip) < ZXC_BLOCK_HEADER_SIZE || + zxc_read_block_header(ip, ZXC_BLOCK_HEADER_SIZE, &dbh) != ZXC_OK || + dbh.block_type != ZXC_BLOCK_DICT) + return ZXC_ERROR_DICT_REQUIRED; /* no embedded dictionary to decode with */ + if (dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX || + (size_t)(ip_end - ip) < ZXC_BLOCK_HEADER_SIZE + dbh.comp_size) + return ZXC_ERROR_BAD_HEADER; + const uint8_t* const edict = ip + ZXC_BLOCK_HEADER_SIZE; + if (zxc_dict_id(edict, dbh.comp_size) != header_dict_id) return ZXC_ERROR_DICT_MISMATCH; + + const size_t need = (size_t)dbh.comp_size + work_sz; + if (dctx->dict_work_cap < need) { + /* A static workspace can't grow. Distinguish "dictionary support not + * enabled" (no region reserved) from "reserved region too small" for + * this archive's dictionary. */ + if (dctx->owns_workspace) + return dctx->dict_work ? ZXC_ERROR_DICT_TOO_LARGE : ZXC_ERROR_DICT_REQUIRED; + uint8_t* const nb = (uint8_t*)ZXC_MALLOC(need); + if (UNLIKELY(!nb)) return ZXC_ERROR_MEMORY; + ZXC_FREE(dctx->dict_work); + dctx->dict_work = nb; + dctx->dict_work_cap = need; + } + ZXC_MEMCPY(dctx->dict_work, edict, dbh.comp_size); + emb_dict_size = dbh.comp_size; + ip += ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; /* skip the embedded dict block */ + } + ctx->dict_size = emb_dict_size; + while (ip < ip_end) { const size_t rem_src = (size_t)(ip_end - ip); zxc_block_header_t bh; @@ -1032,7 +1190,16 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size const size_t rem_cap = (size_t)(op_end - op); int res; - if (LIKELY(rem_cap >= work_sz)) { + if (emb_dict_size > 0) { + // Dict path: decode into the [dict | decode_space] bounce buffer so + // match copies referencing dictionary bytes resolve, then copy out. + res = zxc_decompress_chunk_wrapper(ctx, ip, rem_src, dctx->dict_work + emb_dict_size, + work_sz); + if (LIKELY(res > 0)) { + if (UNLIKELY((size_t)res > rem_cap)) return ZXC_ERROR_DST_TOO_SMALL; + ZXC_MEMCPY(op, dctx->dict_work + emb_dict_size, (size_t)res); + } + } else if (LIKELY(rem_cap >= work_sz)) { // Fast path: decode directly into dst (enough padding for wild copies). res = zxc_decompress_chunk_wrapper(ctx, ip, rem_src, op, rem_cap); } else { @@ -1081,8 +1248,13 @@ int64_t zxc_compress_block(zxc_cctx* cctx, const void* RESTRICT src, const size_ (opts && opts->block_size > 0) ? opts->block_size : cctx->stored_block_size; const size_t min_bs = zxc_block_size_ceil(src_size); - /* Always ensure internal buffers can hold src_size. */ - const size_t effective_block_size = (block_size > min_bs) ? block_size : min_bs; + /* Always ensure internal buffers can hold src_size. + * When a dictionary is active, offset_bits must accommodate dict + block. */ + const uint8_t* b_dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t b_dict_size = (opts && opts->dict) ? opts->dict_size : 0; + const size_t base_block_size = (block_size > min_bs) ? block_size : min_bs; + const size_t effective_block_size = + b_dict_size > 0 ? zxc_block_size_ceil(b_dict_size + base_block_size) : base_block_size; cctx->stored_level = level; cctx->stored_block_size = effective_block_size; @@ -1108,8 +1280,21 @@ int64_t zxc_compress_block(zxc_cctx* cctx, const void* RESTRICT src, const size_ cctx->inner.checksum_enabled = checksum_enabled; } - const int res = zxc_compress_chunk_wrapper(&cctx->inner, (const uint8_t*)src, src_size, - (uint8_t*)dst, dst_capacity); + cctx->inner.dict_size = b_dict_size; + + int res; + if (b_dict && b_dict_size > 0) { + uint8_t* combined = (uint8_t*)ZXC_MALLOC(b_dict_size + src_size); + if (UNLIKELY(!combined)) return ZXC_ERROR_MEMORY; + ZXC_MEMCPY(combined, b_dict, b_dict_size); + ZXC_MEMCPY(combined + b_dict_size, src, src_size); + res = zxc_compress_chunk_wrapper(&cctx->inner, combined, b_dict_size + src_size, + (uint8_t*)dst, dst_capacity); + ZXC_FREE(combined); + } else { + res = zxc_compress_chunk_wrapper(&cctx->inner, (const uint8_t*)src, src_size, (uint8_t*)dst, + dst_capacity); + } if (UNLIKELY(res < 0)) return res; return (int64_t)res; } @@ -1148,12 +1333,30 @@ int64_t zxc_decompress_block(zxc_dctx* dctx, const void* RESTRICT src, const siz zxc_cctx_t* const ctx = &dctx->inner; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; + ctx->dict_size = dict_size; + /* work_buf was pre-sized to block_size + ZXC_DECOMPRESS_TAIL_PAD inside * the matching zxc_cctx_init call above. */ const size_t work_sz = block_size + ZXC_DECOMPRESS_TAIL_PAD; int res; - if (LIKELY(dst_capacity >= work_sz)) { + if (dict && dict_size > 0) { + uint8_t* dec_buf = (uint8_t*)ZXC_MALLOC(dict_size + work_sz); + if (UNLIKELY(!dec_buf)) return ZXC_ERROR_MEMORY; + ZXC_MEMCPY(dec_buf, dict, dict_size); + res = zxc_decompress_chunk_wrapper(ctx, (const uint8_t*)src, src_size, dec_buf + dict_size, + work_sz); + if (LIKELY(res > 0)) { + if (UNLIKELY((size_t)res > dst_capacity)) { + ZXC_FREE(dec_buf); + return ZXC_ERROR_DST_TOO_SMALL; + } + ZXC_MEMCPY(dst, dec_buf + dict_size, (size_t)res); + } + ZXC_FREE(dec_buf); + } else if (LIKELY(dst_capacity >= work_sz)) { res = zxc_decompress_chunk_wrapper(ctx, (const uint8_t*)src, src_size, (uint8_t*)dst, dst_capacity); } else { @@ -1272,21 +1475,33 @@ zxc_cctx* zxc_init_static_cctx(void* RESTRICT workspace, const size_t workspace_ return cctx; } -size_t zxc_static_dctx_workspace_size(const size_t block_size) { +/* Bytes reserved for the dictionary decode bounce buffer when a static dctx is + * built to support embedded dictionaries up to `max_dict_size`. 0 disables it. + * Layout is [dict (<= max_dict_size) | decode_space (block + TAIL_PAD)]. */ +static size_t zxc_static_dctx_dict_region(const size_t block_size, const size_t max_dict_size) { + if (max_dict_size == 0) return 0; + return ZXC_ALIGN_CL(max_dict_size + block_size + ZXC_DECOMPRESS_TAIL_PAD); +} + +size_t zxc_static_dctx_workspace_size(const size_t block_size, const size_t max_dict_size) { if (UNLIKELY(!zxc_validate_block_size(block_size))) return 0; + if (UNLIKELY(max_dict_size > ZXC_DICT_SIZE_MAX)) return 0; const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 0, 0); if (UNLIKELY(inner_sz == 0)) return 0; - return ZXC_STATIC_DCTX_HDR_SIZE + inner_sz; + return ZXC_STATIC_DCTX_HDR_SIZE + inner_sz + + zxc_static_dctx_dict_region(block_size, max_dict_size); } zxc_dctx* zxc_init_static_dctx(void* RESTRICT workspace, const size_t workspace_size, - const size_t block_size) { + const size_t block_size, const size_t max_dict_size) { if (UNLIKELY(!workspace)) return NULL; if (UNLIKELY(!zxc_validate_block_size(block_size))) return NULL; + if (UNLIKELY(max_dict_size > ZXC_DICT_SIZE_MAX)) return NULL; const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 0, 0); if (UNLIKELY(inner_sz == 0)) return NULL; - if (UNLIKELY(workspace_size < ZXC_STATIC_DCTX_HDR_SIZE + inner_sz)) return NULL; + const size_t dict_region = zxc_static_dctx_dict_region(block_size, max_dict_size); + if (UNLIKELY(workspace_size < ZXC_STATIC_DCTX_HDR_SIZE + inner_sz + dict_region)) return NULL; zxc_dctx* const dctx = (zxc_dctx*)workspace; ZXC_MEMSET(dctx, 0, sizeof(*dctx)); @@ -1298,6 +1513,10 @@ zxc_dctx* zxc_init_static_dctx(void* RESTRICT workspace, const size_t workspace_ 0) != ZXC_OK)) return NULL; + if (dict_region > 0) { + dctx->dict_work = inner_ws + inner_sz; + dctx->dict_work_cap = dict_region; + } dctx->owns_workspace = 1; dctx->initialized = 1; dctx->last_block_size = block_size; diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index e62b360a..1d3092b0 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -32,6 +32,7 @@ #include #include "../../include/zxc_buffer.h" +#include "../../include/zxc_dict.h" #include "../../include/zxc_error.h" #include "../../include/zxc_seekable.h" #include "../../include/zxc_stream.h" @@ -288,6 +289,8 @@ typedef struct { zxc_progress_callback_t progress_cb; void* progress_user_data; uint64_t total_input_bytes; + const uint8_t* dict; + size_t dict_size; } zxc_stream_ctx_t; /** @@ -370,7 +373,10 @@ static void* zxc_stream_worker(void* arg) { ? ctx->checksum_enabled : (ctx->file_has_checksum && ctx->checksum_enabled); - if (zxc_cctx_init(&cctx, ctx->chunk_size, ctx->compression_mode, ctx->compression_level, + const size_t eff_chunk = (ctx->dict_size > 0 && ctx->compression_mode == 1) + ? zxc_block_size_ceil(ctx->dict_size + ctx->chunk_size) + : ctx->chunk_size; + if (zxc_cctx_init(&cctx, eff_chunk, ctx->compression_mode, ctx->compression_level, unified_chk) != ZXC_OK) { // LCOV_EXCL_START zxc_cctx_free(&cctx); @@ -384,6 +390,27 @@ static void* zxc_stream_worker(void* arg) { } cctx.compression_level = ctx->compression_level; + cctx.dict_size = ctx->dict_size; + + /* Per-worker dict buffer for assembling [dict | block_data]. */ + const size_t dsz = ctx->dict_size; + uint8_t* dict_work = NULL; + if (dsz > 0) { + const size_t alloc = dsz + ctx->chunk_size + ZXC_DECOMPRESS_TAIL_PAD; + dict_work = (uint8_t*)ZXC_MALLOC(alloc); + if (UNLIKELY(!dict_work)) { + // LCOV_EXCL_START + zxc_cctx_free(&cctx); + pthread_mutex_lock(&ctx->lock); + ctx->io_error = 1; + pthread_cond_broadcast(&ctx->cond_writer); + pthread_cond_broadcast(&ctx->cond_reader); + pthread_mutex_unlock(&ctx->lock); + return NULL; + // LCOV_EXCL_STOP + } + ZXC_MEMCPY(dict_work, ctx->dict, dsz); + } while (1) { zxc_stream_job_t* job = NULL; @@ -401,7 +428,17 @@ static void* zxc_stream_worker(void* arg) { job = &ctx->jobs[jid]; pthread_mutex_unlock(&ctx->lock); - const int res = ctx->processor(&cctx, job->in_buf, job->in_sz, job->out_buf, job->out_cap); + int res; + if (dict_work && ctx->compression_mode == 1) { + ZXC_MEMCPY(dict_work + dsz, job->in_buf, job->in_sz); + res = ctx->processor(&cctx, dict_work, dsz + job->in_sz, job->out_buf, job->out_cap); + } else if (dict_work && ctx->compression_mode == 0) { + res = ctx->processor(&cctx, job->in_buf, job->in_sz, dict_work + dsz, + ctx->chunk_size + ZXC_DECOMPRESS_TAIL_PAD); + if (LIKELY(res > 0)) ZXC_MEMCPY(job->out_buf, dict_work + dsz, (size_t)res); + } else { + res = ctx->processor(&cctx, job->in_buf, job->in_sz, job->out_buf, job->out_cap); + } pthread_mutex_lock(&ctx->lock); job->result_sz = UNLIKELY(res < 0) ? 0 : (size_t)res; @@ -415,6 +452,7 @@ static void* zxc_stream_worker(void* arg) { } pthread_mutex_unlock(&ctx->lock); } + ZXC_FREE(dict_work); zxc_cctx_free(&cctx); return NULL; } @@ -563,12 +601,14 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread const int level, const size_t block_size, const int checksum_enabled, const int seekable, zxc_chunk_processor_t func, - zxc_progress_callback_t progress_cb, void* user_data) { + zxc_progress_callback_t progress_cb, void* user_data, + const uint8_t* dict, size_t dict_size) { zxc_stream_ctx_t ctx; ZXC_MEMSET(&ctx, 0, sizeof(ctx)); size_t runtime_chunk_sz = (block_size > 0) ? block_size : ZXC_BLOCK_SIZE_DEFAULT; int file_has_chk = 0; + uint8_t* embedded_dict = NULL; /* heap copy of an embedded dictionary (decode); freed at exit */ // Try to get input file size for progress tracking (compression mode only) // For decompression, the CLI precomputes the size and passes it via user_data @@ -589,10 +629,36 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (mode == 0) { // Decompression Mode: Read and validate file header uint8_t h[ZXC_FILE_HEADER_SIZE]; + uint32_t header_dict_id = 0; if (UNLIKELY(fread(h, 1, ZXC_FILE_HEADER_SIZE, f_in) != ZXC_FILE_HEADER_SIZE || - zxc_read_file_header(h, ZXC_FILE_HEADER_SIZE, &runtime_chunk_sz, - &file_has_chk) != ZXC_OK)) + zxc_read_file_header(h, ZXC_FILE_HEADER_SIZE, &runtime_chunk_sz, &file_has_chk, + &header_dict_id) != ZXC_OK)) return ZXC_ERROR_BAD_HEADER; + + if (header_dict_id != 0) { + /* A dictionary is present: it is embedded as a ZXC_BLOCK_DICT block + * right after the header. Read it and use it (no external dict). The + * header is already consumed, so this works on pipes too. */ + uint8_t dbh[ZXC_BLOCK_HEADER_SIZE]; + zxc_block_header_t bh; + if (UNLIKELY(fread(dbh, 1, ZXC_BLOCK_HEADER_SIZE, f_in) != ZXC_BLOCK_HEADER_SIZE || + zxc_read_block_header(dbh, ZXC_BLOCK_HEADER_SIZE, &bh) != ZXC_OK || + bh.block_type != ZXC_BLOCK_DICT || bh.comp_size == 0 || + bh.comp_size > ZXC_DICT_SIZE_MAX)) + return ZXC_ERROR_BAD_HEADER; + embedded_dict = (uint8_t*)ZXC_MALLOC(bh.comp_size); + if (UNLIKELY(!embedded_dict)) return ZXC_ERROR_MEMORY; + if (UNLIKELY(fread(embedded_dict, 1, bh.comp_size, f_in) != bh.comp_size)) { + ZXC_FREE(embedded_dict); + return ZXC_ERROR_SRC_TOO_SMALL; + } + if (UNLIKELY(zxc_dict_id(embedded_dict, bh.comp_size) != header_dict_id)) { + ZXC_FREE(embedded_dict); + return ZXC_ERROR_DICT_MISMATCH; + } + dict = embedded_dict; + dict_size = bh.comp_size; + } } int num_threads = (n_threads > 0) ? n_threads : (int)sysconf(_SC_NPROCESSORS_ONLN); @@ -611,6 +677,8 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread ctx.progress_cb = progress_cb; ctx.progress_user_data = user_data; ctx.total_input_bytes = total_file_size; + ctx.dict = dict; + ctx.dict_size = dict_size; uint32_t d_global_hash = 0; @@ -629,6 +697,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (UNLIKELY(!mem_block || per_job_sz > SIZE_MAX / ctx.ring_size)) { // LCOV_EXCL_START ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -664,6 +733,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (UNLIKELY(!workers)) { // LCOV_EXCL_START ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -680,6 +750,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread pthread_mutex_destroy(&ctx.lock); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -703,6 +774,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread pthread_mutex_destroy(&ctx.lock); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; } // LCOV_EXCL_STOP @@ -710,11 +782,28 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (mode == 1 && f_out) { uint8_t h[ZXC_FILE_HEADER_SIZE]; - zxc_write_file_header(h, ZXC_FILE_HEADER_SIZE, runtime_chunk_sz, checksum_enabled); + zxc_write_file_header(h, ZXC_FILE_HEADER_SIZE, runtime_chunk_sz, checksum_enabled, + (dict && dict_size) ? zxc_dict_id(dict, dict_size) : 0); if (UNLIKELY(fwrite(h, 1, ZXC_FILE_HEADER_SIZE, f_out) != ZXC_FILE_HEADER_SIZE)) ctx.io_error = 1; - w_args.total_bytes = ZXC_FILE_HEADER_SIZE; + + if (dict && dict_size) { + /* A dictionary is always embedded: store it as a ZXC_BLOCK_DICT block + * right after the header ([block header 8][raw dict content]). Data + * blocks follow, so the seekable reader offsets the first past it. + * The HAS_DICTIONARY header flag (set above via dict_id) marks it. */ + uint8_t dbh[ZXC_BLOCK_HEADER_SIZE]; + const zxc_block_header_t dh = {.block_type = ZXC_BLOCK_DICT, + .block_flags = 0, + .reserved = 0, + .comp_size = (uint32_t)dict_size}; + zxc_write_block_header(dbh, ZXC_BLOCK_HEADER_SIZE, &dh); + if (UNLIKELY(fwrite(dbh, 1, ZXC_BLOCK_HEADER_SIZE, f_out) != ZXC_BLOCK_HEADER_SIZE || + fwrite(dict, 1, dict_size, f_out) != dict_size)) + ctx.io_error = 1; + w_args.total_bytes += ZXC_BLOCK_HEADER_SIZE + dict_size; + } } pthread_t writer_th; if (UNLIKELY(pthread_create(&writer_th, NULL, zxc_async_writer, &w_args) != 0)) { @@ -730,6 +819,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread pthread_mutex_destroy(&ctx.lock); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -928,6 +1018,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread ZXC_FREE(w_args.seek_comp); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); if (UNLIKELY(ctx.io_error)) return ZXC_ERROR_IO; @@ -943,13 +1034,16 @@ int64_t zxc_stream_compress(FILE* f_in, FILE* f_out, const zxc_compress_opts_t* const int level = (opts && opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT; const size_t block_size = (opts && opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; zxc_progress_callback_t cb = opts ? opts->progress_cb : NULL; void* ud = opts ? opts->user_data : NULL; if (UNLIKELY(!zxc_validate_block_size(block_size))) return ZXC_ERROR_BAD_BLOCK_SIZE; + if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; return zxc_stream_engine_run(f_in, f_out, n_threads, 1, level, block_size, checksum_enabled, - seekable, zxc_compress_chunk_wrapper, cb, ud); + seekable, zxc_compress_chunk_wrapper, cb, ud, dict, dict_size); } int64_t zxc_stream_decompress(FILE* f_in, FILE* f_out, const zxc_decompress_opts_t* opts) { @@ -957,11 +1051,14 @@ int64_t zxc_stream_decompress(FILE* f_in, FILE* f_out, const zxc_decompress_opts const int n_threads = opts ? opts->n_threads : 0; const int checksum_enabled = opts ? opts->checksum_enabled : 0; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; zxc_progress_callback_t cb = opts ? opts->progress_cb : NULL; void* ud = opts ? opts->user_data : NULL; return zxc_stream_engine_run(f_in, f_out, n_threads, 0, 0, 0, checksum_enabled, 0, - (zxc_chunk_processor_t)zxc_decompress_chunk_wrapper, cb, ud); + (zxc_chunk_processor_t)zxc_decompress_chunk_wrapper, cb, ud, dict, + dict_size); } int64_t zxc_stream_get_decompressed_size(FILE* f_in) { diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h index 9b5c1255..8361a2aa 100644 --- a/src/lib/zxc_internal.h +++ b/src/lib/zxc_internal.h @@ -32,6 +32,7 @@ #include "../../include/zxc_buffer.h" #include "../../include/zxc_constants.h" +#include "../../include/zxc_error.h" #include "../../include/zxc_seekable.h" #include "rapidhash.h" @@ -334,9 +335,27 @@ extern "C" { /** @brief Bit flag in the Flags byte indicating checksum presence (bit 7). */ #define ZXC_FILE_FLAG_HAS_CHECKSUM 0x80U +/** @brief Bit flag in the Flags byte indicating a dictionary is present (bit 6). + * The dictionary is always embedded as a ZXC_BLOCK_DICT block right after the + * file header, so the decoder reads it from the archive itself. */ +#define ZXC_FILE_FLAG_HAS_DICTIONARY 0x40U /** @brief Mask for the checksum algorithm id (bits 0-3). */ #define ZXC_FILE_CHECKSUM_ALGO_MASK 0x0FU +/** @brief K-gram length scanned by the dictionary trainer. Aligned on the LZ + * minimum match length so trained patterns are matchable at encode time. */ +#define ZXC_DICT_KGRAM_LEN ZXC_LZ_MIN_MATCH_LEN +/** @brief Address bits for the dictionary trainer's k-gram frequency table. */ +#define ZXC_DICT_HT_BITS 16 +/** @brief Maximum number of candidate segments the dictionary trainer keeps. */ +#define ZXC_DICT_MAX_SEGMENTS (1U << 16) +/** @brief Target number of sampled k-gram positions for the trainer's frequency + * estimate. Bounds the count so 16-bit counters stay unsaturated on large + * corpora; the trainer strides the corpus to hit roughly this many positions. */ +#define ZXC_DICT_FREQ_SAMPLE_TARGET (1U << 19) +/** @brief Number of buckets in the dictionary trainer's frequency table. */ +#define ZXC_DICT_HT_SIZE (1U << ZXC_DICT_HT_BITS) + /** @brief Block header size: Type(1)+Flags(1)+Reserved(1)+CRC(1)+CompSize(4). */ #define ZXC_BLOCK_HEADER_SIZE 8 /** @brief Size of the per-block checksum field in bytes. */ @@ -739,6 +758,9 @@ static ZXC_ALWAYS_INLINE zxc_lz77_params_t zxc_get_lz77_params(const int level) * Uses Delta Encoding + ZigZag + Bitpacking. * - `ZXC_BLOCK_GHI` (3): General-purpose high-velocity mode using LZ77 with advanced * techniques (lazy matching, step skipping) for maximum ratio. Includes 3 sections descriptors. + * - `ZXC_BLOCK_DICT` (253): Embedded dictionary block. Contains the dictionary data + * for the file, if present. Always placed immediately after the file header when the + * dictionary flag is set. * - `ZXC_BLOCK_SEK` (254): Seek table block. Contains per-block compressed/decompressed sizes * for random-access decompression. Placed between EOF block and file footer. * - `ZXC_BLOCK_EOF` (255): End of file marker. @@ -748,6 +770,7 @@ typedef enum { ZXC_BLOCK_GLO = 1, ZXC_BLOCK_NUM = 2, ZXC_BLOCK_GHI = 3, + ZXC_BLOCK_DICT = 253, ZXC_BLOCK_SEK = 254, ZXC_BLOCK_EOF = 255 } zxc_block_type_t; @@ -1584,6 +1607,7 @@ typedef struct { size_t opt_scratch_cap; /**< Current capacity of opt_scratch in bytes. */ int checksum_enabled; /**< 1 if checksum calculation/verification is enabled. */ int compression_level; /**< Compression level. */ + size_t dict_size; /**< Dictionary prefill size (0 = no dictionary). */ /* Block-size derived parameters (computed once at init). */ size_t chunk_size; /**< Effective block size in bytes. */ @@ -1741,12 +1765,13 @@ typedef struct { * @param[in] dst_capacity Total capacity of @p dst in bytes. * @param[in] chunk_size Block size to encode in the header. * @param[in] has_checksum Non-zero if the checksum bit must be set. + * @param[in] dict_id Dictionary ID (0 = no dictionary). * * @return Number of bytes written (@c ZXC_FILE_HEADER_SIZE) on success, * or @c ZXC_ERROR_DST_TOO_SMALL if @p dst_capacity is insufficient. */ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, const size_t chunk_size, - const int has_checksum); + const int has_checksum, const uint32_t dict_id); /** * @brief Validates and reads the ZXC file header from @p src. @@ -1760,13 +1785,15 @@ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, cons * block size. May be @c NULL. * @param[out] out_has_checksum Optional pointer that receives the checksum * flag. May be @c NULL. + * @param[out] out_dict_id Optional pointer that receives the dictionary + * ID (0 if none). May be @c NULL. * * @return @c ZXC_OK on success, or a negative error code (e.g. * @c ZXC_ERROR_SRC_TOO_SMALL, @c ZXC_ERROR_BAD_MAGIC, * @c ZXC_ERROR_BAD_VERSION). */ int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size, size_t* out_block_size, - int* out_has_checksum); + int* out_has_checksum, uint32_t* out_dict_id); /** * @brief Encodes a block header into @p dst. diff --git a/src/lib/zxc_pstream.c b/src/lib/zxc_pstream.c index c09d251d..815d87a4 100644 --- a/src/lib/zxc_pstream.c +++ b/src/lib/zxc_pstream.c @@ -291,7 +291,7 @@ zxc_cstream* zxc_cstream_create(const zxc_compress_opts_t* opts) { */ static int cs_stage_file_header(zxc_cstream* cs) { const int w = zxc_write_file_header(cs->pending, cs->pending_cap, cs->block_size, - cs->opts.checksum_enabled); + cs->opts.checksum_enabled, 0); if (UNLIKELY(w < 0)) return w; // LCOV_EXCL_LINE cs->pending_len = (size_t)w; cs->pending_pos = 0; @@ -903,7 +903,7 @@ static int ds_handle_need_file_header(zxc_dstream* ds, zxc_inbuf_t* in) { size_t bs = 0; int has_csum = 0; - const int rc = zxc_read_file_header(ds->scratch, ds->scratch_used, &bs, &has_csum); + const int rc = zxc_read_file_header(ds->scratch, ds->scratch_used, &bs, &has_csum, NULL); if (UNLIKELY(rc != ZXC_OK)) return ds_set_error(ds, rc); // LCOV_EXCL_LINE ds->block_size = bs; ds->file_has_checksum = has_csum; diff --git a/src/lib/zxc_seekable.c b/src/lib/zxc_seekable.c index 614d3a67..5845e71c 100644 --- a/src/lib/zxc_seekable.c +++ b/src/lib/zxc_seekable.c @@ -29,6 +29,7 @@ #include "../../include/zxc_seekable.h" +#include "../../include/zxc_dict.h" #include "../../include/zxc_error.h" #include "zxc_internal.h" @@ -166,12 +167,20 @@ struct zxc_seekable_s { * fits in 21 bits. */ uint32_t block_size; int file_has_checksums; + uint32_t expected_dict_id; /* dict_id from the file header; 0 = no dictionary */ /* Reusable decompression context (single-threaded path only) */ zxc_cctx_t dctx; int dctx_initialized; + + /* Dictionary (owned copy, freed in zxc_seekable_free) */ + uint8_t* dict; + size_t dict_size; + uint8_t* dict_work; /* [dict | decode_space] bounce buffer */ }; +static int zxc_seekable_install_dict(zxc_seekable* s, const void* dict, size_t dict_size); + /** * @brief Parses the seek table from raw bytes at the end of the archive. * @@ -192,8 +201,10 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s /* Step 1: validate file header => block_size */ size_t block_size_sz = 0; int file_has_chk = 0; - if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk) != ZXC_OK)) - return NULL; + uint32_t header_dict_id = 0; + if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk, + &header_dict_id) != ZXC_OK)) + return NULL; // LCOV_EXCL_LINE const uint32_t block_size = (uint32_t)block_size_sz; if (UNLIKELY(block_size == 0)) return NULL; // LCOV_EXCL_LINE @@ -235,6 +246,7 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s s->num_blocks = num_blocks; s->block_size = block_size; s->file_has_checksums = file_has_chk; + s->expected_dict_id = header_dict_id; s->src = data; s->src_size = (uint64_t)data_size; @@ -249,11 +261,38 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s // LCOV_EXCL_STOP s->total_decomp = total_decomp; + /* A dictionary, when present, is embedded as a ZXC_BLOCK_DICT block right + * after the file header. Detect it by the block type (no dedicated flag): + * if the first block is a DICT block, load it and start the data blocks + * past it. Otherwise the dictionary must be supplied via + * zxc_seekable_set_dict (its id is already in expected_dict_id). */ + uint64_t data_start = ZXC_FILE_HEADER_SIZE; + if (header_dict_id != 0 && data_size >= ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE) { + zxc_block_header_t dbh; + if (zxc_read_block_header(data + ZXC_FILE_HEADER_SIZE, ZXC_BLOCK_HEADER_SIZE, &dbh) == + ZXC_OK && + dbh.block_type == ZXC_BLOCK_DICT) { + if (UNLIKELY(dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX || + (uint64_t)ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + dbh.comp_size > + (uint64_t)data_size)) { + zxc_seekable_free(s); + return NULL; + } + const uint8_t* dcontent = data + ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE; + if (UNLIKELY(zxc_dict_id(dcontent, dbh.comp_size) != header_dict_id || + zxc_seekable_install_dict(s, dcontent, dbh.comp_size) != ZXC_OK)) { + zxc_seekable_free(s); + return NULL; + } + data_start = (uint64_t)ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; + } + } + /* Parse comp_sizes and build compressed prefix sums. * Validate each comp_size against data_size to prevent prefix-sum overflow * and out-of-bounds reads during decompression. */ const uint8_t* ep = seek_block_start + ZXC_BLOCK_HEADER_SIZE; - uint64_t comp_acc = ZXC_FILE_HEADER_SIZE; /* blocks start after file header */ + uint64_t comp_acc = data_start; /* data blocks start after header (+ embedded dict block) */ for (uint32_t i = 0; i < num_blocks; i++) { s->comp_sizes[i] = zxc_le32(ep); ep += sizeof(uint32_t); @@ -326,8 +365,10 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { size_t bs_sz = 0; int fhc = 0; - if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc) != ZXC_OK)) - return NULL; + uint32_t header_dict_id = 0; + if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc, + &header_dict_id) != ZXC_OK)) + return NULL; // LCOV_EXCL_LINE const uint32_t bs = (uint32_t)bs_sz; if (UNLIKELY(bs == 0)) return NULL; @@ -388,6 +429,7 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { s->num_blocks = num_blocks; s->block_size = bs; s->file_has_checksums = fhc; + s->expected_dict_id = header_dict_id; s->comp_sizes = (uint32_t*)ZXC_CALLOC(num_blocks, sizeof(uint32_t)); s->comp_offsets = (uint64_t*)ZXC_CALLOC((size_t)num_blocks + 1, sizeof(uint64_t)); @@ -400,9 +442,43 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { } s->total_decomp = total_decomp; + /* Embedded dictionary (ZXC_BLOCK_DICT right after the header): read it via + * the reader, load it, and start the data blocks after it. */ + uint64_t data_start = ZXC_FILE_HEADER_SIZE; + if (header_dict_id != 0) { + uint8_t dbh_buf[ZXC_BLOCK_HEADER_SIZE]; + zxc_block_header_t dbh; + /* Embedded only if the first block is a DICT block (else external). */ + if (r->read_at(r->ctx, dbh_buf, ZXC_BLOCK_HEADER_SIZE, ZXC_FILE_HEADER_SIZE) == + (int64_t)ZXC_BLOCK_HEADER_SIZE && + zxc_read_block_header(dbh_buf, ZXC_BLOCK_HEADER_SIZE, &dbh) == ZXC_OK && + dbh.block_type == ZXC_BLOCK_DICT) { + if (UNLIKELY(dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX)) { + zxc_seekable_free(s); + return NULL; + } + uint8_t* dtmp = (uint8_t*)ZXC_MALLOC(dbh.comp_size); + if (UNLIKELY(!dtmp)) { + zxc_seekable_free(s); + return NULL; + } + if (UNLIKELY(r->read_at(r->ctx, dtmp, dbh.comp_size, + ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE) != + (int64_t)dbh.comp_size || + zxc_dict_id(dtmp, dbh.comp_size) != header_dict_id || + zxc_seekable_install_dict(s, dtmp, dbh.comp_size) != ZXC_OK)) { + ZXC_FREE(dtmp); + zxc_seekable_free(s); + return NULL; + } + ZXC_FREE(dtmp); + data_start = (uint64_t)ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; + } + } + /* Parse comp_sizes and build prefix sums; validate against archive size. */ const uint8_t* ep = seek_buf + ZXC_BLOCK_HEADER_SIZE; - uint64_t comp_acc = ZXC_FILE_HEADER_SIZE; + uint64_t comp_acc = data_start; for (uint32_t i = 0; i < num_blocks; i++) { s->comp_sizes[i] = zxc_le32(ep); ep += sizeof(uint32_t); @@ -496,10 +572,12 @@ static int zxc_seek_read_block(const zxc_seekable* s, const uint32_t block_idx, int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t dst_capacity, const uint64_t offset, const size_t len) { - if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(len == 0)) return 0; + if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(dst_capacity < len)) return ZXC_ERROR_DST_TOO_SMALL; if (UNLIKELY(offset + len > s->total_decomp)) return ZXC_ERROR_SRC_TOO_SMALL; + if (UNLIKELY(s->expected_dict_id != 0 && (!s->dict || s->dict_size == 0))) + return ZXC_ERROR_DICT_REQUIRED; /* Initialize decompression context on first use */ if (!s->dctx_initialized) { @@ -509,6 +587,7 @@ int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t d // LCOV_EXCL_STOP s->dctx_initialized = 1; } + s->dctx.dict_size = s->dict_size; /* work_buf is pre-sized to block_size + ZXC_DECOMPRESS_TAIL_PAD by the * matching zxc_cctx_init above. */ @@ -539,9 +618,12 @@ int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t d // LCOV_EXCL_STOP } - /* Decompress the block */ - const int dec_res = zxc_decompress_chunk_wrapper(&s->dctx, read_buf, (size_t)read_res, - s->dctx.work_buf, work_sz); + /* Decompress the block: when a dictionary is active, decode into the + * dict_work bounce buffer (which has dict content prepended) so that + * match copies referencing dictionary bytes resolve naturally. */ + uint8_t* dec_dst = s->dict_work ? s->dict_work + s->dict_size : s->dctx.work_buf; + const int dec_res = + zxc_decompress_chunk_wrapper(&s->dctx, read_buf, (size_t)read_res, dec_dst, work_sz); if (UNLIKELY(dec_res < 0)) { // LCOV_EXCL_START ZXC_FREE(read_buf); @@ -561,7 +643,7 @@ int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t d const size_t avail = (size_t)dec_res - skip; const size_t copy = (avail < remaining) ? avail : remaining; - ZXC_MEMCPY(out, s->dctx.work_buf + skip, copy); + ZXC_MEMCPY(out, dec_dst + skip, copy); out += copy; remaining -= copy; } @@ -637,13 +719,29 @@ static void* zxc_seek_mt_worker(void* arg) { return NULL; } // LCOV_EXCL_STOP + dctx.dict_size = s->dict_size; const size_t work_sz = (size_t)s->block_size + ZXC_DECOMPRESS_TAIL_PAD; + /* Thread-local dict bounce buffer: [dict_content | decode_space] */ + uint8_t* dict_work = NULL; + if (s->dict_size > 0 && s->dict) { + dict_work = (uint8_t*)ZXC_MALLOC(s->dict_size + work_sz); + if (UNLIKELY(!dict_work)) { + // LCOV_EXCL_START + zxc_cctx_free(&dctx); + job->result = ZXC_ERROR_MEMORY; + return NULL; + // LCOV_EXCL_STOP + } + ZXC_MEMCPY(dict_work, s->dict, s->dict_size); + } + /* Read compressed block */ const uint32_t csz = s->comp_sizes[bi]; uint8_t* const read_buf = (uint8_t*)ZXC_MALLOC(csz + ZXC_PAD_SIZE); // LCOV_EXCL_START if (UNLIKELY(!read_buf)) { + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = ZXC_ERROR_MEMORY; return NULL; @@ -654,24 +752,28 @@ static void* zxc_seek_mt_worker(void* arg) { // LCOV_EXCL_START if (UNLIKELY(read_res < 0)) { ZXC_FREE(read_buf); + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = read_res; return NULL; } // LCOV_EXCL_STOP - /* Decompress */ + /* Decompress: use dict bounce buffer when dictionary is active */ + uint8_t* dec_dst = dict_work ? dict_work + s->dict_size : dctx.work_buf; const int dec_res = - zxc_decompress_chunk_wrapper(&dctx, read_buf, (size_t)read_res, dctx.work_buf, work_sz); + zxc_decompress_chunk_wrapper(&dctx, read_buf, (size_t)read_res, dec_dst, work_sz); ZXC_FREE(read_buf); // LCOV_EXCL_START if (UNLIKELY(dec_res < 0)) { + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = dec_res; return NULL; } if (UNLIKELY((size_t)dec_res < job->skip + job->copy_len)) { + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = ZXC_ERROR_CORRUPT_DATA; return NULL; @@ -679,8 +781,9 @@ static void* zxc_seek_mt_worker(void* arg) { // LCOV_EXCL_STOP /* Copy the requested portion directly into the caller's output buffer */ - ZXC_MEMCPY(job->dst, dctx.work_buf + job->skip, job->copy_len); + ZXC_MEMCPY(job->dst, dec_dst + job->skip, job->copy_len); + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = 0; return NULL; @@ -688,10 +791,12 @@ static void* zxc_seek_mt_worker(void* arg) { int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst, const size_t dst_capacity, const uint64_t offset, const size_t len, int n_threads) { - if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(len == 0)) return 0; + if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(dst_capacity < len)) return ZXC_ERROR_DST_TOO_SMALL; if (UNLIKELY(offset + len > s->total_decomp)) return ZXC_ERROR_SRC_TOO_SMALL; + if (UNLIKELY(s->expected_dict_id != 0 && (!s->dict || s->dict_size == 0))) + return ZXC_ERROR_DICT_REQUIRED; /* Find block range - O(1) division */ const uint32_t blk_start = zxc_seek_find_block(s->block_size, offset); @@ -808,12 +913,50 @@ int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst, const size_ void zxc_seekable_free(zxc_seekable* s) { if (!s) return; if (s->dctx_initialized) zxc_cctx_free(&s->dctx); + ZXC_FREE(s->dict); + ZXC_FREE(s->dict_work); ZXC_FREE(s->comp_sizes); ZXC_FREE(s->comp_offsets); ZXC_FREE(s->owned_reader_ctx); ZXC_FREE(s); } +/* Install a dictionary into the handle: owned copy + [dict | decode] bounce + * buffer. No id validation (callers do it where needed). */ +static int zxc_seekable_install_dict(zxc_seekable* s, const void* dict, const size_t dict_size) { + ZXC_FREE(s->dict); + ZXC_FREE(s->dict_work); + s->dict = NULL; + s->dict_work = NULL; + s->dict_size = 0; + + s->dict = (uint8_t*)ZXC_MALLOC(dict_size); + if (UNLIKELY(!s->dict)) return ZXC_ERROR_MEMORY; + ZXC_MEMCPY(s->dict, dict, dict_size); + s->dict_size = dict_size; + + const size_t work_sz = dict_size + (size_t)s->block_size + ZXC_DECOMPRESS_TAIL_PAD; + s->dict_work = (uint8_t*)ZXC_MALLOC(work_sz); + if (UNLIKELY(!s->dict_work)) { + // LCOV_EXCL_START + ZXC_FREE(s->dict); + s->dict = NULL; + s->dict_size = 0; + return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP + } + ZXC_MEMCPY(s->dict_work, dict, dict_size); + return ZXC_OK; +} + +int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_size) { + if (UNLIKELY(!s || !dict || dict_size == 0)) return ZXC_ERROR_NULL_INPUT; + if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + if (UNLIKELY(s->expected_dict_id != 0 && zxc_dict_id(dict, dict_size) != s->expected_dict_id)) + return ZXC_ERROR_DICT_MISMATCH; + return zxc_seekable_install_dict(s, dict, dict_size); +} + void zxc_seekable_attach_owned_ctx(zxc_seekable* s, void* ctx) { if (s) s->owned_reader_ctx = ctx; } diff --git a/tests/fuzz_dict.c b/tests/fuzz_dict.c new file mode 100644 index 00000000..a32be656 --- /dev/null +++ b/tests/fuzz_dict.c @@ -0,0 +1,90 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/* + * Fuzz target: dictionary roundtrip. + * + * The fuzzer input is split into a dictionary prefix and block data. + * The first 2 bytes encode the dict size (u16 LE, capped at 32 KB). + * The remainder is the block data to compress with that dictionary. + * The roundtrip (compress -> decompress) must produce identical output. + */ + +#include +#include +#include +#include +#include + +#include "../include/zxc_buffer.h" + +#define FUZZ_DICT_MAX_INPUT (256 << 10) /* 256 KiB */ +#define FUZZ_DICT_MAX_DICT (32 << 10) /* 32 KiB */ + +int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + static void* comp_buf = NULL; + static size_t comp_cap = 0; + static void* decomp_buf = NULL; + static size_t decomp_cap = 0; + + if (size < 4) return 0; + if (size > FUZZ_DICT_MAX_INPUT) return 0; + + /* First 2 bytes: dict_size (u16 LE, capped). Byte 2: level. */ + size_t dict_size = (size_t)(data[0] | (data[1] << 8)); + if (dict_size > FUZZ_DICT_MAX_DICT) dict_size = FUZZ_DICT_MAX_DICT; + const int level = (data[2] % 6) + 1; + data += 3; + size -= 3; + + if (dict_size >= size) dict_size = size / 2; + const uint8_t* dict = data; + const uint8_t* src = data + dict_size; + const size_t src_size = size - dict_size; + + if (src_size == 0) return 0; + + const uint64_t bound64 = zxc_compress_bound(src_size); + if (bound64 == 0 || bound64 > SIZE_MAX) return 0; + const size_t bound = (size_t)bound64; + if (bound > comp_cap) { + void* nb = realloc(comp_buf, bound); + if (!nb) return 0; + comp_buf = nb; + comp_cap = bound; + } + + zxc_compress_opts_t copts = { + .level = level, + .checksum_enabled = 1, + .dict = dict_size > 0 ? dict : NULL, + .dict_size = dict_size, + }; + const int64_t csize = zxc_compress(src, src_size, comp_buf, bound, &copts); + if (csize < 0) return 0; + + if (src_size > decomp_cap) { + void* nb = realloc(decomp_buf, src_size); + if (!nb) return 0; + decomp_buf = nb; + decomp_cap = src_size; + } + + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_size > 0 ? dict : NULL, + .dict_size = dict_size, + }; + const int64_t dsize = zxc_decompress(comp_buf, (size_t)csize, decomp_buf, src_size, &dopts); + + if (dsize >= 0) { + assert((size_t)dsize == src_size); + assert(memcmp(src, decomp_buf, src_size) == 0); + } + + return 0; +} diff --git a/tests/test_cli.sh b/tests/test_cli.sh index b6964f76..1009722f 100755 --- a/tests/test_cli.sh +++ b/tests/test_cli.sh @@ -890,5 +890,76 @@ else log_fail "List command on seekable archive failed" fi +# 25. Dictionary Tests (--auto-dict, embedded) +echo "Testing Dictionary (--auto-dict)..." + +# 25.1 Round-trip with an auto-trained, embedded dictionary. +# The dictionary is trained from the input and stored in the archive; no +# external dictionary file is needed at decompression. +echo " Testing auto-dict round-trip (small blocks)..." +"$ZXC_BIN" -3 -B 4K --auto-dict -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict.zxc" +if [ ! -s "$TEST_DIR/test_dict.zxc" ]; then + log_fail "Auto-dict compression failed" +fi +"$ZXC_BIN" -d -c "$TEST_DIR/test_dict.zxc" > "$TEST_DIR/test_dict.dec" +if cmp -s "$TEST_FILE" "$TEST_DIR/test_dict.dec"; then + log_pass "Auto-dict round-trip (no external dict at decode)" +else + log_fail "Auto-dict round-trip content mismatch" +fi + +# 25.2 List shows dict_id for an embedded-dict archive +echo " Testing list with dict_id..." +OUT=$("$ZXC_BIN" -l "$TEST_DIR/test_dict.zxc") +if [[ "$OUT" == *"Dict ID"* ]] && [[ "$OUT" == *"0x"* ]]; then + log_pass "List shows dict_id" +else + log_fail "List should show dict_id column with 0x value" +fi + +# 25.3 List without dict shows dash +echo " Testing list without dict shows dash..." +"$ZXC_BIN" -3 -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_nodict2.zxc" +OUT=$("$ZXC_BIN" -l "$TEST_DIR/test_nodict2.zxc") +if [[ "$OUT" == *"Dict ID"* ]] && [[ "$OUT" == *" - "* ]]; then + log_pass "List without dict shows dash" +else + log_fail "List without dict should show dash in Dict ID column" +fi + +# 25.4 JSON list shows dict_id field +echo " Testing JSON list with dict_id..." +JSON_OUT=$("$ZXC_BIN" -l -j "$TEST_DIR/test_dict.zxc") +if [[ "$JSON_OUT" == *'"dict_id"'* ]] && [[ "$JSON_OUT" == *"0x"* ]]; then + log_pass "JSON list shows dict_id" +else + log_fail "JSON list should contain dict_id field" +fi + +# 25.5 Auto-dict + seekable +echo " Testing auto-dict + seekable (--auto-dict -S)..." +"$ZXC_BIN" -3 -B 4K --auto-dict -S -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_seek.zxc" +"$ZXC_BIN" -d -c "$TEST_DIR/test_dict_seek.zxc" > "$TEST_DIR/test_dict_seek.dec" +if cmp -s "$TEST_FILE" "$TEST_DIR/test_dict_seek.dec"; then + log_pass "Auto-dict + seekable" +else + log_fail "Auto-dict + seekable round-trip failed" +fi + +# 25.6 Auto-dict across all levels +echo " Testing auto-dict across all levels..." +DICT_ALL_OK=1 +for LEVEL in 1 2 3 4 5 6; do + "$ZXC_BIN" -$LEVEL -B 4K --auto-dict -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" + "$ZXC_BIN" -d -c "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" > "$TEST_DIR/test_dict_lvl${LEVEL}.dec" + if ! cmp -s "$TEST_FILE" "$TEST_DIR/test_dict_lvl${LEVEL}.dec"; then + DICT_ALL_OK=0 + log_fail "Auto-dict level $LEVEL round-trip failed" + fi +done +if [ "$DICT_ALL_OK" -eq 1 ]; then + log_pass "Auto-dict across all levels (1-6)" +fi + echo "All tests passed!" exit 0 diff --git a/tests/test_common.h b/tests/test_common.h index f76caec0..84a8bcb7 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -81,6 +81,7 @@ int test_static_ctx_size_query(void); int test_static_ctx_workspace_too_small(void); int test_static_ctx_block_size_locked(void); int test_static_ctx_null_inputs(void); +int test_static_ctx_embedded_dict(void); /* Stream API */ int test_null_output_decompression(void); @@ -182,4 +183,21 @@ int test_legacy_header(void); int test_error_name(void); int test_library_info_api(void); +/* Dictionary */ +int test_dict_id_deterministic(void); +int test_dict_get_id_apis(void); +int test_dict_buffer_roundtrip(void); +int test_dict_block_roundtrip(void); +int test_dict_mismatch_error(void); +int test_dict_required_error(void); +int test_dict_no_dict_compat(void); +int test_dict_stream_roundtrip(void); +int test_dict_large_dict_roundtrip(void); +int test_dict_seekable_roundtrip(void); +int test_dict_train_roundtrip(void); +int test_dict_train_no_frequent_patterns(void); +int test_dict_seekable_mt_roundtrip(void); +int test_dict_stream_dict_id_checks(void); +int test_dict_seekable_dict_id_checks(void); + #endif /* ZXC_TEST_COMMON_H */ diff --git a/tests/test_dict.c b/tests/test_dict.c new file mode 100644 index 00000000..4b905bc4 --- /dev/null +++ b/tests/test_dict.c @@ -0,0 +1,870 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "test_common.h" + +#include "../include/zxc_dict.h" + +static void gen_dict_friendly_data(uint8_t* buf, size_t size, const uint8_t* dict, + size_t dict_size) { + for (size_t i = 0; i < size; i++) { + if (i % 7 < 5 && dict_size > 5) { + size_t off = (i * 31) % (dict_size - 5); + buf[i] = dict[off + (i % 5)]; + } else { + buf[i] = (uint8_t)(i ^ (i >> 8)); + } + } +} + +int test_dict_id_deterministic(void) { + printf("=== TEST: Dict - dict_id is deterministic ===\n"); + + const char* data = "some repeatable dictionary content"; + size_t size = strlen(data); + + uint32_t id1 = zxc_dict_id(data, size); + uint32_t id2 = zxc_dict_id(data, size); + + if (id1 != id2 || id1 == 0) { + printf(" [FAIL] dict_id not deterministic or zero: %u vs %u\n", id1, id2); + return 0; + } + + uint32_t id_null = zxc_dict_id(NULL, 0); + if (id_null != 0) { + printf(" [FAIL] dict_id(NULL, 0) should be 0, got %u\n", id_null); + return 0; + } + + printf("PASS\n\n"); + return 1; +} + +int test_dict_get_id_apis(void) { + printf("=== TEST: Dict - zxc_get_dict_id (archive header) ===\n"); + + const uint8_t dict[] = "dictionary content for get_id test"; + const size_t dict_size = sizeof(dict) - 1; + const uint32_t expected_id = zxc_dict_id(dict, dict_size); + + /* Compress with dict and verify zxc_get_dict_id reads it back */ + const uint8_t src[] = "some data to compress with dict for id test purposes"; + const size_t src_size = sizeof(src) - 1; + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 1, .dict = dict, .dict_size = dict_size}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + uint32_t got_id = zxc_get_dict_id(compressed, (size_t)comp_size); + if (got_id != expected_id) { + printf(" [FAIL] zxc_get_dict_id: got 0x%08X, expected 0x%08X\n", got_id, expected_id); + free(compressed); + return 0; + } + printf(" [PASS] zxc_get_dict_id returns 0x%08X\n", got_id); + + /* Compress without dict: should return 0 */ + zxc_compress_opts_t copts2 = {.level = 1}; + int64_t comp2 = zxc_compress(src, src_size, compressed, comp_bound, &copts2); + if (comp2 > 0 && zxc_get_dict_id(compressed, (size_t)comp2) != 0) { + printf(" [FAIL] zxc_get_dict_id should return 0 for no-dict file\n"); + free(compressed); + return 0; + } + printf(" [PASS] zxc_get_dict_id returns 0 for no-dict file\n"); + free(compressed); + + printf("PASS\n\n"); + return 1; +} + +int test_dict_buffer_roundtrip(void) { + printf("=== TEST: Dict - buffer API roundtrip (all levels) ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + "Pack my box with five dozen liquor jugs. " + "How vexingly quick daft zebras jump!"; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 4096; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + uint8_t* decompressed = (uint8_t*)malloc(src_size); + + for (int level = 1; level <= 6; level++) { + zxc_compress_opts_t copts = { + .level = level, + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] level %d: compress returned %lld\n", level, (long long)comp_size); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, src_size, + &dopts); + if (dec_size != (int64_t)src_size) { + printf(" [FAIL] level %d: decompress returned %lld, expected %zu\n", level, + (long long)dec_size, src_size); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + if (memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] level %d: content mismatch\n", level); + free(src); + free(compressed); + free(decompressed); + return 0; + } + printf(" [PASS] level %d: %zu -> %lld bytes\n", level, src_size, (long long)comp_size); + } + + free(src); + free(compressed); + free(decompressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_block_roundtrip(void) { + printf("=== TEST: Dict - block API roundtrip (all levels) ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + "Pack my box with five dozen liquor jugs."; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 4096; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + const size_t comp_bound = (size_t)zxc_compress_block_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + uint8_t* decompressed = (uint8_t*)malloc(src_size); + zxc_cctx* cctx = zxc_create_cctx(NULL); + zxc_dctx* dctx = zxc_create_dctx(); + + int result = 0; + if (!src || !compressed || !decompressed || !cctx || !dctx) { + printf(" [FAIL] allocation failed\n"); + goto cleanup; + } + + for (int level = 1; level <= 6; level++) { + zxc_compress_opts_t copts = { + .level = level, + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = + zxc_compress_block(cctx, src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] level %d: compress_block returned %lld\n", level, + (long long)comp_size); + goto cleanup; + } + + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t dec_size = zxc_decompress_block(dctx, compressed, (size_t)comp_size, decompressed, + src_size, &dopts); + if (dec_size != (int64_t)src_size || memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] level %d: block roundtrip mismatch (dec_size=%lld)\n", level, + (long long)dec_size); + goto cleanup; + } + printf(" [PASS] level %d: %zu -> %lld bytes\n", level, src_size, (long long)comp_size); + } + + result = 1; + +cleanup: + zxc_free_cctx(cctx); /* safe with NULL */ + zxc_free_dctx(dctx); /* safe with NULL */ + free(src); + free(compressed); + free(decompressed); + if (result) printf("PASS\n\n"); + return result; +} + +int test_dict_mismatch_error(void) { + printf("=== TEST: Dict - dict_id mismatch error ===\n"); + + const uint8_t dict[] = "correct dictionary content"; + const uint8_t wrong_dict[] = "wrong dictionary contentz"; + const size_t dict_size = sizeof(dict) - 1; + + const uint8_t src[] = "some data to compress with dict"; + const size_t src_size = sizeof(src) - 1; + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 3, .dict = dict, .dict_size = dict_size}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress failed: %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = {.dict = wrong_dict, .dict_size = sizeof(wrong_dict) - 1}; + int64_t rc = zxc_decompress(compressed, (size_t)comp_size, decompressed, sizeof(decompressed), + &dopts); + if (rc != ZXC_ERROR_DICT_MISMATCH) { + printf(" [FAIL] expected DICT_MISMATCH, got %lld (%s)\n", (long long)rc, + zxc_error_name((int)rc)); + free(compressed); + return 0; + } + + free(compressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_required_error(void) { + printf("=== TEST: Dict - dict required error ===\n"); + + const uint8_t dict[] = "required dictionary"; + const size_t dict_size = sizeof(dict) - 1; + + const uint8_t src[] = "data needing a dict"; + const size_t src_size = sizeof(src) - 1; + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 3, .dict = dict, .dict_size = dict_size}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress failed: %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = {0}; + int64_t rc = zxc_decompress(compressed, (size_t)comp_size, decompressed, sizeof(decompressed), + &dopts); + if (rc != ZXC_ERROR_DICT_REQUIRED) { + printf(" [FAIL] expected DICT_REQUIRED, got %lld (%s)\n", (long long)rc, + zxc_error_name((int)rc)); + free(compressed); + return 0; + } + + free(compressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_no_dict_compat(void) { + printf("=== TEST: Dict - no-dict files decompress normally ===\n"); + + const uint8_t src[] = "data compressed without any dictionary at all, just normal data"; + const size_t src_size = sizeof(src) - 1; + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 3, .checksum_enabled = 1}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress failed\n"); + free(compressed); + return 0; + } + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = {.checksum_enabled = 1}; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, + sizeof(decompressed), &dopts); + if (dec_size != (int64_t)src_size || memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] roundtrip without dict failed\n"); + free(compressed); + return 0; + } + + free(compressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_stream_roundtrip(void) { + printf("=== TEST: Dict - stream API roundtrip ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + FILE* f_src = tmpfile(); + FILE* f_comp = tmpfile(); + FILE* f_dec = tmpfile(); + if (!f_src || !f_comp || !f_dec) { + printf(" [FAIL] tmpfile() failed\n"); + free(src); + return 0; + } + + fwrite(src, 1, src_size, f_src); + rewind(f_src); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_sz = zxc_stream_compress(f_src, f_comp, &copts); + if (comp_sz <= 0) { + printf(" [FAIL] stream_compress returned %lld\n", (long long)comp_sz); + fclose(f_src); + fclose(f_comp); + fclose(f_dec); + free(src); + return 0; + } + + rewind(f_comp); + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t dec_sz = zxc_stream_decompress(f_comp, f_dec, &dopts); + if (dec_sz != (int64_t)src_size) { + printf(" [FAIL] stream_decompress returned %lld, expected %zu\n", (long long)dec_sz, + src_size); + fclose(f_src); + fclose(f_comp); + fclose(f_dec); + free(src); + return 0; + } + + rewind(f_dec); + uint8_t* result = (uint8_t*)malloc(src_size); + fread(result, 1, src_size, f_dec); + int ok = (memcmp(src, result, src_size) == 0); + + fclose(f_src); + fclose(f_comp); + fclose(f_dec); + free(result); + free(src); + + if (!ok) { + printf(" [FAIL] content mismatch\n"); + return 0; + } + + printf("PASS\n\n"); + return 1; +} + +int test_dict_large_dict_roundtrip(void) { + printf("=== TEST: Dict - large dict (32KB) with small blocks (4KB) ===\n"); + + uint8_t* dict = (uint8_t*)malloc(32768); + for (size_t i = 0; i < 32768; i++) dict[i] = (uint8_t)(i * 7 + 13); + const size_t dict_size = 32768; + + const size_t src_size = 4096; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + uint8_t* decompressed = (uint8_t*)malloc(src_size); + + for (int level = 1; level <= 6; level++) { + zxc_compress_opts_t copts = { + .level = level, .checksum_enabled = 1, .dict = dict, .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] level %d: compress returned %lld (%s)\n", level, (long long)comp_size, + zxc_error_name((int)comp_size)); + free(src); free(compressed); free(decompressed); free(dict); + return 0; + } + zxc_decompress_opts_t dopts = {.checksum_enabled = 1, .dict = dict, .dict_size = dict_size}; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, src_size, + &dopts); + if (dec_size != (int64_t)src_size || memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] level %d: dec_size=%lld err=%s\n", level, (long long)dec_size, + dec_size < 0 ? zxc_error_name((int)dec_size) : "content mismatch"); + free(src); free(compressed); free(decompressed); free(dict); + return 0; + } + printf(" [PASS] level %d\n", level); + } + + free(src); free(compressed); free(decompressed); free(dict); + printf("PASS\n\n"); + return 1; +} + +int test_dict_train_roundtrip(void) { + printf("=== TEST: Dict - train then compress/decompress ===\n"); + + const char* json_samples[] = { + "{\"id\":1,\"name\":\"alice\",\"email\":\"alice@example.com\",\"active\":true}", + "{\"id\":2,\"name\":\"bob\",\"email\":\"bob@example.com\",\"active\":false}", + "{\"id\":3,\"name\":\"carol\",\"email\":\"carol@example.com\",\"active\":true}", + "{\"id\":4,\"name\":\"dave\",\"email\":\"dave@example.com\",\"active\":true}", + "{\"id\":5,\"name\":\"eve\",\"email\":\"eve@example.com\",\"active\":false}", + "{\"id\":6,\"name\":\"frank\",\"email\":\"frank@example.com\",\"active\":true}", + "{\"id\":7,\"name\":\"grace\",\"email\":\"grace@example.com\",\"active\":false}", + "{\"id\":8,\"name\":\"hank\",\"email\":\"hank@example.com\",\"active\":true}", + }; + const size_t n_samples = sizeof(json_samples) / sizeof(json_samples[0]); + const void* sample_ptrs[8]; + size_t sample_sizes[8]; + for (size_t i = 0; i < n_samples; i++) { + sample_ptrs[i] = json_samples[i]; + sample_sizes[i] = strlen(json_samples[i]); + } + + uint8_t dict_buf[4096]; + int64_t dict_sz = + zxc_train_dict(sample_ptrs, sample_sizes, n_samples, dict_buf, sizeof(dict_buf)); + if (dict_sz <= 0) { + printf(" [FAIL] train_dict returned %lld\n", (long long)dict_sz); + return 0; + } + printf(" trained dict: %lld bytes\n", (long long)dict_sz); + + const char* test_input = + "{\"id\":99,\"name\":\"zara\",\"email\":\"zara@example.com\",\"active\":true}"; + const size_t src_size = strlen(test_input); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .dict = dict_buf, + .dict_size = (size_t)dict_sz, + }; + int64_t comp_size = zxc_compress(test_input, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + zxc_compress_opts_t copts_nodict = {.level = ZXC_LEVEL_DEFAULT, .checksum_enabled = 1}; + uint8_t* comp_nodict = (uint8_t*)malloc(comp_bound); + int64_t comp_nodict_sz = + zxc_compress(test_input, src_size, comp_nodict, comp_bound, &copts_nodict); + printf(" with dict: %lld bytes, without: %lld bytes (input: %zu)\n", (long long)comp_size, + (long long)comp_nodict_sz, src_size); + free(comp_nodict); + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_buf, + .dict_size = (size_t)dict_sz, + }; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, + sizeof(decompressed), &dopts); + free(compressed); + + if (dec_size != (int64_t)src_size || memcmp(test_input, decompressed, src_size) != 0) { + printf(" [FAIL] roundtrip mismatch\n"); + return 0; + } + + printf("PASS\n\n"); + return 1; +} + +int test_dict_train_no_frequent_patterns(void) { + printf("=== TEST: Dict - train fallback when no frequent k-grams ===\n"); + + /* A strictly increasing byte sequence has all-distinct 5-grams, so no + * k-gram repeats and the trainer finds zero scorable segments. This forces + * the n_segs == 0 fallback: copy the tail of the corpus into the dict. */ + uint8_t corpus[64]; + for (size_t i = 0; i < sizeof(corpus); i++) corpus[i] = (uint8_t)i; + + const void* sample_ptrs[1] = {corpus}; + const size_t sample_sizes[1] = {sizeof(corpus)}; + + /* Case 1: capacity >= corpus_size -> copy == corpus_size, dict == whole corpus. */ + uint8_t dict_big[256]; + int64_t sz = zxc_train_dict(sample_ptrs, sample_sizes, 1, dict_big, sizeof(dict_big)); + if (sz != (int64_t)sizeof(corpus)) { + printf(" [FAIL] expected %zu bytes (full corpus), got %lld\n", sizeof(corpus), + (long long)sz); + return 0; + } + if (memcmp(dict_big, corpus, sizeof(corpus)) != 0) { + printf(" [FAIL] dict content does not match corpus tail\n"); + return 0; + } + printf(" [PASS] full-corpus fallback (%lld bytes)\n", (long long)sz); + + /* Case 2: capacity < corpus_size -> copy == capacity, dict == last `cap` bytes. */ + const size_t cap = 16; + uint8_t dict_small[16]; + sz = zxc_train_dict(sample_ptrs, sample_sizes, 1, dict_small, cap); + if (sz != (int64_t)cap) { + printf(" [FAIL] expected %zu bytes (capped), got %lld\n", cap, (long long)sz); + return 0; + } + if (memcmp(dict_small, corpus + sizeof(corpus) - cap, cap) != 0) { + printf(" [FAIL] capped dict does not match corpus tail\n"); + return 0; + } + printf(" [PASS] capped tail fallback (%lld bytes)\n", (long long)sz); + + printf("PASS\n\n"); + return 1; +} + +int test_dict_seekable_roundtrip(void) { + printf("=== TEST: Dict - seekable API roundtrip ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .seekable = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(src); + free(compressed); + return 0; + } + + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + free(src); + free(compressed); + return 0; + } + + int rc = zxc_seekable_set_dict(s, dict_content, dict_size); + if (rc != ZXC_OK) { + printf(" [FAIL] seekable_set_dict returned %d\n", rc); + zxc_seekable_free(s); + free(src); + free(compressed); + return 0; + } + + uint8_t* decompressed = (uint8_t*)malloc(src_size); + int64_t dec_size = zxc_seekable_decompress_range(s, decompressed, src_size, 0, src_size); + if (dec_size != (int64_t)src_size) { + printf(" [FAIL] decompress_range returned %lld, expected %zu\n", (long long)dec_size, + src_size); + zxc_seekable_free(s); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + int ok = (memcmp(src, decompressed, src_size) == 0); + zxc_seekable_free(s); + free(decompressed); + free(src); + free(compressed); + + if (!ok) { + printf(" [FAIL] content mismatch\n"); + return 0; + } + + printf("PASS\n\n"); + return 1; +} + +int test_dict_seekable_mt_roundtrip(void) { + printf("=== TEST: Dict - seekable MT roundtrip ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + const size_t dict_size = sizeof(dict_content) - 1; + + /* Use 32KB of data with 4KB blocks = 8 blocks, enough for MT */ + const size_t src_size = 32768; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .block_size = 4096, + .checksum_enabled = 1, + .seekable = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(src); + free(compressed); + return 0; + } + + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + free(src); + free(compressed); + return 0; + } + zxc_seekable_set_dict(s, dict_content, dict_size); + + /* Full range MT decompress */ + uint8_t* decompressed = (uint8_t*)malloc(src_size); + int64_t dec_size = zxc_seekable_decompress_range_mt(s, decompressed, src_size, 0, src_size, 4); + if (dec_size != (int64_t)src_size) { + printf(" [FAIL] decompress_range_mt returned %lld (%s)\n", (long long)dec_size, + dec_size < 0 ? zxc_error_name((int)dec_size) : "size mismatch"); + zxc_seekable_free(s); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + int ok = (memcmp(src, decompressed, src_size) == 0); + if (!ok) { + for (size_t i = 0; i < src_size; i++) { + if (src[i] != decompressed[i]) { + printf(" [FAIL] content mismatch at byte %zu\n", i); + break; + } + } + } + + /* Also test a sub-range across block boundaries */ + if (ok) { + int64_t sub = zxc_seekable_decompress_range_mt(s, decompressed, 8192, 4000, 8192, 4); + ok = (sub == 8192 && memcmp(src + 4000, decompressed, 8192) == 0); + if (!ok) printf(" [FAIL] sub-range MT mismatch\n"); + } + + zxc_seekable_free(s); + free(decompressed); + free(src); + free(compressed); + + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} + +static const uint8_t k_dict_a[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; +static const uint8_t k_dict_b[] = + "A completely unrelated dictionary payload hashing to a different dict_id value."; + +// A stream archive compressed with a dictionary embeds it, so it must +// decompress correctly WITHOUT any external dictionary supplied at decode. +int test_dict_stream_dict_id_checks(void) { + printf("=== TEST: Dict - stream embeds dict (decodes with no external dict) ===\n"); + + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, k_dict_a, sizeof(k_dict_a) - 1); + + FILE* f_src = tmpfile(); + FILE* f_comp = tmpfile(); + FILE* f_dec = tmpfile(); + int ok = (f_src && f_comp && f_dec); + if (!ok) printf(" [FAIL] tmpfile() failed\n"); + + if (ok) { + fwrite(src, 1, src_size, f_src); + rewind(f_src); + zxc_compress_opts_t copts = {.level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .dict = k_dict_a, + .dict_size = sizeof(k_dict_a) - 1}; + if (zxc_stream_compress(f_src, f_comp, &copts) <= 0) { + printf(" [FAIL] stream_compress failed\n"); + ok = 0; + } + } + + if (ok) { + rewind(f_comp); + /* No dict supplied: it must come from the embedded block. */ + zxc_decompress_opts_t dopts = {.checksum_enabled = 1}; + int64_t rc = zxc_stream_decompress(f_comp, f_dec, &dopts); + if (rc != (int64_t)src_size) { + printf(" [FAIL] embedded decode returned %lld, expected %zu\n", (long long)rc, + src_size); + ok = 0; + } + } + + if (ok) { + rewind(f_dec); + uint8_t* got = (uint8_t*)malloc(src_size); + ok = (fread(got, 1, src_size, f_dec) == src_size && memcmp(got, src, src_size) == 0); + if (!ok) printf(" [FAIL] embedded roundtrip mismatch\n"); + free(got); + } + + if (f_src) fclose(f_src); + if (f_comp) fclose(f_comp); + if (f_dec) fclose(f_dec); + free(src); + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} + +int test_dict_seekable_dict_id_checks(void) { + printf("=== TEST: Dict - seekable decode rejects missing/wrong dict ===\n"); + + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, k_dict_a, sizeof(k_dict_a) - 1); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + zxc_compress_opts_t copts = {.level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .seekable = 1, + .dict = k_dict_a, + .dict_size = sizeof(k_dict_a) - 1}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + + uint8_t* out = (uint8_t*)malloc(src_size); + int ok = 1; + + if (comp_size <= 0) { + printf(" [FAIL] seekable compress failed\n"); + ok = 0; + } + + // 1. Wrong dict via set_dict must be rejected up front. + if (ok) { + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + ok = 0; + } else { + int rc = zxc_seekable_set_dict(s, k_dict_b, sizeof(k_dict_b) - 1); + if (rc != ZXC_ERROR_DICT_MISMATCH) { + printf(" [FAIL] set_dict(wrong): expected DICT_MISMATCH, got %d (%s)\n", rc, + zxc_error_name(rc)); + ok = 0; + } + zxc_seekable_free(s); + } + } + + // 2. Decoding without any dict must be rejected, not silently corrupt + // (single-threaded and multi-threaded entry points). + if (ok) { + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + ok = 0; + } else { + int64_t st = zxc_seekable_decompress_range(s, out, src_size, 0, src_size); + int64_t mt = zxc_seekable_decompress_range_mt(s, out, src_size, 0, src_size, 4); + if (st != ZXC_ERROR_DICT_REQUIRED || mt != ZXC_ERROR_DICT_REQUIRED) { + printf(" [FAIL] no-dict decode: expected DICT_REQUIRED, got st=%lld mt=%lld\n", + (long long)st, (long long)mt); + ok = 0; + } + zxc_seekable_free(s); + } + } + + // 3. Correct dict still works (guard against over-rejection). + if (ok) { + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (s && zxc_seekable_set_dict(s, k_dict_a, sizeof(k_dict_a) - 1) == ZXC_OK && + zxc_seekable_decompress_range(s, out, src_size, 0, src_size) == (int64_t)src_size && + memcmp(src, out, src_size) == 0) { + // expected + } else { + printf(" [FAIL] correct dict roundtrip regressed\n"); + ok = 0; + } + zxc_seekable_free(s); + } + + free(out); + free(src); + free(compressed); + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} diff --git a/tests/test_format.c b/tests/test_format.c index afb6794a..59ae1d3f 100644 --- a/tests/test_format.c +++ b/tests/test_format.c @@ -451,7 +451,7 @@ int test_legacy_header() { size_t block_size = 0; int has_checksum = -1; - int rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum); + int rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum, NULL); if (rc != ZXC_OK) { printf(" [FAIL] zxc_read_file_header returned %d (%s)\n", rc, zxc_error_name(rc)); @@ -475,7 +475,7 @@ int test_legacy_header() { hdr[14] = (uint8_t)(crc & 0xFF); hdr[15] = (uint8_t)(crc >> 8); - rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum); + rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum, NULL); if (rc != ZXC_ERROR_BAD_BLOCK_SIZE) { printf(" [FAIL] invalid code 99: expected %d, got %d\n", ZXC_ERROR_BAD_BLOCK_SIZE, rc); return 0; diff --git a/tests/test_main.c b/tests/test_main.c index 5bad7b43..cbe5e963 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -72,6 +72,7 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_static_ctx_block_size_locked), TEST_CASE(test_static_ctx_null_inputs), TEST_CASE(test_static_ctx_roundtrip_all_levels), + TEST_CASE(test_static_ctx_embedded_dict), /* --- Stream API --- */ TEST_CASE(test_null_output_decompression), @@ -113,6 +114,23 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_error_name), TEST_CASE(test_library_info_api), + /* --- Dictionary --- */ + TEST_CASE(test_dict_id_deterministic), + TEST_CASE(test_dict_get_id_apis), + TEST_CASE(test_dict_buffer_roundtrip), + TEST_CASE(test_dict_block_roundtrip), + TEST_CASE(test_dict_mismatch_error), + TEST_CASE(test_dict_required_error), + TEST_CASE(test_dict_no_dict_compat), + TEST_CASE(test_dict_stream_roundtrip), + TEST_CASE(test_dict_large_dict_roundtrip), + TEST_CASE(test_dict_seekable_roundtrip), + TEST_CASE(test_dict_train_roundtrip), + TEST_CASE(test_dict_train_no_frequent_patterns), + TEST_CASE(test_dict_seekable_mt_roundtrip), + TEST_CASE(test_dict_stream_dict_id_checks), + TEST_CASE(test_dict_seekable_dict_id_checks), + /* --- Seekable (single-threaded) --- */ TEST_CASE(test_seekable_table_sizes), TEST_CASE(test_seekable_table_write), diff --git a/tests/test_misc.c b/tests/test_misc.c index 55a43eb5..71bc38f3 100644 --- a/tests/test_misc.c +++ b/tests/test_misc.c @@ -29,6 +29,9 @@ int test_error_name() { {ZXC_ERROR_NULL_INPUT, "ZXC_ERROR_NULL_INPUT"}, {ZXC_ERROR_BAD_BLOCK_TYPE, "ZXC_ERROR_BAD_BLOCK_TYPE"}, {ZXC_ERROR_BAD_BLOCK_SIZE, "ZXC_ERROR_BAD_BLOCK_SIZE"}, + {ZXC_ERROR_DICT_REQUIRED, "ZXC_ERROR_DICT_REQUIRED"}, + {ZXC_ERROR_DICT_MISMATCH, "ZXC_ERROR_DICT_MISMATCH"}, + {ZXC_ERROR_DICT_TOO_LARGE, "ZXC_ERROR_DICT_TOO_LARGE"}, }; const int n = sizeof(cases) / sizeof(cases[0]); diff --git a/tests/test_static_ctx.c b/tests/test_static_ctx.c index df5f7dcc..0a18ecfc 100644 --- a/tests/test_static_ctx.c +++ b/tests/test_static_ctx.c @@ -83,7 +83,7 @@ int test_static_ctx_roundtrip_all_levels(void) { } /* Size + init the dctx workspace. */ - const size_t dctx_ws_sz = zxc_static_dctx_workspace_size(block_size); + const size_t dctx_ws_sz = zxc_static_dctx_workspace_size(block_size, 0); if (dctx_ws_sz == 0) { printf(" [FAIL] level %d: dctx_ws_sz == 0\n", lvl); goto fail; @@ -93,7 +93,7 @@ int test_static_ctx_roundtrip_all_levels(void) { printf(" [FAIL] level %d: aligned_alloc(dctx_ws)\n", lvl); goto fail; } - zxc_dctx* const dctx = zxc_init_static_dctx(dctx_ws, dctx_ws_sz, block_size); + zxc_dctx* const dctx = zxc_init_static_dctx(dctx_ws, dctx_ws_sz, block_size, 0); if (!dctx) { printf(" [FAIL] level %d: zxc_init_static_dctx returned NULL\n", lvl); test_aligned_free(dctx_ws); @@ -135,7 +135,7 @@ int test_static_ctx_size_query(void) { printf(" [FAIL] cctx_size(0) should be 0\n"); return 0; } - if (zxc_static_dctx_workspace_size(0) != 0) { + if (zxc_static_dctx_workspace_size(0, 0) != 0) { printf(" [FAIL] dctx_size(0) should be 0\n"); return 0; } @@ -269,7 +269,7 @@ int test_static_ctx_null_inputs(void) { printf(" [FAIL] init_static_cctx(NULL opts) should fail\n"); return 0; } - if (zxc_init_static_dctx(NULL, 65536, 4096) != NULL) { + if (zxc_init_static_dctx(NULL, 65536, 4096, 0) != NULL) { printf(" [FAIL] init_static_dctx(NULL workspace) should fail\n"); return 0; } @@ -279,3 +279,99 @@ int test_static_ctx_null_inputs(void) { printf(" [PASS] NULL inputs rejected; NULL free is idempotent\n"); return 1; } + +int test_static_ctx_embedded_dict(void) { + printf("=== TEST: Static ctx - decode embedded dictionary ===\n"); + const size_t bs = 65536; + static const uint8_t dict[] = + "The quick brown fox jumps over the lazy dog. keys: \"id\",\"name\",\"email\"."; + const size_t dsz = sizeof(dict) - 1; + const size_t src_size = 200000; + uint8_t* src = (uint8_t*)malloc(src_size); + int ok = (src != NULL); + /* Dict-referencing data so the encoder actually emits matches into the dict. */ + if (ok) + for (size_t i = 0; i < src_size; i++) src[i] = dict[i % dsz]; + + /* Build an embedded-dictionary archive with the stream compressor. */ + FILE* fs = ok ? tmpfile() : NULL; + FILE* fc = ok ? tmpfile() : NULL; + uint8_t* comp = NULL; + long csz = 0; + if (!fs || !fc) ok = 0; + if (ok) { + fwrite(src, 1, src_size, fs); + rewind(fs); + zxc_compress_opts_t copts = {.level = ZXC_LEVEL_DEFAULT, + .block_size = bs, + .checksum_enabled = 1, + .dict = dict, + .dict_size = dsz}; + if (zxc_stream_compress(fs, fc, &copts) <= 0) { + printf(" [FAIL] stream_compress\n"); + ok = 0; + } + } + if (ok) { + fseek(fc, 0, SEEK_END); + csz = ftell(fc); + rewind(fc); + comp = (uint8_t*)malloc((size_t)csz); + ok = (comp && fread(comp, 1, (size_t)csz, fc) == (size_t)csz); + } + if (fs) fclose(fs); + if (fc) fclose(fc); + uint8_t* out = (uint8_t*)malloc(src_size); + if (!out) ok = 0; + + /* (a) Dynamic dctx: lazily allocates the dict bounce buffer. */ + if (ok) { + zxc_dctx* d = zxc_create_dctx(); + int64_t r = zxc_decompress_dctx(d, comp, (size_t)csz, out, src_size, NULL); + if (r != (int64_t)src_size || memcmp(out, src, src_size) != 0) { + printf(" [FAIL] dynamic dctx embedded decode (r=%lld)\n", (long long)r); + ok = 0; + } else { + printf(" [PASS] dynamic dctx decodes embedded dict\n"); + } + zxc_free_dctx(d); + } + + /* (b) Static dctx sized for a dictionary. */ + if (ok) { + const size_t ws_sz = zxc_static_dctx_workspace_size(bs, ZXC_DICT_SIZE_MAX); + void* ws = ws_sz ? test_aligned_alloc(64, ws_sz) : NULL; + zxc_dctx* d = ws ? zxc_init_static_dctx(ws, ws_sz, bs, ZXC_DICT_SIZE_MAX) : NULL; + int64_t r = d ? zxc_decompress_dctx(d, comp, (size_t)csz, out, src_size, NULL) : -1; + if (r != (int64_t)src_size || memcmp(out, src, src_size) != 0) { + printf(" [FAIL] static dctx (with dict room) decode (r=%lld)\n", (long long)r); + ok = 0; + } else { + printf(" [PASS] static dctx (max_dict_size) decodes embedded dict\n"); + } + if (ws) test_aligned_free(ws); + } + + /* (c) Static dctx WITHOUT dict room must reject cleanly, not corrupt. */ + if (ok) { + const size_t ws_sz = zxc_static_dctx_workspace_size(bs, 0); + void* ws = ws_sz ? test_aligned_alloc(64, ws_sz) : NULL; + zxc_dctx* d = ws ? zxc_init_static_dctx(ws, ws_sz, bs, 0) : NULL; + int64_t r = d ? zxc_decompress_dctx(d, comp, (size_t)csz, out, src_size, NULL) : -999; + if (r != ZXC_ERROR_DICT_REQUIRED) { + printf(" [FAIL] static dctx (no dict room) should return DICT_REQUIRED, got %lld\n", + (long long)r); + ok = 0; + } else { + printf(" [PASS] static dctx (no dict room) rejects cleanly\n"); + } + if (ws) test_aligned_free(ws); + } + + free(comp); + free(out); + free(src); + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} diff --git a/wrappers/rust/zxc-sys/build.rs b/wrappers/rust/zxc-sys/build.rs index b3530f7f..276bf430 100644 --- a/wrappers/rust/zxc-sys/build.rs +++ b/wrappers/rust/zxc-sys/build.rs @@ -167,8 +167,9 @@ fn main() { .include(src_lib.join("vendors")) .define("ZXC_STATIC_DEFINE", None) .file(src_lib.join("zxc_common.c")) - .file(src_lib.join("zxc_driver.c")) + .file(src_lib.join("zxc_dict.c")) .file(src_lib.join("zxc_dispatch.c")) + .file(src_lib.join("zxc_driver.c")) .file(src_lib.join("zxc_seekable.c")) .file(src_lib.join("zxc_pstream.c")) .opt_level(3)