From a9208de816522124cee6c71683d9cc303e0d1c94 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 14:27:38 +0200 Subject: [PATCH 01/47] feat: Add dictionary error codes Introduces specific error types for dictionary-related issues, such as missing, mismatched, or oversized dictionaries. These errors are essential for robust dictionary support. --- include/zxc_error.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/zxc_error.h b/include/zxc_error.h index 1326e563..8b300686 100644 --- a/include/zxc_error.h +++ b/include/zxc_error.h @@ -66,6 +66,11 @@ typedef enum { ZXC_ERROR_BAD_BLOCK_TYPE = -13, /**< Unknown or unexpected block type. */ ZXC_ERROR_BAD_BLOCK_SIZE = -14, /**< Invalid block size. */ + /* Dictionary errors */ + ZXC_ERROR_DICT_REQUIRED = -15, /**< File requires a dictionary but none was provided. */ + ZXC_ERROR_DICT_MISMATCH = -16, /**< Provided dictionary ID does not match the file header. */ + ZXC_ERROR_DICT_TOO_LARGE = -17, /**< Dictionary exceeds maximum allowed size. */ + } zxc_error_t; /** From 879cc668b23a620b3618f9557ae1c0805298d8a2 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 14:32:06 +0200 Subject: [PATCH 02/47] feat: Add pre-trained dictionary support Introduces a comprehensive API for creating, managing, and utilizing pre-trained dictionaries. This includes functions for training, saving to the new `.zxd` file format, loading, and computing unique dictionary IDs. Dictionaries enhance compression ratios for small, similar data by pre-filling the LZ77 sliding window. The dictionary ID is integrated into the ZXC file header, enabling decoders to verify the correct dictionary is provided at decompression time. --- CMakeLists.txt | 3 + include/zxc.h | 1 + include/zxc_constants.h | 9 +++ include/zxc_dict.h | 128 ++++++++++++++++++++++++++++++++++++++++ src/lib/zxc_dict.c | 116 ++++++++++++++++++++++++++++++++++++ src/lib/zxc_internal.h | 18 +++++- 6 files changed, 273 insertions(+), 2 deletions(-) create mode 100644 include/zxc_dict.h create mode 100644 src/lib/zxc_dict.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 98e8ff91..481fe264 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,7 @@ endif() if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(ZXC_CORE_SOURCES src/lib/zxc_common.c + src/lib/zxc_dict.c src/lib/zxc_dispatch.c src/lib/zxc_pstream.c src/lib/zxc_seekable.c @@ -221,6 +222,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") else() set(ZXC_CORE_SOURCES src/lib/zxc_common.c + src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_pstream.c @@ -429,6 +431,7 @@ if(ZXC_BUILD_TESTS) # and is already pulled in via ${ZXC_VARIANT_OBJECTS} below. add_library(zxc_lib_static STATIC src/lib/zxc_common.c + src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_pstream.c diff --git a/include/zxc.h b/include/zxc.h index 9cd2be41..e8f5d4d0 100644 --- a/include/zxc.h +++ b/include/zxc.h @@ -10,6 +10,7 @@ #include "zxc_buffer.h" // IWYU pragma: keep #include "zxc_constants.h" // IWYU pragma: keep +#include "zxc_dict.h" // IWYU pragma: keep #include "zxc_error.h" // IWYU pragma: keep #include "zxc_opts.h" // IWYU pragma: keep #include "zxc_pstream.h" // IWYU pragma: keep diff --git a/include/zxc_constants.h b/include/zxc_constants.h index 23c46d1a..adc3cd9d 100644 --- a/include/zxc_constants.h +++ b/include/zxc_constants.h @@ -64,6 +64,15 @@ #define ZXC_BLOCK_SIZE_MAX (1U << ZXC_BLOCK_SIZE_MAX_LOG2) /** @} */ /* end of block_size */ +/** + * @defgroup dictionary Dictionary + * @brief Constants for pre-trained dictionary support. + * @{ + */ +/** @brief Maximum dictionary content size in bytes (64 KB, bounded by LZ window). */ +#define ZXC_DICT_SIZE_MAX (1U << 16) +/** @} */ /* end of dictionary */ + /** * @defgroup threading Threading Limits * @brief Bounds on thread-count parameters accepted by the streaming APIs. diff --git a/include/zxc_dict.h b/include/zxc_dict.h new file mode 100644 index 00000000..c250ed15 --- /dev/null +++ b/include/zxc_dict.h @@ -0,0 +1,128 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/** + * @file zxc_dict.h + * @brief Pre-trained dictionary API for ZXC compression. + * + * Provides functions to train, save, load, and identify dictionaries that + * improve compression ratio on small, similar payloads. Dictionaries are + * stored as external `.zxd` files and referenced by a 32-bit ID in the + * ZXC file header. + * + * A dictionary contains raw byte content that prefills the LZ77 sliding + * window at the start of each block, giving the compressor immediate + * access to representative patterns without waiting for them to appear + * in the input stream. + * + * @code + * // Train a dictionary from a corpus of JSON samples + * void* dict_buf = malloc(32768); + * int64_t dict_sz = zxc_train_dict(samples, sizes, n, dict_buf, 32768); + * + * // Save to .zxd file + * void* zxd = malloc(zxc_dict_save_bound(dict_sz)); + * int64_t zxd_sz = zxc_dict_save(dict_buf, dict_sz, zxd, ...); + * + * // Use for compression + * zxc_compress_opts_t opts = { .level = 3, .dict = dict_buf, .dict_size = dict_sz }; + * zxc_compress(src, src_size, dst, dst_capacity, &opts); + * @endcode + */ + +#ifndef ZXC_DICT_H +#define ZXC_DICT_H + +#include +#include + +#include "zxc_export.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @defgroup dict Dictionary + * @brief Pre-trained dictionary training, serialization, and identification. + * @{ + */ + +/** + * @brief Compute the dictionary ID for the given content. + * + * The ID is a deterministic 32-bit hash of the raw dictionary content. + * It is stored in the ZXC file header so the decoder can verify that + * the correct dictionary is provided at decompression time. + * + * @param[in] dict Pointer to dictionary content. + * @param[in] dict_size Size in bytes. + * @return 32-bit dictionary ID. Returns 0 if @p dict is NULL or @p dict_size is 0. + */ +ZXC_EXPORT uint32_t zxc_dict_id(const void* dict, size_t dict_size); + +/** + * @brief Load and validate a `.zxd` dictionary file from a memory buffer. + * + * On success, @p content_out points into the input buffer (zero-copy). + * The caller must keep @p buf alive while the content pointer is in use. + * + * @param[in] buf Buffer containing the .zxd file. + * @param[in] buf_size Size of @p buf in bytes. + * @param[out] content_out Receives a pointer to the dictionary content. + * @param[out] content_size_out Receives the content size in bytes. + * @param[out] dict_id_out Receives the dictionary ID (may be NULL). + * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code. + */ +ZXC_EXPORT int zxc_dict_load(const void* buf, size_t buf_size, const void** content_out, + size_t* content_size_out, uint32_t* dict_id_out); + +/** + * @brief Serialize dictionary content to the `.zxd` file format. + * + * @param[in] content Raw dictionary content. + * @param[in] content_size Size of @p content in bytes (max ZXC_DICT_SIZE_MAX). + * @param[out] buf Output buffer for the .zxd file. + * @param[in] buf_capacity Capacity of @p buf. + * @return Number of bytes written on success, or a negative @ref zxc_error_t code. + */ +ZXC_EXPORT int64_t zxc_dict_save(const void* content, size_t content_size, void* buf, + size_t buf_capacity); + +/** + * @brief Returns the maximum .zxd file size for a given content size. + * + * @param[in] content_size Size of the dictionary content. + * @return Total .zxd file size (header + content). + */ +ZXC_EXPORT size_t zxc_dict_save_bound(size_t content_size); + +/** + * @brief Train a dictionary from a corpus of samples. + * + * Analyzes the samples to select byte sequences that maximize LZ77 match + * coverage. The resulting dictionary content can be passed directly to + * zxc_compress_opts_t::dict or serialized with zxc_dict_save(). + * + * @param[in] samples Array of pointers to sample buffers. + * @param[in] sample_sizes Array of sample sizes in bytes. + * @param[in] n_samples Number of samples. + * @param[out] dict_buf Output buffer for trained dictionary content. + * @param[in] dict_capacity Capacity of @p dict_buf (max ZXC_DICT_SIZE_MAX). + * @return Size of the trained dictionary on success, or a negative + * @ref zxc_error_t code. + */ +ZXC_EXPORT int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, + size_t n_samples, void* dict_buf, size_t dict_capacity); + +/** @} */ /* end of dict */ + +#ifdef __cplusplus +} +#endif + +#endif /* ZXC_DICT_H */ diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c new file mode 100644 index 00000000..1a7f249f --- /dev/null +++ b/src/lib/zxc_dict.c @@ -0,0 +1,116 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/** + * @file zxc_dict.c + * @brief Pre-trained dictionary: ID computation, .zxd serialization, and training. + */ + +#include "../../include/zxc_dict.h" + +#include "zxc_internal.h" + +/* ------------------------------------------------------------------------- + * Dictionary ID + * ------------------------------------------------------------------------- */ + +uint32_t zxc_dict_id(const void* dict, const size_t dict_size) { + if (UNLIKELY(!dict || dict_size == 0)) return 0; + return zxc_checksum(dict, dict_size, 0); +} + +/* ------------------------------------------------------------------------- + * .zxd format: save / load / bound + * + * Layout (ZXC_DICT_HEADER_SIZE = 16 bytes + content): + * 0x00 4 Magic (0x9CB0D1C7 LE) + * 0x04 1 Version (1) + * 0x05 1 Flags (reserved, 0) + * 0x06 2 Content size (u16 LE) + * 0x08 4 dict_id (u32 LE) + * 0x0C 4 Header CRC32 (rapidhash-folded, computed with this field zeroed) + * 0x10 N Content bytes + * ------------------------------------------------------------------------- */ + +size_t zxc_dict_save_bound(const size_t content_size) { + return ZXC_DICT_HEADER_SIZE + content_size; +} + +int64_t zxc_dict_save(const void* content, const size_t content_size, void* buf, + const size_t buf_capacity) { + if (UNLIKELY(!content || content_size == 0)) return ZXC_ERROR_NULL_INPUT; + if (UNLIKELY(content_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + + const size_t total = ZXC_DICT_HEADER_SIZE + content_size; + if (UNLIKELY(buf_capacity < total)) return ZXC_ERROR_DST_TOO_SMALL; + + uint8_t* dst = (uint8_t*)buf; + + zxc_store_le32(dst + 0, ZXC_DICT_MAGIC); + dst[4] = ZXC_DICT_VERSION; + dst[5] = 0; /* flags: reserved */ + zxc_store_le16(dst + 6, (uint16_t)content_size); + zxc_store_le32(dst + 8, zxc_dict_id(content, content_size)); + + /* CRC32 of header with CRC field zeroed */ + zxc_store_le32(dst + 12, 0); + const uint32_t crc = zxc_checksum(dst, ZXC_DICT_HEADER_SIZE, 0); + zxc_store_le32(dst + 12, crc); + + ZXC_MEMCPY(dst + ZXC_DICT_HEADER_SIZE, content, content_size); + + return (int64_t)total; +} + +int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_out, + size_t* content_size_out, uint32_t* dict_id_out) { + if (UNLIKELY(!buf || !content_out || !content_size_out)) return ZXC_ERROR_NULL_INPUT; + if (UNLIKELY(buf_size < ZXC_DICT_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL; + + const uint8_t* src = (const uint8_t*)buf; + + if (zxc_le32(src) != ZXC_DICT_MAGIC) return ZXC_ERROR_BAD_MAGIC; + if (src[4] != ZXC_DICT_VERSION) return ZXC_ERROR_BAD_VERSION; + + const size_t content_size = zxc_le16(src + 6); + if (content_size == 0) return ZXC_ERROR_CORRUPT_DATA; + if (content_size > ZXC_DICT_SIZE_MAX) return ZXC_ERROR_DICT_TOO_LARGE; + if (buf_size < ZXC_DICT_HEADER_SIZE + content_size) return ZXC_ERROR_SRC_TOO_SMALL; + + /* Verify header CRC32 */ + uint8_t temp[ZXC_DICT_HEADER_SIZE]; + ZXC_MEMCPY(temp, src, ZXC_DICT_HEADER_SIZE); + zxc_store_le32(temp + 12, 0); + const uint32_t expected_crc = zxc_checksum(temp, ZXC_DICT_HEADER_SIZE, 0); + if (UNLIKELY(zxc_le32(src + 12) != expected_crc)) return ZXC_ERROR_BAD_HEADER; + + /* Verify dict_id matches content */ + const uint8_t* content = src + ZXC_DICT_HEADER_SIZE; + const uint32_t id = zxc_dict_id(content, content_size); + if (UNLIKELY(zxc_le32(src + 8) != id)) return ZXC_ERROR_BAD_CHECKSUM; + + *content_out = content; + *content_size_out = content_size; + if (dict_id_out) *dict_id_out = id; + + return ZXC_OK; +} + +/* ------------------------------------------------------------------------- + * Dictionary training (étape 9 — stub for now) + * ------------------------------------------------------------------------- */ + +int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, + const size_t n_samples, void* dict_buf, const size_t dict_capacity) { + (void)samples; + (void)sample_sizes; + (void)n_samples; + (void)dict_buf; + (void)dict_capacity; + /* TODO: implement training algorithm */ + return ZXC_ERROR_NULL_INPUT; +} diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h index 9b5c1255..c4680ebb 100644 --- a/src/lib/zxc_internal.h +++ b/src/lib/zxc_internal.h @@ -32,6 +32,7 @@ #include "../../include/zxc_buffer.h" #include "../../include/zxc_constants.h" +#include "../../include/zxc_error.h" #include "../../include/zxc_seekable.h" #include "rapidhash.h" @@ -334,9 +335,18 @@ extern "C" { /** @brief Bit flag in the Flags byte indicating checksum presence (bit 7). */ #define ZXC_FILE_FLAG_HAS_CHECKSUM 0x80U +/** @brief Bit flag in the Flags byte indicating a dictionary is required (bit 6). */ +#define ZXC_FILE_FLAG_HAS_DICTIONARY 0x40U /** @brief Mask for the checksum algorithm id (bits 0-3). */ #define ZXC_FILE_CHECKSUM_ALGO_MASK 0x0FU +/** @brief Magic word identifying ZXC dictionary files (.zxd). */ +#define ZXC_DICT_MAGIC 0x9CB0D1C7U +/** @brief Current dictionary file format version. */ +#define ZXC_DICT_VERSION 1 +/** @brief Size of the .zxd file header in bytes. */ +#define ZXC_DICT_HEADER_SIZE 16 + /** @brief Block header size: Type(1)+Flags(1)+Reserved(1)+CRC(1)+CompSize(4). */ #define ZXC_BLOCK_HEADER_SIZE 8 /** @brief Size of the per-block checksum field in bytes. */ @@ -1584,6 +1594,7 @@ typedef struct { size_t opt_scratch_cap; /**< Current capacity of opt_scratch in bytes. */ int checksum_enabled; /**< 1 if checksum calculation/verification is enabled. */ int compression_level; /**< Compression level. */ + size_t dict_size; /**< Dictionary prefill size (0 = no dictionary). */ /* Block-size derived parameters (computed once at init). */ size_t chunk_size; /**< Effective block size in bytes. */ @@ -1741,12 +1752,13 @@ typedef struct { * @param[in] dst_capacity Total capacity of @p dst in bytes. * @param[in] chunk_size Block size to encode in the header. * @param[in] has_checksum Non-zero if the checksum bit must be set. + * @param[in] dict_id Dictionary ID (0 = no dictionary). * * @return Number of bytes written (@c ZXC_FILE_HEADER_SIZE) on success, * or @c ZXC_ERROR_DST_TOO_SMALL if @p dst_capacity is insufficient. */ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, const size_t chunk_size, - const int has_checksum); + const int has_checksum, const uint32_t dict_id); /** * @brief Validates and reads the ZXC file header from @p src. @@ -1760,13 +1772,15 @@ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, cons * block size. May be @c NULL. * @param[out] out_has_checksum Optional pointer that receives the checksum * flag. May be @c NULL. + * @param[out] out_dict_id Optional pointer that receives the dictionary + * ID (0 if none). May be @c NULL. * * @return @c ZXC_OK on success, or a negative error code (e.g. * @c ZXC_ERROR_SRC_TOO_SMALL, @c ZXC_ERROR_BAD_MAGIC, * @c ZXC_ERROR_BAD_VERSION). */ int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size, size_t* out_block_size, - int* out_has_checksum); + int* out_has_checksum, uint32_t* out_dict_id); /** * @brief Encodes a block header into @p dst. From c24bc8c0d07c48fd4a235c75e00301006d307ef0 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 14:39:17 +0200 Subject: [PATCH 03/47] feat: Implement dictionary integration in zxc file format and LZ77 Extends `zxc_compress_opts_t` and `zxc_decompress_opts_t` with dictionary parameters. The ZXC file header is updated to include a dictionary ID and a flag, enabling decoders to verify the correct dictionary is used. The LZ77 compression engine is enhanced to seed its hash tables with dictionary content, while the decompressor is adjusted to logically pre-fill its sliding window, facilitating matches against dictionary data for improved compression ratios. --- include/zxc_opts.h | 4 ++ src/lib/zxc_common.c | 19 +++++-- src/lib/zxc_compress.c | 106 +++++++++++++++++++++++++++++++-------- src/lib/zxc_decompress.c | 8 ++- src/lib/zxc_dispatch.c | 12 ++--- src/lib/zxc_driver.c | 6 +-- src/lib/zxc_pstream.c | 4 +- src/lib/zxc_seekable.c | 5 +- tests/test_format.c | 4 +- 9 files changed, 126 insertions(+), 42 deletions(-) diff --git a/include/zxc_opts.h b/include/zxc_opts.h index 6517e77c..e52e5d75 100644 --- a/include/zxc_opts.h +++ b/include/zxc_opts.h @@ -63,6 +63,8 @@ typedef struct { of 2, [4KB - 2MB]. */ int checksum_enabled; /**< 1 to enable per-block and global checksums, 0 to disable. */ int seekable; /**< 1 to append a seek table for random-access decompression. */ + const void* dict; /**< Pre-trained dictionary content (NULL = none). */ + size_t dict_size; /**< Dictionary size in bytes (0 = none, max ZXC_DICT_SIZE_MAX). */ zxc_progress_callback_t progress_cb; /**< Optional progress callback (NULL to disable). */ void* user_data; /**< User context pointer passed to progress_cb. */ } zxc_compress_opts_t; @@ -80,6 +82,8 @@ typedef struct { typedef struct { int n_threads; /**< Worker thread count (0 = auto-detect CPU cores). */ int checksum_enabled; /**< 1 to verify per-block and global checksums, 0 to skip. */ + const void* dict; /**< Pre-trained dictionary content (NULL = none). */ + size_t dict_size; /**< Dictionary size in bytes (0 = none). */ zxc_progress_callback_t progress_cb; /**< Optional progress callback (NULL to disable). */ void* user_data; /**< User context pointer passed to progress_cb. */ } zxc_decompress_opts_t; diff --git a/src/lib/zxc_common.c b/src/lib/zxc_common.c index d7a72e1a..496cf2a1 100644 --- a/src/lib/zxc_common.c +++ b/src/lib/zxc_common.c @@ -289,7 +289,7 @@ void zxc_cctx_free(zxc_cctx_t* ctx) { * or a negative @ref zxc_error_t code. */ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, const size_t chunk_size, - const int has_checksum) { + const int has_checksum, const uint32_t dict_id) { if (UNLIKELY(dst_capacity < ZXC_FILE_HEADER_SIZE)) return ZXC_ERROR_DST_TOO_SMALL; zxc_store_le32(dst, ZXC_MAGIC_WORD); @@ -299,10 +299,13 @@ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, cons dst[5] = (uint8_t)zxc_log2_u32((uint32_t)chunk_size); // Flags are at offset 6 - dst[6] = has_checksum ? (ZXC_FILE_FLAG_HAS_CHECKSUM | ZXC_CHECKSUM_RAPIDHASH) : 0; + uint8_t flags = has_checksum ? (ZXC_FILE_FLAG_HAS_CHECKSUM | ZXC_CHECKSUM_RAPIDHASH) : 0; + if (dict_id != 0) flags |= ZXC_FILE_FLAG_HAS_DICTIONARY; + dst[6] = flags; - // Bytes 7-13: Reserved (must be 0, 7 bytes) + // Bytes 7-13: Reserved / dict_id ZXC_MEMSET(dst + 7, 0, 7); + if (dict_id != 0) zxc_store_le32(dst + 7, dict_id); // Bytes 14-15: CRC (16-bit) zxc_store_le16(dst + 14, 0); // Zero out before hashing @@ -325,7 +328,8 @@ int zxc_write_file_header(uint8_t* RESTRICT dst, const size_t dst_capacity, cons * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code. */ int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size, - size_t* RESTRICT out_block_size, int* RESTRICT out_has_checksum) { + size_t* RESTRICT out_block_size, int* RESTRICT out_has_checksum, + uint32_t* RESTRICT out_dict_id) { if (UNLIKELY(src_size < ZXC_FILE_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL; if (UNLIKELY(zxc_le32(src) != ZXC_MAGIC_WORD)) return ZXC_ERROR_BAD_MAGIC; if (UNLIKELY(src[4] != ZXC_FILE_FORMAT_VERSION)) return ZXC_ERROR_BAD_VERSION; @@ -353,6 +357,7 @@ int zxc_read_file_header(const uint8_t* RESTRICT src, const size_t src_size, } // Flags are at offset 6 if (out_has_checksum) *out_has_checksum = (src[6] & ZXC_FILE_FLAG_HAS_CHECKSUM) ? 1 : 0; + if (out_dict_id) *out_dict_id = (src[6] & ZXC_FILE_FLAG_HAS_DICTIONARY) ? zxc_le32(src + 7) : 0; return ZXC_OK; } @@ -801,6 +806,12 @@ const char* zxc_error_name(const int code) { return "ZXC_ERROR_BAD_BLOCK_TYPE"; case ZXC_ERROR_BAD_BLOCK_SIZE: return "ZXC_ERROR_BAD_BLOCK_SIZE"; + case ZXC_ERROR_DICT_REQUIRED: + return "ZXC_ERROR_DICT_REQUIRED"; + case ZXC_ERROR_DICT_MISMATCH: + return "ZXC_ERROR_DICT_MISMATCH"; + case ZXC_ERROR_DICT_TOO_LARGE: + return "ZXC_ERROR_DICT_TOO_LARGE"; default: return "ZXC_UNKNOWN_ERROR"; } diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 694d07d6..0da1ff08 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -1071,19 +1071,26 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R zxc_lz77_params_t lzp_opt = zxc_get_lz77_params(level); lzp_opt.use_lazy = 0; // guard + /* When a dictionary is active, src = [dict | block_data]. DP arrays are + * indexed relative to the block start (position dict_sz in src). The + * variable src_base points to the first block byte for literal copies, + * while src remains the base for the match finder (absolute positions). */ + const size_t dict_sz = ctx->dict_size; + const size_t block_sz = src_sz - dict_sz; + const uint8_t* const src_base = src + dict_sz; const uint8_t* const iend = src + src_sz; /* Block too small for any match: emit all as literals. */ - if (UNLIKELY(src_sz < 13)) { - if (src_sz > 0) ZXC_MEMCPY(literals, src, src_sz); - *lit_c_out = src_sz; + if (UNLIKELY(block_sz < 13)) { + if (block_sz > 0) ZXC_MEMCPY(literals, src_base, block_sz); + *lit_c_out = block_sz; *seq_c_out = 0; *extras_sz_out = 0; *max_offset_out = 0; return 0; } - const size_t mflimit_pos = src_sz - 12; + const size_t mflimit_pos = block_sz - 12; const uint8_t* const mflimit = src + mflimit_pos; /* DP arrays carved from ctx->opt_scratch: a single allocation lazy- @@ -1125,8 +1132,8 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R * zxc_estimate_cctx_size(). */ (void)needed; - /* Per-block literal cost: */ - const uint32_t lit_cost = zxc_opt_estimate_lit_bits(src, src_sz, ctx->opt_scratch); + /* Per-block literal cost (sample only block data, not dict prefix): */ + const uint32_t lit_cost = zxc_opt_estimate_lit_bits(src_base, block_sz, ctx->opt_scratch); uint32_t* const dp = (uint32_t*)ctx->opt_scratch; uint16_t* const parent_len = (uint16_t*)(ctx->opt_scratch + sz_dp); @@ -1134,7 +1141,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R uint64_t* const match_end_bits = (uint64_t*)(ctx->opt_scratch + sz_dp + sz_pl + sz_po); dp[0] = 0; - ZXC_MEMSET(dp + 1, 0xFF, src_sz * sizeof(uint32_t)); + ZXC_MEMSET(dp + 1, 0xFF, block_sz * sizeof(uint32_t)); ZXC_MEMSET(parent_len, 0, sz_pl + sz_po + sz_bm); /* Forward DP: visit every position, update reachable successors. @@ -1159,17 +1166,19 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R /* Match transition: call find_best_match (no lazy, no backtrack via * anchor=ip). Iterate sub-lengths since any L <= max_L matches at the - * same offset and may end at a more useful DP position. */ - const uint8_t* ip = src + p; - const zxc_match_t m = zxc_lz77_find_best_match( - src, ip, iend, mflimit, /*anchor=*/ip, hash_table, hash_tags, chain_table, epoch_mark, - offset_mask, level, lzp_opt, last_off); + * same offset and may end at a more useful DP position. + * ip uses absolute position (src + dict_sz + p) so match finder + * resolves dict references correctly via src as base. */ + const uint8_t* ip = src_base + p; + const zxc_match_t m = + zxc_lz77_find_best_match(src, ip, iend, mflimit, /*anchor=*/ip, hash_table, hash_tags, + chain_table, epoch_mark, offset_mask, level, lzp_opt, last_off); if (m.ref) { const uint32_t off = (uint32_t)(ip - m.ref); if (off > 0 && off <= ZXC_LZ_WINDOW_SIZE) { last_off = off; - const size_t L_max_raw = (m.len > src_sz - p) ? (src_sz - p) : (size_t)m.len; + const size_t L_max_raw = (m.len > block_sz - p) ? (block_sz - p) : (size_t)m.len; const size_t L_max = (L_max_raw > UINT16_MAX) ? UINT16_MAX : L_max_raw; /* The L-iteration cost function is piecewise constant in @@ -1227,7 +1236,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R } /* Last 12 bytes can only be literals (matches must end before iend). */ - for (size_t p = mflimit_pos; p < src_sz; p++) { + for (size_t p = mflimit_pos; p < block_sz; p++) { if (UNLIKELY(dp[p] == UINT32_MAX)) continue; const uint32_t lit_next = dp[p] + lit_cost; if (lit_next < dp[p + 1]) { @@ -1241,7 +1250,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R * runs of unmarked positions and are reconstructed during forward emission * via lit_start tracking, so they need no backtrack storage. */ { - size_t pos = src_sz; + size_t pos = block_sz; while (pos > 0) { const uint32_t L = parent_len[pos]; if (L == 0) { @@ -1273,7 +1282,7 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R const size_t LL = match_start - lit_start; if (LL > 0) { - ZXC_MEMCPY(literals + lit_c, src + lit_start, LL); + ZXC_MEMCPY(literals + lit_c, src_base + lit_start, LL); lit_c += LL; } const uint32_t ll = (uint32_t)LL; @@ -1301,9 +1310,9 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R } /* Tail literals after the last match (or all literals if no match). */ - if (lit_start < src_sz) { - const size_t tail = src_sz - lit_start; - ZXC_MEMCPY(literals + lit_c, src + lit_start, tail); + if (lit_start < block_sz) { + const size_t tail = block_sz - lit_start; + ZXC_MEMCPY(literals + lit_c, src_base + lit_start, tail); lit_c += tail; } @@ -1314,6 +1323,49 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R return 0; } +/** + * @brief Seeds the hash/chain tables from dictionary content prepended to @p src. + * + * When a dictionary is active, @p src is laid out as [dict_content | block_data]. + * This function inserts hash entries for dictionary positions [0, dict_size) so + * the match finder can reference them during block encoding. + * + * @param[in] src Source buffer starting with dictionary content. + * @param[in] dict_size Size of the dictionary prefix in bytes. + * @param[in,out] hash_table Hash table to seed with dictionary positions. + * @param[in,out] hash_tags Tag table for fast match rejection. + * @param[in,out] chain_table Chain table for collision resolution. + * @param[in] epoch_mark Current epoch marker for hash table entries. + * @param[in] offset_mask Position mask for epoch/offset encoding. + * @param[in] level Compression level (controls hash function variant). + */ +static void zxc_lz_seed_dict(const uint8_t* RESTRICT src, const size_t dict_size, + uint32_t* RESTRICT hash_table, uint8_t* RESTRICT hash_tags, + uint16_t* RESTRICT chain_table, const uint32_t epoch_mark, + const uint32_t offset_mask, const int level) { + if (UNLIKELY(dict_size < ZXC_LZ_MIN_MATCH_LEN)) return; + + const int use_hash5 = (level >= 3); + const size_t limit = dict_size - (ZXC_LZ_MIN_MATCH_LEN - 1); + for (size_t i = 0; i < limit; i++) { + const uint64_t val8 = zxc_le64(src + i); + const uint32_t h = zxc_hash_func(val8, use_hash5); + const uint32_t cur_pos = (uint32_t)i; + const uint8_t tag = (uint8_t)((uint32_t)val8 ^ ((uint32_t)val8 >> 16)); + + const uint32_t raw_head = hash_table[h]; + const uint32_t prev_idx = + ((raw_head & ~offset_mask) == epoch_mark) ? (raw_head & offset_mask) : 0; + + hash_table[h] = epoch_mark | cur_pos; + hash_tags[h] = tag; + + const uint32_t dist = cur_pos - prev_idx; + const uint32_t valid = -((int32_t)((prev_idx != 0) & (dist < ZXC_LZ_WINDOW_SIZE))); + chain_table[cur_pos & ZXC_LZ_WINDOW_MASK] = (uint16_t)(dist & valid); + } +} + /** * @brief Encodes a data block using the General (GLO) compression format. * @@ -1365,6 +1417,7 @@ static int zxc_encode_block_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const size_t src_sz, uint8_t* RESTRICT dst, size_t dst_cap, size_t* RESTRICT out_sz) { const int level = ctx->compression_level; + const size_t dict_sz = ctx->dict_size; const zxc_lz77_params_t lzp = zxc_get_lz77_params(level); @@ -1377,7 +1430,12 @@ static int zxc_encode_block_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const uint32_t offset_bits = ctx->offset_bits; const uint32_t offset_mask = ctx->offset_mask; const uint32_t epoch_mark = ctx->epoch << offset_bits; - const uint8_t *ip = src, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; + + if (dict_sz > 0) + zxc_lz_seed_dict(src, dict_sz, ctx->hash_table, ctx->hash_tags, ctx->chain_table, + epoch_mark, offset_mask, level); + + const uint8_t *ip = src + dict_sz, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; uint32_t* const hash_table = ctx->hash_table; uint8_t* const hash_tags = ctx->hash_tags; @@ -1980,6 +2038,7 @@ static int zxc_encode_block_ghi(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap, size_t* RESTRICT const out_sz) { const int level = ctx->compression_level; + const size_t dict_sz = ctx->dict_size; const zxc_lz77_params_t lzp = zxc_get_lz77_params(level); @@ -1992,7 +2051,12 @@ static int zxc_encode_block_ghi(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRIC const uint32_t offset_bits = ctx->offset_bits; const uint32_t offset_mask = ctx->offset_mask; const uint32_t epoch_mark = ctx->epoch << offset_bits; - const uint8_t *ip = src, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; + + if (dict_sz > 0) + zxc_lz_seed_dict(src, dict_sz, ctx->hash_table, ctx->hash_tags, ctx->chain_table, + epoch_mark, offset_mask, level); + + const uint8_t *ip = src + dict_sz, *iend = src + src_sz, *anchor = ip, *mflimit = iend - 12; uint32_t* const hash_table = ctx->hash_table; uint8_t* const hash_tags = ctx->hash_tags; diff --git a/src/lib/zxc_decompress.c b/src/lib/zxc_decompress.c index 0dfb3f49..fe786b27 100644 --- a/src/lib/zxc_decompress.c +++ b/src/lib/zxc_decompress.c @@ -792,7 +792,9 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_glo_impl(zxc_cctx_t* RESTRICT ctx, // For 1-byte offsets (enc_off==1): validate until 256 bytes written (max 8-bit offset) // For 2-byte offsets (enc_off==0): validate until 65536 bytes written (max 16-bit offset) // After threshold, all offsets are guaranteed valid (can't exceed written bytes) - size_t written = 0; + // When a dictionary is active, dict_size bytes are logically "already written" + // (prepended by the caller), so the SAFE loop may be skipped entirely. + size_t written = ctx->dict_size; // --- SAFE Loop: offset validation until threshold (4x unroll) --- // For 1-byte offsets: bounds check until 256 bytes written @@ -1463,7 +1465,9 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_ghi_impl(zxc_cctx_t* RESTRICT ctx, // For 1-byte offsets (enc_off==1): validate until 256 bytes written (max 8-bit offset) // For 2-byte offsets (enc_off==0): validate until 65536 bytes written (max 16-bit offset) // After threshold, all offsets are guaranteed valid (can't exceed written bytes) - size_t written = 0; + // When a dictionary is active, dict_size bytes are logically "already written" + // (prepended by the caller), so the SAFE loop may be skipped entirely. + size_t written = ctx->dict_size; // --- SAFE Loop: offset validation until threshold (4x unroll) --- // Since offset is 16-bit, threshold is 65536. diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index d37c31e7..c83c24b2 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -515,7 +515,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST // LCOV_EXCL_STOP const int h_val = - zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled); + zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, 0); // LCOV_EXCL_START if (UNLIKELY(h_val < 0)) { zxc_cctx_free(&ctx); @@ -664,8 +664,8 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE int file_has_checksums = 0; // File header verification and context initialization - if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums) != - ZXC_OK || + if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums, + NULL) != ZXC_OK || zxc_cctx_init(&ctx, runtime_chunk_size, 0, 0, file_has_checksums && checksum_enabled) != ZXC_OK)) { return ZXC_ERROR_BAD_HEADER; @@ -889,7 +889,7 @@ int64_t zxc_compress_cctx(zxc_cctx* cctx, const void* RESTRICT src, const size_t uint32_t global_hash = 0; const int h_val = - zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled); + zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, 0); if (UNLIKELY(h_val < 0)) return h_val; // LCOV_EXCL_LINE op += h_val; @@ -973,8 +973,8 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size int file_has_checksums = 0; uint32_t global_hash = 0; - if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums) != - ZXC_OK)) + if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums, + NULL) != ZXC_OK)) return ZXC_ERROR_BAD_HEADER; /* Static dctx: block_size is locked at workspace init; reject any diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index e62b360a..ecee1482 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -590,8 +590,8 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread // Decompression Mode: Read and validate file header uint8_t h[ZXC_FILE_HEADER_SIZE]; if (UNLIKELY(fread(h, 1, ZXC_FILE_HEADER_SIZE, f_in) != ZXC_FILE_HEADER_SIZE || - zxc_read_file_header(h, ZXC_FILE_HEADER_SIZE, &runtime_chunk_sz, - &file_has_chk) != ZXC_OK)) + zxc_read_file_header(h, ZXC_FILE_HEADER_SIZE, &runtime_chunk_sz, &file_has_chk, + NULL) != ZXC_OK)) return ZXC_ERROR_BAD_HEADER; } @@ -710,7 +710,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (mode == 1 && f_out) { uint8_t h[ZXC_FILE_HEADER_SIZE]; - zxc_write_file_header(h, ZXC_FILE_HEADER_SIZE, runtime_chunk_sz, checksum_enabled); + zxc_write_file_header(h, ZXC_FILE_HEADER_SIZE, runtime_chunk_sz, checksum_enabled, 0); if (UNLIKELY(fwrite(h, 1, ZXC_FILE_HEADER_SIZE, f_out) != ZXC_FILE_HEADER_SIZE)) ctx.io_error = 1; diff --git a/src/lib/zxc_pstream.c b/src/lib/zxc_pstream.c index c09d251d..815d87a4 100644 --- a/src/lib/zxc_pstream.c +++ b/src/lib/zxc_pstream.c @@ -291,7 +291,7 @@ zxc_cstream* zxc_cstream_create(const zxc_compress_opts_t* opts) { */ static int cs_stage_file_header(zxc_cstream* cs) { const int w = zxc_write_file_header(cs->pending, cs->pending_cap, cs->block_size, - cs->opts.checksum_enabled); + cs->opts.checksum_enabled, 0); if (UNLIKELY(w < 0)) return w; // LCOV_EXCL_LINE cs->pending_len = (size_t)w; cs->pending_pos = 0; @@ -903,7 +903,7 @@ static int ds_handle_need_file_header(zxc_dstream* ds, zxc_inbuf_t* in) { size_t bs = 0; int has_csum = 0; - const int rc = zxc_read_file_header(ds->scratch, ds->scratch_used, &bs, &has_csum); + const int rc = zxc_read_file_header(ds->scratch, ds->scratch_used, &bs, &has_csum, NULL); if (UNLIKELY(rc != ZXC_OK)) return ds_set_error(ds, rc); // LCOV_EXCL_LINE ds->block_size = bs; ds->file_has_checksum = has_csum; diff --git a/src/lib/zxc_seekable.c b/src/lib/zxc_seekable.c index 614d3a67..4513dde3 100644 --- a/src/lib/zxc_seekable.c +++ b/src/lib/zxc_seekable.c @@ -192,7 +192,8 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s /* Step 1: validate file header => block_size */ size_t block_size_sz = 0; int file_has_chk = 0; - if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk) != ZXC_OK)) + if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk, NULL) != + ZXC_OK)) return NULL; const uint32_t block_size = (uint32_t)block_size_sz; if (UNLIKELY(block_size == 0)) return NULL; // LCOV_EXCL_LINE @@ -326,7 +327,7 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { size_t bs_sz = 0; int fhc = 0; - if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc) != ZXC_OK)) + if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc, NULL) != ZXC_OK)) return NULL; const uint32_t bs = (uint32_t)bs_sz; if (UNLIKELY(bs == 0)) return NULL; diff --git a/tests/test_format.c b/tests/test_format.c index afb6794a..59ae1d3f 100644 --- a/tests/test_format.c +++ b/tests/test_format.c @@ -451,7 +451,7 @@ int test_legacy_header() { size_t block_size = 0; int has_checksum = -1; - int rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum); + int rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum, NULL); if (rc != ZXC_OK) { printf(" [FAIL] zxc_read_file_header returned %d (%s)\n", rc, zxc_error_name(rc)); @@ -475,7 +475,7 @@ int test_legacy_header() { hdr[14] = (uint8_t)(crc & 0xFF); hdr[15] = (uint8_t)(crc >> 8); - rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum); + rc = zxc_read_file_header(hdr, sizeof(hdr), &block_size, &has_checksum, NULL); if (rc != ZXC_ERROR_BAD_BLOCK_SIZE) { printf(" [FAIL] invalid code 99: expected %d, got %d\n", ZXC_ERROR_BAD_BLOCK_SIZE, rc); return 0; From 3862f48fa1e27e70bdd023e9a323c78d7bbd165f Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 14:47:09 +0200 Subject: [PATCH 04/47] feat: Enable dictionary processing in compression and decompression Prepares input buffers for dictionary-aware compression by prefixing data blocks with dictionary content. This allows the core compression logic to operate on the data block while referencing the dictionary. For decompression, a dictionary-prefixed bounce buffer facilitates dictionary lookups during decoding. Includes dictionary ID validation during file header processing to ensure correct dictionary usage. --- src/lib/zxc_compress.c | 17 +++++---- src/lib/zxc_dispatch.c | 80 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 0da1ff08..309638fa 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -2315,14 +2315,17 @@ static int zxc_probe_is_numeric(const uint8_t* src, const size_t size) { // cppcheck-suppress unusedFunction int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT chunk, const size_t src_sz, uint8_t* RESTRICT dst, const size_t dst_cap) { + const size_t dict_sz = ctx->dict_size; + const size_t block_sz = src_sz - dict_sz; + const uint8_t* block_data = chunk + dict_sz; size_t w = 0; int res = ZXC_OK; - int try_num = zxc_probe_is_numeric(chunk, src_sz); + int try_num = zxc_probe_is_numeric(block_data, block_sz); if (UNLIKELY(try_num)) { - res = zxc_encode_block_num(ctx, chunk, src_sz, dst, dst_cap, &w); - if (res != ZXC_OK || w > (src_sz - (src_sz >> 2))) // w > 75% of src_sz - try_num = 0; // NUM didn't compress well, try GLO/GHI instead + res = zxc_encode_block_num(ctx, block_data, block_sz, dst, dst_cap, &w); + if (res != ZXC_OK || w > (block_sz - (block_sz >> 2))) + try_num = 0; } if (LIKELY(!try_num)) { @@ -2332,9 +2335,9 @@ int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT res = zxc_encode_block_glo(ctx, chunk, src_sz, dst, dst_cap, &w); } - // Check expansion. W contains Header + Payload. - if (UNLIKELY(res != ZXC_OK || w >= src_sz)) { - res = zxc_encode_block_raw(chunk, src_sz, dst, dst_cap, &w); + // Check expansion against block data size (excluding dict prefix). + if (UNLIKELY(res != ZXC_OK || w >= block_sz)) { + res = zxc_encode_block_raw(block_data, block_sz, dst, dst_cap, &w); if (UNLIKELY(res != ZXC_OK)) return res; } diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index c83c24b2..d6e2ed24 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -15,6 +15,7 @@ * @ref zxc_decompress, @ref zxc_get_decompressed_size). */ +#include "../../include/zxc_dict.h" #include "../../include/zxc_error.h" #include "../../include/zxc_seekable.h" #include "zxc_internal.h" @@ -499,9 +500,14 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST const int level = (opts && opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT; const size_t block_size = (opts && opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; + if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; if (UNLIKELY(!zxc_validate_block_size(block_size))) return ZXC_ERROR_BAD_BLOCK_SIZE; + const uint32_t did = (dict && dict_size > 0) ? zxc_dict_id(dict, dict_size) : 0; + const uint8_t* ip = (const uint8_t*)src; uint8_t* op = (uint8_t*)dst; const uint8_t* op_start = op; @@ -513,9 +519,21 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST if (UNLIKELY(zxc_cctx_init(&ctx, block_size, 1, level, checksum_enabled) != ZXC_OK)) return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP + ctx.dict_size = dict_size; + + /* Dict input buffer: [dict_content | block_data] for the encoder. */ + uint8_t* dict_input = NULL; + if (dict_size > 0) { + dict_input = (uint8_t*)ZXC_MALLOC(dict_size + block_size); + if (UNLIKELY(!dict_input)) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_MEMORY; + } + ZXC_MEMCPY(dict_input, dict, dict_size); + } const int h_val = - zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, 0); + zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, did); // LCOV_EXCL_START if (UNLIKELY(h_val < 0)) { zxc_cctx_free(&ctx); @@ -549,8 +567,15 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST const size_t chunk_len = (src_size - pos > block_size) ? block_size : (src_size - pos); const size_t rem_cap = (size_t)(op_end - op); - const int res = zxc_compress_chunk_wrapper(&ctx, ip + pos, chunk_len, op, rem_cap); + int res; + if (dict_input) { + ZXC_MEMCPY(dict_input + dict_size, ip + pos, chunk_len); + res = zxc_compress_chunk_wrapper(&ctx, dict_input, dict_size + chunk_len, op, rem_cap); + } else { + res = zxc_compress_chunk_wrapper(&ctx, ip + pos, chunk_len, op, rem_cap); + } if (UNLIKELY(res < 0)) { + ZXC_FREE(dict_input); ZXC_FREE(seek_comp); zxc_cctx_free(&ctx); return res; @@ -572,6 +597,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST seek_cap = seek_cap * 2; uint32_t* nc = (uint32_t*)ZXC_REALLOC(seek_comp, seek_cap * sizeof(uint32_t)); if (UNLIKELY(!nc)) { + ZXC_FREE(dict_input); ZXC_FREE(seek_comp); zxc_cctx_free(&ctx); return ZXC_ERROR_MEMORY; @@ -587,6 +613,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST pos += chunk_len; } + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); // Write EOF Block @@ -653,6 +680,8 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE } const int checksum_enabled = opts ? opts->checksum_enabled : 0; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; const uint8_t* ip = (const uint8_t*)src; const uint8_t* ip_end = ip + src_size; @@ -663,21 +692,42 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE zxc_cctx_t ctx; int file_has_checksums = 0; - // File header verification and context initialization + uint32_t header_dict_id = 0; if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums, - NULL) != ZXC_OK || + &header_dict_id) != ZXC_OK || zxc_cctx_init(&ctx, runtime_chunk_size, 0, 0, file_has_checksums && checksum_enabled) != ZXC_OK)) { return ZXC_ERROR_BAD_HEADER; } + /* Dictionary validation */ + if (header_dict_id != 0) { + if (!dict || dict_size == 0) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_REQUIRED; + } + if (zxc_dict_id(dict, dict_size) != header_dict_id) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_MISMATCH; + } + } + ctx.dict_size = dict_size; + ip += ZXC_FILE_HEADER_SIZE; - // work_buf is sized to runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD - // inside zxc_cctx_init (mode == 0). The threshold below must match so - // the fast-path / bounce decision uses the actual work_buf capacity. const size_t work_sz = runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD; + /* Dict decode buffer: [dict_content | decode_space + PAD] */ + uint8_t* dict_dec = NULL; + if (dict_size > 0) { + dict_dec = (uint8_t*)ZXC_MALLOC(dict_size + work_sz); + if (UNLIKELY(!dict_dec)) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_MEMORY; + } + ZXC_MEMCPY(dict_dec, dict, dict_size); + } + // Block decompression loop uint32_t global_hash = 0; @@ -722,7 +772,19 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE int res; const size_t rem_cap = (size_t)(op_end - op); - if (LIKELY(rem_cap >= work_sz)) { + if (dict_dec) { + /* Dict path: decode into bounce buffer with dict prefix so match + * copies that reference dict content resolve naturally. */ + res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, dict_dec + dict_size, work_sz); + if (LIKELY(res > 0)) { + if (UNLIKELY((size_t)res > rem_cap)) { + ZXC_FREE(dict_dec); + zxc_cctx_free(&ctx); + return ZXC_ERROR_DST_TOO_SMALL; + } + ZXC_MEMCPY(op, dict_dec + dict_size, (size_t)res); + } + } else if (LIKELY(rem_cap >= work_sz)) { // Fast path: decode directly into dst. Cap dst_cap to chunk_size + PAD res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, op, work_sz); } else { @@ -739,6 +801,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE } } if (UNLIKELY(res < 0)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return res; } @@ -754,6 +817,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE op += res; } + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return (int64_t)(op - op_start); } From 3a23529879c915eba4012c697e43bbad2c8d6404 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 14:47:37 +0200 Subject: [PATCH 05/47] test: Add comprehensive dictionary test suite This suite validates the core dictionary functionalities, including: - Saving and loading dictionaries using the `.zxd` format. - Deterministic generation of dictionary IDs. - Roundtrip compression/decompression with dictionaries across all levels. - Correct error reporting for dictionary mismatches and missing dictionaries. - Compatibility for decompressing non-dictionary-aware streams. This ensures the dictionary feature is robust and correctly integrated. --- CMakeLists.txt | 1 + src/lib/zxc_dict.c | 2 +- src/lib/zxc_driver.c | 53 ++++++- tests/test_common.h | 9 ++ tests/test_dict.c | 338 +++++++++++++++++++++++++++++++++++++++++++ tests/test_main.c | 9 ++ 6 files changed, 406 insertions(+), 6 deletions(-) create mode 100644 tests/test_dict.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 481fe264..5bb7bfa3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -421,6 +421,7 @@ if(ZXC_BUILD_TESTS) tests/test_seekable_mt.c tests/test_format.c tests/test_misc.c + tests/test_dict.c ) # When building shared libraries, create a static version for tests diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 1a7f249f..ec05b153 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -101,7 +101,7 @@ int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_o } /* ------------------------------------------------------------------------- - * Dictionary training (étape 9 — stub for now) + * Dictionary training * ------------------------------------------------------------------------- */ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index ecee1482..8b7c8b27 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -32,6 +32,7 @@ #include #include "../../include/zxc_buffer.h" +#include "../../include/zxc_dict.h" #include "../../include/zxc_error.h" #include "../../include/zxc_seekable.h" #include "../../include/zxc_stream.h" @@ -288,6 +289,8 @@ typedef struct { zxc_progress_callback_t progress_cb; void* progress_user_data; uint64_t total_input_bytes; + const uint8_t* dict; + size_t dict_size; } zxc_stream_ctx_t; /** @@ -384,6 +387,25 @@ static void* zxc_stream_worker(void* arg) { } cctx.compression_level = ctx->compression_level; + cctx.dict_size = ctx->dict_size; + + /* Per-worker dict buffer for assembling [dict | block_data]. */ + const size_t dsz = ctx->dict_size; + uint8_t* dict_work = NULL; + if (dsz > 0) { + const size_t alloc = dsz + ctx->chunk_size + ZXC_DECOMPRESS_TAIL_PAD; + dict_work = (uint8_t*)ZXC_MALLOC(alloc); + if (UNLIKELY(!dict_work)) { + zxc_cctx_free(&cctx); + pthread_mutex_lock(&ctx->lock); + ctx->io_error = 1; + pthread_cond_broadcast(&ctx->cond_writer); + pthread_cond_broadcast(&ctx->cond_reader); + pthread_mutex_unlock(&ctx->lock); + return NULL; + } + ZXC_MEMCPY(dict_work, ctx->dict, dsz); + } while (1) { zxc_stream_job_t* job = NULL; @@ -401,7 +423,17 @@ static void* zxc_stream_worker(void* arg) { job = &ctx->jobs[jid]; pthread_mutex_unlock(&ctx->lock); - const int res = ctx->processor(&cctx, job->in_buf, job->in_sz, job->out_buf, job->out_cap); + int res; + if (dict_work && ctx->compression_mode == 1) { + ZXC_MEMCPY(dict_work + dsz, job->in_buf, job->in_sz); + res = ctx->processor(&cctx, dict_work, dsz + job->in_sz, job->out_buf, job->out_cap); + } else if (dict_work && ctx->compression_mode == 0) { + res = ctx->processor(&cctx, job->in_buf, job->in_sz, dict_work + dsz, + ctx->chunk_size + ZXC_DECOMPRESS_TAIL_PAD); + if (LIKELY(res > 0)) ZXC_MEMCPY(job->out_buf, dict_work + dsz, (size_t)res); + } else { + res = ctx->processor(&cctx, job->in_buf, job->in_sz, job->out_buf, job->out_cap); + } pthread_mutex_lock(&ctx->lock); job->result_sz = UNLIKELY(res < 0) ? 0 : (size_t)res; @@ -415,6 +447,7 @@ static void* zxc_stream_worker(void* arg) { } pthread_mutex_unlock(&ctx->lock); } + ZXC_FREE(dict_work); zxc_cctx_free(&cctx); return NULL; } @@ -563,7 +596,8 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread const int level, const size_t block_size, const int checksum_enabled, const int seekable, zxc_chunk_processor_t func, - zxc_progress_callback_t progress_cb, void* user_data) { + zxc_progress_callback_t progress_cb, void* user_data, + const uint8_t* dict, const size_t dict_size) { zxc_stream_ctx_t ctx; ZXC_MEMSET(&ctx, 0, sizeof(ctx)); @@ -611,6 +645,8 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread ctx.progress_cb = progress_cb; ctx.progress_user_data = user_data; ctx.total_input_bytes = total_file_size; + ctx.dict = dict; + ctx.dict_size = dict_size; uint32_t d_global_hash = 0; @@ -710,7 +746,8 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (mode == 1 && f_out) { uint8_t h[ZXC_FILE_HEADER_SIZE]; - zxc_write_file_header(h, ZXC_FILE_HEADER_SIZE, runtime_chunk_sz, checksum_enabled, 0); + zxc_write_file_header(h, ZXC_FILE_HEADER_SIZE, runtime_chunk_sz, checksum_enabled, + (dict && dict_size) ? zxc_dict_id(dict, dict_size) : 0); if (UNLIKELY(fwrite(h, 1, ZXC_FILE_HEADER_SIZE, f_out) != ZXC_FILE_HEADER_SIZE)) ctx.io_error = 1; @@ -943,13 +980,16 @@ int64_t zxc_stream_compress(FILE* f_in, FILE* f_out, const zxc_compress_opts_t* const int level = (opts && opts->level > 0) ? opts->level : ZXC_LEVEL_DEFAULT; const size_t block_size = (opts && opts->block_size > 0) ? opts->block_size : ZXC_BLOCK_SIZE_DEFAULT; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; zxc_progress_callback_t cb = opts ? opts->progress_cb : NULL; void* ud = opts ? opts->user_data : NULL; if (UNLIKELY(!zxc_validate_block_size(block_size))) return ZXC_ERROR_BAD_BLOCK_SIZE; + if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; return zxc_stream_engine_run(f_in, f_out, n_threads, 1, level, block_size, checksum_enabled, - seekable, zxc_compress_chunk_wrapper, cb, ud); + seekable, zxc_compress_chunk_wrapper, cb, ud, dict, dict_size); } int64_t zxc_stream_decompress(FILE* f_in, FILE* f_out, const zxc_decompress_opts_t* opts) { @@ -957,11 +997,14 @@ int64_t zxc_stream_decompress(FILE* f_in, FILE* f_out, const zxc_decompress_opts const int n_threads = opts ? opts->n_threads : 0; const int checksum_enabled = opts ? opts->checksum_enabled : 0; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; zxc_progress_callback_t cb = opts ? opts->progress_cb : NULL; void* ud = opts ? opts->user_data : NULL; return zxc_stream_engine_run(f_in, f_out, n_threads, 0, 0, 0, checksum_enabled, 0, - (zxc_chunk_processor_t)zxc_decompress_chunk_wrapper, cb, ud); + (zxc_chunk_processor_t)zxc_decompress_chunk_wrapper, cb, ud, + dict, dict_size); } int64_t zxc_stream_get_decompressed_size(FILE* f_in) { diff --git a/tests/test_common.h b/tests/test_common.h index f76caec0..31c11659 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -182,4 +182,13 @@ int test_legacy_header(void); int test_error_name(void); int test_library_info_api(void); +/* Dictionary */ +int test_dict_zxd_roundtrip(void); +int test_dict_id_deterministic(void); +int test_dict_buffer_roundtrip(void); +int test_dict_mismatch_error(void); +int test_dict_required_error(void); +int test_dict_no_dict_compat(void); +int test_dict_stream_roundtrip(void); + #endif /* ZXC_TEST_COMMON_H */ diff --git a/tests/test_dict.c b/tests/test_dict.c new file mode 100644 index 00000000..0daacaee --- /dev/null +++ b/tests/test_dict.c @@ -0,0 +1,338 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "test_common.h" + +#include "../include/zxc_dict.h" + +static void gen_dict_friendly_data(uint8_t* buf, size_t size, const uint8_t* dict, + size_t dict_size) { + for (size_t i = 0; i < size; i++) { + if (i % 7 < 5 && dict_size > 5) { + size_t off = (i * 31) % (dict_size - 5); + buf[i] = dict[off + (i % 5)]; + } else { + buf[i] = (uint8_t)(i ^ (i >> 8)); + } + } +} + +int test_dict_zxd_roundtrip(void) { + printf("=== TEST: Dict - .zxd save/load roundtrip ===\n"); + + const char* content = "hello dict content for testing zxd format!"; + const size_t content_size = strlen(content); + + size_t bound = zxc_dict_save_bound(content_size); + uint8_t* zxd = (uint8_t*)malloc(bound); + int64_t written = zxc_dict_save(content, content_size, zxd, bound); + if (written < 0) { + printf(" [FAIL] zxc_dict_save returned %lld\n", (long long)written); + free(zxd); + return 0; + } + + const void* loaded_content = NULL; + size_t loaded_size = 0; + uint32_t loaded_id = 0; + int rc = zxc_dict_load(zxd, (size_t)written, &loaded_content, &loaded_size, &loaded_id); + if (rc != ZXC_OK) { + printf(" [FAIL] zxc_dict_load returned %d (%s)\n", rc, zxc_error_name(rc)); + free(zxd); + return 0; + } + + if (loaded_size != content_size || memcmp(loaded_content, content, content_size) != 0) { + printf(" [FAIL] content mismatch after load\n"); + free(zxd); + return 0; + } + + uint32_t expected_id = zxc_dict_id(content, content_size); + if (loaded_id != expected_id) { + printf(" [FAIL] dict_id mismatch: got %u, expected %u\n", loaded_id, expected_id); + free(zxd); + return 0; + } + + free(zxd); + printf("PASS\n\n"); + return 1; +} + +int test_dict_id_deterministic(void) { + printf("=== TEST: Dict - dict_id is deterministic ===\n"); + + const char* data = "some repeatable dictionary content"; + size_t size = strlen(data); + + uint32_t id1 = zxc_dict_id(data, size); + uint32_t id2 = zxc_dict_id(data, size); + + if (id1 != id2 || id1 == 0) { + printf(" [FAIL] dict_id not deterministic or zero: %u vs %u\n", id1, id2); + return 0; + } + + uint32_t id_null = zxc_dict_id(NULL, 0); + if (id_null != 0) { + printf(" [FAIL] dict_id(NULL, 0) should be 0, got %u\n", id_null); + return 0; + } + + printf("PASS\n\n"); + return 1; +} + +int test_dict_buffer_roundtrip(void) { + printf("=== TEST: Dict - buffer API roundtrip (all levels) ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + "Pack my box with five dozen liquor jugs. " + "How vexingly quick daft zebras jump!"; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 4096; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + uint8_t* decompressed = (uint8_t*)malloc(src_size); + + for (int level = 1; level <= 6; level++) { + zxc_compress_opts_t copts = { + .level = level, + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] level %d: compress returned %lld\n", level, (long long)comp_size); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, src_size, + &dopts); + if (dec_size != (int64_t)src_size) { + printf(" [FAIL] level %d: decompress returned %lld, expected %zu\n", level, + (long long)dec_size, src_size); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + if (memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] level %d: content mismatch\n", level); + free(src); + free(compressed); + free(decompressed); + return 0; + } + printf(" [PASS] level %d: %zu -> %lld bytes\n", level, src_size, (long long)comp_size); + } + + free(src); + free(compressed); + free(decompressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_mismatch_error(void) { + printf("=== TEST: Dict - dict_id mismatch error ===\n"); + + const uint8_t dict[] = "correct dictionary content"; + const uint8_t wrong_dict[] = "wrong dictionary contentz"; + const size_t dict_size = sizeof(dict) - 1; + + const uint8_t src[] = "some data to compress with dict"; + const size_t src_size = sizeof(src) - 1; + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 3, .dict = dict, .dict_size = dict_size}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress failed: %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = {.dict = wrong_dict, .dict_size = sizeof(wrong_dict) - 1}; + int64_t rc = zxc_decompress(compressed, (size_t)comp_size, decompressed, sizeof(decompressed), + &dopts); + if (rc != ZXC_ERROR_DICT_MISMATCH) { + printf(" [FAIL] expected DICT_MISMATCH, got %lld (%s)\n", (long long)rc, + zxc_error_name((int)rc)); + free(compressed); + return 0; + } + + free(compressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_required_error(void) { + printf("=== TEST: Dict - dict required error ===\n"); + + const uint8_t dict[] = "required dictionary"; + const size_t dict_size = sizeof(dict) - 1; + + const uint8_t src[] = "data needing a dict"; + const size_t src_size = sizeof(src) - 1; + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 3, .dict = dict, .dict_size = dict_size}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress failed: %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = {0}; + int64_t rc = zxc_decompress(compressed, (size_t)comp_size, decompressed, sizeof(decompressed), + &dopts); + if (rc != ZXC_ERROR_DICT_REQUIRED) { + printf(" [FAIL] expected DICT_REQUIRED, got %lld (%s)\n", (long long)rc, + zxc_error_name((int)rc)); + free(compressed); + return 0; + } + + free(compressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_no_dict_compat(void) { + printf("=== TEST: Dict - no-dict files decompress normally ===\n"); + + const uint8_t src[] = "data compressed without any dictionary at all, just normal data"; + const size_t src_size = sizeof(src) - 1; + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 3, .checksum_enabled = 1}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress failed\n"); + free(compressed); + return 0; + } + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = {.checksum_enabled = 1}; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, + sizeof(decompressed), &dopts); + if (dec_size != (int64_t)src_size || memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] roundtrip without dict failed\n"); + free(compressed); + return 0; + } + + free(compressed); + printf("PASS\n\n"); + return 1; +} + +int test_dict_stream_roundtrip(void) { + printf("=== TEST: Dict - stream API roundtrip ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + FILE* f_src = tmpfile(); + FILE* f_comp = tmpfile(); + FILE* f_dec = tmpfile(); + if (!f_src || !f_comp || !f_dec) { + printf(" [FAIL] tmpfile() failed\n"); + free(src); + return 0; + } + + fwrite(src, 1, src_size, f_src); + rewind(f_src); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_sz = zxc_stream_compress(f_src, f_comp, &copts); + if (comp_sz <= 0) { + printf(" [FAIL] stream_compress returned %lld\n", (long long)comp_sz); + fclose(f_src); + fclose(f_comp); + fclose(f_dec); + free(src); + return 0; + } + + rewind(f_comp); + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t dec_sz = zxc_stream_decompress(f_comp, f_dec, &dopts); + if (dec_sz != (int64_t)src_size) { + printf(" [FAIL] stream_decompress returned %lld, expected %zu\n", (long long)dec_sz, + src_size); + fclose(f_src); + fclose(f_comp); + fclose(f_dec); + free(src); + return 0; + } + + rewind(f_dec); + uint8_t* result = (uint8_t*)malloc(src_size); + fread(result, 1, src_size, f_dec); + int ok = (memcmp(src, result, src_size) == 0); + + fclose(f_src); + fclose(f_comp); + fclose(f_dec); + free(result); + free(src); + + if (!ok) { + printf(" [FAIL] content mismatch\n"); + return 0; + } + + printf("PASS\n\n"); + return 1; +} diff --git a/tests/test_main.c b/tests/test_main.c index 5bad7b43..6ae40fed 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -113,6 +113,15 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_error_name), TEST_CASE(test_library_info_api), + /* --- Dictionary --- */ + TEST_CASE(test_dict_zxd_roundtrip), + TEST_CASE(test_dict_id_deterministic), + TEST_CASE(test_dict_buffer_roundtrip), + TEST_CASE(test_dict_mismatch_error), + TEST_CASE(test_dict_required_error), + TEST_CASE(test_dict_no_dict_compat), + TEST_CASE(test_dict_stream_roundtrip), + /* --- Seekable (single-threaded) --- */ TEST_CASE(test_seekable_table_sizes), TEST_CASE(test_seekable_table_write), From b012695503582101b2940599b9cb339bb6e977c4 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 15:09:07 +0200 Subject: [PATCH 06/47] feat: Implement dictionary training and integrate across all interfaces Introduces a k-gram frequency based algorithm for `zxc_train_dict`, enabling users to generate effective dictionaries from sample data. This commit also fully integrates dictionary handling into the command-line interface (via the new `-D` option), block-level compression/decompression, and the seekable API. This completes the dictionary feature by providing the necessary training capability and ensuring dictionaries are correctly processed across all compression and decompression paths, including robust buffer management for dictionary-aware chunk processing. --- include/zxc_constants.h | 2 + include/zxc_seekable.h | 14 ++++ src/cli/main.c | 82 ++++++++++++++++++---- src/lib/zxc_compress.c | 3 +- src/lib/zxc_dict.c | 148 ++++++++++++++++++++++++++++++++++++--- src/lib/zxc_dispatch.c | 39 ++++++++++- src/lib/zxc_driver.c | 4 +- src/lib/zxc_internal.h | 2 - src/lib/zxc_seekable.c | 43 ++++++++++-- tests/test_common.h | 2 + tests/test_dict.c | 151 ++++++++++++++++++++++++++++++++++++++++ tests/test_main.c | 2 + 12 files changed, 459 insertions(+), 33 deletions(-) diff --git a/include/zxc_constants.h b/include/zxc_constants.h index adc3cd9d..e7931854 100644 --- a/include/zxc_constants.h +++ b/include/zxc_constants.h @@ -71,6 +71,8 @@ */ /** @brief Maximum dictionary content size in bytes (64 KB, bounded by LZ window). */ #define ZXC_DICT_SIZE_MAX (1U << 16) +/** @brief Size of the .zxd dictionary file header in bytes. */ +#define ZXC_DICT_HEADER_SIZE 16 /** @} */ /* end of dictionary */ /** diff --git a/include/zxc_seekable.h b/include/zxc_seekable.h index 84549f3b..05db650b 100644 --- a/include/zxc_seekable.h +++ b/include/zxc_seekable.h @@ -229,6 +229,20 @@ ZXC_EXPORT int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst, */ ZXC_EXPORT void zxc_seekable_free(zxc_seekable* s); +/** + * @brief Attach a pre-trained dictionary to a seekable handle. + * + * The dictionary content is copied internally; the caller may free + * @p dict after this call returns. Must be called before any + * zxc_seekable_decompress_range() call. + * + * @param[in] s Seekable handle. + * @param[in] dict Dictionary content. + * @param[in] dict_size Size in bytes (max ZXC_DICT_SIZE_MAX). + * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code. + */ +ZXC_EXPORT int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, size_t dict_size); + /* ========================================================================= */ /* Seek Table Writer (low-level) */ /* ========================================================================= */ diff --git a/src/cli/main.c b/src/cli/main.c index ef5dfeeb..76d171a3 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -22,6 +22,8 @@ #include "../../include/zxc_buffer.h" #include "../../include/zxc_constants.h" +#include "../../include/zxc_dict.h" +#include "../../include/zxc_error.h" #include "../../include/zxc_stream.h" #define ZXC_STDIO_BUFFER_SIZE (1024 * 1024) @@ -351,17 +353,17 @@ enum { OPT_VERSION = 1000, OPT_HELP }; static int process_single_file(const char* in_path, const char* out_path_override, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum, int level, size_t block_size, int json_output, - int seekable); + int seekable, const void* dict, size_t dict_size); // Forward declaration for processing directory static int process_directory(const char* dir_path, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum, int level, size_t block_size, - int json_output, int seekable); + int json_output, int seekable, const void* dict, size_t dict_size); // OS-specific implementation of directory processing static int process_directory(const char* dir_path, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum, int level, size_t block_size, - int json_output, int seekable) { + int json_output, int seekable, const void* dict, size_t dict_size) { int overall_ret = 0; #ifdef _WIN32 char search_path[MAX_PATH]; @@ -386,7 +388,7 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre if (find_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { overall_ret |= process_directory(full_path, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } else { // Check if it ends with .zxc to skip if compressing to avoid double compression if (mode == MODE_COMPRESS) { @@ -398,7 +400,7 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre overall_ret |= process_single_file(full_path, NULL, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } } while (FindNextFileA(hFind, &find_data) != 0); @@ -435,7 +437,8 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre if (S_ISDIR(st.st_mode)) { overall_ret |= process_directory(full_path, mode, num_threads, keep_input, force, to_stdout, - checksum, level, block_size, json_output, seekable); + checksum, level, block_size, json_output, seekable, + dict, dict_size); } else if (S_ISREG(st.st_mode)) { // Check if it ends with .zxc to skip if compressing to avoid double compression if (mode == MODE_COMPRESS) { @@ -448,7 +451,7 @@ static int process_directory(const char* dir_path, zxc_mode_t mode, int num_thre overall_ret |= process_single_file(full_path, NULL, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } } free(full_path); @@ -479,6 +482,7 @@ void print_help(const char* app) { " -T, --threads N Number of threads (0=auto)\n" " -C, --checksum Enable checksum {default}\n" " -N, --no-checksum Disable checksum\n" + " -D, --dict FILE Use pre-trained dictionary (.zxd) for compression/decompression\n" " -S, --seekable Append seek table for random-access decompression\n" " -k, --keep Keep input file\n" " -f, --force Force overwrite\n" @@ -723,7 +727,7 @@ static int zxc_list_archive(const char* path, int json_output) { static int process_single_file(const char* in_path, const char* out_path_override, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum_enabled, int level, size_t block_size, - int json_output, int seekable) { + int json_output, int seekable, const void* dict, size_t dict_size) { FILE* f_in = stdin; FILE* f_out = stdout; char resolved_in_path[4096] = {0}; @@ -912,6 +916,8 @@ static int process_single_file(const char* in_path, const char* out_path_overrid .block_size = block_size, .checksum_enabled = checksum_enabled, .seekable = seekable, + .dict = dict, + .dict_size = dict_size, .progress_cb = show_progress ? cli_progress_callback : NULL, .user_data = &pctx, }; @@ -920,6 +926,8 @@ static int process_single_file(const char* in_path, const char* out_path_overrid zxc_decompress_opts_t dopts = { .n_threads = num_threads, .checksum_enabled = checksum_enabled, + .dict = dict, + .dict_size = dict_size, .progress_cb = show_progress ? cli_progress_callback : NULL, .user_data = &pctx, }; @@ -1019,8 +1027,10 @@ int main(int argc, char** argv) { int json_output = 0; size_t block_size = 0; int seekable = 0; + const char* dict_path = NULL; - static const struct option long_options[] = {{"compress", no_argument, 0, 'z'}, + static const struct option long_options[] = {{"dict", required_argument, 0, 'D'}, + {"compress", no_argument, 0, 'z'}, {"decompress", no_argument, 0, 'd'}, {"list", no_argument, 0, 'l'}, {"test", no_argument, 0, 't'}, @@ -1045,7 +1055,7 @@ int main(int argc, char** argv) { int opt; int multiple_mode = 0; int recursive_mode = 0; - while ((opt = getopt_long(argc, argv, "123456b::B:cCdfhjklmrNqST:tvVz", long_options, NULL)) != + while ((opt = getopt_long(argc, argv, "123456b::B:cCdD:fhjklmrNqST:tvVz", long_options, NULL)) != -1) { switch (opt) { case 'z': @@ -1129,6 +1139,9 @@ int main(int argc, char** argv) { case 'S': seekable = 1; break; + case 'D': + dict_path = optarg; + break; case 'r': recursive_mode = 1; multiple_mode = 1; // Recursive implies multiple mode for files processing @@ -1201,6 +1214,51 @@ int main(int argc, char** argv) { checksum = (mode == MODE_BENCHMARK) ? 0 : 1; } + /* Load dictionary file (.zxd) if requested */ + void* dict = NULL; + size_t dict_size = 0; + if (dict_path) { + FILE* f_dict = fopen(dict_path, "rb"); + if (!f_dict) { + fprintf(stderr, "Error: cannot open dictionary '%s': %s\n", dict_path, strerror(errno)); + return 1; + } + fseeko(f_dict, 0, SEEK_END); + const long long fsize = ftello(f_dict); + fseeko(f_dict, 0, SEEK_SET); + if (fsize <= 0 || (size_t)fsize > ZXC_DICT_SIZE_MAX + ZXC_DICT_HEADER_SIZE) { + fprintf(stderr, "Error: dictionary file '%s' has invalid size\n", dict_path); + fclose(f_dict); + return 1; + } + uint8_t* zxd_buf = (uint8_t*)malloc((size_t)fsize); + if (!zxd_buf || fread(zxd_buf, 1, (size_t)fsize, f_dict) != (size_t)fsize) { + fprintf(stderr, "Error: failed to read dictionary '%s'\n", dict_path); + free(zxd_buf); + fclose(f_dict); + return 1; + } + fclose(f_dict); + + const void* content = NULL; + size_t content_size = 0; + const int rc = zxc_dict_load(zxd_buf, (size_t)fsize, &content, &content_size, NULL); + if (rc != ZXC_OK) { + fprintf(stderr, "Error: invalid dictionary '%s': %s\n", dict_path, + zxc_error_name(rc)); + free(zxd_buf); + return 1; + } + dict = malloc(content_size); + if (!dict) { + free(zxd_buf); + return 1; + } + memcpy(dict, content, content_size); + dict_size = content_size; + free(zxd_buf); + } + /* * Benchmark Mode * Loads the entire input file into RAM to measure raw algorithm throughput @@ -1464,7 +1522,7 @@ int main(int argc, char** argv) { zxc_is_directory(current_arg)) { overall_ret |= process_directory(current_arg, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } else { const char* explicit_out_path = (!multiple_mode && optind + 1 < argc && current_arg && strcmp(current_arg, "-") != 0 && !to_stdout) @@ -1474,7 +1532,7 @@ int main(int argc, char** argv) { overall_ret |= process_single_file(current_arg, explicit_out_path, mode, num_threads, keep_input, force, to_stdout, checksum, level, block_size, json_output, - seekable); + seekable, dict, dict_size); } if (!multiple_mode) { diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 309638fa..3b2f38f0 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -2324,8 +2324,7 @@ int zxc_compress_chunk_wrapper(zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT if (UNLIKELY(try_num)) { res = zxc_encode_block_num(ctx, block_data, block_sz, dst, dst_cap, &w); - if (res != ZXC_OK || w > (block_sz - (block_sz >> 2))) - try_num = 0; + if (res != ZXC_OK || w > (block_sz - (block_sz >> 2))) try_num = 0; } if (LIKELY(!try_num)) { diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index ec05b153..bd362fdc 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -101,16 +101,148 @@ int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_o } /* ------------------------------------------------------------------------- - * Dictionary training + * Dictionary training: k-gram frequency selection + * + * Algorithm: + * 1. Concatenate all samples into a corpus. + * 2. For each position in the corpus, hash the k-gram (k = MIN_MATCH_LEN) + * and count occurrences in a fixed-size hash map. + * 3. Walk the corpus a second time: for each position, look up the k-gram + * frequency and greedily select segments whose k-grams have the highest + * frequency x length score. + * 4. The most frequent segments are placed at the END of the dictionary + * so they produce shorter offsets (closer to the block start). * ------------------------------------------------------------------------- */ +#define ZXC_DICT_KGRAM_LEN ZXC_LZ_MIN_MATCH_LEN +#define ZXC_DICT_HT_BITS 16 +#define ZXC_DICT_HT_SIZE (1U << ZXC_DICT_HT_BITS) +#define ZXC_DICT_HT_MASK (ZXC_DICT_HT_SIZE - 1U) + +static uint32_t zxc_dict_hash(const uint8_t* p) { + uint32_t v = zxc_le32(p); + v ^= (uint32_t)p[4]; + return (v * ZXC_LZ_HASH_PRIME1) >> (32 - ZXC_DICT_HT_BITS); +} + +/** + * @brief Segment descriptor for dictionary training, scored by coverage. + */ +typedef struct { + uint32_t offset; + uint16_t length; + uint16_t score; +} zxc_dict_seg_t; + +static int zxc_seg_cmp_desc(const void* a, const void* b) { + const zxc_dict_seg_t* sa = (const zxc_dict_seg_t*)a; + const zxc_dict_seg_t* sb = (const zxc_dict_seg_t*)b; + if (sa->score != sb->score) return (sa->score < sb->score) ? 1 : -1; + return 0; +} + int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, const size_t n_samples, void* dict_buf, const size_t dict_capacity) { - (void)samples; - (void)sample_sizes; - (void)n_samples; - (void)dict_buf; - (void)dict_capacity; - /* TODO: implement training algorithm */ - return ZXC_ERROR_NULL_INPUT; + if (UNLIKELY(!samples || !sample_sizes || n_samples == 0 || !dict_buf || dict_capacity == 0)) + return ZXC_ERROR_NULL_INPUT; + if (UNLIKELY(dict_capacity > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + + /* Step 1: concatenate samples */ + size_t corpus_size = 0; + for (size_t i = 0; i < n_samples; i++) corpus_size += sample_sizes[i]; + if (UNLIKELY(corpus_size < ZXC_DICT_KGRAM_LEN)) return ZXC_ERROR_SRC_TOO_SMALL; + + uint8_t* corpus = (uint8_t*)ZXC_MALLOC(corpus_size); + if (UNLIKELY(!corpus)) return ZXC_ERROR_MEMORY; + { + size_t pos = 0; + for (size_t i = 0; i < n_samples; i++) { + if (sample_sizes[i] > 0) ZXC_MEMCPY(corpus + pos, samples[i], sample_sizes[i]); + pos += sample_sizes[i]; + } + } + + /* Step 2: count k-gram frequencies */ + uint16_t* freq = (uint16_t*)ZXC_MALLOC(ZXC_DICT_HT_SIZE * sizeof(uint16_t)); + if (UNLIKELY(!freq)) { + ZXC_FREE(corpus); + return ZXC_ERROR_MEMORY; + } + ZXC_MEMSET(freq, 0, ZXC_DICT_HT_SIZE * sizeof(uint16_t)); + + const size_t kgram_limit = corpus_size - ZXC_DICT_KGRAM_LEN + 1; + for (size_t i = 0; i < kgram_limit; i++) { + const uint32_t h = zxc_dict_hash(corpus + i); + if (freq[h] < UINT16_MAX) freq[h]++; + } + + /* Step 3: score segments: stride by k-gram length to avoid overlap, + * collect top-scoring segments. */ + const size_t stride = ZXC_DICT_KGRAM_LEN; + const size_t max_segs = corpus_size / stride; + const size_t seg_alloc = (max_segs < 65536) ? max_segs : 65536; + + zxc_dict_seg_t* segs = (zxc_dict_seg_t*)ZXC_MALLOC(seg_alloc * sizeof(zxc_dict_seg_t)); + if (UNLIKELY(!segs)) { + ZXC_FREE(freq); + ZXC_FREE(corpus); + return ZXC_ERROR_MEMORY; + } + + size_t n_segs = 0; + for (size_t i = 0; i + ZXC_DICT_KGRAM_LEN <= corpus_size && n_segs < seg_alloc; i += stride) { + const uint32_t h = zxc_dict_hash(corpus + i); + const uint16_t f = freq[h]; + if (f < 2) continue; + + /* Extend the segment as long as the next k-gram is also frequent. */ + size_t end = i + ZXC_DICT_KGRAM_LEN; + while (end + ZXC_DICT_KGRAM_LEN <= corpus_size && end - i < 255) { + const uint16_t nf = freq[zxc_dict_hash(corpus + end)]; + if (nf < 2) break; + end += ZXC_DICT_KGRAM_LEN; + } + + segs[n_segs].offset = (uint32_t)i; + segs[n_segs].length = (uint16_t)(end - i); + segs[n_segs].score = f; + n_segs++; + } + + ZXC_FREE(freq); + + if (UNLIKELY(n_segs == 0)) { + /* No frequent patterns. Use tail of corpus as dict. */ + const size_t copy = (corpus_size < dict_capacity) ? corpus_size : dict_capacity; + ZXC_MEMCPY(dict_buf, corpus + corpus_size - copy, copy); + ZXC_FREE(segs); + ZXC_FREE(corpus); + return (int64_t)copy; + } + + /* Step 4: sort by score descending, fill dict from end (most frequent last + * = shortest offsets from block start). */ + qsort(segs, n_segs, sizeof(zxc_dict_seg_t), zxc_seg_cmp_desc); + + uint8_t* out = (uint8_t*)dict_buf; + size_t filled = 0; + + for (size_t i = 0; i < n_segs && filled < dict_capacity; i++) { + size_t copy = segs[i].length; + if (copy > dict_capacity - filled) copy = dict_capacity - filled; + ZXC_MEMCPY(out + filled, corpus + segs[i].offset, copy); + filled += copy; + } + + /* If we haven't filled the capacity, pad with tail of corpus. */ + if (filled < dict_capacity) { + const size_t pad = dict_capacity - filled; + const size_t tail = (corpus_size > pad) ? pad : corpus_size; + ZXC_MEMCPY(out + filled, corpus + corpus_size - tail, tail); + filled += tail; + } + + ZXC_FREE(segs); + ZXC_FREE(corpus); + return (int64_t)filled; } diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index d6e2ed24..6d667272 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -1172,8 +1172,23 @@ int64_t zxc_compress_block(zxc_cctx* cctx, const void* RESTRICT src, const size_ cctx->inner.checksum_enabled = checksum_enabled; } - const int res = zxc_compress_chunk_wrapper(&cctx->inner, (const uint8_t*)src, src_size, - (uint8_t*)dst, dst_capacity); + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; + cctx->inner.dict_size = dict_size; + + int res; + if (dict && dict_size > 0) { + uint8_t* combined = (uint8_t*)ZXC_MALLOC(dict_size + src_size); + if (UNLIKELY(!combined)) return ZXC_ERROR_MEMORY; + ZXC_MEMCPY(combined, dict, dict_size); + ZXC_MEMCPY(combined + dict_size, src, src_size); + res = zxc_compress_chunk_wrapper(&cctx->inner, combined, dict_size + src_size, + (uint8_t*)dst, dst_capacity); + ZXC_FREE(combined); + } else { + res = zxc_compress_chunk_wrapper(&cctx->inner, (const uint8_t*)src, src_size, (uint8_t*)dst, + dst_capacity); + } if (UNLIKELY(res < 0)) return res; return (int64_t)res; } @@ -1212,12 +1227,30 @@ int64_t zxc_decompress_block(zxc_dctx* dctx, const void* RESTRICT src, const siz zxc_cctx_t* const ctx = &dctx->inner; + const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; + ctx->dict_size = dict_size; + /* work_buf was pre-sized to block_size + ZXC_DECOMPRESS_TAIL_PAD inside * the matching zxc_cctx_init call above. */ const size_t work_sz = block_size + ZXC_DECOMPRESS_TAIL_PAD; int res; - if (LIKELY(dst_capacity >= work_sz)) { + if (dict && dict_size > 0) { + uint8_t* dec_buf = (uint8_t*)ZXC_MALLOC(dict_size + work_sz); + if (UNLIKELY(!dec_buf)) return ZXC_ERROR_MEMORY; + ZXC_MEMCPY(dec_buf, dict, dict_size); + res = zxc_decompress_chunk_wrapper(ctx, (const uint8_t*)src, src_size, dec_buf + dict_size, + work_sz); + if (LIKELY(res > 0)) { + if (UNLIKELY((size_t)res > dst_capacity)) { + ZXC_FREE(dec_buf); + return ZXC_ERROR_DST_TOO_SMALL; + } + ZXC_MEMCPY(dst, dec_buf + dict_size, (size_t)res); + } + ZXC_FREE(dec_buf); + } else if (LIKELY(dst_capacity >= work_sz)) { res = zxc_decompress_chunk_wrapper(ctx, (const uint8_t*)src, src_size, (uint8_t*)dst, dst_capacity); } else { diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index 8b7c8b27..7eac691e 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -1003,8 +1003,8 @@ int64_t zxc_stream_decompress(FILE* f_in, FILE* f_out, const zxc_decompress_opts void* ud = opts ? opts->user_data : NULL; return zxc_stream_engine_run(f_in, f_out, n_threads, 0, 0, 0, checksum_enabled, 0, - (zxc_chunk_processor_t)zxc_decompress_chunk_wrapper, cb, ud, - dict, dict_size); + (zxc_chunk_processor_t)zxc_decompress_chunk_wrapper, cb, ud, dict, + dict_size); } int64_t zxc_stream_get_decompressed_size(FILE* f_in) { diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h index c4680ebb..cb9667fe 100644 --- a/src/lib/zxc_internal.h +++ b/src/lib/zxc_internal.h @@ -344,8 +344,6 @@ extern "C" { #define ZXC_DICT_MAGIC 0x9CB0D1C7U /** @brief Current dictionary file format version. */ #define ZXC_DICT_VERSION 1 -/** @brief Size of the .zxd file header in bytes. */ -#define ZXC_DICT_HEADER_SIZE 16 /** @brief Block header size: Type(1)+Flags(1)+Reserved(1)+CRC(1)+CompSize(4). */ #define ZXC_BLOCK_HEADER_SIZE 8 diff --git a/src/lib/zxc_seekable.c b/src/lib/zxc_seekable.c index 4513dde3..cde91711 100644 --- a/src/lib/zxc_seekable.c +++ b/src/lib/zxc_seekable.c @@ -170,6 +170,11 @@ struct zxc_seekable_s { /* Reusable decompression context (single-threaded path only) */ zxc_cctx_t dctx; int dctx_initialized; + + /* Dictionary (owned copy, freed in zxc_seekable_free) */ + uint8_t* dict; + size_t dict_size; + uint8_t* dict_work; /* [dict | decode_space] bounce buffer */ }; /** @@ -510,6 +515,7 @@ int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t d // LCOV_EXCL_STOP s->dctx_initialized = 1; } + s->dctx.dict_size = s->dict_size; /* work_buf is pre-sized to block_size + ZXC_DECOMPRESS_TAIL_PAD by the * matching zxc_cctx_init above. */ @@ -540,9 +546,12 @@ int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t d // LCOV_EXCL_STOP } - /* Decompress the block */ - const int dec_res = zxc_decompress_chunk_wrapper(&s->dctx, read_buf, (size_t)read_res, - s->dctx.work_buf, work_sz); + /* Decompress the block: when a dictionary is active, decode into the + * dict_work bounce buffer (which has dict content prepended) so that + * match copies referencing dictionary bytes resolve naturally. */ + uint8_t* dec_dst = s->dict_work ? s->dict_work + s->dict_size : s->dctx.work_buf; + const int dec_res = + zxc_decompress_chunk_wrapper(&s->dctx, read_buf, (size_t)read_res, dec_dst, work_sz); if (UNLIKELY(dec_res < 0)) { // LCOV_EXCL_START ZXC_FREE(read_buf); @@ -562,7 +571,7 @@ int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t d const size_t avail = (size_t)dec_res - skip; const size_t copy = (avail < remaining) ? avail : remaining; - ZXC_MEMCPY(out, s->dctx.work_buf + skip, copy); + ZXC_MEMCPY(out, dec_dst + skip, copy); out += copy; remaining -= copy; } @@ -809,12 +818,38 @@ int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst, const size_ void zxc_seekable_free(zxc_seekable* s) { if (!s) return; if (s->dctx_initialized) zxc_cctx_free(&s->dctx); + ZXC_FREE(s->dict); + ZXC_FREE(s->dict_work); ZXC_FREE(s->comp_sizes); ZXC_FREE(s->comp_offsets); ZXC_FREE(s->owned_reader_ctx); ZXC_FREE(s); } +int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_size) { + if (UNLIKELY(!s || !dict || dict_size == 0)) return ZXC_ERROR_NULL_INPUT; + if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + + ZXC_FREE(s->dict); + ZXC_FREE(s->dict_work); + + s->dict = (uint8_t*)ZXC_MALLOC(dict_size); + if (UNLIKELY(!s->dict)) return ZXC_ERROR_MEMORY; + ZXC_MEMCPY(s->dict, dict, dict_size); + s->dict_size = dict_size; + + const size_t work_sz = dict_size + (size_t)s->block_size + ZXC_DECOMPRESS_TAIL_PAD; + s->dict_work = (uint8_t*)ZXC_MALLOC(work_sz); + if (UNLIKELY(!s->dict_work)) { + ZXC_FREE(s->dict); + s->dict = NULL; + s->dict_size = 0; + return ZXC_ERROR_MEMORY; + } + ZXC_MEMCPY(s->dict_work, dict, dict_size); + return ZXC_OK; +} + void zxc_seekable_attach_owned_ctx(zxc_seekable* s, void* ctx) { if (s) s->owned_reader_ctx = ctx; } diff --git a/tests/test_common.h b/tests/test_common.h index 31c11659..73144c53 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -190,5 +190,7 @@ int test_dict_mismatch_error(void); int test_dict_required_error(void); int test_dict_no_dict_compat(void); int test_dict_stream_roundtrip(void); +int test_dict_seekable_roundtrip(void); +int test_dict_train_roundtrip(void); #endif /* ZXC_TEST_COMMON_H */ diff --git a/tests/test_dict.c b/tests/test_dict.c index 0daacaee..cd0b7b62 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -336,3 +336,154 @@ int test_dict_stream_roundtrip(void) { printf("PASS\n\n"); return 1; } + +int test_dict_train_roundtrip(void) { + printf("=== TEST: Dict - train then compress/decompress ===\n"); + + const char* json_samples[] = { + "{\"id\":1,\"name\":\"alice\",\"email\":\"alice@example.com\",\"active\":true}", + "{\"id\":2,\"name\":\"bob\",\"email\":\"bob@example.com\",\"active\":false}", + "{\"id\":3,\"name\":\"carol\",\"email\":\"carol@example.com\",\"active\":true}", + "{\"id\":4,\"name\":\"dave\",\"email\":\"dave@example.com\",\"active\":true}", + "{\"id\":5,\"name\":\"eve\",\"email\":\"eve@example.com\",\"active\":false}", + "{\"id\":6,\"name\":\"frank\",\"email\":\"frank@example.com\",\"active\":true}", + "{\"id\":7,\"name\":\"grace\",\"email\":\"grace@example.com\",\"active\":false}", + "{\"id\":8,\"name\":\"hank\",\"email\":\"hank@example.com\",\"active\":true}", + }; + const size_t n_samples = sizeof(json_samples) / sizeof(json_samples[0]); + const void* sample_ptrs[8]; + size_t sample_sizes[8]; + for (size_t i = 0; i < n_samples; i++) { + sample_ptrs[i] = json_samples[i]; + sample_sizes[i] = strlen(json_samples[i]); + } + + uint8_t dict_buf[4096]; + int64_t dict_sz = + zxc_train_dict(sample_ptrs, sample_sizes, n_samples, dict_buf, sizeof(dict_buf)); + if (dict_sz <= 0) { + printf(" [FAIL] train_dict returned %lld\n", (long long)dict_sz); + return 0; + } + printf(" trained dict: %lld bytes\n", (long long)dict_sz); + + const char* test_input = + "{\"id\":99,\"name\":\"zara\",\"email\":\"zara@example.com\",\"active\":true}"; + const size_t src_size = strlen(test_input); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .dict = dict_buf, + .dict_size = (size_t)dict_sz, + }; + int64_t comp_size = zxc_compress(test_input, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + zxc_compress_opts_t copts_nodict = {.level = ZXC_LEVEL_DEFAULT, .checksum_enabled = 1}; + uint8_t* comp_nodict = (uint8_t*)malloc(comp_bound); + int64_t comp_nodict_sz = + zxc_compress(test_input, src_size, comp_nodict, comp_bound, &copts_nodict); + printf(" with dict: %lld bytes, without: %lld bytes (input: %zu)\n", (long long)comp_size, + (long long)comp_nodict_sz, src_size); + free(comp_nodict); + + uint8_t decompressed[256]; + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_buf, + .dict_size = (size_t)dict_sz, + }; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, + sizeof(decompressed), &dopts); + free(compressed); + + if (dec_size != (int64_t)src_size || memcmp(test_input, decompressed, src_size) != 0) { + printf(" [FAIL] roundtrip mismatch\n"); + return 0; + } + + printf("PASS\n\n"); + return 1; +} + +int test_dict_seekable_roundtrip(void) { + printf("=== TEST: Dict - seekable API roundtrip ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .seekable = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(src); + free(compressed); + return 0; + } + + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + free(src); + free(compressed); + return 0; + } + + int rc = zxc_seekable_set_dict(s, dict_content, dict_size); + if (rc != ZXC_OK) { + printf(" [FAIL] seekable_set_dict returned %d\n", rc); + zxc_seekable_free(s); + free(src); + free(compressed); + return 0; + } + + uint8_t* decompressed = (uint8_t*)malloc(src_size); + int64_t dec_size = zxc_seekable_decompress_range(s, decompressed, src_size, 0, src_size); + if (dec_size != (int64_t)src_size) { + printf(" [FAIL] decompress_range returned %lld, expected %zu\n", (long long)dec_size, + src_size); + zxc_seekable_free(s); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + int ok = (memcmp(src, decompressed, src_size) == 0); + zxc_seekable_free(s); + free(decompressed); + free(src); + free(compressed); + + if (!ok) { + printf(" [FAIL] content mismatch\n"); + return 0; + } + + printf("PASS\n\n"); + return 1; +} diff --git a/tests/test_main.c b/tests/test_main.c index 6ae40fed..a26d10b7 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -121,6 +121,8 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_dict_required_error), TEST_CASE(test_dict_no_dict_compat), TEST_CASE(test_dict_stream_roundtrip), + TEST_CASE(test_dict_seekable_roundtrip), + TEST_CASE(test_dict_train_roundtrip), /* --- Seekable (single-threaded) --- */ TEST_CASE(test_seekable_table_sizes), From 04c37a2f1c7792b43e839493b77c47a0521f6f81 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 15:15:38 +0200 Subject: [PATCH 07/47] docs: Document dictionary file format and usage This commit adds comprehensive documentation for the pre-trained dictionary feature. It formalizes the `.zxd` dictionary file format, updates the ZXC file header specification to include dictionary details, and provides user-facing guidance on training and using dictionaries in the `README.md`. --- README.md | 50 +++++++++++++++++++++++++++++ docs/API.md | 87 ++++++++++++++++++++++++++++++++++++++++++++++++-- docs/FORMAT.md | 85 +++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 212 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 30b40a51..6b3e9a3a 100644 --- a/README.md +++ b/README.md @@ -460,6 +460,56 @@ zxc_compress_opts_t opts = { --- +## Dictionary Compression + +For workloads consisting of many **small, similar payloads** (< 64 KB each), a pre-trained dictionary dramatically improves compression ratio. The dictionary prefills the LZ77 sliding window at the start of each block, giving the match finder immediate access to representative patterns. + +**Typical use cases:** JSON API responses, small game assets, structured logs, key-value store records, RPC messages. + +### Training a dictionary + +```bash +# Train a dictionary from a corpus of similar files +zxc --train-dict corpus.zxd samples/*.json +``` + +```c +// C API +const void* samples[] = { buf1, buf2, buf3 }; +size_t sizes[] = { len1, len2, len3 }; +uint8_t dict[32768]; +int64_t dict_sz = zxc_train_dict(samples, sizes, 3, dict, sizeof(dict)); +``` + +### Compressing with a dictionary + +```bash +# CLI +zxc -z -D corpus.zxd input.json +zxc -d -D corpus.zxd input.json.zxc +``` + +```c +// C API — compression +zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .dict = dict_content, + .dict_size = dict_sz, +}; +int64_t compressed_size = zxc_compress(src, src_size, dst, dst_cap, &copts); + +// C API — decompression (same dictionary required) +zxc_decompress_opts_t dopts = { + .dict = dict_content, + .dict_size = dict_sz, +}; +int64_t original_size = zxc_decompress(compressed, comp_size, out, out_cap, &dopts); +``` + +The dictionary is stored as an external `.zxd` file and referenced by a 32-bit ID in the ZXC file header. Decompressing without the matching dictionary returns `ZXC_ERROR_DICT_REQUIRED` or `ZXC_ERROR_DICT_MISMATCH`. See [FORMAT.md](docs/FORMAT.md) §12 for the full specification. + +--- + ## Usage ### 1. CLI diff --git a/docs/API.md b/docs/API.md index e9ad810d..4e350bbd 100644 --- a/docs/API.md +++ b/docs/API.md @@ -28,6 +28,7 @@ For the on-disk binary format see [`FORMAT.md`](FORMAT.md). - [10. Streaming API](#10-streaming-api) - [10b. Push Streaming API](#10b-push-streaming-api) - [11. Seekable API](#11-seekable-api) +- [11b. Dictionary API](#11b-dictionary-api) - [12. Error Handling](#12-error-handling) - [13. Thread Safety](#13-thread-safety) - [14. Exported Symbols Summary](#14-exported-symbols-summary) @@ -40,7 +41,9 @@ For the on-disk binary format see [`FORMAT.md`](FORMAT.md). zxc.h <- freestanding umbrella (no ; kernel-safe) ├── zxc_buffer.h <- Buffer API + Reusable Context API │ └── zxc_export.h <- visibility macros -├── zxc_constants.h <- version macros, compression levels, block sizes +├── zxc_constants.h <- version macros, compression levels, block sizes, dict sizes +├── zxc_dict.h <- Dictionary training, save/load, identification +│ └── zxc_export.h ├── zxc_error.h <- error codes + zxc_error_name() │ └── zxc_export.h ├── zxc_opts.h <- compression / decompression options structs @@ -204,7 +207,10 @@ typedef enum { ZXC_ERROR_IO = -11, // file read/write/seek failure ZXC_ERROR_NULL_INPUT = -12, // required pointer is NULL ZXC_ERROR_BAD_BLOCK_TYPE = -13, // unknown block type - ZXC_ERROR_BAD_BLOCK_SIZE = -14 // invalid block size + ZXC_ERROR_BAD_BLOCK_SIZE = -14, // invalid block size + ZXC_ERROR_DICT_REQUIRED = -15, // file requires a dictionary but none provided + ZXC_ERROR_DICT_MISMATCH = -16, // provided dictionary ID does not match header + ZXC_ERROR_DICT_TOO_LARGE = -17 // dictionary exceeds ZXC_DICT_SIZE_MAX } zxc_error_t; ``` @@ -225,6 +231,8 @@ typedef struct { size_t block_size; // Block size in bytes (0 = 512 KB default). int checksum_enabled; // 1 = enable checksums, 0 = disable. int seekable; // 1 = append seek table for random access. + const void* dict; // Pre-trained dictionary content (NULL = none). + size_t dict_size; // Dictionary size in bytes (0 = none, max 64 KB). zxc_progress_callback_t progress_cb; // Optional callback (NULL to disable). void* user_data; // Passed through to progress_cb. } zxc_compress_opts_t; @@ -236,6 +244,8 @@ typedef struct { typedef struct { int n_threads; // Worker thread count (0 = auto-detect). int checksum_enabled; // 1 = verify checksums, 0 = skip. + const void* dict; // Pre-trained dictionary content (NULL = none). + size_t dict_size; // Dictionary size in bytes (0 = none). zxc_progress_callback_t progress_cb; // Optional callback. void* user_data; // Passed through to progress_cb. } zxc_decompress_opts_t; @@ -1278,6 +1288,73 @@ Returns the encoded byte size of a seek table for `num_blocks` blocks. --- +## 11b. Dictionary API + +Declared in ``. Provides dictionary training, serialization (`.zxd` format), and identification. + +### `zxc_train_dict` + +```c +ZXC_EXPORT int64_t zxc_train_dict( + const void* const* samples, + const size_t* sample_sizes, + size_t n_samples, + void* dict_buf, + size_t dict_capacity // max ZXC_DICT_SIZE_MAX (64 KB) +); +``` + +Trains a dictionary from a corpus of representative samples. Returns the size of the trained dictionary, or a negative `zxc_error_t` code. + +### `zxc_dict_id` + +```c +ZXC_EXPORT uint32_t zxc_dict_id(const void* dict, size_t dict_size); +``` + +Returns a deterministic 32-bit hash of the dictionary content. This ID is stored in the ZXC file header and verified at decompression time. Returns 0 for NULL/empty input. + +### `zxc_dict_save` + +```c +ZXC_EXPORT int64_t zxc_dict_save( + const void* content, size_t content_size, + void* buf, size_t buf_capacity +); +``` + +Serializes dictionary content to the `.zxd` file format. Use `zxc_dict_save_bound(content_size)` to compute the required buffer capacity. + +### `zxc_dict_load` + +```c +ZXC_EXPORT int zxc_dict_load( + const void* buf, size_t buf_size, + const void** content_out, size_t* content_size_out, + uint32_t* dict_id_out // may be NULL +); +``` + +Validates and parses a `.zxd` file from memory. On success, `content_out` points into the input buffer (zero-copy). Returns `ZXC_OK` or a negative error code. + +### `zxc_dict_save_bound` + +```c +ZXC_EXPORT size_t zxc_dict_save_bound(size_t content_size); +``` + +Returns the maximum `.zxd` file size for a given content size (`ZXC_DICT_HEADER_SIZE + content_size`). + +### `zxc_seekable_set_dict` + +```c +ZXC_EXPORT int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, size_t dict_size); +``` + +Attaches a dictionary to a seekable handle for random-access decompression. The content is copied internally. Must be called before any `zxc_seekable_decompress_range()` call. + +--- + ## 12. Error Handling ### `zxc_error_name` @@ -1374,6 +1451,12 @@ The shared library exports **47 symbols** (verified with `nm -gU`): | 49 | `zxc_write_seek_table` | Seekable | `zxc_seekable.h` | | 50 | `zxc_seek_table_size` | Seekable | `zxc_seekable.h` | | 51 | `zxc_error_name` | Error | `zxc_error.h` | +| 52 | `zxc_train_dict` | Dictionary | `zxc_dict.h` | +| 53 | `zxc_dict_id` | Dictionary | `zxc_dict.h` | +| 54 | `zxc_dict_save` | Dictionary | `zxc_dict.h` | +| 55 | `zxc_dict_load` | Dictionary | `zxc_dict.h` | +| 56 | `zxc_dict_save_bound` | Dictionary | `zxc_dict.h` | +| 57 | `zxc_seekable_set_dict` | Seekable | `zxc_seekable.h` | No internal symbols leak into the public ABI. FMV dispatch variants (`_default`, `_neon`, `_avx2`, `_avx512`) are compiled with diff --git a/docs/FORMAT.md b/docs/FORMAT.md index 00ec4269..72db7b16 100644 --- a/docs/FORMAT.md +++ b/docs/FORMAT.md @@ -63,9 +63,12 @@ Offset Size Field - Valid block sizes are powers of 2 in the range **4 KB – 2 MB**. - **Flags** (`u8`): - Bit 7 (`0x80`): `HAS_CHECKSUM`. + - Bit 6 (`0x40`): `HAS_DICTIONARY` — a pre-trained dictionary is required for decompression. - Bits 0..3: checksum algorithm id (`0` = RapidHash-based folding). - - Bits 4..6: reserved. -- **Reserved**: 7 bytes set to zero. + - Bits 4..5: reserved. +- **Reserved / Dictionary ID**: 7 bytes. + - When `HAS_DICTIONARY` is set: bytes `0x07..0x0A` contain a `dict_id` (`u32` LE), a 32-bit hash of the dictionary content. Bytes `0x0B..0x0D` remain zero. + - When `HAS_DICTIONARY` is clear: all 7 bytes are zero. - **Header CRC16** (`u16`): computed with `zxc_hash16` on the 16-byte header where bytes `0x0E..0x0F` are zeroed. --- @@ -561,7 +564,72 @@ For decoders processing untrusted input (e.g. network data, user uploads): --- -## 12. Summary of Useful Fixed Sizes +## 12. Pre-Trained Dictionary Support + +### 12.1 Overview + +A pre-trained dictionary improves compression ratio on small, similar payloads +(e.g. JSON API responses, game assets, structured logs) by prefilling the LZ77 +sliding window at the start of each block. The dictionary is an external file +(`.zxd` format) referenced by a 32-bit ID in the ZXC file header. + +### 12.2 Mechanism + +The dictionary contains raw byte content (max 64 KB, bounded by the 64 KB LZ +sliding window). At compression time, the dictionary is logically prepended to +each block's input, seeding the hash tables so the match finder can reference +dictionary content immediately. At decompression time, the dictionary is +prepended to the output buffer so match copies that reference dictionary bytes +resolve naturally via pointer arithmetic. + +Since each block is independent, the dictionary prefill happens per-block. +This preserves O(1) seekable random-access: load the dictionary once, then +decompress any block independently. + +### 12.3 File header encoding + +When `HAS_DICTIONARY` (flag bit 6) is set, the reserved bytes at offsets +`0x07..0x0A` contain the `dict_id` (`u32` LE). A decoder **MUST**: +1. Verify that a dictionary is provided (`ZXC_ERROR_DICT_REQUIRED` if not). +2. Verify that `zxc_dict_id(dict, dict_size) == header.dict_id` + (`ZXC_ERROR_DICT_MISMATCH` if not). + +Older decoders that do not recognize the `HAS_DICTIONARY` flag will ignore it +(per §10.3: reserved flag bits are ignored). However, blocks compressed with a +dictionary contain match offsets that reference dictionary content; decoding +without the dictionary produces corrupt output. Per-block and global checksums +(when enabled) will detect this corruption. + +### 12.4 Dictionary file format (`.zxd`) + +Dictionaries are stored as standalone `.zxd` files with the following layout: + +```text +Offset Size Field +0x00 4 Magic Word (0x9CB0D1C7 LE) +0x04 1 Dictionary format version (currently 1) +0x05 1 Flags (reserved, must be 0) +0x06 2 Content size (u16 LE, max 65535) +0x08 4 dict_id (u32 LE, hash of content) +0x0C 4 Header CRC32 (computed with this field zeroed) +0x10 N Dictionary content (raw bytes) +``` + +- **Magic Word**: `0x9CB0D1C7`. Allows immediate rejection of non-dictionary files. +- **dict_id**: deterministic 32-bit hash (RapidHash-folded) of the content bytes. Must match the `dict_id` stored in any ZXC file header that references this dictionary. +- **Header CRC32**: RapidHash-folded checksum of the 16-byte header with bytes `0x0C..0x0F` zeroed before hashing. +- **Content**: raw bytes that prefill the LZ77 window. Not compressed. + +### 12.5 Dictionary training + +The `zxc_train_dict()` function analyzes a corpus of representative samples to +select byte segments that maximize LZ77 match coverage. The most frequently +matched segments are placed at the end of the dictionary so they produce the +shortest offsets (closest to the block start in the virtual window). + +--- + +## 13. Summary of Useful Fixed Sizes - File header: **16** bytes - Block header: **8** bytes @@ -573,10 +641,11 @@ For decoders processing untrusted input (e.g. network data, user uploads): - GLO descriptors total: **32** bytes - GHI descriptors total: **24** bytes - File footer: **12** bytes +- Dictionary file header (`.zxd`): **16** bytes --- -## 13. Worked Example (Real Hexdump) +## 14. Worked Example (Real Hexdump) This example was produced with the CLI from a 10-byte input (`Hello ZXC\n`) using: @@ -586,7 +655,7 @@ zxc -z -C -1 sample.txt Generated archive size: **58 bytes**. -### 13.1 Full hexdump +### 14.1 Full hexdump ```text 00000000: F5 2E B0 9C 05 13 80 00 00 00 00 00 00 00 B8 90 @@ -595,7 +664,7 @@ Generated archive size: **58 bytes**. 00000030: 00 00 00 00 00 00 90 BB A1 75 ``` -### 13.2 Byte-level decoding +### 14.2 Byte-level decoding #### A) File Header (offset `0x00`, 16 bytes) @@ -665,7 +734,7 @@ global0 = 0 global1 = rotl1(global0) XOR block_crc = block_crc ``` -### 13.3 Structural view with absolute offsets +### 14.3 Structural view with absolute offsets ```text 0x00..0x0F File Header (16) @@ -676,7 +745,7 @@ global1 = rotl1(global0) XOR block_crc = block_crc 0x2E..0x39 File Footer (12) ``` -### 13.4 Seekable Variant (with Seek Table) +### 14.4 Seekable Variant (with Seek Table) Same 10-byte input (`Hello ZXC\n`), compressed with seekable mode enabled: From 82d4ebdbe78c24a50c4f45eedd5d69b47303658f Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 16:27:30 +0200 Subject: [PATCH 08/47] feat: Improve dictionary buffer management and match validation Correct buffer allocation for compression contexts to properly account for dictionaries, preventing potential overflows when dictionary content is prefixed to data blocks. Extend the valid match range in the decompressor to include dictionary data, resolving `ZXC_ERROR_BAD_OFFSET` for valid references into the dictionary. A new test case validates robust handling of large dictionaries with smaller blocks. --- src/lib/zxc_decompress.c | 4 ++-- src/lib/zxc_dispatch.c | 27 +++++++++++++++---------- src/lib/zxc_driver.c | 5 ++++- tests/test_common.h | 1 + tests/test_dict.c | 43 ++++++++++++++++++++++++++++++++++++++++ tests/test_main.c | 1 + 6 files changed, 67 insertions(+), 14 deletions(-) diff --git a/src/lib/zxc_decompress.c b/src/lib/zxc_decompress.c index fe786b27..6a95e843 100644 --- a/src/lib/zxc_decompress.c +++ b/src/lib/zxc_decompress.c @@ -1365,7 +1365,7 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_glo_impl(zxc_cctx_t* RESTRICT ctx, d_ptr += ll; const uint8_t* match_src = d_ptr - offset; - if (UNLIKELY(match_src < dst)) return ZXC_ERROR_BAD_OFFSET; + if (UNLIKELY(match_src < dst - ctx->dict_size)) return ZXC_ERROR_BAD_OFFSET; if (offset < ml) { for (size_t i = 0; i < ml; i++) d_ptr[i] = match_src[i]; @@ -2004,7 +2004,7 @@ static ZXC_ALWAYS_INLINE int zxc_decode_block_ghi_impl(zxc_cctx_t* RESTRICT ctx, d_ptr += ll; const uint8_t* match_src = d_ptr - offset; - if (UNLIKELY(match_src < dst)) return ZXC_ERROR_BAD_OFFSET; + if (UNLIKELY(match_src < dst - ctx->dict_size)) return ZXC_ERROR_BAD_OFFSET; if (offset < ml) { for (size_t i = 0; i < ml; i++) d_ptr[i] = match_src[i]; diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 6d667272..e203440e 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -515,8 +515,10 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST uint32_t global_hash = 0; zxc_cctx_t ctx; + const size_t eff_chunk = + dict_size > 0 ? zxc_block_size_ceil(dict_size + block_size) : block_size; // LCOV_EXCL_START - if (UNLIKELY(zxc_cctx_init(&ctx, block_size, 1, level, checksum_enabled) != ZXC_OK)) + if (UNLIKELY(zxc_cctx_init(&ctx, eff_chunk, 1, level, checksum_enabled) != ZXC_OK)) return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP ctx.dict_size = dict_size; @@ -1145,8 +1147,13 @@ int64_t zxc_compress_block(zxc_cctx* cctx, const void* RESTRICT src, const size_ (opts && opts->block_size > 0) ? opts->block_size : cctx->stored_block_size; const size_t min_bs = zxc_block_size_ceil(src_size); - /* Always ensure internal buffers can hold src_size. */ - const size_t effective_block_size = (block_size > min_bs) ? block_size : min_bs; + /* Always ensure internal buffers can hold src_size. + * When a dictionary is active, offset_bits must accommodate dict + block. */ + const uint8_t* b_dict = opts ? (const uint8_t*)opts->dict : NULL; + const size_t b_dict_size = (opts && opts->dict) ? opts->dict_size : 0; + const size_t base_block_size = (block_size > min_bs) ? block_size : min_bs; + const size_t effective_block_size = + b_dict_size > 0 ? zxc_block_size_ceil(b_dict_size + base_block_size) : base_block_size; cctx->stored_level = level; cctx->stored_block_size = effective_block_size; @@ -1172,17 +1179,15 @@ int64_t zxc_compress_block(zxc_cctx* cctx, const void* RESTRICT src, const size_ cctx->inner.checksum_enabled = checksum_enabled; } - const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; - const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; - cctx->inner.dict_size = dict_size; + cctx->inner.dict_size = b_dict_size; int res; - if (dict && dict_size > 0) { - uint8_t* combined = (uint8_t*)ZXC_MALLOC(dict_size + src_size); + if (b_dict && b_dict_size > 0) { + uint8_t* combined = (uint8_t*)ZXC_MALLOC(b_dict_size + src_size); if (UNLIKELY(!combined)) return ZXC_ERROR_MEMORY; - ZXC_MEMCPY(combined, dict, dict_size); - ZXC_MEMCPY(combined + dict_size, src, src_size); - res = zxc_compress_chunk_wrapper(&cctx->inner, combined, dict_size + src_size, + ZXC_MEMCPY(combined, b_dict, b_dict_size); + ZXC_MEMCPY(combined + b_dict_size, src, src_size); + res = zxc_compress_chunk_wrapper(&cctx->inner, combined, b_dict_size + src_size, (uint8_t*)dst, dst_capacity); ZXC_FREE(combined); } else { diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index 7eac691e..c97b9586 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -373,7 +373,10 @@ static void* zxc_stream_worker(void* arg) { ? ctx->checksum_enabled : (ctx->file_has_checksum && ctx->checksum_enabled); - if (zxc_cctx_init(&cctx, ctx->chunk_size, ctx->compression_mode, ctx->compression_level, + const size_t eff_chunk = (ctx->dict_size > 0 && ctx->compression_mode == 1) + ? zxc_block_size_ceil(ctx->dict_size + ctx->chunk_size) + : ctx->chunk_size; + if (zxc_cctx_init(&cctx, eff_chunk, ctx->compression_mode, ctx->compression_level, unified_chk) != ZXC_OK) { // LCOV_EXCL_START zxc_cctx_free(&cctx); diff --git a/tests/test_common.h b/tests/test_common.h index 73144c53..aa1dc721 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -190,6 +190,7 @@ int test_dict_mismatch_error(void); int test_dict_required_error(void); int test_dict_no_dict_compat(void); int test_dict_stream_roundtrip(void); +int test_dict_large_dict_roundtrip(void); int test_dict_seekable_roundtrip(void); int test_dict_train_roundtrip(void); diff --git a/tests/test_dict.c b/tests/test_dict.c index cd0b7b62..5272cce9 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -337,6 +337,49 @@ int test_dict_stream_roundtrip(void) { return 1; } +int test_dict_large_dict_roundtrip(void) { + printf("=== TEST: Dict - large dict (32KB) with small blocks (4KB) ===\n"); + + uint8_t* dict = (uint8_t*)malloc(32768); + for (size_t i = 0; i < 32768; i++) dict[i] = (uint8_t)(i * 7 + 13); + const size_t dict_size = 32768; + + const size_t src_size = 4096; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + uint8_t* decompressed = (uint8_t*)malloc(src_size); + + for (int level = 1; level <= 6; level++) { + zxc_compress_opts_t copts = { + .level = level, .checksum_enabled = 1, .dict = dict, .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] level %d: compress returned %lld (%s)\n", level, (long long)comp_size, + zxc_error_name((int)comp_size)); + free(src); free(compressed); free(decompressed); free(dict); + return 0; + } + zxc_decompress_opts_t dopts = {.checksum_enabled = 1, .dict = dict, .dict_size = dict_size}; + int64_t dec_size = zxc_decompress(compressed, (size_t)comp_size, decompressed, src_size, + &dopts); + if (dec_size != (int64_t)src_size || memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] level %d: dec_size=%lld err=%s\n", level, (long long)dec_size, + dec_size < 0 ? zxc_error_name((int)dec_size) : "content mismatch"); + free(src); free(compressed); free(decompressed); free(dict); + return 0; + } + printf(" [PASS] level %d\n", level); + } + + free(src); free(compressed); free(decompressed); free(dict); + printf("PASS\n\n"); + return 1; +} + int test_dict_train_roundtrip(void) { printf("=== TEST: Dict - train then compress/decompress ===\n"); diff --git a/tests/test_main.c b/tests/test_main.c index a26d10b7..2008ae67 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -121,6 +121,7 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_dict_required_error), TEST_CASE(test_dict_no_dict_compat), TEST_CASE(test_dict_stream_roundtrip), + TEST_CASE(test_dict_large_dict_roundtrip), TEST_CASE(test_dict_seekable_roundtrip), TEST_CASE(test_dict_train_roundtrip), From 30dd9b1a8ab30f9ee320741f0fec93f2ff30bc8d Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 16:36:11 +0200 Subject: [PATCH 09/47] feat: Align dictionary header checksum with zxc file header Refactors the dictionary header to use a 16-bit `zxc_hash16` checksum. This change standardizes the integrity check mechanism for dictionaries, aligning it with the method used for the main ZXC file header. The previous 32-bit CRC32 field is replaced by the 2-byte `CRC16` and a 2-byte reserved field to maintain overall header size. --- docs/FORMAT.md | 5 +++-- src/lib/zxc_dict.c | 21 ++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/docs/FORMAT.md b/docs/FORMAT.md index 72db7b16..5dec2bfe 100644 --- a/docs/FORMAT.md +++ b/docs/FORMAT.md @@ -611,13 +611,14 @@ Offset Size Field 0x05 1 Flags (reserved, must be 0) 0x06 2 Content size (u16 LE, max 65535) 0x08 4 dict_id (u32 LE, hash of content) -0x0C 4 Header CRC32 (computed with this field zeroed) +0x0C 2 Header CRC16 (zxc_hash16, computed with bytes 0x0C-0x0F zeroed) +0x0E 2 Reserved (0) 0x10 N Dictionary content (raw bytes) ``` - **Magic Word**: `0x9CB0D1C7`. Allows immediate rejection of non-dictionary files. - **dict_id**: deterministic 32-bit hash (RapidHash-folded) of the content bytes. Must match the `dict_id` stored in any ZXC file header that references this dictionary. -- **Header CRC32**: RapidHash-folded checksum of the 16-byte header with bytes `0x0C..0x0F` zeroed before hashing. +- **Header CRC16**: `zxc_hash16` checksum of the 16-byte header with bytes `0x0C..0x0F` zeroed before hashing — same method as the ZXC file header. - **Content**: raw bytes that prefill the LZ77 window. Not compressed. ### 12.5 Dictionary training diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index bd362fdc..0f5f2dcb 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -32,7 +32,8 @@ uint32_t zxc_dict_id(const void* dict, const size_t dict_size) { * 0x05 1 Flags (reserved, 0) * 0x06 2 Content size (u16 LE) * 0x08 4 dict_id (u32 LE) - * 0x0C 4 Header CRC32 (rapidhash-folded, computed with this field zeroed) + * 0x0C 2 Header CRC16 (zxc_hash16, computed with bytes 0x0C-0x0F zeroed) + * 0x0E 2 Reserved (0) * 0x10 N Content bytes * ------------------------------------------------------------------------- */ @@ -56,10 +57,11 @@ int64_t zxc_dict_save(const void* content, const size_t content_size, void* buf, zxc_store_le16(dst + 6, (uint16_t)content_size); zxc_store_le32(dst + 8, zxc_dict_id(content, content_size)); - /* CRC32 of header with CRC field zeroed */ - zxc_store_le32(dst + 12, 0); - const uint32_t crc = zxc_checksum(dst, ZXC_DICT_HEADER_SIZE, 0); - zxc_store_le32(dst + 12, crc); + /* CRC16 of header (same method as ZXC file header) with CRC field zeroed */ + zxc_store_le16(dst + 12, 0); + zxc_store_le16(dst + 14, 0); + const uint16_t crc = zxc_hash16(dst); + zxc_store_le16(dst + 12, crc); ZXC_MEMCPY(dst + ZXC_DICT_HEADER_SIZE, content, content_size); @@ -81,12 +83,13 @@ int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_o if (content_size > ZXC_DICT_SIZE_MAX) return ZXC_ERROR_DICT_TOO_LARGE; if (buf_size < ZXC_DICT_HEADER_SIZE + content_size) return ZXC_ERROR_SRC_TOO_SMALL; - /* Verify header CRC32 */ + /* Verify header CRC16 (same method as ZXC file header) */ uint8_t temp[ZXC_DICT_HEADER_SIZE]; ZXC_MEMCPY(temp, src, ZXC_DICT_HEADER_SIZE); - zxc_store_le32(temp + 12, 0); - const uint32_t expected_crc = zxc_checksum(temp, ZXC_DICT_HEADER_SIZE, 0); - if (UNLIKELY(zxc_le32(src + 12) != expected_crc)) return ZXC_ERROR_BAD_HEADER; + zxc_store_le16(temp + 12, 0); + zxc_store_le16(temp + 14, 0); + const uint16_t expected_crc = zxc_hash16(temp); + if (UNLIKELY(zxc_le16(src + 12) != expected_crc)) return ZXC_ERROR_BAD_HEADER; /* Verify dict_id matches content */ const uint8_t* content = src + ZXC_DICT_HEADER_SIZE; From b9f9499f2011f6cb27d0fc84c7f8732b34422f73 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 16:43:22 +0200 Subject: [PATCH 10/47] feat: Optimize dictionary seeding for improved compression Refines the `zxc_lz_seed_dict` function to use a sparse seeding strategy for the first half of the dictionary and a dense strategy for the second. This approach prioritizes seeding positions closer to the end of the dictionary, as they are more likely to produce shorter offsets and effective matches, enhancing overall compression efficiency. Also removes redundant comments in `zxc_dict.c` and adds `UNLIKELY` hints to dictionary loading error checks for minor performance guidance. --- src/lib/zxc_compress.c | 17 ++++++++++++++++- src/lib/zxc_dict.c | 9 +++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 3b2f38f0..123466ec 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -1347,7 +1347,22 @@ static void zxc_lz_seed_dict(const uint8_t* RESTRICT src, const size_t dict_size const int use_hash5 = (level >= 3); const size_t limit = dict_size - (ZXC_LZ_MIN_MATCH_LEN - 1); - for (size_t i = 0; i < limit; i++) { + + /* Sparse seeding for the first half, dense for the second half. + * Positions near the end of the dict produce shorter offsets and are + * more likely to yield matches, so they deserve full coverage. */ + const size_t half = limit / 2; + for (size_t i = 0; i < half; i += 4) { + const uint64_t val8 = zxc_le64(src + i); + const uint32_t h = zxc_hash_func(val8, use_hash5); + const uint32_t cur_pos = (uint32_t)i; + const uint8_t tag = (uint8_t)((uint32_t)val8 ^ ((uint32_t)val8 >> 16)); + + hash_table[h] = epoch_mark | cur_pos; + hash_tags[h] = tag; + chain_table[cur_pos & ZXC_LZ_WINDOW_MASK] = 0; + } + for (size_t i = half; i < limit; i++) { const uint64_t val8 = zxc_le64(src + i); const uint32_t h = zxc_hash_func(val8, use_hash5); const uint32_t cur_pos = (uint32_t)i; diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 0f5f2dcb..7a3621f4 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -56,8 +56,6 @@ int64_t zxc_dict_save(const void* content, const size_t content_size, void* buf, dst[5] = 0; /* flags: reserved */ zxc_store_le16(dst + 6, (uint16_t)content_size); zxc_store_le32(dst + 8, zxc_dict_id(content, content_size)); - - /* CRC16 of header (same method as ZXC file header) with CRC field zeroed */ zxc_store_le16(dst + 12, 0); zxc_store_le16(dst + 14, 0); const uint16_t crc = zxc_hash16(dst); @@ -79,11 +77,10 @@ int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_o if (src[4] != ZXC_DICT_VERSION) return ZXC_ERROR_BAD_VERSION; const size_t content_size = zxc_le16(src + 6); - if (content_size == 0) return ZXC_ERROR_CORRUPT_DATA; - if (content_size > ZXC_DICT_SIZE_MAX) return ZXC_ERROR_DICT_TOO_LARGE; - if (buf_size < ZXC_DICT_HEADER_SIZE + content_size) return ZXC_ERROR_SRC_TOO_SMALL; + if (UNLIKELY(content_size == 0)) return ZXC_ERROR_CORRUPT_DATA; + if (UNLIKELY(content_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + if (UNLIKELY(buf_size < ZXC_DICT_HEADER_SIZE + content_size)) return ZXC_ERROR_SRC_TOO_SMALL; - /* Verify header CRC16 (same method as ZXC file header) */ uint8_t temp[ZXC_DICT_HEADER_SIZE]; ZXC_MEMCPY(temp, src, ZXC_DICT_HEADER_SIZE); zxc_store_le16(temp + 12, 0); From 63f78ac99e05f724d0b29db5214ed4507ff92765 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 17:13:49 +0200 Subject: [PATCH 11/47] feat: Expose dictionary ID retrieval APIs and CLI display Introduces `zxc_get_dict_id` to extract the dictionary ID from ZXC compressed buffers and `zxc_dict_get_id` for `.zxd` dictionary files. These functions allow quickly determining the dictionary used without full decompression or validation. The CLI's `zxc -l` (list) command is updated to display the dictionary ID for files compressed with a dictionary, or a dash for files without. This information is also included in the JSON output. This enhances the inspectability of ZXC files. --- include/zxc_buffer.h | 12 ++++ include/zxc_dict.h | 12 ++++ src/cli/main.c | 25 +++++--- src/lib/zxc_dict.c | 7 +++ src/lib/zxc_dispatch.c | 10 +++ tests/test_cli.sh | 137 +++++++++++++++++++++++++++++++++++++++++ tests/test_common.h | 1 + tests/test_dict.c | 71 +++++++++++++++++++++ tests/test_main.c | 1 + 9 files changed, 269 insertions(+), 7 deletions(-) diff --git a/include/zxc_buffer.h b/include/zxc_buffer.h index fb79d555..86f85c8a 100644 --- a/include/zxc_buffer.h +++ b/include/zxc_buffer.h @@ -167,6 +167,18 @@ ZXC_EXPORT int64_t zxc_decompress(const void* src, const size_t src_size, void* */ ZXC_EXPORT uint64_t zxc_get_decompressed_size(const void* src, const size_t src_size); +/** + * @brief Returns the dictionary ID stored in a ZXC compressed buffer. + * + * Reads the file header flag and dict_id field without decompressing. + * Returns 0 if the file does not require a dictionary or the buffer is invalid. + * + * @param[in] src Pointer to the compressed data buffer. + * @param[in] src_size Size of the compressed data in bytes. + * @return Dictionary ID, or 0 if no dictionary is required. + */ +ZXC_EXPORT uint32_t zxc_get_dict_id(const void* src, size_t src_size); + /* ========================================================================= */ /* Block-Level API (no file framing) */ /* ========================================================================= */ diff --git a/include/zxc_dict.h b/include/zxc_dict.h index c250ed15..b5818fdc 100644 --- a/include/zxc_dict.h +++ b/include/zxc_dict.h @@ -101,6 +101,18 @@ ZXC_EXPORT int64_t zxc_dict_save(const void* content, size_t content_size, void* */ ZXC_EXPORT size_t zxc_dict_save_bound(size_t content_size); +/** + * @brief Returns the dictionary ID stored in a `.zxd` file buffer. + * + * Reads the dict_id field from the .zxd header without validating the full + * file. Returns 0 if the buffer is too small or the magic word doesn't match. + * + * @param[in] buf Buffer containing the .zxd file. + * @param[in] buf_size Size of @p buf in bytes. + * @return Dictionary ID, or 0 if the buffer is not a valid .zxd file. + */ +ZXC_EXPORT uint32_t zxc_dict_get_id(const void* buf, size_t buf_size); + /** * @brief Train a dictionary from a corpus of samples. * diff --git a/src/cli/main.c b/src/cli/main.c index 76d171a3..9699d2a8 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -672,6 +672,9 @@ static int zxc_list_archive(const char* path, int json_output) { ((uint32_t)footer[10] << 16) | ((uint32_t)footer[11] << 24); const char* checksum_method = (stored_checksum != 0) ? "RapidHash" : "-"; + // Dictionary ID (from header flag bit 6 + bytes 7-10) + const uint32_t dict_id = zxc_get_dict_id(header, ZXC_FILE_HEADER_SIZE); + // Calculate ratio (uncompressed / compressed, e.g., 2.5 means 2.5x compression) const double ratio = (file_size > 0) ? ((double)uncompressed_size / (double)file_size) : 0.0; @@ -680,8 +683,13 @@ static int zxc_list_archive(const char* path, int json_output) { format_size_decimal((uint64_t)file_size, comp_str, sizeof(comp_str)); format_size_decimal((uint64_t)uncompressed_size, uncomp_str, sizeof(uncomp_str)); + char dict_id_str[16]; + if (dict_id) + snprintf(dict_id_str, sizeof(dict_id_str), "0x%08X", dict_id); + else + snprintf(dict_id_str, sizeof(dict_id_str), "-"); + if (json_output) { - // JSON mode printf( "{\n" " \"filename\": \"%s\",\n" @@ -691,10 +699,12 @@ static int zxc_list_archive(const char* path, int json_output) { " \"format_version\": %u,\n" " \"block_size_kb\": %zu,\n" " \"checksum_method\": \"%s\",\n" - " \"checksum_value\": \"0x%08X\"\n" + " \"checksum_value\": \"0x%08X\",\n" + " \"dict_id\": %s%s%s\n" "}\n", path, (long long)file_size, (long long)uncompressed_size, ratio, format_version, - block_units * 4, (stored_checksum != 0) ? "RapidHash" : "none", stored_checksum); + block_units * 4, (stored_checksum != 0) ? "RapidHash" : "none", stored_checksum, + dict_id ? "\"" : "", dict_id ? dict_id_str : "null", dict_id ? "\"" : ""); } else if (g_verbose) { // Verbose mode: detailed vertical layout printf( @@ -706,6 +716,7 @@ static int zxc_list_archive(const char* path, int json_output) { path, format_version, block_units, (stored_checksum != 0) ? "RapidHash" : "None"); if (stored_checksum != 0) printf("Checksum Value: 0x%08X\n", stored_checksum); + if (dict_id) printf("Dictionary ID: %s\n", dict_id_str); printf( "-----------------------\n" @@ -715,10 +726,10 @@ static int zxc_list_archive(const char* path, int json_output) { comp_str, uncomp_str, ratio); } else { // Normal mode: table format - printf("\n %12s %12s %5s %-10s %s\n", "Compressed", "Uncompressed", "Ratio", - "Checksum", "Filename"); - printf(" %12s %12s %5.2f %-10s %s\n", comp_str, uncomp_str, ratio, checksum_method, - path); + printf("\n %12s %12s %5s %-10s %-10s %s\n", "Compressed", "Uncompressed", + "Ratio", "Checksum", "Dict ID", "Filename"); + printf(" %12s %12s %5.2f %-10s %-10s %s\n", comp_str, uncomp_str, ratio, + checksum_method, dict_id_str, path); } return 0; diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 7a3621f4..2ac8a5ec 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -37,6 +37,13 @@ uint32_t zxc_dict_id(const void* dict, const size_t dict_size) { * 0x10 N Content bytes * ------------------------------------------------------------------------- */ +uint32_t zxc_dict_get_id(const void* buf, const size_t buf_size) { + if (UNLIKELY(!buf || buf_size < ZXC_DICT_HEADER_SIZE)) return 0; + const uint8_t* p = (const uint8_t*)buf; + if (UNLIKELY(zxc_le32(p) != ZXC_DICT_MAGIC)) return 0; + return zxc_le32(p + 8); +} + size_t zxc_dict_save_bound(const size_t content_size) { return ZXC_DICT_HEADER_SIZE + content_size; } diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index e203440e..9efb5ba2 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -843,6 +843,16 @@ uint64_t zxc_get_decompressed_size(const void* src, const size_t src_size) { return zxc_le64(footer); } +// cppcheck-suppress unusedFunction +uint32_t zxc_get_dict_id(const void* src, const size_t src_size) { + if (UNLIKELY(!src || src_size < ZXC_FILE_HEADER_SIZE)) return 0; + + const uint8_t* const p = (const uint8_t*)src; + if (UNLIKELY(zxc_le32(p) != ZXC_MAGIC_WORD)) return 0; + + return (p[6] & ZXC_FILE_FLAG_HAS_DICTIONARY) ? zxc_le32(p + 7) : 0; +} + /* * ============================================================================ * REUSABLE CONTEXT API (Opaque) diff --git a/tests/test_cli.sh b/tests/test_cli.sh index b6964f76..71e7ef21 100755 --- a/tests/test_cli.sh +++ b/tests/test_cli.sh @@ -890,5 +890,142 @@ else log_fail "List command on seekable archive failed" fi +# 25. Dictionary Tests (-D) +echo "Testing Dictionary (-D)..." + +# 25.1 Create a .zxd dictionary from repetitive content +echo " Creating test dictionary..." +# Build a small helper to create a .zxd (uses the C API) +DICT_HELPER="$TEST_DIR/make_dict" +cat > "$TEST_DIR/make_dict.c" <<'DICTEOF' +#include +#include +#include "zxc.h" +#include "zxc_dict.h" +int main(int argc, char** argv) { + const char* content = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " + "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris."; + size_t sz = strlen(content); + uint8_t buf[1024]; + int64_t w = zxc_dict_save(content, sz, buf, sizeof(buf)); + if (w <= 0) return 1; + FILE* f = fopen(argv[1], "wb"); + fwrite(buf, 1, (size_t)w, f); + fclose(f); + return 0; +} +DICTEOF + +# Compile helper (find include/lib relative to the binary) +ZXC_DIR=$(dirname "$(dirname "$ZXC_BIN")") +cc -O0 -I"${ZXC_DIR}/include" -o "$DICT_HELPER" "$TEST_DIR/make_dict.c" \ + "${ZXC_DIR}/build/libzxc.a" -lpthread 2>/dev/null + +if [ ! -f "$DICT_HELPER" ]; then + echo " [SKIP] Could not compile dict helper (missing dev headers)" +else + +DICT_FILE="$TEST_DIR/test.zxd" +"$DICT_HELPER" "$DICT_FILE" +if [ ! -f "$DICT_FILE" ]; then + log_fail "Dictionary creation failed" +fi +log_pass "Dictionary .zxd created" + +# 25.2 Round-trip with dictionary +echo " Testing dict round-trip..." +"$ZXC_BIN" -3 -D "$DICT_FILE" -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict.zxc" +if [ ! -s "$TEST_DIR/test_dict.zxc" ]; then + log_fail "Dict compression failed" +fi +"$ZXC_BIN" -d -D "$DICT_FILE" -c "$TEST_DIR/test_dict.zxc" > "$TEST_DIR/test_dict.dec" +if cmp -s "$TEST_FILE" "$TEST_DIR/test_dict.dec"; then + log_pass "Dict round-trip (-D)" +else + log_fail "Dict round-trip content mismatch" +fi + +# 25.3 List shows dict_id +echo " Testing list with dict_id..." +OUT=$("$ZXC_BIN" -l "$TEST_DIR/test_dict.zxc") +if [[ "$OUT" == *"Dict ID"* ]] && [[ "$OUT" == *"0x"* ]]; then + log_pass "List shows dict_id" +else + log_fail "List should show dict_id column with 0x value" +fi + +# 25.4 List without dict shows dash +echo " Testing list without dict shows dash..." +"$ZXC_BIN" -3 -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_nodict2.zxc" +OUT=$("$ZXC_BIN" -l "$TEST_DIR/test_nodict2.zxc") +if [[ "$OUT" == *"Dict ID"* ]] && [[ "$OUT" == *" - "* ]]; then + log_pass "List without dict shows dash" +else + log_fail "List without dict should show dash in Dict ID column" +fi + +# 25.5 JSON list shows dict_id field +echo " Testing JSON list with dict_id..." +JSON_OUT=$("$ZXC_BIN" -l -j "$TEST_DIR/test_dict.zxc") +if [[ "$JSON_OUT" == *'"dict_id"'* ]] && [[ "$JSON_OUT" == *"0x"* ]]; then + log_pass "JSON list shows dict_id" +else + log_fail "JSON list should contain dict_id field" +fi + +# 25.6 Decompression without dict should fail +echo " Testing decompress without required dict..." +set +e +"$ZXC_BIN" -d -c "$TEST_DIR/test_dict.zxc" > /dev/null 2>&1 +RET=$? +set -e +if [ $RET -ne 0 ]; then + log_pass "Decompress without dict correctly fails" +else + log_fail "Decompress without required dict should fail" +fi + +# 25.7 Dict with seekable +echo " Testing dict + seekable (-D -S)..." +"$ZXC_BIN" -3 -D "$DICT_FILE" -S -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_seek.zxc" +"$ZXC_BIN" -d -D "$DICT_FILE" -c "$TEST_DIR/test_dict_seek.zxc" > "$TEST_DIR/test_dict_seek.dec" +if cmp -s "$TEST_FILE" "$TEST_DIR/test_dict_seek.dec"; then + log_pass "Dict + seekable (-D -S)" +else + log_fail "Dict + seekable round-trip failed" +fi + +# 25.8 Dict with all levels +echo " Testing dict across all levels..." +DICT_ALL_OK=1 +for LEVEL in 1 2 3 4 5 6; do + "$ZXC_BIN" -$LEVEL -D "$DICT_FILE" -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" + "$ZXC_BIN" -d -D "$DICT_FILE" -c "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" > "$TEST_DIR/test_dict_lvl${LEVEL}.dec" + if ! cmp -s "$TEST_FILE" "$TEST_DIR/test_dict_lvl${LEVEL}.dec"; then + DICT_ALL_OK=0 + log_fail "Dict level $LEVEL round-trip failed" + fi +done +if [ "$DICT_ALL_OK" -eq 1 ]; then + log_pass "Dict across all levels (1-6)" +fi + +# 25.9 Invalid dict file should fail +echo " Testing invalid dict file..." +echo "not a valid dict" > "$TEST_DIR/bad.zxd" +set +e +"$ZXC_BIN" -3 -D "$TEST_DIR/bad.zxd" -c "$TEST_FILE_ARG" > /dev/null 2>&1 +RET=$? +set -e +if [ $RET -ne 0 ]; then + log_pass "Invalid dict file rejected" +else + log_fail "Invalid dict file should be rejected" +fi + +fi # end of dict helper check + echo "All tests passed!" exit 0 diff --git a/tests/test_common.h b/tests/test_common.h index aa1dc721..8d42929c 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -185,6 +185,7 @@ int test_library_info_api(void); /* Dictionary */ int test_dict_zxd_roundtrip(void); int test_dict_id_deterministic(void); +int test_dict_get_id_apis(void); int test_dict_buffer_roundtrip(void); int test_dict_mismatch_error(void); int test_dict_required_error(void); diff --git a/tests/test_dict.c b/tests/test_dict.c index 5272cce9..d753fe0e 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -88,6 +88,77 @@ int test_dict_id_deterministic(void) { return 1; } +int test_dict_get_id_apis(void) { + printf("=== TEST: Dict - zxc_get_dict_id / zxc_dict_get_id ===\n"); + + const uint8_t dict[] = "dictionary content for get_id test"; + const size_t dict_size = sizeof(dict) - 1; + const uint32_t expected_id = zxc_dict_id(dict, dict_size); + + /* Compress with dict and verify zxc_get_dict_id reads it back */ + const uint8_t src[] = "some data to compress with dict for id test purposes"; + const size_t src_size = sizeof(src) - 1; + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = {.level = 1, .dict = dict, .dict_size = dict_size}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(compressed); + return 0; + } + + uint32_t got_id = zxc_get_dict_id(compressed, (size_t)comp_size); + if (got_id != expected_id) { + printf(" [FAIL] zxc_get_dict_id: got 0x%08X, expected 0x%08X\n", got_id, expected_id); + free(compressed); + return 0; + } + printf(" [PASS] zxc_get_dict_id returns 0x%08X\n", got_id); + + /* Compress without dict: should return 0 */ + zxc_compress_opts_t copts2 = {.level = 1}; + int64_t comp2 = zxc_compress(src, src_size, compressed, comp_bound, &copts2); + if (comp2 > 0 && zxc_get_dict_id(compressed, (size_t)comp2) != 0) { + printf(" [FAIL] zxc_get_dict_id should return 0 for no-dict file\n"); + free(compressed); + return 0; + } + printf(" [PASS] zxc_get_dict_id returns 0 for no-dict file\n"); + free(compressed); + + /* Save to .zxd and verify zxc_dict_get_id */ + size_t zxd_bound = zxc_dict_save_bound(dict_size); + uint8_t* zxd = (uint8_t*)malloc(zxd_bound); + int64_t zxd_size = zxc_dict_save(dict, dict_size, zxd, zxd_bound); + if (zxd_size <= 0) { + printf(" [FAIL] zxc_dict_save returned %lld\n", (long long)zxd_size); + free(zxd); + return 0; + } + + uint32_t zxd_id = zxc_dict_get_id(zxd, (size_t)zxd_size); + if (zxd_id != expected_id) { + printf(" [FAIL] zxc_dict_get_id: got 0x%08X, expected 0x%08X\n", zxd_id, expected_id); + free(zxd); + return 0; + } + printf(" [PASS] zxc_dict_get_id returns 0x%08X\n", zxd_id); + + /* Invalid buffer should return 0 */ + if (zxc_dict_get_id("bad", 3) != 0) { + printf(" [FAIL] zxc_dict_get_id should return 0 for invalid buffer\n"); + free(zxd); + return 0; + } + printf(" [PASS] zxc_dict_get_id returns 0 for invalid buffer\n"); + + free(zxd); + printf("PASS\n\n"); + return 1; +} + int test_dict_buffer_roundtrip(void) { printf("=== TEST: Dict - buffer API roundtrip (all levels) ===\n"); diff --git a/tests/test_main.c b/tests/test_main.c index 2008ae67..ebd74e19 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -116,6 +116,7 @@ static const test_entry_t g_tests[] = { /* --- Dictionary --- */ TEST_CASE(test_dict_zxd_roundtrip), TEST_CASE(test_dict_id_deterministic), + TEST_CASE(test_dict_get_id_apis), TEST_CASE(test_dict_buffer_roundtrip), TEST_CASE(test_dict_mismatch_error), TEST_CASE(test_dict_required_error), From 9100bad0015ae67f2c2dddcd27f842679328eaf4 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 17:20:39 +0200 Subject: [PATCH 12/47] feat: Introduce --train-dict CLI command This new option enables users to train custom dictionaries from multiple input files and save them in the `.zxd` format, completing the dictionary workflow in the command-line interface. --- src/cli/main.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 4 deletions(-) diff --git a/src/cli/main.c b/src/cli/main.c index 9699d2a8..440592d6 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -344,10 +344,11 @@ typedef enum { MODE_DECOMPRESS, MODE_BENCHMARK, MODE_INTEGRITY, - MODE_LIST + MODE_LIST, + MODE_TRAIN_DICT } zxc_mode_t; -enum { OPT_VERSION = 1000, OPT_HELP }; +enum { OPT_VERSION = 1000, OPT_HELP, OPT_TRAIN_DICT }; // Forward declaration for recursive mode static int process_single_file(const char* in_path, const char* out_path_override, zxc_mode_t mode, @@ -469,7 +470,8 @@ void print_help(const char* app) { " -d, --decompress Decompress FILE (or stdin -> stdout)\n" " -l, --list List archive information\n" " -t, --test Test compressed FILE integrity\n" - " -b, --bench [N] Benchmark in-memory (N=seconds, default 5)\n\n" + " -b, --bench [N] Benchmark in-memory (N=seconds, default 5)\n" + " --train-dict FILE Train a dictionary from input files\n\n" "Batch Processing:\n" " -m, --multiple Multiple input files\n" " -r, --recursive Operate recursively on directories\n\n" @@ -1039,8 +1041,10 @@ int main(int argc, char** argv) { size_t block_size = 0; int seekable = 0; const char* dict_path = NULL; + const char* train_dict_path = NULL; - static const struct option long_options[] = {{"dict", required_argument, 0, 'D'}, + static const struct option long_options[] = {{"train-dict", required_argument, 0, OPT_TRAIN_DICT}, + {"dict", required_argument, 0, 'D'}, {"compress", no_argument, 0, 'z'}, {"decompress", no_argument, 0, 'd'}, {"list", no_argument, 0, 'l'}, @@ -1153,6 +1157,10 @@ int main(int argc, char** argv) { case 'D': dict_path = optarg; break; + case OPT_TRAIN_DICT: + mode = MODE_TRAIN_DICT; + train_dict_path = optarg; + break; case 'r': recursive_mode = 1; multiple_mode = 1; // Recursive implies multiple mode for files processing @@ -1270,6 +1278,108 @@ int main(int argc, char** argv) { free(zxd_buf); } + /* + * Train Dictionary Mode + * Reads input files as samples, trains a dictionary, saves as .zxd. + */ + if (mode == MODE_TRAIN_DICT) { + if (optind >= argc) { + fprintf(stderr, "Error: --train-dict requires input files as training samples.\n"); + free(dict); + return 1; + } + const int n_files = argc - optind; + const void** samples = (const void**)malloc((size_t)n_files * sizeof(void*)); + size_t* sample_sizes = (size_t*)malloc((size_t)n_files * sizeof(size_t)); + if (!samples || !sample_sizes) { + fprintf(stderr, "Error: memory allocation failed\n"); + free(samples); + free(sample_sizes); + free(dict); + return 1; + } + int n_loaded = 0; + for (int i = optind; i < argc; i++) { + FILE* sf = fopen(argv[i], "rb"); + if (!sf) { + fprintf(stderr, "Warning: cannot open '%s', skipping\n", argv[i]); + continue; + } + fseeko(sf, 0, SEEK_END); + size_t sz = (size_t)ftello(sf); + fseeko(sf, 0, SEEK_SET); + if (sz == 0) { fclose(sf); continue; } + uint8_t* buf = (uint8_t*)malloc(sz); + if (!buf) { fclose(sf); continue; } + fread(buf, 1, sz, sf); + fclose(sf); + samples[n_loaded] = buf; + sample_sizes[n_loaded] = sz; + n_loaded++; + } + if (n_loaded == 0) { + fprintf(stderr, "Error: no valid samples loaded\n"); + free(samples); + free(sample_sizes); + free(dict); + return 1; + } + + size_t dict_cap = 32768; + if (block_size > 0 && block_size < dict_cap) dict_cap = block_size; + uint8_t* dict_buf = (uint8_t*)malloc(dict_cap); + if (!dict_buf) { + fprintf(stderr, "Error: memory allocation failed\n"); + for (int i = 0; i < n_loaded; i++) free((void*)samples[i]); + free(samples); + free(sample_sizes); + free(dict); + return 1; + } + + int64_t dict_sz = zxc_train_dict(samples, sample_sizes, (size_t)n_loaded, + dict_buf, dict_cap); + for (int i = 0; i < n_loaded; i++) free((void*)samples[i]); + free(samples); + free(sample_sizes); + + if (dict_sz <= 0) { + fprintf(stderr, "Error: training failed: %s\n", zxc_error_name((int)dict_sz)); + free(dict_buf); + free(dict); + return 1; + } + + size_t zxd_bound = zxc_dict_save_bound((size_t)dict_sz); + uint8_t* zxd = (uint8_t*)malloc(zxd_bound); + int64_t zxd_sz = zxc_dict_save(dict_buf, (size_t)dict_sz, zxd, zxd_bound); + free(dict_buf); + if (zxd_sz <= 0) { + fprintf(stderr, "Error: dict save failed: %s\n", zxc_error_name((int)zxd_sz)); + free(zxd); + free(dict); + return 1; + } + + FILE* out = fopen(train_dict_path, "wb"); + if (!out) { + fprintf(stderr, "Error: cannot create '%s': %s\n", train_dict_path, strerror(errno)); + free(zxd); + free(dict); + return 1; + } + const uint32_t trained_id = zxc_dict_get_id(zxd, (size_t)zxd_sz); + fwrite(zxd, 1, (size_t)zxd_sz, out); + fclose(out); + free(zxd); + + fprintf(stderr, "Trained dictionary: %lld bytes from %d samples -> %s (dict_id: 0x%08X)\n", + (long long)dict_sz, n_loaded, train_dict_path, trained_id); + + free(dict); + return 0; + } + /* * Benchmark Mode * Loads the entire input file into RAM to measure raw algorithm throughput From 927c78a9ae9982f75fb9bab72b185781d6052fec Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 17:21:14 +0200 Subject: [PATCH 13/47] feat: Add dictionary roundtrip fuzzer Introduces `fuzz_dict` to test the integrity of dictionary-based compression and decompression. This fuzzer uses the input to construct a dictionary and data, then verifies that compressing and decompressing with the dictionary results in identical output. --- .clusterfuzzlite/build.sh | 4 +- .github/workflows/fuzzing.yml | 7 ++- tests/fuzz_dict.c | 90 +++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 tests/fuzz_dict.c diff --git a/.clusterfuzzlite/build.sh b/.clusterfuzzlite/build.sh index 04f057e8..e5338e84 100644 --- a/.clusterfuzzlite/build.sh +++ b/.clusterfuzzlite/build.sh @@ -6,9 +6,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -AVAILABLE_FUZZERS="decompress roundtrip seekable pstream" +AVAILABLE_FUZZERS="decompress roundtrip seekable pstream dict" -LIB_SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" +LIB_SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" for fuzzer in $AVAILABLE_FUZZERS; do if [ -z "${FUZZER_TARGET:-}" ] || [ "${FUZZER_TARGET}" == "$fuzzer" ]; then diff --git a/.github/workflows/fuzzing.yml b/.github/workflows/fuzzing.yml index 8c2a21fb..1ecf083c 100644 --- a/.github/workflows/fuzzing.yml +++ b/.github/workflows/fuzzing.yml @@ -42,7 +42,7 @@ jobs: fail-fast: false matrix: sanitizer: [address, undefined] - fuzzer: [decompress, roundtrip, seekable, pstream] + fuzzer: [decompress, roundtrip, seekable, pstream, dict] steps: - name: Checkout Repository @@ -132,7 +132,7 @@ jobs: CFLAGS="-g -O1 -fprofile-instr-generate -fcoverage-mapping" DEFS="-DZXC_FUNCTION_SUFFIX=_default -DZXC_ONLY_DEFAULT" INCLUDES="-I include -I src/lib/vendors" - SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" + SOURCES="src/lib/zxc_common.c src/lib/zxc_compress.c src/lib/zxc_decompress.c src/lib/zxc_dict.c src/lib/zxc_driver.c src/lib/zxc_dispatch.c src/lib/zxc_huffman.c src/lib/zxc_seekable.c src/lib/zxc_pstream.c" clang $CFLAGS $INCLUDES $DEFS $SOURCES tests/fuzz_roundtrip.c \ -fsanitize=fuzzer -lm -lpthread -o build-cov/fuzz_roundtrip @@ -146,6 +146,9 @@ jobs: clang $CFLAGS $INCLUDES $DEFS $SOURCES tests/fuzz_pstream.c \ -fsanitize=fuzzer -lm -lpthread -o build-cov/fuzz_pstream + clang $CFLAGS $INCLUDES $DEFS $SOURCES tests/fuzz_dict.c \ + -fsanitize=fuzzer -lm -lpthread -o build-cov/fuzz_dict + - name: Replay Corpora run: | LLVM_PROFILE_FILE="build-cov/roundtrip.profraw" \ diff --git a/tests/fuzz_dict.c b/tests/fuzz_dict.c new file mode 100644 index 00000000..a32be656 --- /dev/null +++ b/tests/fuzz_dict.c @@ -0,0 +1,90 @@ +/* + * ZXC - High-performance lossless compression + * + * Copyright (c) 2025-2026 Bertrand Lebonnois and contributors. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/* + * Fuzz target: dictionary roundtrip. + * + * The fuzzer input is split into a dictionary prefix and block data. + * The first 2 bytes encode the dict size (u16 LE, capped at 32 KB). + * The remainder is the block data to compress with that dictionary. + * The roundtrip (compress -> decompress) must produce identical output. + */ + +#include +#include +#include +#include +#include + +#include "../include/zxc_buffer.h" + +#define FUZZ_DICT_MAX_INPUT (256 << 10) /* 256 KiB */ +#define FUZZ_DICT_MAX_DICT (32 << 10) /* 32 KiB */ + +int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + static void* comp_buf = NULL; + static size_t comp_cap = 0; + static void* decomp_buf = NULL; + static size_t decomp_cap = 0; + + if (size < 4) return 0; + if (size > FUZZ_DICT_MAX_INPUT) return 0; + + /* First 2 bytes: dict_size (u16 LE, capped). Byte 2: level. */ + size_t dict_size = (size_t)(data[0] | (data[1] << 8)); + if (dict_size > FUZZ_DICT_MAX_DICT) dict_size = FUZZ_DICT_MAX_DICT; + const int level = (data[2] % 6) + 1; + data += 3; + size -= 3; + + if (dict_size >= size) dict_size = size / 2; + const uint8_t* dict = data; + const uint8_t* src = data + dict_size; + const size_t src_size = size - dict_size; + + if (src_size == 0) return 0; + + const uint64_t bound64 = zxc_compress_bound(src_size); + if (bound64 == 0 || bound64 > SIZE_MAX) return 0; + const size_t bound = (size_t)bound64; + if (bound > comp_cap) { + void* nb = realloc(comp_buf, bound); + if (!nb) return 0; + comp_buf = nb; + comp_cap = bound; + } + + zxc_compress_opts_t copts = { + .level = level, + .checksum_enabled = 1, + .dict = dict_size > 0 ? dict : NULL, + .dict_size = dict_size, + }; + const int64_t csize = zxc_compress(src, src_size, comp_buf, bound, &copts); + if (csize < 0) return 0; + + if (src_size > decomp_cap) { + void* nb = realloc(decomp_buf, src_size); + if (!nb) return 0; + decomp_buf = nb; + decomp_cap = src_size; + } + + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_size > 0 ? dict : NULL, + .dict_size = dict_size, + }; + const int64_t dsize = zxc_decompress(comp_buf, (size_t)csize, decomp_buf, src_size, &dopts); + + if (dsize >= 0) { + assert((size_t)dsize == src_size); + assert(memcmp(src, decomp_buf, src_size) == 0); + } + + return 0; +} From 6912a422a26ef4e12542ab1c3854a8942542af01 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 17:42:46 +0200 Subject: [PATCH 14/47] feat: Enable conformance testing for dictionary-compressed files The conformance test suite now supports zxc archives that require an external dictionary for decompression. The `test_valid_vector` function is enhanced to automatically detect the dictionary ID from the zxc archive and search for a matching `.zxd` dictionary file in the same directory. If found, the dictionary is loaded and supplied to the decompressor. New valid vector test cases are added to validate dictionary-based compression. --- conformance/test_conformance.c | 78 +++++++++++++++++- conformance/valid/dict_text.zxd | Bin 0 -> 207 bytes conformance/valid/text_1k_dict.expected | 1 + conformance/valid/text_1k_dict.zxc | Bin 0 -> 758 bytes .../valid/text_1k_dict_seekable.expected | 1 + conformance/valid/text_1k_dict_seekable.zxc | Bin 0 -> 770 bytes 6 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 conformance/valid/dict_text.zxd create mode 100644 conformance/valid/text_1k_dict.expected create mode 100644 conformance/valid/text_1k_dict.zxc create mode 100644 conformance/valid/text_1k_dict_seekable.expected create mode 100644 conformance/valid/text_1k_dict_seekable.zxc diff --git a/conformance/test_conformance.c b/conformance/test_conformance.c index 42a383f5..70346f7f 100644 --- a/conformance/test_conformance.c +++ b/conformance/test_conformance.c @@ -18,6 +18,7 @@ #endif #include "../include/zxc_buffer.h" +#include "../include/zxc_dict.h" #include "../include/zxc_error.h" /* ---------- helpers ------------------------------------------------------ */ @@ -70,6 +71,63 @@ static int has_suffix(const char *s, const char *suffix) /* ---------- valid vector test -------------------------------------------- */ +/** + * @brief Searches for a .zxd dictionary file in the same directory as @p zxc_path + * whose dict_id matches @p target_id. Returns the loaded content (caller frees). + */ +static uint8_t *find_dict_for_id(const char *zxc_path, uint32_t target_id, + const void **content_out, size_t *content_size_out) +{ + /* Derive directory from zxc_path */ + char dir[512]; + strncpy(dir, zxc_path, sizeof(dir) - 1); + dir[sizeof(dir) - 1] = '\0'; + char *sep = strrchr(dir, '/'); + if (sep) *(sep + 1) = '\0'; else strcpy(dir, "./"); + +#ifdef _WIN32 + char pattern[512]; + snprintf(pattern, sizeof(pattern), "%s*.zxd", dir); + WIN32_FIND_DATAA fd; + HANDLE hf = FindFirstFileA(pattern, &fd); + if (hf == INVALID_HANDLE_VALUE) return NULL; + do { + char path[512]; + snprintf(path, sizeof(path), "%s%s", dir, fd.cFileName); + size_t sz = 0; + uint8_t *buf = read_file(path, &sz); + if (buf && zxc_dict_get_id(buf, sz) == target_id) { + if (zxc_dict_load(buf, sz, content_out, content_size_out, NULL) == 0) { + FindClose(hf); + return buf; + } + } + free(buf); + } while (FindNextFileA(hf, &fd)); + FindClose(hf); +#else + DIR *dp = opendir(dir); + if (!dp) return NULL; + struct dirent *ent; + while ((ent = readdir(dp)) != NULL) { + if (!has_suffix(ent->d_name, ".zxd")) continue; + char path[512]; + snprintf(path, sizeof(path), "%s%s", dir, ent->d_name); + size_t sz = 0; + uint8_t *buf = read_file(path, &sz); + if (buf && zxc_dict_get_id(buf, sz) == target_id) { + if (zxc_dict_load(buf, sz, content_out, content_size_out, NULL) == 0) { + closedir(dp); + return buf; + } + } + free(buf); + } + closedir(dp); +#endif + return NULL; +} + static int test_valid_vector(const char *zxc_path, const char *expected_path) { size_t comp_sz = 0, expected_sz = 0; @@ -87,6 +145,21 @@ static int test_valid_vector(const char *zxc_path, const char *expected_path) return 0; } + /* Auto-detect dictionary: if the archive has a dict_id, find the .zxd */ + const void *dict = NULL; + size_t dict_size = 0; + uint8_t *dict_buf = NULL; + uint32_t did = zxc_get_dict_id(comp, comp_sz); + if (did != 0) { + dict_buf = find_dict_for_id(zxc_path, did, &dict, &dict_size); + if (!dict_buf) { + fprintf(stderr, "FAIL: %s requires dict 0x%08X but no matching .zxd found\n", + zxc_path, did); + free(comp); free(expected); + return 0; + } + } + int ok = 1; uint64_t dec_sz = zxc_get_decompressed_size(comp, comp_sz); @@ -103,8 +176,10 @@ static int test_valid_vector(const char *zxc_path, const char *expected_path) fprintf(stderr, "FAIL: %s OOM\n", zxc_path); ok = 0; } else { + zxc_decompress_opts_t dopts = {0}; + if (dict) { dopts.dict = dict; dopts.dict_size = dict_size; } int64_t result = zxc_decompress(comp, comp_sz, - output, (size_t)dec_sz, NULL); + output, (size_t)dec_sz, &dopts); if (result < 0) { fprintf(stderr, "FAIL: %s decompress failed: %s\n", zxc_path, zxc_error_name((int)result)); @@ -121,6 +196,7 @@ static int test_valid_vector(const char *zxc_path, const char *expected_path) } } + free(dict_buf); free(comp); free(expected); return ok; diff --git a/conformance/valid/dict_text.zxd b/conformance/valid/dict_text.zxd new file mode 100644 index 0000000000000000000000000000000000000000..b5735f652ad8fd57a2c816c33bc73faaba8452fb GIT binary patch literal 207 zcmXBM!EHh@5QO1JCv(UVy-K*D6tLJUG?LeT*t>`p=z${WAjh1O7ScdUh$C(@hyVNh zdOmi`$MSRhzQ6r0%b|5Oa2sLBt+b9ohnV^vsnwy$)uTf)H;%-5h8FsYqvjbv8N*tJ zt8Fu8P0Y+u9pNZ)nmp8Bw8G?E1yXnkS@G`BN}D!}|KmEZ*dCYQKyAZ&Wav$MqKmUt ML@8pay=0E{4;!>i)&Kwi literal 0 HcmV?d00001 diff --git a/conformance/valid/text_1k_dict.expected b/conformance/valid/text_1k_dict.expected new file mode 100644 index 00000000..e519353a --- /dev/null +++ b/conformance/valid/text_1k_dict.expected @@ -0,0 +1 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Curabitur pretium tincidunt lacus. Nulla gravida orci a odio. Nullam varius, turpis et commodo pharetra, est eros bibendum elit, nec luctus magna felis sollicitudin mauris. Integer in mauris eu nibh euismod gravida. Duis ac tellus et risus vulputate vehicula. Donec lobortis risus a elit. Etiam tempor. Ut ullamcorper, ligula ut dictum pharetra, nisi nunc fringilla magna, in commodo elit erat nec turpis. Ut pharetra augue nec augue. Nam elit agna, endrerit sit amet, tincidunt ac. \ No newline at end of file diff --git a/conformance/valid/text_1k_dict.zxc b/conformance/valid/text_1k_dict.zxc new file mode 100644 index 0000000000000000000000000000000000000000..9aba3e7abc189ebbcdd0b3f3eae091c7a13313d7 GIT binary patch literal 758 zcmX|9!Ds(BMH5y$A|*R9Hq3ya!A`K}7T-cu8$f&1}I;_oTaPiQq*r2QPkr z@gR5$2>Al>q<8;7`~o+6j@9eNhN7x#s_VUaHGk@p_v=@_{CxWTjWOo!>%y3?r7^dz z>1|ts^ONIjIiKy$$}XwT296_xBp!hdfDo)72H$w~F4DeWs$oYs0!kv_Ac5FHpfS*l zTtk5qXkzYr853&BoO(@Bj0BeZUIr3qrDIIsr0)?%40@4J!|s8lF_8p_>=Jluk*wBP z8M#622F^PegXe}^nm`n-QZOt1h_EjVzHQkuLPL>4{yO+R!=vK?xKSDjdYt+oLk;&N z;FJ|vTT)^fAhEODqo)8PqlGSDyx$D|?9(&^Dv0Ldtd}*f>LN z8*s8jrQjn>d8!&IP}XDujx6+Ilf+gN5n?85l~JQZ4Jd(?!ULD6qX3|~IiAkcbJUH% zij{>#D(ezjA80UFIb_DF3hJ)>62M2uTprCe+V@;FwDsjzi_^vL)5T)zi79{m*eQak_RQmx_0MLr zI4%y0cjcD(R2&sIO;bIuZdYgJv+`xNS3c~Rb7J1(!o;o8-Wix)N@ Su6_G>rhliN@f{5rv@RHh|+S!7c?n!so62VKr9K84e z#)IH3Amj_gliu?O;upBltFe07*ick=b#=X0uli4Y@?q`T*PqW%-x_1yy(x_ORvL5X zhMu-`bFuP1TaIVDx3Vkhvw`EtAc+T{eINwu``$NRy^FLbm}=M-j)0N~I7lFN5NHTA zBiB&i1R9w8PKJbM6eEGVdk>{nMk%$2>wF=&}VRUu`&5^R{E zmJK*wqEhe?COlD%6ew#l21gb;u}R`o0})~-Yn4%>P4y^&mBJ&JsHFg)x;dWA)N|B{ zz>1ZHL@MhNS|8|Uu5!qXRTb1(`6Y_io)bshNsN6MD2$=wRx@D_fLSNPEW~uywgL*J#}f)zH>g4;E*O-)D=(<}*|N`mtREQ|+3kCu?8K zMsZvm7VpbV^SL-GZkwihS>3J9%NOPAYPW=PtC*UH=9XDEN9I*|e{@{lgu~JG^~;ww c9*@3%I@jN+tNd%sNd4)l{^F0-GG_AqKTm@FRsaA1 literal 0 HcmV?d00001 From 21c615c2a795ad8f06da459059a8f0f9e71bf789 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 17:42:57 +0200 Subject: [PATCH 15/47] feat: Integrate dictionary component into build --- meson.build | 2 ++ 1 file changed, 2 insertions(+) diff --git a/meson.build b/meson.build index a8af39f3..ab86a3da 100644 --- a/meson.build +++ b/meson.build @@ -81,6 +81,7 @@ endforeach libzxc_sources = files( 'src/lib/zxc_common.c', + 'src/lib/zxc_dict.c', 'src/lib/zxc_driver.c', 'src/lib/zxc_dispatch.c', 'src/lib/zxc_pstream.c', @@ -137,6 +138,7 @@ install_headers( 'include/zxc.h', 'include/zxc_buffer.h', 'include/zxc_constants.h', + 'include/zxc_dict.h', 'include/zxc_error.h', 'include/zxc_export.h', 'include/zxc_opts.h', From c26c3b7b81dfd74125e70004a33d36411d59a6f7 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 17:57:35 +0200 Subject: [PATCH 16/47] feat: Enable dictionary support for seekable multi-threaded decompression The `zxc_seek_mt_worker` function now allocates a thread-local "bounce buffer" for dictionary-based decompression. This buffer concatenates the dictionary content with the necessary working space for the decompressor, ensuring each worker can decompress chunks independently using the provided dictionary. A new test case `test_dict_seekable_mt_roundtrip` has been added to validate this functionality, covering full-range and sub-range multi-threaded decompression with a dictionary. --- src/lib/zxc_seekable.c | 25 +++++++++++-- tests/test_common.h | 1 + tests/test_dict.c | 81 ++++++++++++++++++++++++++++++++++++++++++ tests/test_main.c | 1 + 4 files changed, 105 insertions(+), 3 deletions(-) diff --git a/src/lib/zxc_seekable.c b/src/lib/zxc_seekable.c index cde91711..a62edf63 100644 --- a/src/lib/zxc_seekable.c +++ b/src/lib/zxc_seekable.c @@ -647,13 +647,27 @@ static void* zxc_seek_mt_worker(void* arg) { return NULL; } // LCOV_EXCL_STOP + dctx.dict_size = s->dict_size; const size_t work_sz = (size_t)s->block_size + ZXC_DECOMPRESS_TAIL_PAD; + /* Thread-local dict bounce buffer: [dict_content | decode_space] */ + uint8_t* dict_work = NULL; + if (s->dict_size > 0 && s->dict) { + dict_work = (uint8_t*)ZXC_MALLOC(s->dict_size + work_sz); + if (UNLIKELY(!dict_work)) { + zxc_cctx_free(&dctx); + job->result = ZXC_ERROR_MEMORY; + return NULL; + } + ZXC_MEMCPY(dict_work, s->dict, s->dict_size); + } + /* Read compressed block */ const uint32_t csz = s->comp_sizes[bi]; uint8_t* const read_buf = (uint8_t*)ZXC_MALLOC(csz + ZXC_PAD_SIZE); // LCOV_EXCL_START if (UNLIKELY(!read_buf)) { + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = ZXC_ERROR_MEMORY; return NULL; @@ -664,24 +678,28 @@ static void* zxc_seek_mt_worker(void* arg) { // LCOV_EXCL_START if (UNLIKELY(read_res < 0)) { ZXC_FREE(read_buf); + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = read_res; return NULL; } // LCOV_EXCL_STOP - /* Decompress */ + /* Decompress — use dict bounce buffer when dictionary is active */ + uint8_t* dec_dst = dict_work ? dict_work + s->dict_size : dctx.work_buf; const int dec_res = - zxc_decompress_chunk_wrapper(&dctx, read_buf, (size_t)read_res, dctx.work_buf, work_sz); + zxc_decompress_chunk_wrapper(&dctx, read_buf, (size_t)read_res, dec_dst, work_sz); ZXC_FREE(read_buf); // LCOV_EXCL_START if (UNLIKELY(dec_res < 0)) { + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = dec_res; return NULL; } if (UNLIKELY((size_t)dec_res < job->skip + job->copy_len)) { + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = ZXC_ERROR_CORRUPT_DATA; return NULL; @@ -689,8 +707,9 @@ static void* zxc_seek_mt_worker(void* arg) { // LCOV_EXCL_STOP /* Copy the requested portion directly into the caller's output buffer */ - ZXC_MEMCPY(job->dst, dctx.work_buf + job->skip, job->copy_len); + ZXC_MEMCPY(job->dst, dec_dst + job->skip, job->copy_len); + ZXC_FREE(dict_work); zxc_cctx_free(&dctx); job->result = 0; return NULL; diff --git a/tests/test_common.h b/tests/test_common.h index 8d42929c..a727bd37 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -194,5 +194,6 @@ int test_dict_stream_roundtrip(void); int test_dict_large_dict_roundtrip(void); int test_dict_seekable_roundtrip(void); int test_dict_train_roundtrip(void); +int test_dict_seekable_mt_roundtrip(void); #endif /* ZXC_TEST_COMMON_H */ diff --git a/tests/test_dict.c b/tests/test_dict.c index d753fe0e..14887847 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -601,3 +601,84 @@ int test_dict_seekable_roundtrip(void) { printf("PASS\n\n"); return 1; } + +int test_dict_seekable_mt_roundtrip(void) { + printf("=== TEST: Dict - seekable MT roundtrip ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; + const size_t dict_size = sizeof(dict_content) - 1; + + /* Use 32KB of data with 4KB blocks = 8 blocks, enough for MT */ + const size_t src_size = 32768; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + + zxc_compress_opts_t copts = { + .level = ZXC_LEVEL_DEFAULT, + .block_size = 4096, + .checksum_enabled = 1, + .seekable = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] compress returned %lld\n", (long long)comp_size); + free(src); + free(compressed); + return 0; + } + + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + free(src); + free(compressed); + return 0; + } + zxc_seekable_set_dict(s, dict_content, dict_size); + + /* Full range MT decompress */ + uint8_t* decompressed = (uint8_t*)malloc(src_size); + int64_t dec_size = zxc_seekable_decompress_range_mt(s, decompressed, src_size, 0, src_size, 4); + if (dec_size != (int64_t)src_size) { + printf(" [FAIL] decompress_range_mt returned %lld (%s)\n", (long long)dec_size, + dec_size < 0 ? zxc_error_name((int)dec_size) : "size mismatch"); + zxc_seekable_free(s); + free(src); + free(compressed); + free(decompressed); + return 0; + } + + int ok = (memcmp(src, decompressed, src_size) == 0); + if (!ok) { + for (size_t i = 0; i < src_size; i++) { + if (src[i] != decompressed[i]) { + printf(" [FAIL] content mismatch at byte %zu\n", i); + break; + } + } + } + + /* Also test a sub-range across block boundaries */ + if (ok) { + int64_t sub = zxc_seekable_decompress_range_mt(s, decompressed, 8192, 4000, 8192, 4); + ok = (sub == 8192 && memcmp(src + 4000, decompressed, 8192) == 0); + if (!ok) printf(" [FAIL] sub-range MT mismatch\n"); + } + + zxc_seekable_free(s); + free(decompressed); + free(src); + free(compressed); + + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} diff --git a/tests/test_main.c b/tests/test_main.c index ebd74e19..762684c4 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -125,6 +125,7 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_dict_large_dict_roundtrip), TEST_CASE(test_dict_seekable_roundtrip), TEST_CASE(test_dict_train_roundtrip), + TEST_CASE(test_dict_seekable_mt_roundtrip), /* --- Seekable (single-threaded) --- */ TEST_CASE(test_seekable_table_sizes), From 0c5be2fe1837a48a70afc309a9c22821d5042ab9 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:12:45 +0200 Subject: [PATCH 17/47] refactor: Make `dirent` pointer const-correct in dictionary lookup --- conformance/test_conformance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conformance/test_conformance.c b/conformance/test_conformance.c index 70346f7f..7d6b61f0 100644 --- a/conformance/test_conformance.c +++ b/conformance/test_conformance.c @@ -108,7 +108,7 @@ static uint8_t *find_dict_for_id(const char *zxc_path, uint32_t target_id, #else DIR *dp = opendir(dir); if (!dp) return NULL; - struct dirent *ent; + const struct dirent *ent; while ((ent = readdir(dp)) != NULL) { if (!has_suffix(ent->d_name, ".zxd")) continue; char path[512]; From 217e3cb8e8a6f5166797e991903cec2a4c7909e4 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:13:10 +0200 Subject: [PATCH 18/47] refactor: Use `zxc --train-dict` for dictionary test setup Previously, the dictionary test created a dictionary using a custom C helper compiled on the fly. This update replaces that with a direct call to the `zxc --train-dict` CLI command. This change makes the test more realistic by utilizing the public API for dictionary training and removes the external dependency on `cc` for test execution, simplifying the test environment. --- tests/test_cli.sh | 48 +++++++++-------------------------------------- 1 file changed, 9 insertions(+), 39 deletions(-) diff --git a/tests/test_cli.sh b/tests/test_cli.sh index 71e7ef21..7066b57c 100755 --- a/tests/test_cli.sh +++ b/tests/test_cli.sh @@ -893,46 +893,18 @@ fi # 25. Dictionary Tests (-D) echo "Testing Dictionary (-D)..." -# 25.1 Create a .zxd dictionary from repetitive content -echo " Creating test dictionary..." -# Build a small helper to create a .zxd (uses the C API) -DICT_HELPER="$TEST_DIR/make_dict" -cat > "$TEST_DIR/make_dict.c" <<'DICTEOF' -#include -#include -#include "zxc.h" -#include "zxc_dict.h" -int main(int argc, char** argv) { - const char* content = - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " - "Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " - "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris."; - size_t sz = strlen(content); - uint8_t buf[1024]; - int64_t w = zxc_dict_save(content, sz, buf, sizeof(buf)); - if (w <= 0) return 1; - FILE* f = fopen(argv[1], "wb"); - fwrite(buf, 1, (size_t)w, f); - fclose(f); - return 0; -} -DICTEOF - -# Compile helper (find include/lib relative to the binary) -ZXC_DIR=$(dirname "$(dirname "$ZXC_BIN")") -cc -O0 -I"${ZXC_DIR}/include" -o "$DICT_HELPER" "$TEST_DIR/make_dict.c" \ - "${ZXC_DIR}/build/libzxc.a" -lpthread 2>/dev/null - -if [ ! -f "$DICT_HELPER" ]; then - echo " [SKIP] Could not compile dict helper (missing dev headers)" -else - +# 25.1 Train a dictionary using --train-dict +echo " Training dictionary from test data..." +# Create a few sample files for training +for i in 1 2 3 4 5; do + cp "$TEST_FILE" "$TEST_DIR/sample_${i}.txt" +done DICT_FILE="$TEST_DIR/test.zxd" -"$DICT_HELPER" "$DICT_FILE" +"$ZXC_BIN" --train-dict "$DICT_FILE" "$TEST_DIR"/sample_*.txt 2>/dev/null if [ ! -f "$DICT_FILE" ]; then - log_fail "Dictionary creation failed" + log_fail "Dictionary training failed" fi -log_pass "Dictionary .zxd created" +log_pass "Dictionary trained via --train-dict" # 25.2 Round-trip with dictionary echo " Testing dict round-trip..." @@ -1025,7 +997,5 @@ else log_fail "Invalid dict file should be rejected" fi -fi # end of dict helper check - echo "All tests passed!" exit 0 From 30519666e0f16d78e098694b6e6a4c011986cf95 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:15:19 +0200 Subject: [PATCH 19/47] fix: Free dictionary allocations on all exit paths Ensures that dictionary memory, allocated as part of the new dictionary support, is properly released in error handling branches within the compression/decompression functions and upon CLI exit. This prevents memory leaks. --- src/cli/main.c | 1 + src/lib/zxc_dispatch.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/src/cli/main.c b/src/cli/main.c index 440592d6..8f9e8abd 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -1660,5 +1660,6 @@ int main(int argc, char** argv) { break; // Standard mode only does the first argument as input } } + free(dict); return overall_ret; } diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 9efb5ba2..3a45dc24 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -538,6 +538,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST zxc_write_file_header(op, (size_t)(op_end - op), block_size, checksum_enabled, did); // LCOV_EXCL_START if (UNLIKELY(h_val < 0)) { + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); return h_val; } @@ -738,6 +739,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE zxc_block_header_t bh; // Read the block header to determine the compressed size if (UNLIKELY(zxc_read_block_header(ip, rem_src, &bh) != ZXC_OK)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_BAD_HEADER; } @@ -748,6 +750,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE // even when a seek table is inserted between EOF block and footer. // LCOV_EXCL_START if (UNLIKELY(src_size < ZXC_FILE_FOOTER_SIZE)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_SRC_TOO_SMALL; } @@ -757,6 +760,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE // Validate source size matches what we decompressed const uint64_t stored_size = zxc_le64(footer); if (UNLIKELY(stored_size != (uint64_t)(op - op_start))) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_CORRUPT_DATA; } @@ -765,6 +769,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE if (checksum_enabled && file_has_checksums) { const uint32_t stored_hash = zxc_le32(footer + sizeof(uint64_t)); if (UNLIKELY(stored_hash != global_hash)) { + ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_BAD_CHECKSUM; } From deffbaad05c6546f0f85a19d9cf8a2b358b33ba0 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:16:36 +0200 Subject: [PATCH 20/47] fix: Enlarge path buffers for dictionary lookup Doubles the buffer size to accommodate longer directory and file names, preventing potential path truncation issues during dictionary file discovery. --- conformance/test_conformance.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conformance/test_conformance.c b/conformance/test_conformance.c index 7d6b61f0..f518f9f7 100644 --- a/conformance/test_conformance.c +++ b/conformance/test_conformance.c @@ -92,7 +92,7 @@ static uint8_t *find_dict_for_id(const char *zxc_path, uint32_t target_id, HANDLE hf = FindFirstFileA(pattern, &fd); if (hf == INVALID_HANDLE_VALUE) return NULL; do { - char path[512]; + char path[1024]; snprintf(path, sizeof(path), "%s%s", dir, fd.cFileName); size_t sz = 0; uint8_t *buf = read_file(path, &sz); @@ -111,7 +111,7 @@ static uint8_t *find_dict_for_id(const char *zxc_path, uint32_t target_id, const struct dirent *ent; while ((ent = readdir(dp)) != NULL) { if (!has_suffix(ent->d_name, ".zxd")) continue; - char path[512]; + char path[1024]; snprintf(path, sizeof(path), "%s%s", dir, ent->d_name); size_t sz = 0; uint8_t *buf = read_file(path, &sz); From 96f5f1cf1873d2c8be514eae32e3c7d4859c0210 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:16:49 +0200 Subject: [PATCH 21/47] fix: Silence unused 'ctx' parameter warning The `ctx` parameter is present for future dictionary context usage in `zxc_encode_block_num` but is currently unused. This silences a compiler warning. --- src/lib/zxc_compress.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 123466ec..6c6b3cce 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -626,6 +626,7 @@ static ZXC_ALWAYS_INLINE zxc_match_t zxc_lz77_find_best_match( static int zxc_encode_block_num(const zxc_cctx_t* RESTRICT ctx, const uint8_t* RESTRICT src, const size_t src_sz, uint8_t* RESTRICT dst, size_t dst_cap, size_t* RESTRICT out_sz) { + (void)ctx; if (UNLIKELY(src_sz % sizeof(uint32_t) != 0 || src_sz == 0 || dst_cap < ZXC_BLOCK_HEADER_SIZE + ZXC_NUM_HEADER_BINARY_SIZE)) return ZXC_ERROR_DST_TOO_SMALL; From cacd89b9f659a4a77fb1cbc8308767ff14d41183 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:17:20 +0200 Subject: [PATCH 22/47] feat: Include dictionary test --- meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/meson.build b/meson.build index ab86a3da..788f64e7 100644 --- a/meson.build +++ b/meson.build @@ -180,6 +180,7 @@ if not meson.is_subproject() 'tests/test_seekable_mt.c', 'tests/test_format.c', 'tests/test_misc.c', + 'tests/test_dict.c', ), include_directories : libzxc_includes, link_with : libzxc_static, From e238dfc9a8eae44b65d765cfa2bef545df5f67d5 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:26:17 +0200 Subject: [PATCH 23/47] fix: Expand dictionary memory freeing to all CLI and lib error paths --- src/cli/main.c | 5 +++++ src/lib/zxc_dispatch.c | 1 + 2 files changed, 6 insertions(+) diff --git a/src/cli/main.c b/src/cli/main.c index 8f9e8abd..6ba5c6de 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -1386,6 +1386,7 @@ int main(int argc, char** argv) { * without disk I/O bottlenecks. */ if (mode == MODE_BENCHMARK) { + free(dict); if (optind >= argc) { zxc_log("Benchmark requires input file.\n"); return 1; @@ -1586,6 +1587,7 @@ int main(int argc, char** argv) { * Displays archive information (compressed size, uncompressed size, ratio). */ if (mode == MODE_LIST) { + free(dict); if (optind >= argc) { zxc_log("List mode requires input file.\n"); return 1; @@ -1611,6 +1613,7 @@ int main(int argc, char** argv) { if (multiple_mode && to_stdout) { zxc_log("Error: cannot write to stdout when using multiple files mode (-m).\n"); + free(dict); return 1; } @@ -1624,11 +1627,13 @@ int main(int argc, char** argv) { // If no files passed but we aren't using stdin, or mode expects files: if (optind >= argc && mode == MODE_INTEGRITY) { zxc_log("Test mode requires at least one input file.\n"); + free(dict); return 1; } if (multiple_mode && optind >= argc) { zxc_log("Multiple files mode requires at least one input file.\n"); + free(dict); return 1; } diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 3a45dc24..14346e27 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -559,6 +559,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST seek_comp = (uint32_t*)ZXC_MALLOC(seek_cap * sizeof(uint32_t)); // LCOV_EXCL_START if (UNLIKELY(!seek_comp)) { + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); return ZXC_ERROR_MEMORY; } From 15863ecaabc8fdf13abff2f03c1d810d17a2af80 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:26:41 +0200 Subject: [PATCH 24/47] feat: Validate dictionary and input file paths in CLI Introduces a validation step for paths used by the dictionary and compression input files. This prevents issues from malformed paths and improves the CLI's robustness. Invalid dictionary paths now cause an immediate exit, while invalid input files are skipped with a warning. --- src/cli/main.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/cli/main.c b/src/cli/main.c index 6ba5c6de..0c2f5413 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -1237,7 +1237,12 @@ int main(int argc, char** argv) { void* dict = NULL; size_t dict_size = 0; if (dict_path) { - FILE* f_dict = fopen(dict_path, "rb"); + char resolved_dict[4096]; + if (zxc_validate_input_path(dict_path, resolved_dict, sizeof(resolved_dict)) != 0) { + fprintf(stderr, "Error: invalid dictionary path '%s': %s\n", dict_path, strerror(errno)); + return 1; + } + FILE* f_dict = fopen(resolved_dict, "rb"); if (!f_dict) { fprintf(stderr, "Error: cannot open dictionary '%s': %s\n", dict_path, strerror(errno)); return 1; @@ -1300,7 +1305,12 @@ int main(int argc, char** argv) { } int n_loaded = 0; for (int i = optind; i < argc; i++) { - FILE* sf = fopen(argv[i], "rb"); + char resolved[4096]; + if (zxc_validate_input_path(argv[i], resolved, sizeof(resolved)) != 0) { + fprintf(stderr, "Warning: invalid path '%s', skipping\n", argv[i]); + continue; + } + FILE* sf = fopen(resolved, "rb"); if (!sf) { fprintf(stderr, "Warning: cannot open '%s', skipping\n", argv[i]); continue; From 87e5c49c71449a9e0ecda55df1883de699fb2b19 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:47:18 +0200 Subject: [PATCH 25/47] fix: Free dictionary input buffer on block count overflow Ensures that the `dict_input` memory is properly freed when the number of blocks exceeds the maximum allowed, preventing a memory leak in this error path during compression. --- src/lib/zxc_dispatch.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 14346e27..6a7a7f67 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -552,6 +552,7 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST if (seekable) { const size_t block_count = src_size / block_size; if (UNLIKELY(block_count > (size_t)UINT32_MAX - 2)) { + ZXC_FREE(dict_input); zxc_cctx_free(&ctx); return ZXC_ERROR_BAD_BLOCK_SIZE; } From fb3bb271970314a1e77807568e501742e9ee4507 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Wed, 27 May 2026 18:48:26 +0200 Subject: [PATCH 26/47] feat: Set explicit file permissions for dictionary creation on Unix On Unix-like systems, switch from `fopen` to `open` and `fdopen` when creating the training dictionary. This enables explicit control over file permissions, ensuring the dictionary file is created with predictable and secure access rights (owner R/W, group R, others R). --- src/cli/main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/cli/main.c b/src/cli/main.c index 0c2f5413..921d06c8 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -1371,7 +1371,16 @@ int main(int argc, char** argv) { return 1; } - FILE* out = fopen(train_dict_path, "wb"); + FILE* out; +#ifdef _WIN32 + out = fopen(train_dict_path, "wb"); +#else + { + const int fd = open(train_dict_path, O_CREAT | O_WRONLY | O_TRUNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + out = (fd != -1) ? fdopen(fd, "wb") : NULL; + } +#endif if (!out) { fprintf(stderr, "Error: cannot create '%s': %s\n", train_dict_path, strerror(errno)); free(zxd); From c38cc3bc76299f91c6adeef118d7f1591fdf1444 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Thu, 28 May 2026 20:45:20 +0200 Subject: [PATCH 27/47] fix: Exclude error handling and memory failure paths from LCOV coverage Adds LCOV exclusion markers to various error handling and memory allocation failure paths across the library. These paths are often difficult to trigger reliably in unit tests but represent valid error conditions that are correctly handled. Excluding them ensures more accurate code coverage metrics without skewing results for untestable conditions. --- src/lib/zxc_dict.c | 6 +++++- src/lib/zxc_dispatch.c | 8 ++++++++ src/lib/zxc_driver.c | 2 ++ src/lib/zxc_seekable.c | 10 +++++++--- tests/test_common.h | 1 + tests/test_dict.c | 44 ++++++++++++++++++++++++++++++++++++++++++ tests/test_main.c | 1 + tests/test_misc.c | 3 +++ 8 files changed, 71 insertions(+), 4 deletions(-) diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 2ac8a5ec..15603086 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -151,7 +151,7 @@ static int zxc_seg_cmp_desc(const void* a, const void* b) { int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, const size_t n_samples, void* dict_buf, const size_t dict_capacity) { if (UNLIKELY(!samples || !sample_sizes || n_samples == 0 || !dict_buf || dict_capacity == 0)) - return ZXC_ERROR_NULL_INPUT; + return ZXC_ERROR_NULL_INPUT; // LCOV_EXCL_LINE if (UNLIKELY(dict_capacity > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; /* Step 1: concatenate samples */ @@ -172,8 +172,10 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, /* Step 2: count k-gram frequencies */ uint16_t* freq = (uint16_t*)ZXC_MALLOC(ZXC_DICT_HT_SIZE * sizeof(uint16_t)); if (UNLIKELY(!freq)) { + // LCOV_EXCL_START ZXC_FREE(corpus); return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP } ZXC_MEMSET(freq, 0, ZXC_DICT_HT_SIZE * sizeof(uint16_t)); @@ -191,9 +193,11 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, zxc_dict_seg_t* segs = (zxc_dict_seg_t*)ZXC_MALLOC(seg_alloc * sizeof(zxc_dict_seg_t)); if (UNLIKELY(!segs)) { + // LCOV_EXCL_START ZXC_FREE(freq); ZXC_FREE(corpus); return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP } size_t n_segs = 0; diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 6a7a7f67..5e933afe 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -528,8 +528,10 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST if (dict_size > 0) { dict_input = (uint8_t*)ZXC_MALLOC(dict_size + block_size); if (UNLIKELY(!dict_input)) { + // LCOV_EXCL_START zxc_cctx_free(&ctx); return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP } ZXC_MEMCPY(dict_input, dict, dict_size); } @@ -552,9 +554,11 @@ int64_t zxc_compress(const void* RESTRICT src, const size_t src_size, void* REST if (seekable) { const size_t block_count = src_size / block_size; if (UNLIKELY(block_count > (size_t)UINT32_MAX - 2)) { + // LCOV_EXCL_START ZXC_FREE(dict_input); zxc_cctx_free(&ctx); return ZXC_ERROR_BAD_BLOCK_SIZE; + // LCOV_EXCL_STOP } seek_cap = (uint32_t)(block_count + 2); seek_comp = (uint32_t*)ZXC_MALLOC(seek_cap * sizeof(uint32_t)); @@ -727,8 +731,10 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE if (dict_size > 0) { dict_dec = (uint8_t*)ZXC_MALLOC(dict_size + work_sz); if (UNLIKELY(!dict_dec)) { + // LCOV_EXCL_START zxc_cctx_free(&ctx); return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP } ZXC_MEMCPY(dict_dec, dict, dict_size); } @@ -787,9 +793,11 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE res = zxc_decompress_chunk_wrapper(&ctx, ip, rem_src, dict_dec + dict_size, work_sz); if (LIKELY(res > 0)) { if (UNLIKELY((size_t)res > rem_cap)) { + // LCOV_EXCL_START ZXC_FREE(dict_dec); zxc_cctx_free(&ctx); return ZXC_ERROR_DST_TOO_SMALL; + // LCOV_EXCL_STOP } ZXC_MEMCPY(op, dict_dec + dict_size, (size_t)res); } diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index c97b9586..c7fd07c8 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -399,6 +399,7 @@ static void* zxc_stream_worker(void* arg) { const size_t alloc = dsz + ctx->chunk_size + ZXC_DECOMPRESS_TAIL_PAD; dict_work = (uint8_t*)ZXC_MALLOC(alloc); if (UNLIKELY(!dict_work)) { + // LCOV_EXCL_START zxc_cctx_free(&cctx); pthread_mutex_lock(&ctx->lock); ctx->io_error = 1; @@ -406,6 +407,7 @@ static void* zxc_stream_worker(void* arg) { pthread_cond_broadcast(&ctx->cond_reader); pthread_mutex_unlock(&ctx->lock); return NULL; + // LCOV_EXCL_STOP } ZXC_MEMCPY(dict_work, ctx->dict, dsz); } diff --git a/src/lib/zxc_seekable.c b/src/lib/zxc_seekable.c index a62edf63..af633d8f 100644 --- a/src/lib/zxc_seekable.c +++ b/src/lib/zxc_seekable.c @@ -199,7 +199,7 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s int file_has_chk = 0; if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk, NULL) != ZXC_OK)) - return NULL; + return NULL; // LCOV_EXCL_LINE const uint32_t block_size = (uint32_t)block_size_sz; if (UNLIKELY(block_size == 0)) return NULL; // LCOV_EXCL_LINE @@ -333,7 +333,7 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { size_t bs_sz = 0; int fhc = 0; if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc, NULL) != ZXC_OK)) - return NULL; + return NULL; // LCOV_EXCL_LINE const uint32_t bs = (uint32_t)bs_sz; if (UNLIKELY(bs == 0)) return NULL; @@ -655,9 +655,11 @@ static void* zxc_seek_mt_worker(void* arg) { if (s->dict_size > 0 && s->dict) { dict_work = (uint8_t*)ZXC_MALLOC(s->dict_size + work_sz); if (UNLIKELY(!dict_work)) { + // LCOV_EXCL_START zxc_cctx_free(&dctx); job->result = ZXC_ERROR_MEMORY; return NULL; + // LCOV_EXCL_STOP } ZXC_MEMCPY(dict_work, s->dict, s->dict_size); } @@ -685,7 +687,7 @@ static void* zxc_seek_mt_worker(void* arg) { } // LCOV_EXCL_STOP - /* Decompress — use dict bounce buffer when dictionary is active */ + /* Decompress: use dict bounce buffer when dictionary is active */ uint8_t* dec_dst = dict_work ? dict_work + s->dict_size : dctx.work_buf; const int dec_res = zxc_decompress_chunk_wrapper(&dctx, read_buf, (size_t)read_res, dec_dst, work_sz); @@ -860,10 +862,12 @@ int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_s const size_t work_sz = dict_size + (size_t)s->block_size + ZXC_DECOMPRESS_TAIL_PAD; s->dict_work = (uint8_t*)ZXC_MALLOC(work_sz); if (UNLIKELY(!s->dict_work)) { + // LCOV_EXCL_START ZXC_FREE(s->dict); s->dict = NULL; s->dict_size = 0; return ZXC_ERROR_MEMORY; + // LCOV_EXCL_STOP } ZXC_MEMCPY(s->dict_work, dict, dict_size); return ZXC_OK; diff --git a/tests/test_common.h b/tests/test_common.h index a727bd37..3139e3a6 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -194,6 +194,7 @@ int test_dict_stream_roundtrip(void); int test_dict_large_dict_roundtrip(void); int test_dict_seekable_roundtrip(void); int test_dict_train_roundtrip(void); +int test_dict_train_no_frequent_patterns(void); int test_dict_seekable_mt_roundtrip(void); #endif /* ZXC_TEST_COMMON_H */ diff --git a/tests/test_dict.c b/tests/test_dict.c index 14887847..110fbde5 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -528,6 +528,50 @@ int test_dict_train_roundtrip(void) { return 1; } +int test_dict_train_no_frequent_patterns(void) { + printf("=== TEST: Dict - train fallback when no frequent k-grams ===\n"); + + /* A strictly increasing byte sequence has all-distinct 5-grams, so no + * k-gram repeats and the trainer finds zero scorable segments. This forces + * the n_segs == 0 fallback: copy the tail of the corpus into the dict. */ + uint8_t corpus[64]; + for (size_t i = 0; i < sizeof(corpus); i++) corpus[i] = (uint8_t)i; + + const void* sample_ptrs[1] = {corpus}; + const size_t sample_sizes[1] = {sizeof(corpus)}; + + /* Case 1: capacity >= corpus_size -> copy == corpus_size, dict == whole corpus. */ + uint8_t dict_big[256]; + int64_t sz = zxc_train_dict(sample_ptrs, sample_sizes, 1, dict_big, sizeof(dict_big)); + if (sz != (int64_t)sizeof(corpus)) { + printf(" [FAIL] expected %zu bytes (full corpus), got %lld\n", sizeof(corpus), + (long long)sz); + return 0; + } + if (memcmp(dict_big, corpus, sizeof(corpus)) != 0) { + printf(" [FAIL] dict content does not match corpus tail\n"); + return 0; + } + printf(" [PASS] full-corpus fallback (%lld bytes)\n", (long long)sz); + + /* Case 2: capacity < corpus_size -> copy == capacity, dict == last `cap` bytes. */ + const size_t cap = 16; + uint8_t dict_small[16]; + sz = zxc_train_dict(sample_ptrs, sample_sizes, 1, dict_small, cap); + if (sz != (int64_t)cap) { + printf(" [FAIL] expected %zu bytes (capped), got %lld\n", cap, (long long)sz); + return 0; + } + if (memcmp(dict_small, corpus + sizeof(corpus) - cap, cap) != 0) { + printf(" [FAIL] capped dict does not match corpus tail\n"); + return 0; + } + printf(" [PASS] capped tail fallback (%lld bytes)\n", (long long)sz); + + printf("PASS\n\n"); + return 1; +} + int test_dict_seekable_roundtrip(void) { printf("=== TEST: Dict - seekable API roundtrip ===\n"); diff --git a/tests/test_main.c b/tests/test_main.c index 762684c4..78ab7c97 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -125,6 +125,7 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_dict_large_dict_roundtrip), TEST_CASE(test_dict_seekable_roundtrip), TEST_CASE(test_dict_train_roundtrip), + TEST_CASE(test_dict_train_no_frequent_patterns), TEST_CASE(test_dict_seekable_mt_roundtrip), /* --- Seekable (single-threaded) --- */ diff --git a/tests/test_misc.c b/tests/test_misc.c index 55a43eb5..71bc38f3 100644 --- a/tests/test_misc.c +++ b/tests/test_misc.c @@ -29,6 +29,9 @@ int test_error_name() { {ZXC_ERROR_NULL_INPUT, "ZXC_ERROR_NULL_INPUT"}, {ZXC_ERROR_BAD_BLOCK_TYPE, "ZXC_ERROR_BAD_BLOCK_TYPE"}, {ZXC_ERROR_BAD_BLOCK_SIZE, "ZXC_ERROR_BAD_BLOCK_SIZE"}, + {ZXC_ERROR_DICT_REQUIRED, "ZXC_ERROR_DICT_REQUIRED"}, + {ZXC_ERROR_DICT_MISMATCH, "ZXC_ERROR_DICT_MISMATCH"}, + {ZXC_ERROR_DICT_TOO_LARGE, "ZXC_ERROR_DICT_TOO_LARGE"}, }; const int n = sizeof(cases) / sizeof(cases[0]); From 1ea271f9ff165f42b6814bf39e61bc20dee6f1a7 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Thu, 28 May 2026 21:00:30 +0200 Subject: [PATCH 28/47] feat: Implement custom heapsort for reproducible dictionary training Replaces `libc`'s `qsort` with an in-place heapsort implementation for sorting dictionary segments during training. This change removes a dependency on `libc`, making the library more suitable for freestanding environments. It also ensures deterministic dictionary output across different platforms and `libc` versions by providing a fixed sorting algorithm. --- src/lib/zxc_dict.c | 71 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 15603086..ae86e68a 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -141,11 +141,70 @@ typedef struct { uint16_t score; } zxc_dict_seg_t; -static int zxc_seg_cmp_desc(const void* a, const void* b) { - const zxc_dict_seg_t* sa = (const zxc_dict_seg_t*)a; - const zxc_dict_seg_t* sb = (const zxc_dict_seg_t*)b; - if (sa->score != sb->score) return (sa->score < sb->score) ? 1 : -1; - return 0; +/** + * @brief Restore the min-heap property at @p root over the range @p a[0..n). + * + * Sinks @p a[root] down the binary heap (children at @c 2i+1 / @c 2i+2) until + * both children are @c >= it, comparing on @ref zxc_dict_seg_t::score. The loop + * is iterative (no recursion), so the call stack stays O(1) regardless of @p n. + * + * @param[in,out] a Heap-ordered array; @p a[0..n) is treated as the heap. + * @param[in] root Index of the element to sift down. Must be @c < n. + * @param[in] n Number of valid elements in the heap. + * + * @note Complexity O(log n). + */ +static void zxc_dict_sift_down(zxc_dict_seg_t* RESTRICT a, size_t root, const size_t n) { + for (;;) { + size_t child = 2 * root + 1; + if (child >= n) break; + if (child + 1 < n && a[child + 1].score < a[child].score) child++; + if (a[root].score <= a[child].score) break; + const zxc_dict_seg_t t = a[root]; + a[root] = a[child]; + a[child] = t; + root = child; + } +} + +/** + * @brief Sort @p a[0..n) by @ref zxc_dict_seg_t::score in descending order. + * + * In-place heapsort: a min-heap is built over the whole array, then each + * extracted minimum is swapped to the shrinking tail. Because the smallest + * scores accumulate at the end, the array is left in descending order + * (largest score at index 0), as required by the dictionary fill step. + * + * Replaces a libc @c qsort call for two reasons: + * - **Freestanding/kernel-safe**: no dependency on @c qsort and no indirect + * comparator call (the @c score comparison is inlined in @ref + * zxc_dict_sift_down). + * - **Deterministic**: ordering is fixed by this code rather than by the + * platform's @c qsort, which matters for reproducible dictionary output + * across libc implementations. + * + * Equal scores keep an unspecified-but-deterministic relative order, matching + * the previous comparator that returned 0 on ties (heapsort is not stable). + * + * @param[in,out] a Array of @p n segments, sorted in place. + * @param[in] n Number of segments. @c n < 2 is a no-op. + * + * @note Complexity O(n log n) worst case with no extra allocation. In practice + * this matches or beats @c qsort on the sizes seen here (up to ~65536 + * segments): eliminating the per-comparison indirect call outweighs + * heapsort's weaker cache locality. This is a cold path (dictionary + * training), so absolute speed is not critical. + */ +static void zxc_dict_sort_segs_desc(zxc_dict_seg_t* RESTRICT a, const size_t n) { + if (UNLIKELY(n < 2)) return; + for (size_t i = n / 2; i-- > 0;) zxc_dict_sift_down(a, i, n); + for (size_t end = n; end > 1;) { + end--; + const zxc_dict_seg_t t = a[0]; + a[0] = a[end]; + a[end] = t; + zxc_dict_sift_down(a, 0, end); + } } int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, @@ -233,7 +292,7 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, /* Step 4: sort by score descending, fill dict from end (most frequent last * = shortest offsets from block start). */ - qsort(segs, n_segs, sizeof(zxc_dict_seg_t), zxc_seg_cmp_desc); + zxc_dict_sort_segs_desc(segs, n_segs); uint8_t* out = (uint8_t*)dict_buf; size_t filled = 0; From 642b41861213931d1f4cd4eb7d5d3fe18794af26 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Thu, 28 May 2026 21:08:30 +0200 Subject: [PATCH 29/47] feat: Add roundtrip test for dictionary block APIs This test verifies `zxc_compress_block` and `zxc_decompress_block` functionality with a dictionary, covering all compression levels from 1 to 6. It ensures data integrity and correct behavior for block-level dictionary-based compression and decompression. --- tests/test_common.h | 1 + tests/test_dict.c | 67 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_main.c | 1 + 3 files changed, 69 insertions(+) diff --git a/tests/test_common.h b/tests/test_common.h index 3139e3a6..7d6c86eb 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -187,6 +187,7 @@ int test_dict_zxd_roundtrip(void); int test_dict_id_deterministic(void); int test_dict_get_id_apis(void); int test_dict_buffer_roundtrip(void); +int test_dict_block_roundtrip(void); int test_dict_mismatch_error(void); int test_dict_required_error(void); int test_dict_no_dict_compat(void); diff --git a/tests/test_dict.c b/tests/test_dict.c index 110fbde5..d4c52f4f 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -226,6 +226,73 @@ int test_dict_buffer_roundtrip(void) { return 1; } +int test_dict_block_roundtrip(void) { + printf("=== TEST: Dict - block API roundtrip (all levels) ===\n"); + + const uint8_t dict_content[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " + "Pack my box with five dozen liquor jugs."; + const size_t dict_size = sizeof(dict_content) - 1; + + const size_t src_size = 4096; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, dict_content, dict_size); + + const size_t comp_bound = (size_t)zxc_compress_block_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + uint8_t* decompressed = (uint8_t*)malloc(src_size); + zxc_cctx* cctx = zxc_create_cctx(NULL); + zxc_dctx* dctx = zxc_create_dctx(); + + int result = 0; + if (!src || !compressed || !decompressed || !cctx || !dctx) { + printf(" [FAIL] allocation failed\n"); + goto cleanup; + } + + for (int level = 1; level <= 6; level++) { + zxc_compress_opts_t copts = { + .level = level, + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t comp_size = + zxc_compress_block(cctx, src, src_size, compressed, comp_bound, &copts); + if (comp_size <= 0) { + printf(" [FAIL] level %d: compress_block returned %lld\n", level, + (long long)comp_size); + goto cleanup; + } + + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, + .dict = dict_content, + .dict_size = dict_size, + }; + int64_t dec_size = zxc_decompress_block(dctx, compressed, (size_t)comp_size, decompressed, + src_size, &dopts); + if (dec_size != (int64_t)src_size || memcmp(src, decompressed, src_size) != 0) { + printf(" [FAIL] level %d: block roundtrip mismatch (dec_size=%lld)\n", level, + (long long)dec_size); + goto cleanup; + } + printf(" [PASS] level %d: %zu -> %lld bytes\n", level, src_size, (long long)comp_size); + } + + result = 1; + +cleanup: + zxc_free_cctx(cctx); /* safe with NULL */ + zxc_free_dctx(dctx); /* safe with NULL */ + free(src); + free(compressed); + free(decompressed); + if (result) printf("PASS\n\n"); + return result; +} + int test_dict_mismatch_error(void) { printf("=== TEST: Dict - dict_id mismatch error ===\n"); diff --git a/tests/test_main.c b/tests/test_main.c index 78ab7c97..e72417df 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -118,6 +118,7 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_dict_id_deterministic), TEST_CASE(test_dict_get_id_apis), TEST_CASE(test_dict_buffer_roundtrip), + TEST_CASE(test_dict_block_roundtrip), TEST_CASE(test_dict_mismatch_error), TEST_CASE(test_dict_required_error), TEST_CASE(test_dict_no_dict_compat), From 5acc3a0c1e1da047627584cd737d406fbe56dab6 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Thu, 28 May 2026 22:25:10 +0200 Subject: [PATCH 30/47] refactor: Move dictionary configuration macros to internal header Relocates `ZXC_DICT_KGRAM_LEN`, `ZXC_DICT_HT_BITS`, and `ZXC_DICT_HT_SIZE` from `zxc_dict.c` to `zxc_internal.h`. This centralizes shared dictionary-related constants, making them accessible throughout the library and improving code organization. --- src/lib/zxc_dict.c | 5 ----- src/lib/zxc_internal.h | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index ae86e68a..2b4211db 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -121,11 +121,6 @@ int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_o * so they produce shorter offsets (closer to the block start). * ------------------------------------------------------------------------- */ -#define ZXC_DICT_KGRAM_LEN ZXC_LZ_MIN_MATCH_LEN -#define ZXC_DICT_HT_BITS 16 -#define ZXC_DICT_HT_SIZE (1U << ZXC_DICT_HT_BITS) -#define ZXC_DICT_HT_MASK (ZXC_DICT_HT_SIZE - 1U) - static uint32_t zxc_dict_hash(const uint8_t* p) { uint32_t v = zxc_le32(p); v ^= (uint32_t)p[4]; diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h index cb9667fe..ea10786e 100644 --- a/src/lib/zxc_internal.h +++ b/src/lib/zxc_internal.h @@ -344,6 +344,13 @@ extern "C" { #define ZXC_DICT_MAGIC 0x9CB0D1C7U /** @brief Current dictionary file format version. */ #define ZXC_DICT_VERSION 1 +/** @brief K-gram length scanned by the dictionary trainer. Aligned on the LZ + * minimum match length so trained patterns are matchable at encode time. */ +#define ZXC_DICT_KGRAM_LEN ZXC_LZ_MIN_MATCH_LEN +/** @brief Address bits for the dictionary trainer's k-gram frequency table. */ +#define ZXC_DICT_HT_BITS 16 +/** @brief Number of buckets in the dictionary trainer's frequency table. */ +#define ZXC_DICT_HT_SIZE (1U << ZXC_DICT_HT_BITS) /** @brief Block header size: Type(1)+Flags(1)+Reserved(1)+CRC(1)+CompSize(4). */ #define ZXC_BLOCK_HEADER_SIZE 8 From 8c500e13d7cf7ca89f18c9d106c2de3a4b138d95 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Thu, 28 May 2026 22:35:21 +0200 Subject: [PATCH 31/47] docs: Document zxc dictionary file format and magic word Adds a new section to `FORMAT.md` that details the `.zxd` dictionary file format. This includes the unique magic word for dictionary files and a comprehensive worked example with a hexdump and byte-level decoding of the dictionary header and content. This provides essential guidance for implementers working with zxc dictionaries. --- docs/FORMAT.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/docs/FORMAT.md b/docs/FORMAT.md index 5dec2bfe..ffe00316 100644 --- a/docs/FORMAT.md +++ b/docs/FORMAT.md @@ -644,6 +644,13 @@ shortest offsets (closest to the block start in the virtual window). - File footer: **12** bytes - Dictionary file header (`.zxd`): **16** bytes +**Magic words** — both are little-endian `u32` at offset `0x00` and deliberately share the `0x9CB0...` family prefix, so check the full value (or the file extension) to tell them apart: + +| File | Magic (value) | On-disk bytes (LE) | +|------|---------------|--------------------| +| ZXC archive (`.zxc`) | `0x9CB02EF5` | `F5 2E B0 9C` | +| ZXC dictionary (`.zxd`) | `0x9CB0D1C7` | `C7 D1 B0 9C` | + --- ## 14. Worked Example (Real Hexdump) @@ -819,3 +826,47 @@ Seek table entry at `0x36`: ``` > **Compatibility note**: The SEK block is inserted between the EOF block and the file footer. The footer always remains the **last 12 bytes of the file**, so decoders that locate the footer from the end of the file (e.g. `src + src_size - 12` for buffer APIs, or `fseek(END - 12)` for file APIs) work unchanged with seekable archives. However, **streaming decoders** that read the footer sequentially immediately after the EOF block must be updated to detect and skip the SEK block. In practice, all ZXC decoders since v0.9.0 handle both seekable and non-seekable archives transparently. + +--- + +## 15. Worked Example: Dictionary File (`.zxd` Hexdump) + +A minimal dictionary whose content is the 5 ASCII bytes `hello`. Total file size: **21 bytes** (16-byte header + 5-byte content). This is the on-disk form produced by `zxc_dict_save()` (see §12.4). + +### 15.1 Full hexdump + +```text +00000000: C7 D1 B0 9C 01 00 05 00 17 0F 72 9A 4A D9 00 00 +00000010: 68 65 6C 6C 6F +``` + +### 15.2 Byte-level decoding + +#### A) Dictionary Header (offset `0x00`, 16 bytes) + +```text +C7 D1 B0 9C | 01 | 00 | 05 00 | 17 0F 72 9A | 4A D9 | 00 00 +``` + +- `C7 D1 B0 9C` -> magic word (LE) = `0x9CB0D1C7` (`.zxd` dictionary). +- `01` -> dictionary format version 1. +- `00` -> flags (reserved, must be 0). +- `05 00` -> content size (LE) = `5` bytes. +- `17 0F 72 9A` -> `dict_id` (LE) = `0x9A720F17`. Must match the `dict_id` stored in the file header of any `.zxc` archive compressed with this dictionary. +- `4A D9` -> header CRC16 (LE) = `0xD94A`, computed over the 16-byte header with bytes `0x0C..0x0F` zeroed (same method as the ZXC file header). +- `00 00` -> reserved. + +#### B) Dictionary Content (offset `0x10`, 5 bytes) + +```text +68 65 6C 6C 6F +``` + +ASCII: `hello`. Raw bytes that prefill the LZ77 window — not compressed. + +### 15.3 Structural view with absolute offsets + +```text +0x00..0x0F Dictionary Header (16) +0x10..0x14 Dictionary Content (5) +``` From c2a9dea4a0609a7c0f3bd1fce819ac3a32e2c737 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Thu, 28 May 2026 22:45:16 +0200 Subject: [PATCH 32/47] feat: Include zxc_dict.c in zxc-sys build Ensures core dictionary logic is compiled for the Rust wrapper. --- wrappers/rust/zxc-sys/build.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wrappers/rust/zxc-sys/build.rs b/wrappers/rust/zxc-sys/build.rs index b3530f7f..276bf430 100644 --- a/wrappers/rust/zxc-sys/build.rs +++ b/wrappers/rust/zxc-sys/build.rs @@ -167,8 +167,9 @@ fn main() { .include(src_lib.join("vendors")) .define("ZXC_STATIC_DEFINE", None) .file(src_lib.join("zxc_common.c")) - .file(src_lib.join("zxc_driver.c")) + .file(src_lib.join("zxc_dict.c")) .file(src_lib.join("zxc_dispatch.c")) + .file(src_lib.join("zxc_driver.c")) .file(src_lib.join("zxc_seekable.c")) .file(src_lib.join("zxc_pstream.c")) .opt_level(3) From 1bb30a47d00b685e6cd1159158b420b92ebf0458 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Fri, 29 May 2026 11:17:55 +0200 Subject: [PATCH 33/47] feat: Enhance dictionary training by accounting for pattern overlap Refines the dictionary training algorithm to avoid redundant patterns. When a segment is added to the dictionary, its k-grams are marked as covered in the frequency table. This ensures that subsequent dictionary picks prioritize novel patterns, maximizing the dictionary's coverage of the corpus with unique content. --- src/lib/zxc_dict.c | 55 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 2b4211db..3c00fa19 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -114,11 +114,14 @@ int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_o * 1. Concatenate all samples into a corpus. * 2. For each position in the corpus, hash the k-gram (k = MIN_MATCH_LEN) * and count occurrences in a fixed-size hash map. - * 3. Walk the corpus a second time: for each position, look up the k-gram - * frequency and greedily select segments whose k-grams have the highest - * frequency x length score. - * 4. The most frequent segments are placed at the END of the dictionary - * so they produce shorter offsets (closer to the block start). + * 3. Walk the corpus, building candidate segments: each starts at a frequent + * k-gram and extends while neighbours stay frequent. A segment's score is + * the summed frequency of its k-grams (its coverage of the corpus). + * 4. Greedily fill the dictionary in descending coverage order, BUT account + * for overlap: once a pattern is placed, a single copy serves all future + * LZ matches, so its k-grams are zeroed in the frequency table. Segments + * whose coverage has since collapsed (mostly already in the dict) are + * skipped, so capacity goes to NEW patterns instead of redundant copies. * ------------------------------------------------------------------------- */ static uint32_t zxc_dict_hash(const uint8_t* p) { @@ -133,7 +136,7 @@ static uint32_t zxc_dict_hash(const uint8_t* p) { typedef struct { uint32_t offset; uint16_t length; - uint16_t score; + uint32_t score; /**< Summed k-gram frequency (coverage) of the segment. */ } zxc_dict_seg_t; /** @@ -239,8 +242,8 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, if (freq[h] < UINT16_MAX) freq[h]++; } - /* Step 3: score segments: stride by k-gram length to avoid overlap, - * collect top-scoring segments. */ + /* Step 3: build candidate segments. Stride by the k-gram length so + * candidate starts don't overlap; each segment is scored by its coverage. */ const size_t stride = ZXC_DICT_KGRAM_LEN; const size_t max_segs = corpus_size / stride; const size_t seg_alloc = (max_segs < 65536) ? max_segs : 65536; @@ -260,45 +263,65 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, const uint16_t f = freq[h]; if (f < 2) continue; - /* Extend the segment as long as the next k-gram is also frequent. */ + /* Extend the segment as long as the next k-gram is also frequent, and + * accumulate coverage (summed k-gram frequency) as the score. */ + uint32_t coverage = f; size_t end = i + ZXC_DICT_KGRAM_LEN; - while (end + ZXC_DICT_KGRAM_LEN <= corpus_size && end - i < 255) { + while (end + ZXC_DICT_KGRAM_LEN <= corpus_size && end - i < 4096) { const uint16_t nf = freq[zxc_dict_hash(corpus + end)]; if (nf < 2) break; + coverage += nf; end += ZXC_DICT_KGRAM_LEN; } segs[n_segs].offset = (uint32_t)i; segs[n_segs].length = (uint16_t)(end - i); - segs[n_segs].score = f; + segs[n_segs].score = coverage; n_segs++; } - ZXC_FREE(freq); - if (UNLIKELY(n_segs == 0)) { /* No frequent patterns. Use tail of corpus as dict. */ const size_t copy = (corpus_size < dict_capacity) ? corpus_size : dict_capacity; ZXC_MEMCPY(dict_buf, corpus + corpus_size - copy, copy); + ZXC_FREE(freq); ZXC_FREE(segs); ZXC_FREE(corpus); return (int64_t)copy; } - /* Step 4: sort by score descending, fill dict from end (most frequent last - * = shortest offsets from block start). */ + /* Step 4: sort by coverage descending, then greedily fill with overlap + * accounting. The frequency table is decremented as segments are placed so + * that already-covered patterns are not copied into the dictionary twice. */ zxc_dict_sort_segs_desc(segs, n_segs); uint8_t* out = (uint8_t*)dict_buf; size_t filled = 0; for (size_t i = 0; i < n_segs && filled < dict_capacity; i++) { + const size_t seg_off = segs[i].offset; + const size_t seg_end = seg_off + segs[i].length; + + /* Recompute coverage from the decrementing table: skip the segment if + * earlier picks have already covered more than half of its k-grams. */ + uint32_t cur = 0; + for (size_t p = seg_off; p + ZXC_DICT_KGRAM_LEN <= seg_end; p += ZXC_DICT_KGRAM_LEN) + cur += freq[zxc_dict_hash(corpus + p)]; + if (cur * 2 < segs[i].score) continue; + size_t copy = segs[i].length; if (copy > dict_capacity - filled) copy = dict_capacity - filled; - ZXC_MEMCPY(out + filled, corpus + segs[i].offset, copy); + ZXC_MEMCPY(out + filled, corpus + seg_off, copy); filled += copy; + + /* One copy in the dictionary serves all future matches: mark this + * segment's k-grams as covered so later segments cover new ground. */ + for (size_t p = seg_off; p + ZXC_DICT_KGRAM_LEN <= seg_end; p += ZXC_DICT_KGRAM_LEN) + freq[zxc_dict_hash(corpus + p)] = 0; } + ZXC_FREE(freq); + /* If we haven't filled the capacity, pad with tail of corpus. */ if (filled < dict_capacity) { const size_t pad = dict_capacity - filled; From d4bc6d2d5efafc2c2ee2e3ecfbc2c230d63fb7cd Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 10:16:27 +0200 Subject: [PATCH 34/47] feat: Use dictionary-enabled LZbench in benchmark workflow Updates the LZbench clone command to fetch the `zxc-0.12.x-dict` branch. This specific branch includes modifications to LZbench required for benchmarking ZXC's dictionary compression functionality, enabling performance evaluation of this new feature. --- .github/workflows/benchmark.yml | 2 +- src/lib/zxc_compress.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 50b8cee9..ef751096 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -47,7 +47,7 @@ jobs: - name: Clone LZbench run: | # git clone --depth 1 https://github.com/inikep/lzbench "${LZBENCH_DIR}" - git clone -b zxc-0.12.x https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}" + git clone -b zxc-0.12.x-dict https://github.com/hellobertrand/lzbench "${LZBENCH_DIR}" - name: Copy Lib ZXC run: | diff --git a/src/lib/zxc_compress.c b/src/lib/zxc_compress.c index 6c6b3cce..27336c3d 100644 --- a/src/lib/zxc_compress.c +++ b/src/lib/zxc_compress.c @@ -1171,9 +1171,9 @@ static int zxc_lz77_optimal_parse_glo(zxc_cctx_t* RESTRICT ctx, const uint8_t* R * ip uses absolute position (src + dict_sz + p) so match finder * resolves dict references correctly via src as base. */ const uint8_t* ip = src_base + p; - const zxc_match_t m = - zxc_lz77_find_best_match(src, ip, iend, mflimit, /*anchor=*/ip, hash_table, hash_tags, - chain_table, epoch_mark, offset_mask, level, lzp_opt, last_off); + const zxc_match_t m = zxc_lz77_find_best_match( + src, ip, iend, mflimit, /*anchor=*/ip, hash_table, hash_tags, chain_table, epoch_mark, + offset_mask, level, lzp_opt, last_off); if (m.ref) { const uint32_t off = (uint32_t)(ip - m.ref); From f79014b3d60bd45a0b843b84d92a066313ed4162 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:27:06 +0200 Subject: [PATCH 35/47] docs: Clarify dictionary compression benefits and use cases Updates the README to explain that dictionary compression's advantages stem from its interaction with small compression blocks, rather than solely small individual payloads. This clarifies the underlying mechanism and expands the documented use cases to include large, homogeneous corpora compressed for random access. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6b3e9a3a..e6ec26ac 100644 --- a/README.md +++ b/README.md @@ -462,9 +462,9 @@ zxc_compress_opts_t opts = { ## Dictionary Compression -For workloads consisting of many **small, similar payloads** (< 64 KB each), a pre-trained dictionary dramatically improves compression ratio. The dictionary prefills the LZ77 sliding window at the start of each block, giving the match finder immediate access to representative patterns. +For workloads compressed in **small blocks** (4 KB–128 KB), a pre-trained dictionary dramatically improves compression ratio. Because the dictionary prefills the LZ77 sliding window at the *start of each block*, the benefit is per-block: a block only has its own preceding bytes as history, so the smaller the block, the more it leans on the dictionary for representative patterns. This applies whether the input is a single small payload or a large payload split into many small blocks — any time the block size is small enough that early bytes would otherwise lack history to match against. -**Typical use cases:** JSON API responses, small game assets, structured logs, key-value store records, RPC messages. +**Typical use cases:** JSON API responses, small game assets, structured logs, key-value store records, RPC messages, and any large but homogeneous corpus compressed in small blocks for random access (e.g. seekable archives). ### Training a dictionary From 621d9d35f52d1f6363ef8f8918a1e2fcf0e126d2 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 11:35:08 +0200 Subject: [PATCH 36/47] fix: Align maximum dictionary size with 16-bit field constraints The previous `ZXC_DICT_SIZE_MAX` value of 65536 (64 KB) exceeded the capacity of 16-bit fields used for dictionary content size in the `.zxd` header and LZ77 match offsets. This update corrects the maximum to 65535 (64KB - 1) to prevent potential truncation or misinterpretation, and updates documentation and default CLI usage accordingly. --- docs/API.md | 2 +- include/zxc_constants.h | 8 ++++++-- src/cli/main.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/API.md b/docs/API.md index 4e350bbd..2228f383 100644 --- a/docs/API.md +++ b/docs/API.md @@ -1300,7 +1300,7 @@ ZXC_EXPORT int64_t zxc_train_dict( const size_t* sample_sizes, size_t n_samples, void* dict_buf, - size_t dict_capacity // max ZXC_DICT_SIZE_MAX (64 KB) + size_t dict_capacity // max ZXC_DICT_SIZE_MAX (64KB - 1) ); ``` diff --git a/include/zxc_constants.h b/include/zxc_constants.h index e7931854..a01f0a91 100644 --- a/include/zxc_constants.h +++ b/include/zxc_constants.h @@ -69,8 +69,12 @@ * @brief Constants for pre-trained dictionary support. * @{ */ -/** @brief Maximum dictionary content size in bytes (64 KB, bounded by LZ window). */ -#define ZXC_DICT_SIZE_MAX (1U << 16) +/** @brief Maximum dictionary content size in bytes (64 KB - 1). + * + * Bounded to a 16-bit value (65535) by two constraints that both cap at the + * same number: the `.zxd` header stores the content size in a 16-bit field, and + * LZ77 match offsets are 16-bit (max distance 65535). */ +#define ZXC_DICT_SIZE_MAX ((1U << 16) - 1U) /** @brief Size of the .zxd dictionary file header in bytes. */ #define ZXC_DICT_HEADER_SIZE 16 /** @} */ /* end of dictionary */ diff --git a/src/cli/main.c b/src/cli/main.c index 921d06c8..b46b053b 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -1335,7 +1335,7 @@ int main(int argc, char** argv) { return 1; } - size_t dict_cap = 32768; + size_t dict_cap = ZXC_DICT_SIZE_MAX; if (block_size > 0 && block_size < dict_cap) dict_cap = block_size; uint8_t* dict_buf = (uint8_t*)malloc(dict_cap); if (!dict_buf) { From 1765dad11e9f00905d21e96ce634b28614eca6e6 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 12:15:46 +0200 Subject: [PATCH 37/47] feat: Optimize dictionary segment placement for lower match offsets The dictionary training process now first selects segments based on coverage, then emits them in reverse order of selection into the final dictionary. This places the highest-coverage segments at the dictionary's end, granting them the smallest possible LZ77 match offsets. Lower offsets are cheaper to encode and remain within common offset windows longer. This change also removes the general tail-padding, which previously could increase offsets for useful entries. Tail-padding is now only used as a fallback if no valuable segments are selected. --- src/lib/zxc_dict.c | 47 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 3c00fa19..99ea1c86 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -290,15 +290,16 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, return (int64_t)copy; } - /* Step 4: sort by coverage descending, then greedily fill with overlap - * accounting. The frequency table is decremented as segments are placed so - * that already-covered patterns are not copied into the dictionary twice. */ + /* Step 4: pick segments greedily in descending-coverage order, zeroing each + * pick's k-grams so overlapping patterns aren't copied twice. Picks are + * compacted in place into segs[0..n_sel); placement is step 5. */ zxc_dict_sort_segs_desc(segs, n_segs); uint8_t* out = (uint8_t*)dict_buf; - size_t filled = 0; + size_t n_sel = 0; + size_t total = 0; - for (size_t i = 0; i < n_segs && filled < dict_capacity; i++) { + for (size_t i = 0; i < n_segs && total < dict_capacity; i++) { const size_t seg_off = segs[i].offset; const size_t seg_end = seg_off + segs[i].length; @@ -310,24 +311,42 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, if (cur * 2 < segs[i].score) continue; size_t copy = segs[i].length; - if (copy > dict_capacity - filled) copy = dict_capacity - filled; - ZXC_MEMCPY(out + filled, corpus + seg_off, copy); - filled += copy; + if (copy > dict_capacity - total) copy = dict_capacity - total; /* One copy in the dictionary serves all future matches: mark this * segment's k-grams as covered so later segments cover new ground. */ for (size_t p = seg_off; p + ZXC_DICT_KGRAM_LEN <= seg_end; p += ZXC_DICT_KGRAM_LEN) freq[zxc_dict_hash(corpus + p)] = 0; + + /* Record the pick (n_sel <= i, so this never clobbers an unread entry). */ + segs[n_sel].offset = (uint32_t)seg_off; + segs[n_sel].length = (uint16_t)copy; + n_sel++; + total += copy; } ZXC_FREE(freq); - /* If we haven't filled the capacity, pad with tail of corpus. */ - if (filled < dict_capacity) { - const size_t pad = dict_capacity - filled; - const size_t tail = (corpus_size > pad) ? pad : corpus_size; - ZXC_MEMCPY(out + filled, corpus + corpus_size - tail, tail); - filled += tail; + /* Step 5: emit picks in reverse order so the highest-coverage segment ends + * up at the END of the dict. The dict sits just before the data, so bytes + * nearer its end have the smallest match offset: cheapest to encode and the + * last to leave the 16-bit (65535) offset window. + * + * No padding: if the picks don't fill the capacity, the dict is just + * shorter. The old tail-padding only added low-value bytes that raised + * offsets for everything after them. */ + size_t filled = 0; + for (size_t i = n_sel; i-- > 0;) { + ZXC_MEMCPY(out + filled, corpus + segs[i].offset, segs[i].length); + filled += segs[i].length; + } + + /* Nothing selected (every segment subsumed by earlier picks): fall back to + * the corpus tail so the dict is never empty, like the n_segs == 0 path. */ + if (UNLIKELY(filled == 0)) { + const size_t tail = (corpus_size < dict_capacity) ? corpus_size : dict_capacity; + ZXC_MEMCPY(out, corpus + corpus_size - tail, tail); + filled = tail; } ZXC_FREE(segs); From 07462bb36993a5943cabfdb62c84e72fff21dd40 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 14:50:05 +0200 Subject: [PATCH 38/47] feat: Validate dictionary ID in file header during decompression Introduces checks to ensure that if a file was compressed with a dictionary, the correct dictionary is provided for decompression. The dictionary ID from the file header is compared against the provided dictionary's ID. This prevents silent data corruption and returns specific errors (ZXC_ERROR_DICT_REQUIRED or ZXC_ERROR_DICT_MISMATCH) if the dictionary is missing or mismatched. This applies to both stream and seekable decompression paths. --- src/lib/zxc_driver.c | 9 ++- src/lib/zxc_seekable.c | 23 +++++-- tests/test_common.h | 2 + tests/test_dict.c | 147 +++++++++++++++++++++++++++++++++++++++++ tests/test_main.c | 2 + 5 files changed, 177 insertions(+), 6 deletions(-) diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index c7fd07c8..8750f3ea 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -628,10 +628,17 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (mode == 0) { // Decompression Mode: Read and validate file header uint8_t h[ZXC_FILE_HEADER_SIZE]; + uint32_t header_dict_id = 0; if (UNLIKELY(fread(h, 1, ZXC_FILE_HEADER_SIZE, f_in) != ZXC_FILE_HEADER_SIZE || zxc_read_file_header(h, ZXC_FILE_HEADER_SIZE, &runtime_chunk_sz, &file_has_chk, - NULL) != ZXC_OK)) + &header_dict_id) != ZXC_OK)) return ZXC_ERROR_BAD_HEADER; + + if (header_dict_id != 0) { + if (UNLIKELY(!dict || dict_size == 0)) return ZXC_ERROR_DICT_REQUIRED; + if (UNLIKELY(zxc_dict_id(dict, dict_size) != header_dict_id)) + return ZXC_ERROR_DICT_MISMATCH; + } } int num_threads = (n_threads > 0) ? n_threads : (int)sysconf(_SC_NPROCESSORS_ONLN); diff --git a/src/lib/zxc_seekable.c b/src/lib/zxc_seekable.c index af633d8f..d3336630 100644 --- a/src/lib/zxc_seekable.c +++ b/src/lib/zxc_seekable.c @@ -29,6 +29,7 @@ #include "../../include/zxc_seekable.h" +#include "../../include/zxc_dict.h" #include "../../include/zxc_error.h" #include "zxc_internal.h" @@ -166,6 +167,7 @@ struct zxc_seekable_s { * fits in 21 bits. */ uint32_t block_size; int file_has_checksums; + uint32_t expected_dict_id; /* dict_id from the file header; 0 = no dictionary */ /* Reusable decompression context (single-threaded path only) */ zxc_cctx_t dctx; @@ -197,8 +199,9 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s /* Step 1: validate file header => block_size */ size_t block_size_sz = 0; int file_has_chk = 0; - if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk, NULL) != - ZXC_OK)) + uint32_t header_dict_id = 0; + if (UNLIKELY(zxc_read_file_header(data, data_size, &block_size_sz, &file_has_chk, + &header_dict_id) != ZXC_OK)) return NULL; // LCOV_EXCL_LINE const uint32_t block_size = (uint32_t)block_size_sz; if (UNLIKELY(block_size == 0)) return NULL; // LCOV_EXCL_LINE @@ -241,6 +244,7 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s s->num_blocks = num_blocks; s->block_size = block_size; s->file_has_checksums = file_has_chk; + s->expected_dict_id = header_dict_id; s->src = data; s->src_size = (uint64_t)data_size; @@ -332,7 +336,9 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { size_t bs_sz = 0; int fhc = 0; - if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc, NULL) != ZXC_OK)) + uint32_t header_dict_id = 0; + if (UNLIKELY(zxc_read_file_header(header, ZXC_FILE_HEADER_SIZE, &bs_sz, &fhc, + &header_dict_id) != ZXC_OK)) return NULL; // LCOV_EXCL_LINE const uint32_t bs = (uint32_t)bs_sz; if (UNLIKELY(bs == 0)) return NULL; @@ -394,6 +400,7 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { s->num_blocks = num_blocks; s->block_size = bs; s->file_has_checksums = fhc; + s->expected_dict_id = header_dict_id; s->comp_sizes = (uint32_t*)ZXC_CALLOC(num_blocks, sizeof(uint32_t)); s->comp_offsets = (uint64_t*)ZXC_CALLOC((size_t)num_blocks + 1, sizeof(uint64_t)); @@ -502,10 +509,12 @@ static int zxc_seek_read_block(const zxc_seekable* s, const uint32_t block_idx, int64_t zxc_seekable_decompress_range(zxc_seekable* s, void* dst, const size_t dst_capacity, const uint64_t offset, const size_t len) { - if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(len == 0)) return 0; + if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(dst_capacity < len)) return ZXC_ERROR_DST_TOO_SMALL; if (UNLIKELY(offset + len > s->total_decomp)) return ZXC_ERROR_SRC_TOO_SMALL; + if (UNLIKELY(s->expected_dict_id != 0 && (!s->dict || s->dict_size == 0))) + return ZXC_ERROR_DICT_REQUIRED; /* Initialize decompression context on first use */ if (!s->dctx_initialized) { @@ -719,10 +728,12 @@ static void* zxc_seek_mt_worker(void* arg) { int64_t zxc_seekable_decompress_range_mt(zxc_seekable* s, void* dst, const size_t dst_capacity, const uint64_t offset, const size_t len, int n_threads) { - if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(len == 0)) return 0; + if (UNLIKELY(!s || !dst)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(dst_capacity < len)) return ZXC_ERROR_DST_TOO_SMALL; if (UNLIKELY(offset + len > s->total_decomp)) return ZXC_ERROR_SRC_TOO_SMALL; + if (UNLIKELY(s->expected_dict_id != 0 && (!s->dict || s->dict_size == 0))) + return ZXC_ERROR_DICT_REQUIRED; /* Find block range - O(1) division */ const uint32_t blk_start = zxc_seek_find_block(s->block_size, offset); @@ -850,6 +861,8 @@ void zxc_seekable_free(zxc_seekable* s) { int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_size) { if (UNLIKELY(!s || !dict || dict_size == 0)) return ZXC_ERROR_NULL_INPUT; if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + if (UNLIKELY(s->expected_dict_id != 0 && zxc_dict_id(dict, dict_size) != s->expected_dict_id)) + return ZXC_ERROR_DICT_MISMATCH; ZXC_FREE(s->dict); ZXC_FREE(s->dict_work); diff --git a/tests/test_common.h b/tests/test_common.h index 7d6c86eb..1e4402f7 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -197,5 +197,7 @@ int test_dict_seekable_roundtrip(void); int test_dict_train_roundtrip(void); int test_dict_train_no_frequent_patterns(void); int test_dict_seekable_mt_roundtrip(void); +int test_dict_stream_dict_id_checks(void); +int test_dict_seekable_dict_id_checks(void); #endif /* ZXC_TEST_COMMON_H */ diff --git a/tests/test_dict.c b/tests/test_dict.c index d4c52f4f..b9ecb246 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -793,3 +793,150 @@ int test_dict_seekable_mt_roundtrip(void) { printf("PASS\n\n"); return 1; } + +static const uint8_t k_dict_a[] = + "The quick brown fox jumps over the lazy dog. " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit."; +static const uint8_t k_dict_b[] = + "A completely unrelated dictionary payload hashing to a different dict_id value."; + +// Compress `src` with dict A to a tmpfile, then try to stream-decompress it with +// `dec_dict` (NULL = none) and assert the decoder returns `want_err`. +static int stream_dict_error_case(const char* label, const uint8_t* dec_dict, size_t dec_dict_size, + int64_t want_err) { + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, k_dict_a, sizeof(k_dict_a) - 1); + + FILE* f_src = tmpfile(); + FILE* f_comp = tmpfile(); + FILE* f_dec = tmpfile(); + int ok = 1; + if (!f_src || !f_comp || !f_dec) { + printf(" [FAIL] %s: tmpfile() failed\n", label); + ok = 0; + } + + if (ok) { + fwrite(src, 1, src_size, f_src); + rewind(f_src); + zxc_compress_opts_t copts = {.level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .dict = k_dict_a, + .dict_size = sizeof(k_dict_a) - 1}; + if (zxc_stream_compress(f_src, f_comp, &copts) <= 0) { + printf(" [FAIL] %s: stream_compress failed\n", label); + ok = 0; + } + } + + if (ok) { + rewind(f_comp); + zxc_decompress_opts_t dopts = { + .checksum_enabled = 1, .dict = dec_dict, .dict_size = dec_dict_size}; + int64_t rc = zxc_stream_decompress(f_comp, f_dec, &dopts); + if (rc != want_err) { + printf(" [FAIL] %s: expected %s, got %lld (%s)\n", label, zxc_error_name((int)want_err), + (long long)rc, zxc_error_name((int)rc)); + ok = 0; + } + } + + if (f_src) fclose(f_src); + if (f_comp) fclose(f_comp); + if (f_dec) fclose(f_dec); + free(src); + return ok; +} + +int test_dict_stream_dict_id_checks(void) { + printf("=== TEST: Dict - stream decode rejects missing/wrong dict ===\n"); + int ok = stream_dict_error_case("missing dict", NULL, 0, ZXC_ERROR_DICT_REQUIRED); + ok &= stream_dict_error_case("wrong dict", k_dict_b, sizeof(k_dict_b) - 1, + ZXC_ERROR_DICT_MISMATCH); + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} + +int test_dict_seekable_dict_id_checks(void) { + printf("=== TEST: Dict - seekable decode rejects missing/wrong dict ===\n"); + + const size_t src_size = 8192; + uint8_t* src = (uint8_t*)malloc(src_size); + gen_dict_friendly_data(src, src_size, k_dict_a, sizeof(k_dict_a) - 1); + + size_t comp_bound = (size_t)zxc_compress_bound(src_size); + uint8_t* compressed = (uint8_t*)malloc(comp_bound); + zxc_compress_opts_t copts = {.level = ZXC_LEVEL_DEFAULT, + .checksum_enabled = 1, + .seekable = 1, + .dict = k_dict_a, + .dict_size = sizeof(k_dict_a) - 1}; + int64_t comp_size = zxc_compress(src, src_size, compressed, comp_bound, &copts); + + uint8_t* out = (uint8_t*)malloc(src_size); + int ok = 1; + + if (comp_size <= 0) { + printf(" [FAIL] seekable compress failed\n"); + ok = 0; + } + + // 1. Wrong dict via set_dict must be rejected up front. + if (ok) { + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + ok = 0; + } else { + int rc = zxc_seekable_set_dict(s, k_dict_b, sizeof(k_dict_b) - 1); + if (rc != ZXC_ERROR_DICT_MISMATCH) { + printf(" [FAIL] set_dict(wrong): expected DICT_MISMATCH, got %d (%s)\n", rc, + zxc_error_name(rc)); + ok = 0; + } + zxc_seekable_free(s); + } + } + + // 2. Decoding without any dict must be rejected, not silently corrupt + // (single-threaded and multi-threaded entry points). + if (ok) { + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (!s) { + printf(" [FAIL] seekable_open returned NULL\n"); + ok = 0; + } else { + int64_t st = zxc_seekable_decompress_range(s, out, src_size, 0, src_size); + int64_t mt = zxc_seekable_decompress_range_mt(s, out, src_size, 0, src_size, 4); + if (st != ZXC_ERROR_DICT_REQUIRED || mt != ZXC_ERROR_DICT_REQUIRED) { + printf(" [FAIL] no-dict decode: expected DICT_REQUIRED, got st=%lld mt=%lld\n", + (long long)st, (long long)mt); + ok = 0; + } + zxc_seekable_free(s); + } + } + + // 3. Correct dict still works (guard against over-rejection). + if (ok) { + zxc_seekable* s = zxc_seekable_open(compressed, (size_t)comp_size); + if (s && zxc_seekable_set_dict(s, k_dict_a, sizeof(k_dict_a) - 1) == ZXC_OK && + zxc_seekable_decompress_range(s, out, src_size, 0, src_size) == (int64_t)src_size && + memcmp(src, out, src_size) == 0) { + // expected + } else { + printf(" [FAIL] correct dict roundtrip regressed\n"); + ok = 0; + } + zxc_seekable_free(s); + } + + free(out); + free(src); + free(compressed); + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} diff --git a/tests/test_main.c b/tests/test_main.c index e72417df..cef3c09a 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -128,6 +128,8 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_dict_train_roundtrip), TEST_CASE(test_dict_train_no_frequent_patterns), TEST_CASE(test_dict_seekable_mt_roundtrip), + TEST_CASE(test_dict_stream_dict_id_checks), + TEST_CASE(test_dict_seekable_dict_id_checks), /* --- Seekable (single-threaded) --- */ TEST_CASE(test_seekable_table_sizes), From 0564dad2260b70e4accff20c0ec02900542091be Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 14:52:27 +0200 Subject: [PATCH 39/47] feat: List information for zxc dictionary files The `-l, --list` command now supports inspecting `.zxd` dictionary files. When a dictionary file is provided, the CLI will report its `Dict ID` and `Content size` in both human-readable and JSON formats. This allows users to verify dictionary integrity and retrieve the dictionary ID, which is essential for matching dictionaries with compressed archives. --- src/cli/main.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/src/cli/main.c b/src/cli/main.c index b46b053b..ca20bb40 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -468,7 +468,7 @@ void print_help(const char* app) { "Standard Modes:\n" " -z, --compress Compress FILE {default}\n" " -d, --decompress Decompress FILE (or stdin -> stdout)\n" - " -l, --list List archive information\n" + " -l, --list List archive or dictionary info\n" " -t, --test Test compressed FILE integrity\n" " -b, --bench [N] Benchmark in-memory (N=seconds, default 5)\n" " --train-dict FILE Train a dictionary from input files\n\n" @@ -617,6 +617,37 @@ static void cli_progress_callback(uint64_t bytes_processed, uint64_t bytes_total * @param[in] json_output If 1, output JSON format. * @return 0 on success, 1 on error. */ +// Report a .zxd dictionary file: its dict_id (to match against a .zxc's +// "Dict ID") and content size. `buf` holds the whole .zxd file. +static int zxc_list_dict(const char* path, const uint8_t* buf, size_t buf_size, long long file_size, + int json_output) { + const void* content = NULL; + size_t content_size = 0; + uint32_t id = 0; + const int rc = zxc_dict_load(buf, buf_size, &content, &content_size, &id); + if (rc != ZXC_OK) { + fprintf(stderr, "Error: invalid dictionary '%s': %s\n", path, zxc_error_name(rc)); + return 1; + } + if (json_output) { + printf("{\n" + " \"type\": \"dictionary\",\n" + " \"filename\": \"%s\",\n" + " \"dict_id\": \"0x%08X\",\n" + " \"content_size_bytes\": %zu,\n" + " \"file_size_bytes\": %lld\n" + "}\n", + path, id, content_size, file_size); + } else { + printf("\n Dictionary file (.zxd)\n" + " Dict ID: 0x%08X\n" + " Content size: %zu bytes\n" + " File: %s\n", + id, content_size, path); + } + return 0; +} + static int zxc_list_archive(const char* path, int json_output) { char resolved_path[4096]; if (zxc_validate_input_path(path, resolved_path, sizeof(resolved_path)) != 0) { @@ -638,6 +669,29 @@ static int zxc_list_archive(const char* path, int json_output) { } const long long file_size = ftello(f); + // A .zxd dictionary file has its own magic word; recognise it and report + // its dict_id (for matching against a .zxc's "Dict ID") instead of failing + // as a non-archive. The upper bound is the largest possible .zxd file. + if (file_size >= (long long)ZXC_DICT_HEADER_SIZE && + file_size <= (long long)zxc_dict_save_bound(ZXC_DICT_SIZE_MAX)) { + uint8_t probe[ZXC_DICT_HEADER_SIZE]; + if (fseeko(f, 0, SEEK_SET) == 0 && + fread(probe, 1, ZXC_DICT_HEADER_SIZE, f) == ZXC_DICT_HEADER_SIZE && + zxc_dict_get_id(probe, ZXC_DICT_HEADER_SIZE) != 0) { + uint8_t* dbuf = (uint8_t*)malloc((size_t)file_size); + int r = 1; + if (dbuf && fseeko(f, 0, SEEK_SET) == 0 && + fread(dbuf, 1, (size_t)file_size, f) == (size_t)file_size) + r = zxc_list_dict(path, dbuf, (size_t)file_size, file_size, json_output); + else + fprintf(stderr, "Error: Cannot read '%s'\n", path); + free(dbuf); + fclose(f); + return r; + } + fseeko(f, 0, SEEK_SET); + } + // Use public API to get decompressed size const int64_t uncompressed_size = zxc_stream_get_decompressed_size(f); if (uncompressed_size < 0) { @@ -1014,7 +1068,7 @@ static int process_single_file(const char* in_path, const char* out_path_overrid " Reason: Integrity check failed (corrupted data or invalid checksum)\n"); } } else { - zxc_log("Operation failed on %s.\n", in_path ? in_path : ""); + zxc_log("Error: %s: %s\n", in_path ? in_path : "", zxc_error_name((int)bytes)); if (created_out_file) unlink(resolved_out_path); } overall_ret = 1; From a4ffe78d8c15f5ee61f6de8bcd06e21c61997c13 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 15:23:31 +0200 Subject: [PATCH 40/47] feat: Improve dictionary training for large corpora Previously, k-gram frequency counters could saturate on large inputs, preventing effective segment selection. Candidate segment generation was also biased towards the corpus's prefix, ignoring later content. This change introduces sampling for k-gram frequencies to keep counters unsaturated. It also adapts the segment generation stride to distribute candidate segment starts more evenly across the entire corpus, ensuring better representation and segment quality for large datasets. --- src/lib/zxc_dict.c | 22 ++++++++++++++++------ src/lib/zxc_internal.h | 6 ++++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 99ea1c86..fd0f582f 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -236,17 +236,27 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, } ZXC_MEMSET(freq, 0, ZXC_DICT_HT_SIZE * sizeof(uint16_t)); + /* Count k-gram frequencies on a representative sample of positions, not all + * of them: counting a large corpus in full saturates the 16-bit counters, + * so the segment-extension test never stops and segments balloon into + * filler. Sampling keeps counts unsaturated and spread across the corpus. */ const size_t kgram_limit = corpus_size - ZXC_DICT_KGRAM_LEN + 1; - for (size_t i = 0; i < kgram_limit; i++) { + size_t freq_stride = kgram_limit / ZXC_DICT_FREQ_SAMPLE_TARGET; + if (freq_stride < 1) freq_stride = 1; + for (size_t i = 0; i < kgram_limit; i += freq_stride) { const uint32_t h = zxc_dict_hash(corpus + i); if (freq[h] < UINT16_MAX) freq[h]++; } - /* Step 3: build candidate segments. Stride by the k-gram length so - * candidate starts don't overlap; each segment is scored by its coverage. */ - const size_t stride = ZXC_DICT_KGRAM_LEN; - const size_t max_segs = corpus_size / stride; - const size_t seg_alloc = (max_segs < 65536) ? max_segs : 65536; + /* Step 3: build candidate segments, each scored by its coverage. Spread the + * candidate starts across the whole corpus: a fixed k-gram stride exhausts + * the segment budget within the prefix, leaving a large input's later + * content unseen. Segments still extend k-gram by k-gram, so they stay + * contiguous. */ + const size_t max_segs = corpus_size / ZXC_DICT_KGRAM_LEN; + const size_t seg_alloc = (max_segs < ZXC_DICT_MAX_SEGMENTS) ? max_segs : ZXC_DICT_MAX_SEGMENTS; + size_t stride = ZXC_DICT_KGRAM_LEN; + if (seg_alloc > 0 && corpus_size / seg_alloc > stride) stride = corpus_size / seg_alloc; zxc_dict_seg_t* segs = (zxc_dict_seg_t*)ZXC_MALLOC(seg_alloc * sizeof(zxc_dict_seg_t)); if (UNLIKELY(!segs)) { diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h index ea10786e..49d42a4f 100644 --- a/src/lib/zxc_internal.h +++ b/src/lib/zxc_internal.h @@ -349,6 +349,12 @@ extern "C" { #define ZXC_DICT_KGRAM_LEN ZXC_LZ_MIN_MATCH_LEN /** @brief Address bits for the dictionary trainer's k-gram frequency table. */ #define ZXC_DICT_HT_BITS 16 +/** @brief Maximum number of candidate segments the dictionary trainer keeps. */ +#define ZXC_DICT_MAX_SEGMENTS (1U << 16) +/** @brief Target number of sampled k-gram positions for the trainer's frequency + * estimate. Bounds the count so 16-bit counters stay unsaturated on large + * corpora; the trainer strides the corpus to hit roughly this many positions. */ +#define ZXC_DICT_FREQ_SAMPLE_TARGET (1U << 19) /** @brief Number of buckets in the dictionary trainer's frequency table. */ #define ZXC_DICT_HT_SIZE (1U << ZXC_DICT_HT_BITS) From 88622da59b9c2060412060822b8407daa0ba01ba Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 16:51:37 +0200 Subject: [PATCH 41/47] feat: Embed dictionaries directly into zxc archives This refactors dictionary handling to embed trained dictionaries directly within ZXC archives, eliminating the need for separate `.zxd` dictionary files. This simplifies deployment and ensures archives are self-contained for decompression. Key changes include: - Removal of the standalone `.zxd` file format and its associated API (save/load, header, magic, etc.). - Dictionaries are now stored as a `ZXC_BLOCK_DICT` block immediately after the ZXC file header when `ZXC_FILE_FLAG_HAS_DICTIONARY` is set. - The CLI is updated: - The `-D, --dict` option for providing external dictionary files is removed. - A new `--auto-dict` option is added to train a dictionary from input files and embed it automatically during compression. - The `list` command no longer supports inspecting `.zxd` files. - Decompression logic (stream and seekable) automatically reads and validates the embedded dictionary, removing the need to provide an external dictionary at decode time. --- include/zxc_constants.h | 7 +- include/zxc_dict.h | 79 ++-------- src/cli/main.c | 315 ++++++++++------------------------------ src/lib/zxc_dict.c | 88 +---------- src/lib/zxc_driver.c | 50 ++++++- src/lib/zxc_internal.h | 12 +- src/lib/zxc_seekable.c | 87 ++++++++++- tests/test_common.h | 1 - tests/test_dict.c | 116 +++------------ tests/test_main.c | 1 - 10 files changed, 250 insertions(+), 506 deletions(-) diff --git a/include/zxc_constants.h b/include/zxc_constants.h index a01f0a91..d22ae35d 100644 --- a/include/zxc_constants.h +++ b/include/zxc_constants.h @@ -71,12 +71,9 @@ */ /** @brief Maximum dictionary content size in bytes (64 KB - 1). * - * Bounded to a 16-bit value (65535) by two constraints that both cap at the - * same number: the `.zxd` header stores the content size in a 16-bit field, and - * LZ77 match offsets are 16-bit (max distance 65535). */ + * Bounded to 65535 because LZ77 match offsets are 16-bit (max distance 65535): + * a dictionary byte farther back than that could never be referenced. */ #define ZXC_DICT_SIZE_MAX ((1U << 16) - 1U) -/** @brief Size of the .zxd dictionary file header in bytes. */ -#define ZXC_DICT_HEADER_SIZE 16 /** @} */ /* end of dictionary */ /** diff --git a/include/zxc_dict.h b/include/zxc_dict.h index b5818fdc..0d38b34b 100644 --- a/include/zxc_dict.h +++ b/include/zxc_dict.h @@ -9,26 +9,19 @@ * @file zxc_dict.h * @brief Pre-trained dictionary API for ZXC compression. * - * Provides functions to train, save, load, and identify dictionaries that - * improve compression ratio on small, similar payloads. Dictionaries are - * stored as external `.zxd` files and referenced by a 32-bit ID in the - * ZXC file header. + * Provides functions to train and identify dictionaries that improve + * compression ratio on small, similar payloads. A dictionary is raw byte + * content that prefills the LZ77 sliding window at the start of each block, + * giving the compressor immediate access to representative patterns. * - * A dictionary contains raw byte content that prefills the LZ77 sliding - * window at the start of each block, giving the compressor immediate - * access to representative patterns without waiting for them to appear - * in the input stream. + * Dictionaries are embedded in the archive (no standalone file format): pass + * trained content to zxc_compress_opts_t::dict and it is stored in the archive, + * so decompression needs no external dictionary. * * @code - * // Train a dictionary from a corpus of JSON samples + * // Train a dictionary from a corpus of JSON samples and embed it * void* dict_buf = malloc(32768); * int64_t dict_sz = zxc_train_dict(samples, sizes, n, dict_buf, 32768); - * - * // Save to .zxd file - * void* zxd = malloc(zxc_dict_save_bound(dict_sz)); - * int64_t zxd_sz = zxc_dict_save(dict_buf, dict_sz, zxd, ...); - * - * // Use for compression * zxc_compress_opts_t opts = { .level = 3, .dict = dict_buf, .dict_size = dict_sz }; * zxc_compress(src, src_size, dst, dst_capacity, &opts); * @endcode @@ -48,7 +41,7 @@ extern "C" { /** * @defgroup dict Dictionary - * @brief Pre-trained dictionary training, serialization, and identification. + * @brief Pre-trained dictionary training and identification. * @{ */ @@ -56,8 +49,8 @@ extern "C" { * @brief Compute the dictionary ID for the given content. * * The ID is a deterministic 32-bit hash of the raw dictionary content. - * It is stored in the ZXC file header so the decoder can verify that - * the correct dictionary is provided at decompression time. + * It is stored in the ZXC file header so the decoder can verify the embedded + * dictionary matches. * * @param[in] dict Pointer to dictionary content. * @param[in] dict_size Size in bytes. @@ -65,60 +58,12 @@ extern "C" { */ ZXC_EXPORT uint32_t zxc_dict_id(const void* dict, size_t dict_size); -/** - * @brief Load and validate a `.zxd` dictionary file from a memory buffer. - * - * On success, @p content_out points into the input buffer (zero-copy). - * The caller must keep @p buf alive while the content pointer is in use. - * - * @param[in] buf Buffer containing the .zxd file. - * @param[in] buf_size Size of @p buf in bytes. - * @param[out] content_out Receives a pointer to the dictionary content. - * @param[out] content_size_out Receives the content size in bytes. - * @param[out] dict_id_out Receives the dictionary ID (may be NULL). - * @return @ref ZXC_OK on success, or a negative @ref zxc_error_t code. - */ -ZXC_EXPORT int zxc_dict_load(const void* buf, size_t buf_size, const void** content_out, - size_t* content_size_out, uint32_t* dict_id_out); - -/** - * @brief Serialize dictionary content to the `.zxd` file format. - * - * @param[in] content Raw dictionary content. - * @param[in] content_size Size of @p content in bytes (max ZXC_DICT_SIZE_MAX). - * @param[out] buf Output buffer for the .zxd file. - * @param[in] buf_capacity Capacity of @p buf. - * @return Number of bytes written on success, or a negative @ref zxc_error_t code. - */ -ZXC_EXPORT int64_t zxc_dict_save(const void* content, size_t content_size, void* buf, - size_t buf_capacity); - -/** - * @brief Returns the maximum .zxd file size for a given content size. - * - * @param[in] content_size Size of the dictionary content. - * @return Total .zxd file size (header + content). - */ -ZXC_EXPORT size_t zxc_dict_save_bound(size_t content_size); - -/** - * @brief Returns the dictionary ID stored in a `.zxd` file buffer. - * - * Reads the dict_id field from the .zxd header without validating the full - * file. Returns 0 if the buffer is too small or the magic word doesn't match. - * - * @param[in] buf Buffer containing the .zxd file. - * @param[in] buf_size Size of @p buf in bytes. - * @return Dictionary ID, or 0 if the buffer is not a valid .zxd file. - */ -ZXC_EXPORT uint32_t zxc_dict_get_id(const void* buf, size_t buf_size); - /** * @brief Train a dictionary from a corpus of samples. * * Analyzes the samples to select byte sequences that maximize LZ77 match * coverage. The resulting dictionary content can be passed directly to - * zxc_compress_opts_t::dict or serialized with zxc_dict_save(). + * zxc_compress_opts_t::dict (it is then embedded in the archive). * * @param[in] samples Array of pointers to sample buffers. * @param[in] sample_sizes Array of sample sizes in bytes. diff --git a/src/cli/main.c b/src/cli/main.c index ca20bb40..9fda8b1c 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -299,6 +299,7 @@ static int zxc_validate_output_path(const char* path, char* resolved_buffer, siz // CLI Logging Helpers static int g_quiet = 0; static int g_verbose = 0; +static int g_auto_dict = 0; /* --auto-dict: train a dictionary from the input and embed it */ /** * @brief Standard logging function. Respects the global quiet flag. @@ -344,11 +345,10 @@ typedef enum { MODE_DECOMPRESS, MODE_BENCHMARK, MODE_INTEGRITY, - MODE_LIST, - MODE_TRAIN_DICT + MODE_LIST } zxc_mode_t; -enum { OPT_VERSION = 1000, OPT_HELP, OPT_TRAIN_DICT }; +enum { OPT_VERSION = 1000, OPT_HELP, OPT_AUTO_DICT }; // Forward declaration for recursive mode static int process_single_file(const char* in_path, const char* out_path_override, zxc_mode_t mode, @@ -468,10 +468,10 @@ void print_help(const char* app) { "Standard Modes:\n" " -z, --compress Compress FILE {default}\n" " -d, --decompress Decompress FILE (or stdin -> stdout)\n" - " -l, --list List archive or dictionary info\n" + " -l, --list List archive info\n" " -t, --test Test compressed FILE integrity\n" " -b, --bench [N] Benchmark in-memory (N=seconds, default 5)\n" - " --train-dict FILE Train a dictionary from input files\n\n" + "\n" "Batch Processing:\n" " -m, --multiple Multiple input files\n" " -r, --recursive Operate recursively on directories\n\n" @@ -484,7 +484,7 @@ void print_help(const char* app) { " -T, --threads N Number of threads (0=auto)\n" " -C, --checksum Enable checksum {default}\n" " -N, --no-checksum Disable checksum\n" - " -D, --dict FILE Use pre-trained dictionary (.zxd) for compression/decompression\n" + " --auto-dict Train a dictionary from the input and embed it (compression)\n" " -S, --seekable Append seek table for random-access decompression\n" " -k, --keep Keep input file\n" " -f, --force Force overwrite\n" @@ -617,37 +617,6 @@ static void cli_progress_callback(uint64_t bytes_processed, uint64_t bytes_total * @param[in] json_output If 1, output JSON format. * @return 0 on success, 1 on error. */ -// Report a .zxd dictionary file: its dict_id (to match against a .zxc's -// "Dict ID") and content size. `buf` holds the whole .zxd file. -static int zxc_list_dict(const char* path, const uint8_t* buf, size_t buf_size, long long file_size, - int json_output) { - const void* content = NULL; - size_t content_size = 0; - uint32_t id = 0; - const int rc = zxc_dict_load(buf, buf_size, &content, &content_size, &id); - if (rc != ZXC_OK) { - fprintf(stderr, "Error: invalid dictionary '%s': %s\n", path, zxc_error_name(rc)); - return 1; - } - if (json_output) { - printf("{\n" - " \"type\": \"dictionary\",\n" - " \"filename\": \"%s\",\n" - " \"dict_id\": \"0x%08X\",\n" - " \"content_size_bytes\": %zu,\n" - " \"file_size_bytes\": %lld\n" - "}\n", - path, id, content_size, file_size); - } else { - printf("\n Dictionary file (.zxd)\n" - " Dict ID: 0x%08X\n" - " Content size: %zu bytes\n" - " File: %s\n", - id, content_size, path); - } - return 0; -} - static int zxc_list_archive(const char* path, int json_output) { char resolved_path[4096]; if (zxc_validate_input_path(path, resolved_path, sizeof(resolved_path)) != 0) { @@ -669,29 +638,6 @@ static int zxc_list_archive(const char* path, int json_output) { } const long long file_size = ftello(f); - // A .zxd dictionary file has its own magic word; recognise it and report - // its dict_id (for matching against a .zxc's "Dict ID") instead of failing - // as a non-archive. The upper bound is the largest possible .zxd file. - if (file_size >= (long long)ZXC_DICT_HEADER_SIZE && - file_size <= (long long)zxc_dict_save_bound(ZXC_DICT_SIZE_MAX)) { - uint8_t probe[ZXC_DICT_HEADER_SIZE]; - if (fseeko(f, 0, SEEK_SET) == 0 && - fread(probe, 1, ZXC_DICT_HEADER_SIZE, f) == ZXC_DICT_HEADER_SIZE && - zxc_dict_get_id(probe, ZXC_DICT_HEADER_SIZE) != 0) { - uint8_t* dbuf = (uint8_t*)malloc((size_t)file_size); - int r = 1; - if (dbuf && fseeko(f, 0, SEEK_SET) == 0 && - fread(dbuf, 1, (size_t)file_size, f) == (size_t)file_size) - r = zxc_list_dict(path, dbuf, (size_t)file_size, file_size, json_output); - else - fprintf(stderr, "Error: Cannot read '%s'\n", path); - free(dbuf); - fclose(f); - return r; - } - fseeko(f, 0, SEEK_SET); - } - // Use public API to get decompressed size const int64_t uncompressed_size = zxc_stream_get_decompressed_size(f); if (uncompressed_size < 0) { @@ -791,6 +737,53 @@ static int zxc_list_archive(const char* path, int json_output) { return 0; } +// --auto-dict: train a dictionary from (a bounded prefix of) the input file and +// return it as a malloc'd buffer (caller frees), sized to the block size. The +// trained dict is meant to be embedded in the archive. Returns NULL on failure. +static void* cli_auto_train_dict(const char* path, size_t block_size, size_t* out_size) { + *out_size = 0; + FILE* f = fopen(path, "rb"); + if (!f) return NULL; + fseeko(f, 0, SEEK_END); + const long long fsz = ftello(f); + fseeko(f, 0, SEEK_SET); + if (fsz <= 0) { + fclose(f); + return NULL; + } + /* Train on up to 16 MB of the input; the trainer samples across it. */ + const size_t cap = (size_t)16u << 20; + const size_t corpus_sz = ((size_t)fsz < cap) ? (size_t)fsz : cap; + uint8_t* corpus = (uint8_t*)malloc(corpus_sz); + if (!corpus) { + fclose(f); + return NULL; + } + const size_t got = fread(corpus, 1, corpus_sz, f); + fclose(f); + if (got != corpus_sz) { + free(corpus); + return NULL; + } + size_t dict_cap = ZXC_DICT_SIZE_MAX; + if (block_size > 0 && block_size < dict_cap) dict_cap = block_size; + uint8_t* dict = (uint8_t*)malloc(dict_cap); + if (!dict) { + free(corpus); + return NULL; + } + const void* samples[1] = {corpus}; + const size_t sizes[1] = {corpus_sz}; + const int64_t dsz = zxc_train_dict(samples, sizes, 1, dict, dict_cap); + free(corpus); + if (dsz <= 0) { + free(dict); + return NULL; + } + *out_size = (size_t)dsz; + return dict; +} + static int process_single_file(const char* in_path, const char* out_path_override, zxc_mode_t mode, int num_threads, int keep_input, int force, int to_stdout, int checksum_enabled, int level, size_t block_size, @@ -974,6 +967,22 @@ static int process_single_file(const char* in_path, const char* out_path_overrid .operation = (mode == MODE_COMPRESS) ? "Compressing" : "Decompressing", .total_size = total_size}; + /* --auto-dict: train a dictionary from the input and embed it (compress + * only, and only for a real file — a pipe cannot be re-read to train). */ + const void* eff_dict = dict; + size_t eff_dict_size = dict_size; + void* auto_dict_buf = NULL; + if (g_auto_dict && mode == MODE_COMPRESS && !use_stdin) { + auto_dict_buf = cli_auto_train_dict(resolved_in_path, block_size, &eff_dict_size); + if (auto_dict_buf) { + eff_dict = auto_dict_buf; + zxc_log_v("Auto-trained dictionary: %zu bytes (embedded)\n", eff_dict_size); + } else { + eff_dict_size = dict_size; + zxc_log("Warning: --auto-dict training failed; compressing without a dictionary\n"); + } + } + const double t0 = zxc_now(); int64_t bytes; if (mode == MODE_COMPRESS) { @@ -983,8 +992,8 @@ static int process_single_file(const char* in_path, const char* out_path_overrid .block_size = block_size, .checksum_enabled = checksum_enabled, .seekable = seekable, - .dict = dict, - .dict_size = dict_size, + .dict = eff_dict, + .dict_size = eff_dict_size, .progress_cb = show_progress ? cli_progress_callback : NULL, .user_data = &pctx, }; @@ -1022,6 +1031,7 @@ static int process_single_file(const char* in_path, const char* out_path_overrid free(b1); free(b2); + free(auto_dict_buf); if (bytes >= 0) { if (mode == MODE_INTEGRITY) { @@ -1094,11 +1104,8 @@ int main(int argc, char** argv) { int json_output = 0; size_t block_size = 0; int seekable = 0; - const char* dict_path = NULL; - const char* train_dict_path = NULL; - static const struct option long_options[] = {{"train-dict", required_argument, 0, OPT_TRAIN_DICT}, - {"dict", required_argument, 0, 'D'}, + static const struct option long_options[] = { {"compress", no_argument, 0, 'z'}, {"decompress", no_argument, 0, 'd'}, {"list", no_argument, 0, 'l'}, @@ -1119,12 +1126,13 @@ int main(int argc, char** argv) { {"recursive", no_argument, 0, 'r'}, {"block-size", required_argument, 0, 'B'}, {"seekable", no_argument, 0, 'S'}, + {"auto-dict", no_argument, 0, OPT_AUTO_DICT}, {0, 0, 0, 0}}; int opt; int multiple_mode = 0; int recursive_mode = 0; - while ((opt = getopt_long(argc, argv, "123456b::B:cCdD:fhjklmrNqST:tvVz", long_options, NULL)) != + while ((opt = getopt_long(argc, argv, "123456b::B:cCdfhjklmrNqST:tvVz", long_options, NULL)) != -1) { switch (opt) { case 'z': @@ -1208,12 +1216,8 @@ int main(int argc, char** argv) { case 'S': seekable = 1; break; - case 'D': - dict_path = optarg; - break; - case OPT_TRAIN_DICT: - mode = MODE_TRAIN_DICT; - train_dict_path = optarg; + case OPT_AUTO_DICT: + g_auto_dict = 1; break; case 'r': recursive_mode = 1; @@ -1287,171 +1291,10 @@ int main(int argc, char** argv) { checksum = (mode == MODE_BENCHMARK) ? 0 : 1; } - /* Load dictionary file (.zxd) if requested */ + /* Dictionaries are produced internally (--auto-dict) and embedded in the + * archive; the CLI never takes a dictionary as input. */ void* dict = NULL; size_t dict_size = 0; - if (dict_path) { - char resolved_dict[4096]; - if (zxc_validate_input_path(dict_path, resolved_dict, sizeof(resolved_dict)) != 0) { - fprintf(stderr, "Error: invalid dictionary path '%s': %s\n", dict_path, strerror(errno)); - return 1; - } - FILE* f_dict = fopen(resolved_dict, "rb"); - if (!f_dict) { - fprintf(stderr, "Error: cannot open dictionary '%s': %s\n", dict_path, strerror(errno)); - return 1; - } - fseeko(f_dict, 0, SEEK_END); - const long long fsize = ftello(f_dict); - fseeko(f_dict, 0, SEEK_SET); - if (fsize <= 0 || (size_t)fsize > ZXC_DICT_SIZE_MAX + ZXC_DICT_HEADER_SIZE) { - fprintf(stderr, "Error: dictionary file '%s' has invalid size\n", dict_path); - fclose(f_dict); - return 1; - } - uint8_t* zxd_buf = (uint8_t*)malloc((size_t)fsize); - if (!zxd_buf || fread(zxd_buf, 1, (size_t)fsize, f_dict) != (size_t)fsize) { - fprintf(stderr, "Error: failed to read dictionary '%s'\n", dict_path); - free(zxd_buf); - fclose(f_dict); - return 1; - } - fclose(f_dict); - - const void* content = NULL; - size_t content_size = 0; - const int rc = zxc_dict_load(zxd_buf, (size_t)fsize, &content, &content_size, NULL); - if (rc != ZXC_OK) { - fprintf(stderr, "Error: invalid dictionary '%s': %s\n", dict_path, - zxc_error_name(rc)); - free(zxd_buf); - return 1; - } - dict = malloc(content_size); - if (!dict) { - free(zxd_buf); - return 1; - } - memcpy(dict, content, content_size); - dict_size = content_size; - free(zxd_buf); - } - - /* - * Train Dictionary Mode - * Reads input files as samples, trains a dictionary, saves as .zxd. - */ - if (mode == MODE_TRAIN_DICT) { - if (optind >= argc) { - fprintf(stderr, "Error: --train-dict requires input files as training samples.\n"); - free(dict); - return 1; - } - const int n_files = argc - optind; - const void** samples = (const void**)malloc((size_t)n_files * sizeof(void*)); - size_t* sample_sizes = (size_t*)malloc((size_t)n_files * sizeof(size_t)); - if (!samples || !sample_sizes) { - fprintf(stderr, "Error: memory allocation failed\n"); - free(samples); - free(sample_sizes); - free(dict); - return 1; - } - int n_loaded = 0; - for (int i = optind; i < argc; i++) { - char resolved[4096]; - if (zxc_validate_input_path(argv[i], resolved, sizeof(resolved)) != 0) { - fprintf(stderr, "Warning: invalid path '%s', skipping\n", argv[i]); - continue; - } - FILE* sf = fopen(resolved, "rb"); - if (!sf) { - fprintf(stderr, "Warning: cannot open '%s', skipping\n", argv[i]); - continue; - } - fseeko(sf, 0, SEEK_END); - size_t sz = (size_t)ftello(sf); - fseeko(sf, 0, SEEK_SET); - if (sz == 0) { fclose(sf); continue; } - uint8_t* buf = (uint8_t*)malloc(sz); - if (!buf) { fclose(sf); continue; } - fread(buf, 1, sz, sf); - fclose(sf); - samples[n_loaded] = buf; - sample_sizes[n_loaded] = sz; - n_loaded++; - } - if (n_loaded == 0) { - fprintf(stderr, "Error: no valid samples loaded\n"); - free(samples); - free(sample_sizes); - free(dict); - return 1; - } - - size_t dict_cap = ZXC_DICT_SIZE_MAX; - if (block_size > 0 && block_size < dict_cap) dict_cap = block_size; - uint8_t* dict_buf = (uint8_t*)malloc(dict_cap); - if (!dict_buf) { - fprintf(stderr, "Error: memory allocation failed\n"); - for (int i = 0; i < n_loaded; i++) free((void*)samples[i]); - free(samples); - free(sample_sizes); - free(dict); - return 1; - } - - int64_t dict_sz = zxc_train_dict(samples, sample_sizes, (size_t)n_loaded, - dict_buf, dict_cap); - for (int i = 0; i < n_loaded; i++) free((void*)samples[i]); - free(samples); - free(sample_sizes); - - if (dict_sz <= 0) { - fprintf(stderr, "Error: training failed: %s\n", zxc_error_name((int)dict_sz)); - free(dict_buf); - free(dict); - return 1; - } - - size_t zxd_bound = zxc_dict_save_bound((size_t)dict_sz); - uint8_t* zxd = (uint8_t*)malloc(zxd_bound); - int64_t zxd_sz = zxc_dict_save(dict_buf, (size_t)dict_sz, zxd, zxd_bound); - free(dict_buf); - if (zxd_sz <= 0) { - fprintf(stderr, "Error: dict save failed: %s\n", zxc_error_name((int)zxd_sz)); - free(zxd); - free(dict); - return 1; - } - - FILE* out; -#ifdef _WIN32 - out = fopen(train_dict_path, "wb"); -#else - { - const int fd = open(train_dict_path, O_CREAT | O_WRONLY | O_TRUNC, - S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); - out = (fd != -1) ? fdopen(fd, "wb") : NULL; - } -#endif - if (!out) { - fprintf(stderr, "Error: cannot create '%s': %s\n", train_dict_path, strerror(errno)); - free(zxd); - free(dict); - return 1; - } - const uint32_t trained_id = zxc_dict_get_id(zxd, (size_t)zxd_sz); - fwrite(zxd, 1, (size_t)zxd_sz, out); - fclose(out); - free(zxd); - - fprintf(stderr, "Trained dictionary: %lld bytes from %d samples -> %s (dict_id: 0x%08X)\n", - (long long)dict_sz, n_loaded, train_dict_path, trained_id); - - free(dict); - return 0; - } /* * Benchmark Mode diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index fd0f582f..9ba5e5b8 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -7,7 +7,9 @@ /** * @file zxc_dict.c - * @brief Pre-trained dictionary: ID computation, .zxd serialization, and training. + * @brief Pre-trained dictionary: content-ID computation and training. Dictionaries + * are embedded in the archive (see ZXC_BLOCK_DICT); there is no standalone + * dictionary file format. */ #include "../../include/zxc_dict.h" @@ -23,90 +25,6 @@ uint32_t zxc_dict_id(const void* dict, const size_t dict_size) { return zxc_checksum(dict, dict_size, 0); } -/* ------------------------------------------------------------------------- - * .zxd format: save / load / bound - * - * Layout (ZXC_DICT_HEADER_SIZE = 16 bytes + content): - * 0x00 4 Magic (0x9CB0D1C7 LE) - * 0x04 1 Version (1) - * 0x05 1 Flags (reserved, 0) - * 0x06 2 Content size (u16 LE) - * 0x08 4 dict_id (u32 LE) - * 0x0C 2 Header CRC16 (zxc_hash16, computed with bytes 0x0C-0x0F zeroed) - * 0x0E 2 Reserved (0) - * 0x10 N Content bytes - * ------------------------------------------------------------------------- */ - -uint32_t zxc_dict_get_id(const void* buf, const size_t buf_size) { - if (UNLIKELY(!buf || buf_size < ZXC_DICT_HEADER_SIZE)) return 0; - const uint8_t* p = (const uint8_t*)buf; - if (UNLIKELY(zxc_le32(p) != ZXC_DICT_MAGIC)) return 0; - return zxc_le32(p + 8); -} - -size_t zxc_dict_save_bound(const size_t content_size) { - return ZXC_DICT_HEADER_SIZE + content_size; -} - -int64_t zxc_dict_save(const void* content, const size_t content_size, void* buf, - const size_t buf_capacity) { - if (UNLIKELY(!content || content_size == 0)) return ZXC_ERROR_NULL_INPUT; - if (UNLIKELY(content_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; - - const size_t total = ZXC_DICT_HEADER_SIZE + content_size; - if (UNLIKELY(buf_capacity < total)) return ZXC_ERROR_DST_TOO_SMALL; - - uint8_t* dst = (uint8_t*)buf; - - zxc_store_le32(dst + 0, ZXC_DICT_MAGIC); - dst[4] = ZXC_DICT_VERSION; - dst[5] = 0; /* flags: reserved */ - zxc_store_le16(dst + 6, (uint16_t)content_size); - zxc_store_le32(dst + 8, zxc_dict_id(content, content_size)); - zxc_store_le16(dst + 12, 0); - zxc_store_le16(dst + 14, 0); - const uint16_t crc = zxc_hash16(dst); - zxc_store_le16(dst + 12, crc); - - ZXC_MEMCPY(dst + ZXC_DICT_HEADER_SIZE, content, content_size); - - return (int64_t)total; -} - -int zxc_dict_load(const void* buf, const size_t buf_size, const void** content_out, - size_t* content_size_out, uint32_t* dict_id_out) { - if (UNLIKELY(!buf || !content_out || !content_size_out)) return ZXC_ERROR_NULL_INPUT; - if (UNLIKELY(buf_size < ZXC_DICT_HEADER_SIZE)) return ZXC_ERROR_SRC_TOO_SMALL; - - const uint8_t* src = (const uint8_t*)buf; - - if (zxc_le32(src) != ZXC_DICT_MAGIC) return ZXC_ERROR_BAD_MAGIC; - if (src[4] != ZXC_DICT_VERSION) return ZXC_ERROR_BAD_VERSION; - - const size_t content_size = zxc_le16(src + 6); - if (UNLIKELY(content_size == 0)) return ZXC_ERROR_CORRUPT_DATA; - if (UNLIKELY(content_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; - if (UNLIKELY(buf_size < ZXC_DICT_HEADER_SIZE + content_size)) return ZXC_ERROR_SRC_TOO_SMALL; - - uint8_t temp[ZXC_DICT_HEADER_SIZE]; - ZXC_MEMCPY(temp, src, ZXC_DICT_HEADER_SIZE); - zxc_store_le16(temp + 12, 0); - zxc_store_le16(temp + 14, 0); - const uint16_t expected_crc = zxc_hash16(temp); - if (UNLIKELY(zxc_le16(src + 12) != expected_crc)) return ZXC_ERROR_BAD_HEADER; - - /* Verify dict_id matches content */ - const uint8_t* content = src + ZXC_DICT_HEADER_SIZE; - const uint32_t id = zxc_dict_id(content, content_size); - if (UNLIKELY(zxc_le32(src + 8) != id)) return ZXC_ERROR_BAD_CHECKSUM; - - *content_out = content; - *content_size_out = content_size; - if (dict_id_out) *dict_id_out = id; - - return ZXC_OK; -} - /* ------------------------------------------------------------------------- * Dictionary training: k-gram frequency selection * diff --git a/src/lib/zxc_driver.c b/src/lib/zxc_driver.c index 8750f3ea..1d3092b0 100644 --- a/src/lib/zxc_driver.c +++ b/src/lib/zxc_driver.c @@ -602,12 +602,13 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread const int checksum_enabled, const int seekable, zxc_chunk_processor_t func, zxc_progress_callback_t progress_cb, void* user_data, - const uint8_t* dict, const size_t dict_size) { + const uint8_t* dict, size_t dict_size) { zxc_stream_ctx_t ctx; ZXC_MEMSET(&ctx, 0, sizeof(ctx)); size_t runtime_chunk_sz = (block_size > 0) ? block_size : ZXC_BLOCK_SIZE_DEFAULT; int file_has_chk = 0; + uint8_t* embedded_dict = NULL; /* heap copy of an embedded dictionary (decode); freed at exit */ // Try to get input file size for progress tracking (compression mode only) // For decompression, the CLI precomputes the size and passes it via user_data @@ -635,9 +636,28 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread return ZXC_ERROR_BAD_HEADER; if (header_dict_id != 0) { - if (UNLIKELY(!dict || dict_size == 0)) return ZXC_ERROR_DICT_REQUIRED; - if (UNLIKELY(zxc_dict_id(dict, dict_size) != header_dict_id)) + /* A dictionary is present: it is embedded as a ZXC_BLOCK_DICT block + * right after the header. Read it and use it (no external dict). The + * header is already consumed, so this works on pipes too. */ + uint8_t dbh[ZXC_BLOCK_HEADER_SIZE]; + zxc_block_header_t bh; + if (UNLIKELY(fread(dbh, 1, ZXC_BLOCK_HEADER_SIZE, f_in) != ZXC_BLOCK_HEADER_SIZE || + zxc_read_block_header(dbh, ZXC_BLOCK_HEADER_SIZE, &bh) != ZXC_OK || + bh.block_type != ZXC_BLOCK_DICT || bh.comp_size == 0 || + bh.comp_size > ZXC_DICT_SIZE_MAX)) + return ZXC_ERROR_BAD_HEADER; + embedded_dict = (uint8_t*)ZXC_MALLOC(bh.comp_size); + if (UNLIKELY(!embedded_dict)) return ZXC_ERROR_MEMORY; + if (UNLIKELY(fread(embedded_dict, 1, bh.comp_size, f_in) != bh.comp_size)) { + ZXC_FREE(embedded_dict); + return ZXC_ERROR_SRC_TOO_SMALL; + } + if (UNLIKELY(zxc_dict_id(embedded_dict, bh.comp_size) != header_dict_id)) { + ZXC_FREE(embedded_dict); return ZXC_ERROR_DICT_MISMATCH; + } + dict = embedded_dict; + dict_size = bh.comp_size; } } @@ -677,6 +697,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (UNLIKELY(!mem_block || per_job_sz > SIZE_MAX / ctx.ring_size)) { // LCOV_EXCL_START ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -712,6 +733,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread if (UNLIKELY(!workers)) { // LCOV_EXCL_START ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -728,6 +750,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread pthread_mutex_destroy(&ctx.lock); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -751,6 +774,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread pthread_mutex_destroy(&ctx.lock); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; } // LCOV_EXCL_STOP @@ -762,8 +786,24 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread (dict && dict_size) ? zxc_dict_id(dict, dict_size) : 0); if (UNLIKELY(fwrite(h, 1, ZXC_FILE_HEADER_SIZE, f_out) != ZXC_FILE_HEADER_SIZE)) ctx.io_error = 1; - w_args.total_bytes = ZXC_FILE_HEADER_SIZE; + + if (dict && dict_size) { + /* A dictionary is always embedded: store it as a ZXC_BLOCK_DICT block + * right after the header ([block header 8][raw dict content]). Data + * blocks follow, so the seekable reader offsets the first past it. + * The HAS_DICTIONARY header flag (set above via dict_id) marks it. */ + uint8_t dbh[ZXC_BLOCK_HEADER_SIZE]; + const zxc_block_header_t dh = {.block_type = ZXC_BLOCK_DICT, + .block_flags = 0, + .reserved = 0, + .comp_size = (uint32_t)dict_size}; + zxc_write_block_header(dbh, ZXC_BLOCK_HEADER_SIZE, &dh); + if (UNLIKELY(fwrite(dbh, 1, ZXC_BLOCK_HEADER_SIZE, f_out) != ZXC_BLOCK_HEADER_SIZE || + fwrite(dict, 1, dict_size, f_out) != dict_size)) + ctx.io_error = 1; + w_args.total_bytes += ZXC_BLOCK_HEADER_SIZE + dict_size; + } } pthread_t writer_th; if (UNLIKELY(pthread_create(&writer_th, NULL, zxc_async_writer, &w_args) != 0)) { @@ -779,6 +819,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread pthread_mutex_destroy(&ctx.lock); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -977,6 +1018,7 @@ static int64_t zxc_stream_engine_run(FILE* f_in, FILE* f_out, const int n_thread ZXC_FREE(w_args.seek_comp); ZXC_FREE(workers); ZXC_ALIGNED_FREE(mem_block); + ZXC_FREE(embedded_dict); if (UNLIKELY(ctx.io_error)) return ZXC_ERROR_IO; diff --git a/src/lib/zxc_internal.h b/src/lib/zxc_internal.h index 49d42a4f..8361a2aa 100644 --- a/src/lib/zxc_internal.h +++ b/src/lib/zxc_internal.h @@ -335,15 +335,13 @@ extern "C" { /** @brief Bit flag in the Flags byte indicating checksum presence (bit 7). */ #define ZXC_FILE_FLAG_HAS_CHECKSUM 0x80U -/** @brief Bit flag in the Flags byte indicating a dictionary is required (bit 6). */ +/** @brief Bit flag in the Flags byte indicating a dictionary is present (bit 6). + * The dictionary is always embedded as a ZXC_BLOCK_DICT block right after the + * file header, so the decoder reads it from the archive itself. */ #define ZXC_FILE_FLAG_HAS_DICTIONARY 0x40U /** @brief Mask for the checksum algorithm id (bits 0-3). */ #define ZXC_FILE_CHECKSUM_ALGO_MASK 0x0FU -/** @brief Magic word identifying ZXC dictionary files (.zxd). */ -#define ZXC_DICT_MAGIC 0x9CB0D1C7U -/** @brief Current dictionary file format version. */ -#define ZXC_DICT_VERSION 1 /** @brief K-gram length scanned by the dictionary trainer. Aligned on the LZ * minimum match length so trained patterns are matchable at encode time. */ #define ZXC_DICT_KGRAM_LEN ZXC_LZ_MIN_MATCH_LEN @@ -760,6 +758,9 @@ static ZXC_ALWAYS_INLINE zxc_lz77_params_t zxc_get_lz77_params(const int level) * Uses Delta Encoding + ZigZag + Bitpacking. * - `ZXC_BLOCK_GHI` (3): General-purpose high-velocity mode using LZ77 with advanced * techniques (lazy matching, step skipping) for maximum ratio. Includes 3 sections descriptors. + * - `ZXC_BLOCK_DICT` (253): Embedded dictionary block. Contains the dictionary data + * for the file, if present. Always placed immediately after the file header when the + * dictionary flag is set. * - `ZXC_BLOCK_SEK` (254): Seek table block. Contains per-block compressed/decompressed sizes * for random-access decompression. Placed between EOF block and file footer. * - `ZXC_BLOCK_EOF` (255): End of file marker. @@ -769,6 +770,7 @@ typedef enum { ZXC_BLOCK_GLO = 1, ZXC_BLOCK_NUM = 2, ZXC_BLOCK_GHI = 3, + ZXC_BLOCK_DICT = 253, ZXC_BLOCK_SEK = 254, ZXC_BLOCK_EOF = 255 } zxc_block_type_t; diff --git a/src/lib/zxc_seekable.c b/src/lib/zxc_seekable.c index d3336630..5845e71c 100644 --- a/src/lib/zxc_seekable.c +++ b/src/lib/zxc_seekable.c @@ -179,6 +179,8 @@ struct zxc_seekable_s { uint8_t* dict_work; /* [dict | decode_space] bounce buffer */ }; +static int zxc_seekable_install_dict(zxc_seekable* s, const void* dict, size_t dict_size); + /** * @brief Parses the seek table from raw bytes at the end of the archive. * @@ -259,11 +261,38 @@ static zxc_seekable* zxc_seekable_parse(const uint8_t* data, const size_t data_s // LCOV_EXCL_STOP s->total_decomp = total_decomp; + /* A dictionary, when present, is embedded as a ZXC_BLOCK_DICT block right + * after the file header. Detect it by the block type (no dedicated flag): + * if the first block is a DICT block, load it and start the data blocks + * past it. Otherwise the dictionary must be supplied via + * zxc_seekable_set_dict (its id is already in expected_dict_id). */ + uint64_t data_start = ZXC_FILE_HEADER_SIZE; + if (header_dict_id != 0 && data_size >= ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE) { + zxc_block_header_t dbh; + if (zxc_read_block_header(data + ZXC_FILE_HEADER_SIZE, ZXC_BLOCK_HEADER_SIZE, &dbh) == + ZXC_OK && + dbh.block_type == ZXC_BLOCK_DICT) { + if (UNLIKELY(dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX || + (uint64_t)ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + dbh.comp_size > + (uint64_t)data_size)) { + zxc_seekable_free(s); + return NULL; + } + const uint8_t* dcontent = data + ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE; + if (UNLIKELY(zxc_dict_id(dcontent, dbh.comp_size) != header_dict_id || + zxc_seekable_install_dict(s, dcontent, dbh.comp_size) != ZXC_OK)) { + zxc_seekable_free(s); + return NULL; + } + data_start = (uint64_t)ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; + } + } + /* Parse comp_sizes and build compressed prefix sums. * Validate each comp_size against data_size to prevent prefix-sum overflow * and out-of-bounds reads during decompression. */ const uint8_t* ep = seek_block_start + ZXC_BLOCK_HEADER_SIZE; - uint64_t comp_acc = ZXC_FILE_HEADER_SIZE; /* blocks start after file header */ + uint64_t comp_acc = data_start; /* data blocks start after header (+ embedded dict block) */ for (uint32_t i = 0; i < num_blocks; i++) { s->comp_sizes[i] = zxc_le32(ep); ep += sizeof(uint32_t); @@ -413,9 +442,43 @@ zxc_seekable* zxc_seekable_open_reader(const zxc_reader_t* r) { } s->total_decomp = total_decomp; + /* Embedded dictionary (ZXC_BLOCK_DICT right after the header): read it via + * the reader, load it, and start the data blocks after it. */ + uint64_t data_start = ZXC_FILE_HEADER_SIZE; + if (header_dict_id != 0) { + uint8_t dbh_buf[ZXC_BLOCK_HEADER_SIZE]; + zxc_block_header_t dbh; + /* Embedded only if the first block is a DICT block (else external). */ + if (r->read_at(r->ctx, dbh_buf, ZXC_BLOCK_HEADER_SIZE, ZXC_FILE_HEADER_SIZE) == + (int64_t)ZXC_BLOCK_HEADER_SIZE && + zxc_read_block_header(dbh_buf, ZXC_BLOCK_HEADER_SIZE, &dbh) == ZXC_OK && + dbh.block_type == ZXC_BLOCK_DICT) { + if (UNLIKELY(dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX)) { + zxc_seekable_free(s); + return NULL; + } + uint8_t* dtmp = (uint8_t*)ZXC_MALLOC(dbh.comp_size); + if (UNLIKELY(!dtmp)) { + zxc_seekable_free(s); + return NULL; + } + if (UNLIKELY(r->read_at(r->ctx, dtmp, dbh.comp_size, + ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE) != + (int64_t)dbh.comp_size || + zxc_dict_id(dtmp, dbh.comp_size) != header_dict_id || + zxc_seekable_install_dict(s, dtmp, dbh.comp_size) != ZXC_OK)) { + ZXC_FREE(dtmp); + zxc_seekable_free(s); + return NULL; + } + ZXC_FREE(dtmp); + data_start = (uint64_t)ZXC_FILE_HEADER_SIZE + ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; + } + } + /* Parse comp_sizes and build prefix sums; validate against archive size. */ const uint8_t* ep = seek_buf + ZXC_BLOCK_HEADER_SIZE; - uint64_t comp_acc = ZXC_FILE_HEADER_SIZE; + uint64_t comp_acc = data_start; for (uint32_t i = 0; i < num_blocks; i++) { s->comp_sizes[i] = zxc_le32(ep); ep += sizeof(uint32_t); @@ -858,14 +921,14 @@ void zxc_seekable_free(zxc_seekable* s) { ZXC_FREE(s); } -int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_size) { - if (UNLIKELY(!s || !dict || dict_size == 0)) return ZXC_ERROR_NULL_INPUT; - if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; - if (UNLIKELY(s->expected_dict_id != 0 && zxc_dict_id(dict, dict_size) != s->expected_dict_id)) - return ZXC_ERROR_DICT_MISMATCH; - +/* Install a dictionary into the handle: owned copy + [dict | decode] bounce + * buffer. No id validation (callers do it where needed). */ +static int zxc_seekable_install_dict(zxc_seekable* s, const void* dict, const size_t dict_size) { ZXC_FREE(s->dict); ZXC_FREE(s->dict_work); + s->dict = NULL; + s->dict_work = NULL; + s->dict_size = 0; s->dict = (uint8_t*)ZXC_MALLOC(dict_size); if (UNLIKELY(!s->dict)) return ZXC_ERROR_MEMORY; @@ -886,6 +949,14 @@ int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_s return ZXC_OK; } +int zxc_seekable_set_dict(zxc_seekable* s, const void* dict, const size_t dict_size) { + if (UNLIKELY(!s || !dict || dict_size == 0)) return ZXC_ERROR_NULL_INPUT; + if (UNLIKELY(dict_size > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; + if (UNLIKELY(s->expected_dict_id != 0 && zxc_dict_id(dict, dict_size) != s->expected_dict_id)) + return ZXC_ERROR_DICT_MISMATCH; + return zxc_seekable_install_dict(s, dict, dict_size); +} + void zxc_seekable_attach_owned_ctx(zxc_seekable* s, void* ctx) { if (s) s->owned_reader_ctx = ctx; } diff --git a/tests/test_common.h b/tests/test_common.h index 1e4402f7..826e6fa7 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -183,7 +183,6 @@ int test_error_name(void); int test_library_info_api(void); /* Dictionary */ -int test_dict_zxd_roundtrip(void); int test_dict_id_deterministic(void); int test_dict_get_id_apis(void); int test_dict_buffer_roundtrip(void); diff --git a/tests/test_dict.c b/tests/test_dict.c index b9ecb246..4b905bc4 100644 --- a/tests/test_dict.c +++ b/tests/test_dict.c @@ -21,49 +21,6 @@ static void gen_dict_friendly_data(uint8_t* buf, size_t size, const uint8_t* dic } } -int test_dict_zxd_roundtrip(void) { - printf("=== TEST: Dict - .zxd save/load roundtrip ===\n"); - - const char* content = "hello dict content for testing zxd format!"; - const size_t content_size = strlen(content); - - size_t bound = zxc_dict_save_bound(content_size); - uint8_t* zxd = (uint8_t*)malloc(bound); - int64_t written = zxc_dict_save(content, content_size, zxd, bound); - if (written < 0) { - printf(" [FAIL] zxc_dict_save returned %lld\n", (long long)written); - free(zxd); - return 0; - } - - const void* loaded_content = NULL; - size_t loaded_size = 0; - uint32_t loaded_id = 0; - int rc = zxc_dict_load(zxd, (size_t)written, &loaded_content, &loaded_size, &loaded_id); - if (rc != ZXC_OK) { - printf(" [FAIL] zxc_dict_load returned %d (%s)\n", rc, zxc_error_name(rc)); - free(zxd); - return 0; - } - - if (loaded_size != content_size || memcmp(loaded_content, content, content_size) != 0) { - printf(" [FAIL] content mismatch after load\n"); - free(zxd); - return 0; - } - - uint32_t expected_id = zxc_dict_id(content, content_size); - if (loaded_id != expected_id) { - printf(" [FAIL] dict_id mismatch: got %u, expected %u\n", loaded_id, expected_id); - free(zxd); - return 0; - } - - free(zxd); - printf("PASS\n\n"); - return 1; -} - int test_dict_id_deterministic(void) { printf("=== TEST: Dict - dict_id is deterministic ===\n"); @@ -89,7 +46,7 @@ int test_dict_id_deterministic(void) { } int test_dict_get_id_apis(void) { - printf("=== TEST: Dict - zxc_get_dict_id / zxc_dict_get_id ===\n"); + printf("=== TEST: Dict - zxc_get_dict_id (archive header) ===\n"); const uint8_t dict[] = "dictionary content for get_id test"; const size_t dict_size = sizeof(dict) - 1; @@ -128,33 +85,6 @@ int test_dict_get_id_apis(void) { printf(" [PASS] zxc_get_dict_id returns 0 for no-dict file\n"); free(compressed); - /* Save to .zxd and verify zxc_dict_get_id */ - size_t zxd_bound = zxc_dict_save_bound(dict_size); - uint8_t* zxd = (uint8_t*)malloc(zxd_bound); - int64_t zxd_size = zxc_dict_save(dict, dict_size, zxd, zxd_bound); - if (zxd_size <= 0) { - printf(" [FAIL] zxc_dict_save returned %lld\n", (long long)zxd_size); - free(zxd); - return 0; - } - - uint32_t zxd_id = zxc_dict_get_id(zxd, (size_t)zxd_size); - if (zxd_id != expected_id) { - printf(" [FAIL] zxc_dict_get_id: got 0x%08X, expected 0x%08X\n", zxd_id, expected_id); - free(zxd); - return 0; - } - printf(" [PASS] zxc_dict_get_id returns 0x%08X\n", zxd_id); - - /* Invalid buffer should return 0 */ - if (zxc_dict_get_id("bad", 3) != 0) { - printf(" [FAIL] zxc_dict_get_id should return 0 for invalid buffer\n"); - free(zxd); - return 0; - } - printf(" [PASS] zxc_dict_get_id returns 0 for invalid buffer\n"); - - free(zxd); printf("PASS\n\n"); return 1; } @@ -800,10 +730,11 @@ static const uint8_t k_dict_a[] = static const uint8_t k_dict_b[] = "A completely unrelated dictionary payload hashing to a different dict_id value."; -// Compress `src` with dict A to a tmpfile, then try to stream-decompress it with -// `dec_dict` (NULL = none) and assert the decoder returns `want_err`. -static int stream_dict_error_case(const char* label, const uint8_t* dec_dict, size_t dec_dict_size, - int64_t want_err) { +// A stream archive compressed with a dictionary embeds it, so it must +// decompress correctly WITHOUT any external dictionary supplied at decode. +int test_dict_stream_dict_id_checks(void) { + printf("=== TEST: Dict - stream embeds dict (decodes with no external dict) ===\n"); + const size_t src_size = 8192; uint8_t* src = (uint8_t*)malloc(src_size); gen_dict_friendly_data(src, src_size, k_dict_a, sizeof(k_dict_a) - 1); @@ -811,11 +742,8 @@ static int stream_dict_error_case(const char* label, const uint8_t* dec_dict, si FILE* f_src = tmpfile(); FILE* f_comp = tmpfile(); FILE* f_dec = tmpfile(); - int ok = 1; - if (!f_src || !f_comp || !f_dec) { - printf(" [FAIL] %s: tmpfile() failed\n", label); - ok = 0; - } + int ok = (f_src && f_comp && f_dec); + if (!ok) printf(" [FAIL] tmpfile() failed\n"); if (ok) { fwrite(src, 1, src_size, f_src); @@ -825,35 +753,35 @@ static int stream_dict_error_case(const char* label, const uint8_t* dec_dict, si .dict = k_dict_a, .dict_size = sizeof(k_dict_a) - 1}; if (zxc_stream_compress(f_src, f_comp, &copts) <= 0) { - printf(" [FAIL] %s: stream_compress failed\n", label); + printf(" [FAIL] stream_compress failed\n"); ok = 0; } } if (ok) { rewind(f_comp); - zxc_decompress_opts_t dopts = { - .checksum_enabled = 1, .dict = dec_dict, .dict_size = dec_dict_size}; + /* No dict supplied: it must come from the embedded block. */ + zxc_decompress_opts_t dopts = {.checksum_enabled = 1}; int64_t rc = zxc_stream_decompress(f_comp, f_dec, &dopts); - if (rc != want_err) { - printf(" [FAIL] %s: expected %s, got %lld (%s)\n", label, zxc_error_name((int)want_err), - (long long)rc, zxc_error_name((int)rc)); + if (rc != (int64_t)src_size) { + printf(" [FAIL] embedded decode returned %lld, expected %zu\n", (long long)rc, + src_size); ok = 0; } } + if (ok) { + rewind(f_dec); + uint8_t* got = (uint8_t*)malloc(src_size); + ok = (fread(got, 1, src_size, f_dec) == src_size && memcmp(got, src, src_size) == 0); + if (!ok) printf(" [FAIL] embedded roundtrip mismatch\n"); + free(got); + } + if (f_src) fclose(f_src); if (f_comp) fclose(f_comp); if (f_dec) fclose(f_dec); free(src); - return ok; -} - -int test_dict_stream_dict_id_checks(void) { - printf("=== TEST: Dict - stream decode rejects missing/wrong dict ===\n"); - int ok = stream_dict_error_case("missing dict", NULL, 0, ZXC_ERROR_DICT_REQUIRED); - ok &= stream_dict_error_case("wrong dict", k_dict_b, sizeof(k_dict_b) - 1, - ZXC_ERROR_DICT_MISMATCH); if (!ok) return 0; printf("PASS\n\n"); return 1; diff --git a/tests/test_main.c b/tests/test_main.c index cef3c09a..9ec273ed 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -114,7 +114,6 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_library_info_api), /* --- Dictionary --- */ - TEST_CASE(test_dict_zxd_roundtrip), TEST_CASE(test_dict_id_deterministic), TEST_CASE(test_dict_get_id_apis), TEST_CASE(test_dict_buffer_roundtrip), From 1f081994af09f966105d1c4dfe34b7967d7c62f3 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:02:11 +0200 Subject: [PATCH 42/47] test: Remove external dictionary loading from conformance tests The conformance tests no longer need to search for and load external .zxd dictionary files. This change aligns with the new architecture where dictionaries are embedded directly within ZXC archives, making external dictionary provision obsolete for decompression. The `dict_text.zxd` test file is also removed as it is no longer used. --- conformance/test_conformance.c | 74 ------------------ conformance/valid/dict_text.zxd | Bin 207 -> 0 bytes conformance/valid/text_1k_dict.expected | 1 - conformance/valid/text_1k_dict.zxc | Bin 758 -> 0 bytes .../valid/text_1k_dict_seekable.expected | 1 - conformance/valid/text_1k_dict_seekable.zxc | Bin 770 -> 0 bytes 6 files changed, 76 deletions(-) delete mode 100644 conformance/valid/dict_text.zxd delete mode 100644 conformance/valid/text_1k_dict.expected delete mode 100644 conformance/valid/text_1k_dict.zxc delete mode 100644 conformance/valid/text_1k_dict_seekable.expected delete mode 100644 conformance/valid/text_1k_dict_seekable.zxc diff --git a/conformance/test_conformance.c b/conformance/test_conformance.c index f518f9f7..147070b4 100644 --- a/conformance/test_conformance.c +++ b/conformance/test_conformance.c @@ -18,7 +18,6 @@ #endif #include "../include/zxc_buffer.h" -#include "../include/zxc_dict.h" #include "../include/zxc_error.h" /* ---------- helpers ------------------------------------------------------ */ @@ -71,62 +70,6 @@ static int has_suffix(const char *s, const char *suffix) /* ---------- valid vector test -------------------------------------------- */ -/** - * @brief Searches for a .zxd dictionary file in the same directory as @p zxc_path - * whose dict_id matches @p target_id. Returns the loaded content (caller frees). - */ -static uint8_t *find_dict_for_id(const char *zxc_path, uint32_t target_id, - const void **content_out, size_t *content_size_out) -{ - /* Derive directory from zxc_path */ - char dir[512]; - strncpy(dir, zxc_path, sizeof(dir) - 1); - dir[sizeof(dir) - 1] = '\0'; - char *sep = strrchr(dir, '/'); - if (sep) *(sep + 1) = '\0'; else strcpy(dir, "./"); - -#ifdef _WIN32 - char pattern[512]; - snprintf(pattern, sizeof(pattern), "%s*.zxd", dir); - WIN32_FIND_DATAA fd; - HANDLE hf = FindFirstFileA(pattern, &fd); - if (hf == INVALID_HANDLE_VALUE) return NULL; - do { - char path[1024]; - snprintf(path, sizeof(path), "%s%s", dir, fd.cFileName); - size_t sz = 0; - uint8_t *buf = read_file(path, &sz); - if (buf && zxc_dict_get_id(buf, sz) == target_id) { - if (zxc_dict_load(buf, sz, content_out, content_size_out, NULL) == 0) { - FindClose(hf); - return buf; - } - } - free(buf); - } while (FindNextFileA(hf, &fd)); - FindClose(hf); -#else - DIR *dp = opendir(dir); - if (!dp) return NULL; - const struct dirent *ent; - while ((ent = readdir(dp)) != NULL) { - if (!has_suffix(ent->d_name, ".zxd")) continue; - char path[1024]; - snprintf(path, sizeof(path), "%s%s", dir, ent->d_name); - size_t sz = 0; - uint8_t *buf = read_file(path, &sz); - if (buf && zxc_dict_get_id(buf, sz) == target_id) { - if (zxc_dict_load(buf, sz, content_out, content_size_out, NULL) == 0) { - closedir(dp); - return buf; - } - } - free(buf); - } - closedir(dp); -#endif - return NULL; -} static int test_valid_vector(const char *zxc_path, const char *expected_path) { @@ -145,21 +88,6 @@ static int test_valid_vector(const char *zxc_path, const char *expected_path) return 0; } - /* Auto-detect dictionary: if the archive has a dict_id, find the .zxd */ - const void *dict = NULL; - size_t dict_size = 0; - uint8_t *dict_buf = NULL; - uint32_t did = zxc_get_dict_id(comp, comp_sz); - if (did != 0) { - dict_buf = find_dict_for_id(zxc_path, did, &dict, &dict_size); - if (!dict_buf) { - fprintf(stderr, "FAIL: %s requires dict 0x%08X but no matching .zxd found\n", - zxc_path, did); - free(comp); free(expected); - return 0; - } - } - int ok = 1; uint64_t dec_sz = zxc_get_decompressed_size(comp, comp_sz); @@ -177,7 +105,6 @@ static int test_valid_vector(const char *zxc_path, const char *expected_path) ok = 0; } else { zxc_decompress_opts_t dopts = {0}; - if (dict) { dopts.dict = dict; dopts.dict_size = dict_size; } int64_t result = zxc_decompress(comp, comp_sz, output, (size_t)dec_sz, &dopts); if (result < 0) { @@ -196,7 +123,6 @@ static int test_valid_vector(const char *zxc_path, const char *expected_path) } } - free(dict_buf); free(comp); free(expected); return ok; diff --git a/conformance/valid/dict_text.zxd b/conformance/valid/dict_text.zxd deleted file mode 100644 index b5735f652ad8fd57a2c816c33bc73faaba8452fb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 207 zcmXBM!EHh@5QO1JCv(UVy-K*D6tLJUG?LeT*t>`p=z${WAjh1O7ScdUh$C(@hyVNh zdOmi`$MSRhzQ6r0%b|5Oa2sLBt+b9ohnV^vsnwy$)uTf)H;%-5h8FsYqvjbv8N*tJ zt8Fu8P0Y+u9pNZ)nmp8Bw8G?E1yXnkS@G`BN}D!}|KmEZ*dCYQKyAZ&Wav$MqKmUt ML@8pay=0E{4;!>i)&Kwi diff --git a/conformance/valid/text_1k_dict.expected b/conformance/valid/text_1k_dict.expected deleted file mode 100644 index e519353a..00000000 --- a/conformance/valid/text_1k_dict.expected +++ /dev/null @@ -1 +0,0 @@ -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Curabitur pretium tincidunt lacus. Nulla gravida orci a odio. Nullam varius, turpis et commodo pharetra, est eros bibendum elit, nec luctus magna felis sollicitudin mauris. Integer in mauris eu nibh euismod gravida. Duis ac tellus et risus vulputate vehicula. Donec lobortis risus a elit. Etiam tempor. Ut ullamcorper, ligula ut dictum pharetra, nisi nunc fringilla magna, in commodo elit erat nec turpis. Ut pharetra augue nec augue. Nam elit agna, endrerit sit amet, tincidunt ac. \ No newline at end of file diff --git a/conformance/valid/text_1k_dict.zxc b/conformance/valid/text_1k_dict.zxc deleted file mode 100644 index 9aba3e7abc189ebbcdd0b3f3eae091c7a13313d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 758 zcmX|9!Ds(BMH5y$A|*R9Hq3ya!A`K}7T-cu8$f&1}I;_oTaPiQq*r2QPkr z@gR5$2>Al>q<8;7`~o+6j@9eNhN7x#s_VUaHGk@p_v=@_{CxWTjWOo!>%y3?r7^dz z>1|ts^ONIjIiKy$$}XwT296_xBp!hdfDo)72H$w~F4DeWs$oYs0!kv_Ac5FHpfS*l zTtk5qXkzYr853&BoO(@Bj0BeZUIr3qrDIIsr0)?%40@4J!|s8lF_8p_>=Jluk*wBP z8M#622F^PegXe}^nm`n-QZOt1h_EjVzHQkuLPL>4{yO+R!=vK?xKSDjdYt+oLk;&N z;FJ|vTT)^fAhEODqo)8PqlGSDyx$D|?9(&^Dv0Ldtd}*f>LN z8*s8jrQjn>d8!&IP}XDujx6+Ilf+gN5n?85l~JQZ4Jd(?!ULD6qX3|~IiAkcbJUH% zij{>#D(ezjA80UFIb_DF3hJ)>62M2uTprCe+V@;FwDsjzi_^vL)5T)zi79{m*eQak_RQmx_0MLr zI4%y0cjcD(R2&sIO;bIuZdYgJv+`xNS3c~Rb7J1(!o;o8-Wix)N@ Su6_G>rhliN@f{5rv@RHh|+S!7c?n!so62VKr9K84e z#)IH3Amj_gliu?O;upBltFe07*ick=b#=X0uli4Y@?q`T*PqW%-x_1yy(x_ORvL5X zhMu-`bFuP1TaIVDx3Vkhvw`EtAc+T{eINwu``$NRy^FLbm}=M-j)0N~I7lFN5NHTA zBiB&i1R9w8PKJbM6eEGVdk>{nMk%$2>wF=&}VRUu`&5^R{E zmJK*wqEhe?COlD%6ew#l21gb;u}R`o0})~-Yn4%>P4y^&mBJ&JsHFg)x;dWA)N|B{ zz>1ZHL@MhNS|8|Uu5!qXRTb1(`6Y_io)bshNsN6MD2$=wRx@D_fLSNPEW~uywgL*J#}f)zH>g4;E*O-)D=(<}*|N`mtREQ|+3kCu?8K zMsZvm7VpbV^SL-GZkwihS>3J9%NOPAYPW=PtC*UH=9XDEN9I*|e{@{lgu~JG^~;ww c9*@3%I@jN+tNd%sNd4)l{^F0-GG_AqKTm@FRsaA1 From 88ae87e55e15adc0123d256c2027eb95e4f9cabe Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:15:06 +0200 Subject: [PATCH 43/47] feat: Embed dictionaries directly into zxc archives This refactors dictionary handling to embed trained dictionaries directly within ZXC archives, eliminating the need for separate `.zxd` dictionary files. This simplifies deployment and ensures archives are self-contained for decompression. Key changes include: - Removal of the standalone `.zxd` file format and its associated API (save/load, header, magic, etc.). - Dictionaries are now stored as a `ZXC_BLOCK_DICT` block immediately after the ZXC file header when `ZXC_FILE_FLAG_HAS_DICTIONARY` is set. - The CLI is updated: - The `-D, --dict` option for providing external dictionary files is removed. - A new `--auto-dict` option is added to train a dictionary from input files and embed it automatically during compression. - The `list` command no longer supports inspecting `.zxd` files. - Decompression logic (stream and seekable) automatically reads and validates the embedded dictionary, removing the need to provide an external dictionary at decode time. - The static (malloc-free) decompression path now explicitly rejects archives that contain an embedded dictionary. --- README.md | 39 +++++++---------- docs/API.md | 40 ++---------------- docs/FORMAT.md | 96 +++++++++++------------------------------- src/lib/zxc_dispatch.c | 52 +++++++++++++++++------ tests/test_cli.sh | 90 ++++++++++++--------------------------- 5 files changed, 109 insertions(+), 208 deletions(-) diff --git a/README.md b/README.md index e6ec26ac..973d36d5 100644 --- a/README.md +++ b/README.md @@ -466,47 +466,38 @@ For workloads compressed in **small blocks** (4 KB–128 KB), a pre-trained dict **Typical use cases:** JSON API responses, small game assets, structured logs, key-value store records, RPC messages, and any large but homogeneous corpus compressed in small blocks for random access (e.g. seekable archives). -### Training a dictionary +The dictionary is **embedded in the archive** — there is no separate dictionary file to manage or ship. Decompression reads it from the archive automatically; nothing extra is needed at decode time. + +### Auto-trained dictionary (CLI) ```bash -# Train a dictionary from a corpus of similar files -zxc --train-dict corpus.zxd samples/*.json +# Train a dictionary from the input and embed it — zero configuration +zxc -z -S -B 4K --auto-dict input.json +zxc -d input.json.zxc # no dictionary needed: it is in the archive ``` +### Embedding a dictionary via the C API + ```c -// C API +// Train a dictionary from representative samples... const void* samples[] = { buf1, buf2, buf3 }; size_t sizes[] = { len1, len2, len3 }; uint8_t dict[32768]; int64_t dict_sz = zxc_train_dict(samples, sizes, 3, dict, sizeof(dict)); -``` -### Compressing with a dictionary - -```bash -# CLI -zxc -z -D corpus.zxd input.json -zxc -d -D corpus.zxd input.json.zxc -``` - -```c -// C API — compression +// ...then pass it to the compressor: it is embedded in the archive. zxc_compress_opts_t copts = { .level = ZXC_LEVEL_DEFAULT, - .dict = dict_content, - .dict_size = dict_sz, + .dict = dict, + .dict_size = (size_t)dict_sz, }; int64_t compressed_size = zxc_compress(src, src_size, dst, dst_cap, &copts); -// C API — decompression (same dictionary required) -zxc_decompress_opts_t dopts = { - .dict = dict_content, - .dict_size = dict_sz, -}; -int64_t original_size = zxc_decompress(compressed, comp_size, out, out_cap, &dopts); +// Decompression needs no dictionary — it is read from the archive. +int64_t original_size = zxc_decompress(compressed, comp_size, out, out_cap, NULL); ``` -The dictionary is stored as an external `.zxd` file and referenced by a 32-bit ID in the ZXC file header. Decompressing without the matching dictionary returns `ZXC_ERROR_DICT_REQUIRED` or `ZXC_ERROR_DICT_MISMATCH`. See [FORMAT.md](docs/FORMAT.md) §12 for the full specification. +The dictionary is stored as a `ZXC_BLOCK_DICT` block right after the file header (flagged by `HAS_DICTIONARY`). A corrupted or mismatched embedded dictionary surfaces as `ZXC_ERROR_DICT_MISMATCH`. See [FORMAT.md](docs/FORMAT.md) §12 for the full specification. --- diff --git a/docs/API.md b/docs/API.md index 2228f383..0caae299 100644 --- a/docs/API.md +++ b/docs/API.md @@ -1290,7 +1290,7 @@ Returns the encoded byte size of a seek table for `num_blocks` blocks. ## 11b. Dictionary API -Declared in ``. Provides dictionary training, serialization (`.zxd` format), and identification. +Declared in ``. Provides dictionary training and identification. A trained dictionary is passed to the compressor (`zxc_compress_opts_t::dict`) and embedded in the archive; there is no standalone dictionary file format. ### `zxc_train_dict` @@ -1312,38 +1312,7 @@ Trains a dictionary from a corpus of representative samples. Returns the size of ZXC_EXPORT uint32_t zxc_dict_id(const void* dict, size_t dict_size); ``` -Returns a deterministic 32-bit hash of the dictionary content. This ID is stored in the ZXC file header and verified at decompression time. Returns 0 for NULL/empty input. - -### `zxc_dict_save` - -```c -ZXC_EXPORT int64_t zxc_dict_save( - const void* content, size_t content_size, - void* buf, size_t buf_capacity -); -``` - -Serializes dictionary content to the `.zxd` file format. Use `zxc_dict_save_bound(content_size)` to compute the required buffer capacity. - -### `zxc_dict_load` - -```c -ZXC_EXPORT int zxc_dict_load( - const void* buf, size_t buf_size, - const void** content_out, size_t* content_size_out, - uint32_t* dict_id_out // may be NULL -); -``` - -Validates and parses a `.zxd` file from memory. On success, `content_out` points into the input buffer (zero-copy). Returns `ZXC_OK` or a negative error code. - -### `zxc_dict_save_bound` - -```c -ZXC_EXPORT size_t zxc_dict_save_bound(size_t content_size); -``` - -Returns the maximum `.zxd` file size for a given content size (`ZXC_DICT_HEADER_SIZE + content_size`). +Returns a deterministic 32-bit hash of the dictionary content. This ID is stored in the ZXC file header and verified against the embedded dictionary at decompression time. Returns 0 for NULL/empty input. ### `zxc_seekable_set_dict` @@ -1453,10 +1422,7 @@ The shared library exports **47 symbols** (verified with `nm -gU`): | 51 | `zxc_error_name` | Error | `zxc_error.h` | | 52 | `zxc_train_dict` | Dictionary | `zxc_dict.h` | | 53 | `zxc_dict_id` | Dictionary | `zxc_dict.h` | -| 54 | `zxc_dict_save` | Dictionary | `zxc_dict.h` | -| 55 | `zxc_dict_load` | Dictionary | `zxc_dict.h` | -| 56 | `zxc_dict_save_bound` | Dictionary | `zxc_dict.h` | -| 57 | `zxc_seekable_set_dict` | Seekable | `zxc_seekable.h` | +| 54 | `zxc_seekable_set_dict` | Seekable | `zxc_seekable.h` | No internal symbols leak into the public ABI. FMV dispatch variants (`_default`, `_neon`, `_avx2`, `_avx512`) are compiled with diff --git a/docs/FORMAT.md b/docs/FORMAT.md index ffe00316..e37ccc23 100644 --- a/docs/FORMAT.md +++ b/docs/FORMAT.md @@ -570,8 +570,10 @@ For decoders processing untrusted input (e.g. network data, user uploads): A pre-trained dictionary improves compression ratio on small, similar payloads (e.g. JSON API responses, game assets, structured logs) by prefilling the LZ77 -sliding window at the start of each block. The dictionary is an external file -(`.zxd` format) referenced by a 32-bit ID in the ZXC file header. +sliding window at the start of each block. The dictionary is **embedded in the +archive** (as a `ZXC_BLOCK_DICT` block, §12.4) and identified by a 32-bit ID in +the file header — there is no external dictionary file. Decompression reads the +dictionary from the archive itself. ### 12.2 Mechanism @@ -589,37 +591,32 @@ decompress any block independently. ### 12.3 File header encoding When `HAS_DICTIONARY` (flag bit 6) is set, the reserved bytes at offsets -`0x07..0x0A` contain the `dict_id` (`u32` LE). A decoder **MUST**: -1. Verify that a dictionary is provided (`ZXC_ERROR_DICT_REQUIRED` if not). -2. Verify that `zxc_dict_id(dict, dict_size) == header.dict_id` - (`ZXC_ERROR_DICT_MISMATCH` if not). +`0x07..0x0A` contain the `dict_id` (`u32` LE), and a `ZXC_BLOCK_DICT` block +(§12.4) carrying the dictionary content immediately follows the file header, +before the first data block. A decoder reads that block, verifies +`zxc_dict_id(content) == header.dict_id` (`ZXC_ERROR_DICT_MISMATCH` if not), and +uses the content as the dictionary for every data block. Older decoders that do not recognize the `HAS_DICTIONARY` flag will ignore it -(per §10.3: reserved flag bits are ignored). However, blocks compressed with a -dictionary contain match offsets that reference dictionary content; decoding -without the dictionary produces corrupt output. Per-block and global checksums -(when enabled) will detect this corruption. +(per §10.3: reserved flag bits are ignored), then fail on the unknown +`ZXC_BLOCK_DICT` block type rather than silently producing corrupt output. -### 12.4 Dictionary file format (`.zxd`) +### 12.4 Embedded dictionary block (`ZXC_BLOCK_DICT`) -Dictionaries are stored as standalone `.zxd` files with the following layout: +The dictionary is stored as a standard block placed right after the 16-byte file +header and before the first data block: ```text -Offset Size Field -0x00 4 Magic Word (0x9CB0D1C7 LE) -0x04 1 Dictionary format version (currently 1) -0x05 1 Flags (reserved, must be 0) -0x06 2 Content size (u16 LE, max 65535) -0x08 4 dict_id (u32 LE, hash of content) -0x0C 2 Header CRC16 (zxc_hash16, computed with bytes 0x0C-0x0F zeroed) -0x0E 2 Reserved (0) -0x10 N Dictionary content (raw bytes) +[ block header (8 bytes) ] block_type = 0xFD (ZXC_BLOCK_DICT), comp_size = N +[ dictionary content (N) ] raw bytes (uncompressed), max 65535 ``` -- **Magic Word**: `0x9CB0D1C7`. Allows immediate rejection of non-dictionary files. -- **dict_id**: deterministic 32-bit hash (RapidHash-folded) of the content bytes. Must match the `dict_id` stored in any ZXC file header that references this dictionary. -- **Header CRC16**: `zxc_hash16` checksum of the 16-byte header with bytes `0x0C..0x0F` zeroed before hashing — same method as the ZXC file header. -- **Content**: raw bytes that prefill the LZ77 window. Not compressed. +The block header is the ordinary 8-byte block header (§7), with +`block_type = ZXC_BLOCK_DICT (253)` and `comp_size` equal to the dictionary +content length. The content is raw bytes that prefill the LZ77 window; it is not +compressed. Seekable readers advance the first data block's offset past this +block. The dictionary ID stored in the file header (`dict_id`) is the +deterministic 32-bit hash (`zxc_dict_id`) of this content. ### 12.5 Dictionary training @@ -642,14 +639,13 @@ shortest offsets (closest to the block start in the virtual window). - GLO descriptors total: **32** bytes - GHI descriptors total: **24** bytes - File footer: **12** bytes -- Dictionary file header (`.zxd`): **16** bytes +- Embedded dictionary block header (`ZXC_BLOCK_DICT`): **8** bytes (standard block header) -**Magic words** — both are little-endian `u32` at offset `0x00` and deliberately share the `0x9CB0...` family prefix, so check the full value (or the file extension) to tell them apart: +**Magic word** — little-endian `u32` at offset `0x00`: | File | Magic (value) | On-disk bytes (LE) | |------|---------------|--------------------| | ZXC archive (`.zxc`) | `0x9CB02EF5` | `F5 2E B0 9C` | -| ZXC dictionary (`.zxd`) | `0x9CB0D1C7` | `C7 D1 B0 9C` | --- @@ -826,47 +822,3 @@ Seek table entry at `0x36`: ``` > **Compatibility note**: The SEK block is inserted between the EOF block and the file footer. The footer always remains the **last 12 bytes of the file**, so decoders that locate the footer from the end of the file (e.g. `src + src_size - 12` for buffer APIs, or `fseek(END - 12)` for file APIs) work unchanged with seekable archives. However, **streaming decoders** that read the footer sequentially immediately after the EOF block must be updated to detect and skip the SEK block. In practice, all ZXC decoders since v0.9.0 handle both seekable and non-seekable archives transparently. - ---- - -## 15. Worked Example: Dictionary File (`.zxd` Hexdump) - -A minimal dictionary whose content is the 5 ASCII bytes `hello`. Total file size: **21 bytes** (16-byte header + 5-byte content). This is the on-disk form produced by `zxc_dict_save()` (see §12.4). - -### 15.1 Full hexdump - -```text -00000000: C7 D1 B0 9C 01 00 05 00 17 0F 72 9A 4A D9 00 00 -00000010: 68 65 6C 6C 6F -``` - -### 15.2 Byte-level decoding - -#### A) Dictionary Header (offset `0x00`, 16 bytes) - -```text -C7 D1 B0 9C | 01 | 00 | 05 00 | 17 0F 72 9A | 4A D9 | 00 00 -``` - -- `C7 D1 B0 9C` -> magic word (LE) = `0x9CB0D1C7` (`.zxd` dictionary). -- `01` -> dictionary format version 1. -- `00` -> flags (reserved, must be 0). -- `05 00` -> content size (LE) = `5` bytes. -- `17 0F 72 9A` -> `dict_id` (LE) = `0x9A720F17`. Must match the `dict_id` stored in the file header of any `.zxc` archive compressed with this dictionary. -- `4A D9` -> header CRC16 (LE) = `0xD94A`, computed over the 16-byte header with bytes `0x0C..0x0F` zeroed (same method as the ZXC file header). -- `00 00` -> reserved. - -#### B) Dictionary Content (offset `0x10`, 5 bytes) - -```text -68 65 6C 6C 6F -``` - -ASCII: `hello`. Raw bytes that prefill the LZ77 window — not compressed. - -### 15.3 Structural view with absolute offsets - -```text -0x00..0x0F Dictionary Header (16) -0x10..0x14 Dictionary Content (5) -``` diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 5e933afe..7c85e9ca 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -690,7 +690,7 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE const int checksum_enabled = opts ? opts->checksum_enabled : 0; const uint8_t* dict = opts ? (const uint8_t*)opts->dict : NULL; - const size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; + size_t dict_size = (opts && opts->dict) ? opts->dict_size : 0; const uint8_t* ip = (const uint8_t*)src; const uint8_t* ip_end = ip + src_size; @@ -709,21 +709,43 @@ int64_t zxc_decompress(const void* RESTRICT src, const size_t src_size, void* RE return ZXC_ERROR_BAD_HEADER; } - /* Dictionary validation */ + ip += ZXC_FILE_HEADER_SIZE; + + /* Dictionary handling. When the archive needs a dictionary it is normally + * embedded as a ZXC_BLOCK_DICT block right after the header; detect it by + * type and use it (skipping the block). Otherwise the caller must supply a + * matching in-memory dictionary. */ if (header_dict_id != 0) { - if (!dict || dict_size == 0) { - zxc_cctx_free(&ctx); - return ZXC_ERROR_DICT_REQUIRED; - } - if (zxc_dict_id(dict, dict_size) != header_dict_id) { - zxc_cctx_free(&ctx); - return ZXC_ERROR_DICT_MISMATCH; + zxc_block_header_t dbh; + if ((size_t)(ip_end - ip) >= ZXC_BLOCK_HEADER_SIZE && + zxc_read_block_header(ip, ZXC_BLOCK_HEADER_SIZE, &dbh) == ZXC_OK && + dbh.block_type == ZXC_BLOCK_DICT) { + if (dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX || + (size_t)(ip_end - ip) < ZXC_BLOCK_HEADER_SIZE + dbh.comp_size) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_BAD_HEADER; + } + const uint8_t* edict = ip + ZXC_BLOCK_HEADER_SIZE; + if (zxc_dict_id(edict, dbh.comp_size) != header_dict_id) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_MISMATCH; + } + dict = edict; + dict_size = dbh.comp_size; + ip += ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; /* skip the embedded dict block */ + } else { + if (!dict || dict_size == 0) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_REQUIRED; + } + if (zxc_dict_id(dict, dict_size) != header_dict_id) { + zxc_cctx_free(&ctx); + return ZXC_ERROR_DICT_MISMATCH; + } } } ctx.dict_size = dict_size; - ip += ZXC_FILE_HEADER_SIZE; - const size_t work_sz = runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD; /* Dict decode buffer: [dict_content | decode_space + PAD] */ @@ -1064,10 +1086,16 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size int file_has_checksums = 0; uint32_t global_hash = 0; + uint32_t header_dict_id = 0; if (UNLIKELY(zxc_read_file_header(ip, src_size, &runtime_chunk_size, &file_has_checksums, - NULL) != ZXC_OK)) + &header_dict_id) != ZXC_OK)) return ZXC_ERROR_BAD_HEADER; + /* The static (malloc-free) decode path does not support dictionaries, which + * need a heap-allocated decode buffer. Reject such archives up front rather + * than misinterpreting the embedded dictionary block as data. */ + if (UNLIKELY(header_dict_id != 0)) return ZXC_ERROR_DICT_REQUIRED; + /* Static dctx: block_size is locked at workspace init; reject any * archive whose declared block_size would require a re-partition. */ if (UNLIKELY(dctx->owns_workspace && runtime_chunk_size != dctx->last_block_size)) diff --git a/tests/test_cli.sh b/tests/test_cli.sh index 7066b57c..1009722f 100755 --- a/tests/test_cli.sh +++ b/tests/test_cli.sh @@ -890,36 +890,25 @@ else log_fail "List command on seekable archive failed" fi -# 25. Dictionary Tests (-D) -echo "Testing Dictionary (-D)..." - -# 25.1 Train a dictionary using --train-dict -echo " Training dictionary from test data..." -# Create a few sample files for training -for i in 1 2 3 4 5; do - cp "$TEST_FILE" "$TEST_DIR/sample_${i}.txt" -done -DICT_FILE="$TEST_DIR/test.zxd" -"$ZXC_BIN" --train-dict "$DICT_FILE" "$TEST_DIR"/sample_*.txt 2>/dev/null -if [ ! -f "$DICT_FILE" ]; then - log_fail "Dictionary training failed" -fi -log_pass "Dictionary trained via --train-dict" - -# 25.2 Round-trip with dictionary -echo " Testing dict round-trip..." -"$ZXC_BIN" -3 -D "$DICT_FILE" -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict.zxc" +# 25. Dictionary Tests (--auto-dict, embedded) +echo "Testing Dictionary (--auto-dict)..." + +# 25.1 Round-trip with an auto-trained, embedded dictionary. +# The dictionary is trained from the input and stored in the archive; no +# external dictionary file is needed at decompression. +echo " Testing auto-dict round-trip (small blocks)..." +"$ZXC_BIN" -3 -B 4K --auto-dict -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict.zxc" if [ ! -s "$TEST_DIR/test_dict.zxc" ]; then - log_fail "Dict compression failed" + log_fail "Auto-dict compression failed" fi -"$ZXC_BIN" -d -D "$DICT_FILE" -c "$TEST_DIR/test_dict.zxc" > "$TEST_DIR/test_dict.dec" +"$ZXC_BIN" -d -c "$TEST_DIR/test_dict.zxc" > "$TEST_DIR/test_dict.dec" if cmp -s "$TEST_FILE" "$TEST_DIR/test_dict.dec"; then - log_pass "Dict round-trip (-D)" + log_pass "Auto-dict round-trip (no external dict at decode)" else - log_fail "Dict round-trip content mismatch" + log_fail "Auto-dict round-trip content mismatch" fi -# 25.3 List shows dict_id +# 25.2 List shows dict_id for an embedded-dict archive echo " Testing list with dict_id..." OUT=$("$ZXC_BIN" -l "$TEST_DIR/test_dict.zxc") if [[ "$OUT" == *"Dict ID"* ]] && [[ "$OUT" == *"0x"* ]]; then @@ -928,7 +917,7 @@ else log_fail "List should show dict_id column with 0x value" fi -# 25.4 List without dict shows dash +# 25.3 List without dict shows dash echo " Testing list without dict shows dash..." "$ZXC_BIN" -3 -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_nodict2.zxc" OUT=$("$ZXC_BIN" -l "$TEST_DIR/test_nodict2.zxc") @@ -938,7 +927,7 @@ else log_fail "List without dict should show dash in Dict ID column" fi -# 25.5 JSON list shows dict_id field +# 25.4 JSON list shows dict_id field echo " Testing JSON list with dict_id..." JSON_OUT=$("$ZXC_BIN" -l -j "$TEST_DIR/test_dict.zxc") if [[ "$JSON_OUT" == *'"dict_id"'* ]] && [[ "$JSON_OUT" == *"0x"* ]]; then @@ -947,54 +936,29 @@ else log_fail "JSON list should contain dict_id field" fi -# 25.6 Decompression without dict should fail -echo " Testing decompress without required dict..." -set +e -"$ZXC_BIN" -d -c "$TEST_DIR/test_dict.zxc" > /dev/null 2>&1 -RET=$? -set -e -if [ $RET -ne 0 ]; then - log_pass "Decompress without dict correctly fails" -else - log_fail "Decompress without required dict should fail" -fi - -# 25.7 Dict with seekable -echo " Testing dict + seekable (-D -S)..." -"$ZXC_BIN" -3 -D "$DICT_FILE" -S -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_seek.zxc" -"$ZXC_BIN" -d -D "$DICT_FILE" -c "$TEST_DIR/test_dict_seek.zxc" > "$TEST_DIR/test_dict_seek.dec" +# 25.5 Auto-dict + seekable +echo " Testing auto-dict + seekable (--auto-dict -S)..." +"$ZXC_BIN" -3 -B 4K --auto-dict -S -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_seek.zxc" +"$ZXC_BIN" -d -c "$TEST_DIR/test_dict_seek.zxc" > "$TEST_DIR/test_dict_seek.dec" if cmp -s "$TEST_FILE" "$TEST_DIR/test_dict_seek.dec"; then - log_pass "Dict + seekable (-D -S)" + log_pass "Auto-dict + seekable" else - log_fail "Dict + seekable round-trip failed" + log_fail "Auto-dict + seekable round-trip failed" fi -# 25.8 Dict with all levels -echo " Testing dict across all levels..." +# 25.6 Auto-dict across all levels +echo " Testing auto-dict across all levels..." DICT_ALL_OK=1 for LEVEL in 1 2 3 4 5 6; do - "$ZXC_BIN" -$LEVEL -D "$DICT_FILE" -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" - "$ZXC_BIN" -d -D "$DICT_FILE" -c "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" > "$TEST_DIR/test_dict_lvl${LEVEL}.dec" + "$ZXC_BIN" -$LEVEL -B 4K --auto-dict -c -k "$TEST_FILE_ARG" > "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" + "$ZXC_BIN" -d -c "$TEST_DIR/test_dict_lvl${LEVEL}.zxc" > "$TEST_DIR/test_dict_lvl${LEVEL}.dec" if ! cmp -s "$TEST_FILE" "$TEST_DIR/test_dict_lvl${LEVEL}.dec"; then DICT_ALL_OK=0 - log_fail "Dict level $LEVEL round-trip failed" + log_fail "Auto-dict level $LEVEL round-trip failed" fi done if [ "$DICT_ALL_OK" -eq 1 ]; then - log_pass "Dict across all levels (1-6)" -fi - -# 25.9 Invalid dict file should fail -echo " Testing invalid dict file..." -echo "not a valid dict" > "$TEST_DIR/bad.zxd" -set +e -"$ZXC_BIN" -3 -D "$TEST_DIR/bad.zxd" -c "$TEST_FILE_ARG" > /dev/null 2>&1 -RET=$? -set -e -if [ $RET -ne 0 ]; then - log_pass "Invalid dict file rejected" -else - log_fail "Invalid dict file should be rejected" + log_pass "Auto-dict across all levels (1-6)" fi echo "All tests passed!" From 00e132058ebe98e16785403b20de217942073906 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:26:39 +0200 Subject: [PATCH 44/47] feat: Enable static decompression contexts to support embedded dictionaries Previously, static `zxc_dctx` instances would reject archives containing embedded dictionaries. This change extends the static context API by introducing a `max_dict_size` parameter. This allows callers to reserve dedicated space within their supplied workspace for a dictionary decode bounce buffer, making the static decompression path fully compatible with archives that embed dictionaries. The decompression logic is updated to utilize this reserved space when an embedded dictionary is encountered. --- include/zxc_buffer.h | 26 +++++++++---- src/cli/main.c | 4 +- src/lib/zxc_dispatch.c | 82 ++++++++++++++++++++++++++++++++++++----- tests/test_static_ctx.c | 8 ++-- 4 files changed, 97 insertions(+), 23 deletions(-) diff --git a/include/zxc_buffer.h b/include/zxc_buffer.h index 86f85c8a..0309e78f 100644 --- a/include/zxc_buffer.h +++ b/include/zxc_buffer.h @@ -573,11 +573,17 @@ ZXC_EXPORT zxc_cctx* zxc_init_static_cctx(void* workspace, const size_t workspac * the decoder cannot predict the per-block literal encoding until it sees * each block header. * - * @param[in] block_size Maximum block size the decoder will encounter - * (must satisfy the regular block-size constraints). - * @return Workspace size in bytes, or 0 if @p block_size is invalid. - */ -ZXC_EXPORT size_t zxc_static_dctx_workspace_size(const size_t block_size); + * @param[in] block_size Maximum block size the decoder will encounter + * (must satisfy the regular block-size constraints). + * @param[in] max_dict_size Largest embedded dictionary the handle must be able + * to decode (0 = no dictionary support; archives that + * embed a dictionary are then rejected). When > 0, the + * workspace reserves room for a dictionary decode + * buffer; must be <= @ref ZXC_DICT_SIZE_MAX. + * @return Workspace size in bytes, or 0 if a parameter is invalid. + */ +ZXC_EXPORT size_t zxc_static_dctx_workspace_size(const size_t block_size, + const size_t max_dict_size); /** * @brief Initialises a decompression context inside a caller-supplied @@ -597,11 +603,17 @@ ZXC_EXPORT size_t zxc_static_dctx_workspace_size(const size_t block_size); * @param[in,out] workspace Caller-allocated buffer, cache-line aligned. * @param[in] workspace_size Capacity of @p workspace in bytes. * @param[in] block_size Block size the decoder will accept. + * @param[in] max_dict_size Largest embedded dictionary to support (0 = + * none). Must match the value passed to + * @ref zxc_static_dctx_workspace_size and be + * <= @ref ZXC_DICT_SIZE_MAX. An archive whose + * embedded dictionary exceeds this returns + * @ref ZXC_ERROR_DICT_TOO_LARGE. * @return Handle pointing inside @p workspace on success, or @c NULL if - * the workspace is too small or @p block_size is invalid. + * the workspace is too small or a parameter is invalid. */ ZXC_EXPORT zxc_dctx* zxc_init_static_dctx(void* workspace, const size_t workspace_size, - const size_t block_size); + const size_t block_size, const size_t max_dict_size); /** @} */ /* end of static_context_api */ /** @} */ /* end of context_api */ diff --git a/src/cli/main.c b/src/cli/main.c index 9fda8b1c..c4e4f340 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -299,7 +299,7 @@ static int zxc_validate_output_path(const char* path, char* resolved_buffer, siz // CLI Logging Helpers static int g_quiet = 0; static int g_verbose = 0; -static int g_auto_dict = 0; /* --auto-dict: train a dictionary from the input and embed it */ +static int g_auto_dict = 0; /** * @brief Standard logging function. Respects the global quiet flag. @@ -968,7 +968,7 @@ static int process_single_file(const char* in_path, const char* out_path_overrid .total_size = total_size}; /* --auto-dict: train a dictionary from the input and embed it (compress - * only, and only for a real file — a pipe cannot be re-read to train). */ + * only, and only for a real file: a pipe cannot be re-read to train). */ const void* eff_dict = dict; size_t eff_dict_size = dict_size; void* auto_dict_buf = NULL; diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 7c85e9ca..23880332 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -1053,6 +1053,12 @@ struct zxc_dctx_s { int owns_workspace; /* 0 = library-allocated (free in zxc_free_dctx), 1 = caller-supplied static workspace (no-op free, block_size pinned at init) */ + /* Dictionary decode bounce buffer [dict | decode_space]. For a static dctx + this points into the caller's workspace (sized via max_dict_size at init). + For a dynamic dctx it is lazily heap-allocated on the first dict archive + and freed in zxc_free_dctx. NULL/0 until needed. */ + uint8_t* dict_work; + size_t dict_work_cap; }; zxc_dctx* zxc_create_dctx(void) { @@ -1066,6 +1072,7 @@ void zxc_free_dctx(zxc_dctx* dctx) { * which we do not own. Free is a no-op; the caller owns the workspace. */ if (dctx->owns_workspace) return; if (dctx->initialized) zxc_cctx_free(&dctx->inner); + ZXC_FREE(dctx->dict_work); /* lazily-allocated dict bounce buffer (dynamic dctx only) */ ZXC_FREE(dctx); } @@ -1091,11 +1098,6 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size &header_dict_id) != ZXC_OK)) return ZXC_ERROR_BAD_HEADER; - /* The static (malloc-free) decode path does not support dictionaries, which - * need a heap-allocated decode buffer. Reject such archives up front rather - * than misinterpreting the embedded dictionary block as data. */ - if (UNLIKELY(header_dict_id != 0)) return ZXC_ERROR_DICT_REQUIRED; - /* Static dctx: block_size is locked at workspace init; reject any * archive whose declared block_size would require a re-partition. */ if (UNLIKELY(dctx->owns_workspace && runtime_chunk_size != dctx->last_block_size)) @@ -1128,6 +1130,41 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size * it stays in sync when chunk_size changes between calls. */ const size_t work_sz = runtime_chunk_size + ZXC_DECOMPRESS_TAIL_PAD; + /* Dictionary: when present it is embedded as a ZXC_BLOCK_DICT block right + * after the header. Decode into a [dict | decode_space] bounce buffer so + * dict back-references resolve. The buffer is the caller's reserved + * workspace region (static dctx, sized via max_dict_size) or a buffer the + * dynamic dctx allocates once and keeps. */ + size_t emb_dict_size = 0; + if (header_dict_id != 0) { + zxc_block_header_t dbh; + if ((size_t)(ip_end - ip) < ZXC_BLOCK_HEADER_SIZE || + zxc_read_block_header(ip, ZXC_BLOCK_HEADER_SIZE, &dbh) != ZXC_OK || + dbh.block_type != ZXC_BLOCK_DICT) + return ZXC_ERROR_DICT_REQUIRED; /* no embedded dictionary to decode with */ + if (dbh.comp_size == 0 || dbh.comp_size > ZXC_DICT_SIZE_MAX || + (size_t)(ip_end - ip) < ZXC_BLOCK_HEADER_SIZE + dbh.comp_size) + return ZXC_ERROR_BAD_HEADER; + const uint8_t* const edict = ip + ZXC_BLOCK_HEADER_SIZE; + if (zxc_dict_id(edict, dbh.comp_size) != header_dict_id) return ZXC_ERROR_DICT_MISMATCH; + + const size_t need = (size_t)dbh.comp_size + work_sz; + if (dctx->dict_work_cap < need) { + /* Static workspace can't grow: the caller must reserve enough via + * max_dict_size. Dynamic dctx allocates (and keeps) the buffer. */ + if (dctx->owns_workspace) return ZXC_ERROR_DICT_TOO_LARGE; + uint8_t* const nb = (uint8_t*)ZXC_MALLOC(need); + if (UNLIKELY(!nb)) return ZXC_ERROR_MEMORY; + ZXC_FREE(dctx->dict_work); + dctx->dict_work = nb; + dctx->dict_work_cap = need; + } + ZXC_MEMCPY(dctx->dict_work, edict, dbh.comp_size); + emb_dict_size = dbh.comp_size; + ip += ZXC_BLOCK_HEADER_SIZE + dbh.comp_size; /* skip the embedded dict block */ + } + ctx->dict_size = emb_dict_size; + while (ip < ip_end) { const size_t rem_src = (size_t)(ip_end - ip); zxc_block_header_t bh; @@ -1151,7 +1188,16 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size const size_t rem_cap = (size_t)(op_end - op); int res; - if (LIKELY(rem_cap >= work_sz)) { + if (emb_dict_size > 0) { + // Dict path: decode into the [dict | decode_space] bounce buffer so + // match copies referencing dictionary bytes resolve, then copy out. + res = zxc_decompress_chunk_wrapper(ctx, ip, rem_src, dctx->dict_work + emb_dict_size, + work_sz); + if (LIKELY(res > 0)) { + if (UNLIKELY((size_t)res > rem_cap)) return ZXC_ERROR_DST_TOO_SMALL; + ZXC_MEMCPY(op, dctx->dict_work + emb_dict_size, (size_t)res); + } + } else if (LIKELY(rem_cap >= work_sz)) { // Fast path: decode directly into dst (enough padding for wild copies). res = zxc_decompress_chunk_wrapper(ctx, ip, rem_src, op, rem_cap); } else { @@ -1427,21 +1473,33 @@ zxc_cctx* zxc_init_static_cctx(void* RESTRICT workspace, const size_t workspace_ return cctx; } -size_t zxc_static_dctx_workspace_size(const size_t block_size) { +/* Bytes reserved for the dictionary decode bounce buffer when a static dctx is + * built to support embedded dictionaries up to `max_dict_size`. 0 disables it. + * Layout is [dict (<= max_dict_size) | decode_space (block + TAIL_PAD)]. */ +static size_t zxc_static_dctx_dict_region(const size_t block_size, const size_t max_dict_size) { + if (max_dict_size == 0) return 0; + return ZXC_ALIGN_CL(max_dict_size + block_size + ZXC_DECOMPRESS_TAIL_PAD); +} + +size_t zxc_static_dctx_workspace_size(const size_t block_size, const size_t max_dict_size) { if (UNLIKELY(!zxc_validate_block_size(block_size))) return 0; + if (UNLIKELY(max_dict_size > ZXC_DICT_SIZE_MAX)) return 0; const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 0, 0); if (UNLIKELY(inner_sz == 0)) return 0; - return ZXC_STATIC_DCTX_HDR_SIZE + inner_sz; + return ZXC_STATIC_DCTX_HDR_SIZE + inner_sz + + zxc_static_dctx_dict_region(block_size, max_dict_size); } zxc_dctx* zxc_init_static_dctx(void* RESTRICT workspace, const size_t workspace_size, - const size_t block_size) { + const size_t block_size, const size_t max_dict_size) { if (UNLIKELY(!workspace)) return NULL; if (UNLIKELY(!zxc_validate_block_size(block_size))) return NULL; + if (UNLIKELY(max_dict_size > ZXC_DICT_SIZE_MAX)) return NULL; const size_t inner_sz = zxc_cctx_compute_workspace_size(block_size, 0, 0); if (UNLIKELY(inner_sz == 0)) return NULL; - if (UNLIKELY(workspace_size < ZXC_STATIC_DCTX_HDR_SIZE + inner_sz)) return NULL; + const size_t dict_region = zxc_static_dctx_dict_region(block_size, max_dict_size); + if (UNLIKELY(workspace_size < ZXC_STATIC_DCTX_HDR_SIZE + inner_sz + dict_region)) return NULL; zxc_dctx* const dctx = (zxc_dctx*)workspace; ZXC_MEMSET(dctx, 0, sizeof(*dctx)); @@ -1453,6 +1511,10 @@ zxc_dctx* zxc_init_static_dctx(void* RESTRICT workspace, const size_t workspace_ 0) != ZXC_OK)) return NULL; + if (dict_region > 0) { + dctx->dict_work = inner_ws + inner_sz; + dctx->dict_work_cap = dict_region; + } dctx->owns_workspace = 1; dctx->initialized = 1; dctx->last_block_size = block_size; diff --git a/tests/test_static_ctx.c b/tests/test_static_ctx.c index df5f7dcc..ef529a59 100644 --- a/tests/test_static_ctx.c +++ b/tests/test_static_ctx.c @@ -83,7 +83,7 @@ int test_static_ctx_roundtrip_all_levels(void) { } /* Size + init the dctx workspace. */ - const size_t dctx_ws_sz = zxc_static_dctx_workspace_size(block_size); + const size_t dctx_ws_sz = zxc_static_dctx_workspace_size(block_size, 0); if (dctx_ws_sz == 0) { printf(" [FAIL] level %d: dctx_ws_sz == 0\n", lvl); goto fail; @@ -93,7 +93,7 @@ int test_static_ctx_roundtrip_all_levels(void) { printf(" [FAIL] level %d: aligned_alloc(dctx_ws)\n", lvl); goto fail; } - zxc_dctx* const dctx = zxc_init_static_dctx(dctx_ws, dctx_ws_sz, block_size); + zxc_dctx* const dctx = zxc_init_static_dctx(dctx_ws, dctx_ws_sz, block_size, 0); if (!dctx) { printf(" [FAIL] level %d: zxc_init_static_dctx returned NULL\n", lvl); test_aligned_free(dctx_ws); @@ -135,7 +135,7 @@ int test_static_ctx_size_query(void) { printf(" [FAIL] cctx_size(0) should be 0\n"); return 0; } - if (zxc_static_dctx_workspace_size(0) != 0) { + if (zxc_static_dctx_workspace_size(0, 0) != 0) { printf(" [FAIL] dctx_size(0) should be 0\n"); return 0; } @@ -269,7 +269,7 @@ int test_static_ctx_null_inputs(void) { printf(" [FAIL] init_static_cctx(NULL opts) should fail\n"); return 0; } - if (zxc_init_static_dctx(NULL, 65536, 4096) != NULL) { + if (zxc_init_static_dctx(NULL, 65536, 4096, 0) != NULL) { printf(" [FAIL] init_static_dctx(NULL workspace) should fail\n"); return 0; } From 1e51dee44097c7ee897a5f6a3c75fc5293bc58c0 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:40:41 +0200 Subject: [PATCH 45/47] feat: Add max_dict_size to static dctx API for embedded dictionary support This change introduces the `max_dict_size` parameter to `zxc_static_dctx_workspace_size` and `zxc_init_static_dctx`. Callers can now specify and reserve dedicated space within pre-allocated workspaces for handling embedded dictionaries in static decompression contexts. Error handling is refined to distinguish between `ZXC_ERROR_DICT_REQUIRED` (when `max_dict_size` is 0 but an embedded dictionary is present) and `ZXC_ERROR_DICT_TOO_LARGE` (when the embedded dictionary exceeds the configured `max_dict_size`). API documentation and examples are updated, and new test cases validate dictionary decoding and error conditions for static contexts. --- docs/API.md | 23 +++++++--- src/lib/zxc_dispatch.c | 8 ++-- tests/test_common.h | 1 + tests/test_main.c | 1 + tests/test_static_ctx.c | 96 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 119 insertions(+), 10 deletions(-) diff --git a/docs/API.md b/docs/API.md index 0caae299..e6b61db2 100644 --- a/docs/API.md +++ b/docs/API.md @@ -680,7 +680,8 @@ is `NULL`. ```c ZXC_EXPORT size_t zxc_static_dctx_workspace_size( - const size_t block_size + const size_t block_size, + const size_t max_dict_size ); ``` @@ -690,7 +691,12 @@ for the given `block_size`. Unlike the compression variant, this size is provisioned worst-case because the decoder cannot predict the per-block literal encoding (RAW / RLE / HUFFMAN) until it sees each block header. -**Returns**: workspace size in bytes, or `0` if `block_size` is invalid. +`max_dict_size` (0 = no dictionary support) reserves room for decoding archives +that embed a dictionary up to that size (≤ `ZXC_DICT_SIZE_MAX`). With `0`, an +archive that embeds a dictionary is rejected with `ZXC_ERROR_DICT_REQUIRED` +(the malloc-free path cannot allocate a dictionary buffer on demand). + +**Returns**: workspace size in bytes, or `0` if a parameter is invalid. ### `zxc_init_static_dctx` @@ -698,14 +704,17 @@ literal encoding (RAW / RLE / HUFFMAN) until it sees each block header. ZXC_EXPORT zxc_dctx* zxc_init_static_dctx( void* workspace, const size_t workspace_size, - const size_t block_size + const size_t block_size, + const size_t max_dict_size ); ``` Initialises a decompression context inside a caller-supplied workspace. `block_size` is **pinned** at init time: feeding the returned handle an archive whose file header declares a different `block_size` returns -`ZXC_ERROR_BAD_BLOCK_SIZE`. +`ZXC_ERROR_BAD_BLOCK_SIZE`. `max_dict_size` must match the value passed to +`zxc_static_dctx_workspace_size`; an archive whose embedded dictionary exceeds +it returns `ZXC_ERROR_DICT_TOO_LARGE`. The returned handle points inside `workspace`; the workspace must remain valid for the lifetime of the handle. `zxc_free_dctx` is a no-op. @@ -737,11 +746,11 @@ zxc_free_cctx(cctx); /* no-op for static */ free(cws); /* caller owns the workspace */ /* --- Decompression side --- */ -size_t dws_sz = zxc_static_dctx_workspace_size(BLOCK_SZ); +size_t dws_sz = zxc_static_dctx_workspace_size(BLOCK_SZ, 0); void *dws = NULL; posix_memalign(&dws, 64, dws_sz); -zxc_dctx *dctx = zxc_init_static_dctx(dws, dws_sz, BLOCK_SZ); +zxc_dctx *dctx = zxc_init_static_dctx(dws, dws_sz, BLOCK_SZ, 0); zxc_decompress_dctx(dctx, in, in_sz, out, out_cap, NULL); zxc_free_dctx(dctx); /* no-op */ free(dws); @@ -814,7 +823,7 @@ must know it *before* calling `init`. Four patterns cover every use case: archive at the cost of over-allocation (~4 MB dctx). ```c - size_t dws_sz = zxc_static_dctx_workspace_size(ZXC_BLOCK_SIZE_MAX); + size_t dws_sz = zxc_static_dctx_workspace_size(ZXC_BLOCK_SIZE_MAX, 0); ``` If the workspace pool must stay tight and worst-case sizing is too diff --git a/src/lib/zxc_dispatch.c b/src/lib/zxc_dispatch.c index 23880332..3b8a204a 100644 --- a/src/lib/zxc_dispatch.c +++ b/src/lib/zxc_dispatch.c @@ -1150,9 +1150,11 @@ int64_t zxc_decompress_dctx(zxc_dctx* dctx, const void* RESTRICT src, const size const size_t need = (size_t)dbh.comp_size + work_sz; if (dctx->dict_work_cap < need) { - /* Static workspace can't grow: the caller must reserve enough via - * max_dict_size. Dynamic dctx allocates (and keeps) the buffer. */ - if (dctx->owns_workspace) return ZXC_ERROR_DICT_TOO_LARGE; + /* A static workspace can't grow. Distinguish "dictionary support not + * enabled" (no region reserved) from "reserved region too small" for + * this archive's dictionary. */ + if (dctx->owns_workspace) + return dctx->dict_work ? ZXC_ERROR_DICT_TOO_LARGE : ZXC_ERROR_DICT_REQUIRED; uint8_t* const nb = (uint8_t*)ZXC_MALLOC(need); if (UNLIKELY(!nb)) return ZXC_ERROR_MEMORY; ZXC_FREE(dctx->dict_work); diff --git a/tests/test_common.h b/tests/test_common.h index 826e6fa7..84a8bcb7 100644 --- a/tests/test_common.h +++ b/tests/test_common.h @@ -81,6 +81,7 @@ int test_static_ctx_size_query(void); int test_static_ctx_workspace_too_small(void); int test_static_ctx_block_size_locked(void); int test_static_ctx_null_inputs(void); +int test_static_ctx_embedded_dict(void); /* Stream API */ int test_null_output_decompression(void); diff --git a/tests/test_main.c b/tests/test_main.c index 9ec273ed..cbe5e963 100644 --- a/tests/test_main.c +++ b/tests/test_main.c @@ -72,6 +72,7 @@ static const test_entry_t g_tests[] = { TEST_CASE(test_static_ctx_block_size_locked), TEST_CASE(test_static_ctx_null_inputs), TEST_CASE(test_static_ctx_roundtrip_all_levels), + TEST_CASE(test_static_ctx_embedded_dict), /* --- Stream API --- */ TEST_CASE(test_null_output_decompression), diff --git a/tests/test_static_ctx.c b/tests/test_static_ctx.c index ef529a59..0a18ecfc 100644 --- a/tests/test_static_ctx.c +++ b/tests/test_static_ctx.c @@ -279,3 +279,99 @@ int test_static_ctx_null_inputs(void) { printf(" [PASS] NULL inputs rejected; NULL free is idempotent\n"); return 1; } + +int test_static_ctx_embedded_dict(void) { + printf("=== TEST: Static ctx - decode embedded dictionary ===\n"); + const size_t bs = 65536; + static const uint8_t dict[] = + "The quick brown fox jumps over the lazy dog. keys: \"id\",\"name\",\"email\"."; + const size_t dsz = sizeof(dict) - 1; + const size_t src_size = 200000; + uint8_t* src = (uint8_t*)malloc(src_size); + int ok = (src != NULL); + /* Dict-referencing data so the encoder actually emits matches into the dict. */ + if (ok) + for (size_t i = 0; i < src_size; i++) src[i] = dict[i % dsz]; + + /* Build an embedded-dictionary archive with the stream compressor. */ + FILE* fs = ok ? tmpfile() : NULL; + FILE* fc = ok ? tmpfile() : NULL; + uint8_t* comp = NULL; + long csz = 0; + if (!fs || !fc) ok = 0; + if (ok) { + fwrite(src, 1, src_size, fs); + rewind(fs); + zxc_compress_opts_t copts = {.level = ZXC_LEVEL_DEFAULT, + .block_size = bs, + .checksum_enabled = 1, + .dict = dict, + .dict_size = dsz}; + if (zxc_stream_compress(fs, fc, &copts) <= 0) { + printf(" [FAIL] stream_compress\n"); + ok = 0; + } + } + if (ok) { + fseek(fc, 0, SEEK_END); + csz = ftell(fc); + rewind(fc); + comp = (uint8_t*)malloc((size_t)csz); + ok = (comp && fread(comp, 1, (size_t)csz, fc) == (size_t)csz); + } + if (fs) fclose(fs); + if (fc) fclose(fc); + uint8_t* out = (uint8_t*)malloc(src_size); + if (!out) ok = 0; + + /* (a) Dynamic dctx: lazily allocates the dict bounce buffer. */ + if (ok) { + zxc_dctx* d = zxc_create_dctx(); + int64_t r = zxc_decompress_dctx(d, comp, (size_t)csz, out, src_size, NULL); + if (r != (int64_t)src_size || memcmp(out, src, src_size) != 0) { + printf(" [FAIL] dynamic dctx embedded decode (r=%lld)\n", (long long)r); + ok = 0; + } else { + printf(" [PASS] dynamic dctx decodes embedded dict\n"); + } + zxc_free_dctx(d); + } + + /* (b) Static dctx sized for a dictionary. */ + if (ok) { + const size_t ws_sz = zxc_static_dctx_workspace_size(bs, ZXC_DICT_SIZE_MAX); + void* ws = ws_sz ? test_aligned_alloc(64, ws_sz) : NULL; + zxc_dctx* d = ws ? zxc_init_static_dctx(ws, ws_sz, bs, ZXC_DICT_SIZE_MAX) : NULL; + int64_t r = d ? zxc_decompress_dctx(d, comp, (size_t)csz, out, src_size, NULL) : -1; + if (r != (int64_t)src_size || memcmp(out, src, src_size) != 0) { + printf(" [FAIL] static dctx (with dict room) decode (r=%lld)\n", (long long)r); + ok = 0; + } else { + printf(" [PASS] static dctx (max_dict_size) decodes embedded dict\n"); + } + if (ws) test_aligned_free(ws); + } + + /* (c) Static dctx WITHOUT dict room must reject cleanly, not corrupt. */ + if (ok) { + const size_t ws_sz = zxc_static_dctx_workspace_size(bs, 0); + void* ws = ws_sz ? test_aligned_alloc(64, ws_sz) : NULL; + zxc_dctx* d = ws ? zxc_init_static_dctx(ws, ws_sz, bs, 0) : NULL; + int64_t r = d ? zxc_decompress_dctx(d, comp, (size_t)csz, out, src_size, NULL) : -999; + if (r != ZXC_ERROR_DICT_REQUIRED) { + printf(" [FAIL] static dctx (no dict room) should return DICT_REQUIRED, got %lld\n", + (long long)r); + ok = 0; + } else { + printf(" [PASS] static dctx (no dict room) rejects cleanly\n"); + } + if (ws) test_aligned_free(ws); + } + + free(comp); + free(out); + free(src); + if (!ok) return 0; + printf("PASS\n\n"); + return 1; +} From 0acd51448c8213137298452f495f35f57c22e526 Mon Sep 17 00:00:00 2001 From: hellobertrand <5901952+hellobertrand@users.noreply.github.com> Date: Mon, 1 Jun 2026 17:41:11 +0200 Subject: [PATCH 46/47] test: Add conformance tests for embedded dictionaries Introduces new .zxc and .expected files to validate decompression of archives with embedded dictionaries for both stream and seekable contexts. --- conformance/valid/dict_embedded.expected | 400 ++++++++++++++++++ conformance/valid/dict_embedded.zxc | Bin 0 -> 2792 bytes .../valid/dict_embedded_seekable.expected | 400 ++++++++++++++++++ conformance/valid/dict_embedded_seekable.zxc | Bin 0 -> 2832 bytes 4 files changed, 800 insertions(+) create mode 100644 conformance/valid/dict_embedded.expected create mode 100644 conformance/valid/dict_embedded.zxc create mode 100644 conformance/valid/dict_embedded_seekable.expected create mode 100644 conformance/valid/dict_embedded_seekable.zxc diff --git a/conformance/valid/dict_embedded.expected b/conformance/valid/dict_embedded.expected new file mode 100644 index 00000000..0a810edb --- /dev/null +++ b/conformance/valid/dict_embedded.expected @@ -0,0 +1,400 @@ +{"id":00000,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00001,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00002,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00003,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00004,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00005,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00006,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00007,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00008,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00009,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00010,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00011,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00012,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00013,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00014,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00015,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00016,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00017,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00018,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00019,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00020,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00021,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00022,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00023,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00024,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00025,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00026,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00027,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00028,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00029,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00030,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00031,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00032,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00033,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00034,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00035,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00036,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00037,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00038,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00039,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00040,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00041,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00042,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00043,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00044,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00045,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00046,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00047,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00048,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00049,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00050,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00051,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00052,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00053,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00054,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00055,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00056,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00057,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00058,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00059,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00060,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00061,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00062,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00063,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00064,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00065,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00066,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00067,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00068,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00069,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00070,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00071,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00072,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00073,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00074,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00075,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00076,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00077,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00078,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00079,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00080,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00081,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00082,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00083,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00084,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00085,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00086,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00087,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00088,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00089,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00090,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00091,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00092,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00093,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00094,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00095,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00096,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00097,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00098,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00099,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00100,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00101,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00102,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00103,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00104,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00105,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00106,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00107,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00108,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00109,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00110,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00111,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00112,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00113,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00114,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00115,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00116,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00117,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00118,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00119,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00120,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00121,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00122,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00123,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00124,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00125,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00126,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00127,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00128,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00129,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00130,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00131,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00132,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00133,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00134,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00135,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00136,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00137,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00138,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00139,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00140,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00141,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00142,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00143,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00144,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00145,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00146,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00147,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00148,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00149,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00150,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00151,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00152,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00153,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00154,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00155,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00156,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00157,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00158,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00159,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00160,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00161,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00162,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00163,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00164,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00165,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00166,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00167,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00168,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00169,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00170,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00171,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00172,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00173,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00174,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00175,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00176,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00177,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00178,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00179,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00180,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00181,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00182,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00183,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00184,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00185,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00186,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00187,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00188,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00189,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00190,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00191,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00192,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00193,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00194,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00195,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00196,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00197,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00198,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00199,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00200,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00201,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00202,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00203,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00204,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00205,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00206,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00207,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00208,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00209,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00210,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00211,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00212,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00213,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00214,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00215,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00216,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00217,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00218,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00219,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00220,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00221,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00222,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00223,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00224,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00225,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00226,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00227,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00228,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00229,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00230,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00231,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00232,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00233,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00234,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00235,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00236,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00237,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00238,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00239,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00240,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00241,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00242,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00243,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00244,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00245,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00246,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00247,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00248,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00249,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00250,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00251,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00252,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00253,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00254,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00255,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00256,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00257,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00258,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00259,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00260,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00261,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00262,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00263,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00264,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00265,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00266,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00267,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00268,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00269,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00270,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00271,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00272,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00273,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00274,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00275,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00276,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00277,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00278,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00279,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00280,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00281,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00282,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00283,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00284,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00285,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00286,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00287,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00288,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00289,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00290,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00291,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00292,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00293,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00294,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00295,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00296,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00297,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00298,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00299,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00300,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00301,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00302,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00303,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00304,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00305,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00306,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00307,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00308,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00309,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00310,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00311,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00312,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00313,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00314,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00315,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00316,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00317,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00318,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00319,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00320,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00321,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00322,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00323,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00324,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00325,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00326,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00327,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00328,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00329,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00330,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00331,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00332,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00333,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00334,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00335,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00336,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00337,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00338,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00339,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00340,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00341,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00342,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00343,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00344,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00345,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00346,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00347,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00348,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00349,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00350,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00351,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00352,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00353,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00354,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00355,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00356,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00357,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00358,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00359,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00360,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00361,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00362,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00363,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00364,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00365,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00366,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00367,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00368,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00369,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00370,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00371,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00372,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00373,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00374,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00375,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00376,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00377,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00378,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00379,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00380,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00381,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00382,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00383,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00384,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00385,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00386,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00387,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00388,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00389,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00390,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00391,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00392,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00393,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00394,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00395,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00396,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00397,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00398,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00399,"type":"event","level":"INFO","msg":"user logged in","ok":true} diff --git a/conformance/valid/dict_embedded.zxc b/conformance/valid/dict_embedded.zxc new file mode 100644 index 0000000000000000000000000000000000000000..ac7c2df8cee705232f27bf2e4d3156e6a2ec1393 GIT binary patch literal 2792 zcmcJQKTH%s6vpR{gMbn@0Rt5BCmV%G@NQ=Ic4oO~wvd445-rrqQZ7h1IRgiYs00t( zh7cPQ8WR$t715Yj*qBI%m4%I^v7)e`GBz50Z+B;1_m)%ioR_yd-`itz_v3qSKBu2Q zYHZzpyU;O02%&Gj5i*k?sC*Gu7K`+r79n_v&pkKlib_OJoD$6Um2S zz2Qr;IvzAdYkQ4*jnP^yErltgYdIOMM09_4^y?0=F(DTuus92;SPl9b>zLDiDpndb zv(!XVLQuhlaZfnXv7BLz(O=`R%MQrdnGn8MWLV7jOCwhUaHS+hCXpw#ApFZSAD<8r zMzK=(t7k3${wV+(g#3t*&Oj=FJ-&_tR650|#b|}KaOOIWG3NQrF%v>)Hej$_i)rAx z-+4{Z>?GI7SONoKsX#Dx%1%WA9n@vfzfYM6A{J4NV#kT{v+F=92l)hPjE1PCeXS@d z+bOLv!I|({lu3vc*L3ZiR!4arUG`zu(JUp`s?eZTLbUaLa98b zcpq7O*LDsFo`hUOI$`g1`&ve2w2b)oGgFqHiiq}|kRmHpJSzqua;Xn;!*Q&ei$4j(mbr9YGa*lEZ+2q$bkwvY9K ziN~cITzdB7IPKaES{)8+FbvK$=wRCgrNtqvavgjWI=wqXb|BhA{hKGR>+Ze{Z9_xw z$AmgM6w5HfsO!|T7Z2qkFmwkW@)?5!?Jgazs1>*9DcfxDtikc*s2GGV`>wk$n}hS; zYg@$kM<<5wMM;)}u3b-J{J#U^xZAS_oqnfwUabM8O@Oo62^VZrMXkfvSjWUch^cg& zAuU5176!LG2f|?+J|=t)aI?uODJ3veAL1C>ug`{J$9Q_+<>mhG@C{%kx97dh)=1am EA5)jZH~;_u literal 0 HcmV?d00001 diff --git a/conformance/valid/dict_embedded_seekable.expected b/conformance/valid/dict_embedded_seekable.expected new file mode 100644 index 00000000..0a810edb --- /dev/null +++ b/conformance/valid/dict_embedded_seekable.expected @@ -0,0 +1,400 @@ +{"id":00000,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00001,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00002,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00003,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00004,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00005,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00006,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00007,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00008,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00009,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00010,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00011,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00012,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00013,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00014,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00015,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00016,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00017,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00018,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00019,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00020,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00021,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00022,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00023,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00024,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00025,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00026,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00027,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00028,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00029,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00030,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00031,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00032,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00033,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00034,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00035,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00036,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00037,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00038,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00039,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00040,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00041,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00042,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00043,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00044,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00045,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00046,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00047,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00048,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00049,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00050,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00051,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00052,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00053,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00054,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00055,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00056,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00057,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00058,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00059,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00060,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00061,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00062,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00063,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00064,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00065,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00066,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00067,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00068,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00069,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00070,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00071,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00072,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00073,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00074,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00075,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00076,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00077,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00078,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00079,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00080,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00081,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00082,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00083,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00084,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00085,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00086,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00087,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00088,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00089,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00090,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00091,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00092,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00093,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00094,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00095,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00096,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00097,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00098,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00099,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00100,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00101,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00102,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00103,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00104,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00105,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00106,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00107,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00108,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00109,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00110,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00111,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00112,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00113,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00114,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00115,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00116,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00117,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00118,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00119,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00120,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00121,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00122,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00123,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00124,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00125,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00126,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00127,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00128,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00129,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00130,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00131,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00132,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00133,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00134,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00135,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00136,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00137,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00138,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00139,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00140,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00141,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00142,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00143,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00144,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00145,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00146,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00147,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00148,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00149,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00150,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00151,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00152,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00153,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00154,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00155,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00156,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00157,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00158,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00159,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00160,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00161,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00162,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00163,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00164,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00165,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00166,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00167,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00168,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00169,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00170,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00171,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00172,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00173,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00174,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00175,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00176,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00177,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00178,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00179,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00180,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00181,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00182,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00183,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00184,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00185,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00186,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00187,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00188,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00189,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00190,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00191,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00192,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00193,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00194,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00195,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00196,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00197,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00198,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00199,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00200,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00201,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00202,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00203,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00204,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00205,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00206,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00207,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00208,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00209,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00210,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00211,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00212,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00213,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00214,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00215,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00216,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00217,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00218,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00219,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00220,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00221,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00222,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00223,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00224,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00225,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00226,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00227,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00228,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00229,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00230,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00231,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00232,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00233,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00234,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00235,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00236,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00237,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00238,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00239,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00240,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00241,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00242,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00243,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00244,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00245,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00246,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00247,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00248,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00249,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00250,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00251,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00252,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00253,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00254,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00255,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00256,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00257,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00258,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00259,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00260,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00261,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00262,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00263,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00264,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00265,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00266,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00267,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00268,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00269,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00270,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00271,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00272,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00273,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00274,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00275,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00276,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00277,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00278,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00279,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00280,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00281,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00282,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00283,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00284,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00285,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00286,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00287,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00288,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00289,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00290,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00291,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00292,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00293,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00294,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00295,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00296,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00297,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00298,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00299,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00300,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00301,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00302,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00303,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00304,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00305,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00306,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00307,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00308,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00309,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00310,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00311,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00312,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00313,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00314,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00315,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00316,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00317,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00318,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00319,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00320,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00321,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00322,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00323,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00324,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00325,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00326,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00327,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00328,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00329,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00330,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00331,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00332,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00333,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00334,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00335,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00336,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00337,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00338,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00339,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00340,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00341,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00342,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00343,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00344,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00345,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00346,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00347,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00348,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00349,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00350,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00351,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00352,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00353,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00354,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00355,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00356,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00357,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00358,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00359,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00360,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00361,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00362,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00363,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00364,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00365,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00366,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00367,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00368,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00369,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00370,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00371,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00372,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00373,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00374,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00375,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00376,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00377,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00378,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00379,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00380,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00381,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00382,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00383,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00384,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00385,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00386,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00387,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00388,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00389,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00390,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00391,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00392,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00393,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00394,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00395,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00396,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00397,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00398,"type":"event","level":"INFO","msg":"user logged in","ok":true} +{"id":00399,"type":"event","level":"INFO","msg":"user logged in","ok":true} diff --git a/conformance/valid/dict_embedded_seekable.zxc b/conformance/valid/dict_embedded_seekable.zxc new file mode 100644 index 0000000000000000000000000000000000000000..bb56eee5a201b43b887cbe029d32913dea273ef5 GIT binary patch literal 2832 zcmcJRPe>F|9LL|Tt7u6B8tNf+8A?gE-S^(?ycw5bh-57m>QJ3J=CX;K8@dWviLEvl zkuHIkK#)2~5_ISi#>y zpFV7E-+D9GH9`oXuf7s86(Qtu`R0whno;w!`N^`HP>XO;1byy&_5#T3(*+PS)A^}* zu~aDJ$KvCYP%cfVnex<3{?370>iC$Np=x7ZLn#|hH0)iRHt=1nVebO!amZx>e+Ls1 zuAR{XGit137TuJ3fwpo=wX`k_(`JM&aa*&~3wo@--?I8~D9&8HO#QqLy}C%&AU=|u zFIJmAM{DDLOR%=vyxSbC)zgO1jo?^}1}gzQkj{SD1~xk6oB=G(KuT6auEsj%q??kJ zLiIG&k(3ZraG~85wqcw0u)^rCanNB0BD;~H~6eAVMk$MpR<(`XA zhybHdDcsSwl6&_AfDJ)@KuD(`CBQyc2LVbQW7K4{!rC}<9LE@|`pwoOLMRqsupEmi z;JV*=P15WpSIKAu17XQPFjmZp1p)2XWx>zKbOaHKs7|r#Sb6grP|83)MjBZkwWO;h zC22dU6(%?n)fQz2#FAsWPEKp1!W7xI08=6gS*7K?hscD-4jFbgVFZ7DqjbCHy?Q96 z=Oph#^KU!O0>R^uD@e!hyqV;tnfBK$vM{`mSC^5bsMZ_;Xa4vg9RHh z1X*oqkW4$!b|V;93@;1WpTaWEw zJ#WY3h7;VV+K=OI*QwFkuvwF)ajrlITMj5C3}KDy;8E!G-W1t}==Ak(j=XHR`__~- z1;HN?^5~E({S1SyllPuIFy?`wv+*IHQAp6vP+>+bxdl(zR*k1MjweS&BYe?y-94k# zJO8U~i}?QN#BjZ6kVUU+SECsJ@4z_f^z43jzZ2UpR)Eqvz*+Bx3ENas>vA>LF>xcr zl)6JROwG_t3~p8(2#2lzm~c73Vv%K1ieSh-#BFS^K1+md Date: Mon, 1 Jun 2026 20:28:24 +0200 Subject: [PATCH 47/47] feat: Enhance dictionary training with full input and copy avoidance The `--auto-dict` option, which automatically trains and embeds a dictionary, now processes the entire input file instead of a capped 16MB prefix. This allows for the generation of more representative dictionaries. Additionally, the `zxc_train_dict` function is optimized to avoid unnecessary memory copies when training from a single input sample (e.g., an entire file). This improves performance and reduces memory usage for large training corpora. --- src/cli/main.c | 20 +++++++++++--------- src/lib/zxc_dict.c | 26 +++++++++++++++++--------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/src/cli/main.c b/src/cli/main.c index c4e4f340..d4ff5f41 100644 --- a/src/cli/main.c +++ b/src/cli/main.c @@ -737,9 +737,9 @@ static int zxc_list_archive(const char* path, int json_output) { return 0; } -// --auto-dict: train a dictionary from (a bounded prefix of) the input file and -// return it as a malloc'd buffer (caller frees), sized to the block size. The -// trained dict is meant to be embedded in the archive. Returns NULL on failure. +// --auto-dict: train a dictionary from the whole input file and return it as a +// malloc'd buffer (caller frees), sized to the block size. The trained dict is +// meant to be embedded in the archive. Returns NULL on failure. static void* cli_auto_train_dict(const char* path, size_t block_size, size_t* out_size) { *out_size = 0; FILE* f = fopen(path, "rb"); @@ -751,20 +751,22 @@ static void* cli_auto_train_dict(const char* path, size_t block_size, size_t* ou fclose(f); return NULL; } - /* Train on up to 16 MB of the input; the trainer samples across it. */ - const size_t cap = (size_t)16u << 20; - const size_t corpus_sz = ((size_t)fsz < cap) ? (size_t)fsz : cap; + /* Train on the whole file as a single sample (same as the external + * --train-dict workflow used to). The trainer samples k-gram frequencies + * representatively across the corpus internally (see ZXC_DICT_FREQ_SAMPLE_ + * TARGET), so the full input can be fed without analysing every byte. */ + const size_t corpus_sz = (size_t)fsz; uint8_t* corpus = (uint8_t*)malloc(corpus_sz); if (!corpus) { fclose(f); return NULL; } - const size_t got = fread(corpus, 1, corpus_sz, f); - fclose(f); - if (got != corpus_sz) { + if (fread(corpus, 1, corpus_sz, f) != corpus_sz) { free(corpus); + fclose(f); return NULL; } + fclose(f); size_t dict_cap = ZXC_DICT_SIZE_MAX; if (block_size > 0 && block_size < dict_cap) dict_cap = block_size; uint8_t* dict = (uint8_t*)malloc(dict_cap); diff --git a/src/lib/zxc_dict.c b/src/lib/zxc_dict.c index 9ba5e5b8..71a47309 100644 --- a/src/lib/zxc_dict.c +++ b/src/lib/zxc_dict.c @@ -129,26 +129,34 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, return ZXC_ERROR_NULL_INPUT; // LCOV_EXCL_LINE if (UNLIKELY(dict_capacity > ZXC_DICT_SIZE_MAX)) return ZXC_ERROR_DICT_TOO_LARGE; - /* Step 1: concatenate samples */ + /* Step 1: assemble the analysis corpus. The trainer only reads it, so a + * single sample is used in place (no copy) -- important since callers often + * pass one whole file, which can be hundreds of MB. Multiple samples are + * concatenated into an owned buffer (freed via corpus_owned at exit). */ size_t corpus_size = 0; for (size_t i = 0; i < n_samples; i++) corpus_size += sample_sizes[i]; if (UNLIKELY(corpus_size < ZXC_DICT_KGRAM_LEN)) return ZXC_ERROR_SRC_TOO_SMALL; - uint8_t* corpus = (uint8_t*)ZXC_MALLOC(corpus_size); - if (UNLIKELY(!corpus)) return ZXC_ERROR_MEMORY; - { + const uint8_t* corpus; + uint8_t* corpus_owned = NULL; + if (n_samples == 1) { + corpus = (const uint8_t*)samples[0]; + } else { + corpus_owned = (uint8_t*)ZXC_MALLOC(corpus_size); + if (UNLIKELY(!corpus_owned)) return ZXC_ERROR_MEMORY; size_t pos = 0; for (size_t i = 0; i < n_samples; i++) { - if (sample_sizes[i] > 0) ZXC_MEMCPY(corpus + pos, samples[i], sample_sizes[i]); + if (sample_sizes[i] > 0) ZXC_MEMCPY(corpus_owned + pos, samples[i], sample_sizes[i]); pos += sample_sizes[i]; } + corpus = corpus_owned; } /* Step 2: count k-gram frequencies */ uint16_t* freq = (uint16_t*)ZXC_MALLOC(ZXC_DICT_HT_SIZE * sizeof(uint16_t)); if (UNLIKELY(!freq)) { // LCOV_EXCL_START - ZXC_FREE(corpus); + ZXC_FREE(corpus_owned); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -180,7 +188,7 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, if (UNLIKELY(!segs)) { // LCOV_EXCL_START ZXC_FREE(freq); - ZXC_FREE(corpus); + ZXC_FREE(corpus_owned); return ZXC_ERROR_MEMORY; // LCOV_EXCL_STOP } @@ -214,7 +222,7 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, ZXC_MEMCPY(dict_buf, corpus + corpus_size - copy, copy); ZXC_FREE(freq); ZXC_FREE(segs); - ZXC_FREE(corpus); + ZXC_FREE(corpus_owned); return (int64_t)copy; } @@ -278,6 +286,6 @@ int64_t zxc_train_dict(const void* const* samples, const size_t* sample_sizes, } ZXC_FREE(segs); - ZXC_FREE(corpus); + ZXC_FREE(corpus_owned); return (int64_t)filled; }