diff --git a/.gitignore b/.gitignore index c14a76d..3c39e5b 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,6 @@ Cargo.toml.original.txt *.profraw *.profdata fuzz-*.log + +# Local-only dev dependency overrides (build against in-tree zen sources) +.cargo/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b7f43a..9b1c631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,157 @@ All notable changes to zencodec are documented here. ### Added +- **Cross-codec color-emission policy** — + `resolve_color_emit(&SourceColor, &EncodeCapabilities, ColorEmitPolicy) -> ColorEmitPlan`, + a pure `no_std` decision of which color carriers (ICC vs CICP) to write for a + target, with no CMS and no codec dependencies. The `color` module is private; + the types are re-exported at the crate root (`zencodec::ColorEmitPolicy`, …). + - `ColorEmitPolicy { Compatibility, Balanced (default), Compact, Verbatim, Custom(ColorEmitFields) }`; + `ColorEmitPlan { cicp: Option, icc: IccDisposition }`; + `IccDisposition { KeepSource, SynthesizeFrom(Cicp), Drop }`. Handles the + grayscale/CMYK terminal states and never emits a redundant `SynthesizeFrom(sRGB)`. + (Names carry the emit direction so they can't be confused with the decode-side + `SourceColor`.) + - `ColorEmitFields::new` makes `ColorEmitPolicy::Custom` constructible downstream. + - `EncodeCapabilities` gains `cicp_is_valid_carrier` (standardized carrier — + JXL/AVIF/HEIC `nclx`, PNG `cICP`) and `cicp_safe_sole_carrier` (safe CICP-only, + JXL) (+ `with_*`); `IccRetention` gains `DropIfCicpRepresentable`, + `DropIfCicpSafeSoleCarrier`. The plan lowers to `zenpixels_convert`'s + `finalize_for_output_with` (`icc_profile_for_primaries` materializes a + `SynthesizeFrom` from a `const fn` table — no CMS, never a silent drop). + - `EncodePolicy` now bundles the output-emission policy: `color: + Option` and `metadata: Option` (+ `with_color`, + `with_metadata_policy`, `resolve_color`, `resolve_metadata`), so encode and + transcode select the color carrier and metadata retention through one object — + the codec reads `color`, the pipeline applies `metadata` via `Metadata::filtered`. + Its docs reframe the legacy `embed_*` flags as a coarse best-effort codec gate. + `MetadataPolicy` is now `Copy` so it can be bundled by value. + - `helpers::set_exif_orientation` rewrites a blob's EXIF orientation tag inline + (offset-preserving) so a baked-upright pixel buffer and its embedded tag can't + disagree (the double-rotation hazard). Applied by the pipeline, not by the + color resolver. + - `exif::ByteOrder` is module-scoped (a TIFF/EXIF header detail), not re-exported + at the crate root. + - Design + rejected alternatives: `docs/color-emit-model.md`. +- **EXIF string-field editing** — `Exif::set_copyright` / `set_artist` set (insert + or replace) the IFD0 rights tags, materialized through the existing canonical + `Exif::to_bytes` (offsets recomputed, byte-exact fixpoint preserved). The new + `exif::TextEncoding` (re-exported at the crate root) lets the caller pick the + TIFF field type explicitly: `Ascii` (Exif 2.x, type 2 — carries UTF-8 bytes + de-facto, most compatible) or `Utf8` (Exif 3.0 / CIPA DC-008-2023, type 129 — + spec-conformant Unicode, thinly read). Explicit over auto-upgrade because + auto-promoting non-ASCII to type 129 would silently produce strings most + readers can't parse. `Entry` value bytes are now `Cow` so parsed entries stay + zero-copy while edited ones are owned; the `copyright()` / `artist()` / + `*_bytes()` accessors now borrow `&self`. EXIF tag/type numbers in the parser + are named constants (no bare hex), and the `ExifPolicy` timestamps category is + `datetimes` (plural — it covers DateTime / Original / Digitized / OffsetTime* / + SubSecTime*). (f4b9f1b) +- **Explicit embed-time metadata policy on `Metadata`** — `Metadata` gains + `policy: Option` (**no implicit default** — privacy is an + explicit choice) and `with_policy()`. `Metadata::for_embedding()` returns + `Option`: the policy-filtered metadata once a policy is set, else + `None` — a codec embeds nothing (fail-safe: a forgotten policy strips, never + leaks). It's the hook a codec calls inside its existing `EncodeJob::with_metadata` + with no EXIF logic and no trait/signature change. Carried bytes stay untouched + until then (bring-your-own-EXIF-library round-trips still see originals); + `From<&ImageInfo>` sets `None`; `filtered()`'s output is `Some(PreserveExact)` so + re-embedding can't double-strip. `MetadataPolicy` has **no `Default`** — callers + name a policy explicitly (`Web` recommended). `EncodePolicy::strip_all` / + `preserve_all` carry a real `MetadataPolicy` through the reliable + `resolve_metadata` channel (`Custom(DISCARD_ALL)` / `PreserveExact`) instead of + the advisory `embed_*` flags that no-op on codecs without `with_policy`. + `Metadata` is `#[non_exhaustive]`; `size_of` 104 → 120 on 64-bit. (b832cdc, 73c5799) +- **EXIF privacy hardening for partial-strip policies** — `MakerNote` (0x927C) is + dropped whenever `gps` **or** `camera` is stripped (it can embed GPS/serials and + can't be selectively scrubbed); `SubIFDs` (0x014A, an unmodeled sub-IFD pointer) + is dropped on a rewrite rather than left dangling; IFD1 (thumbnail-directory) + entries are filtered by the same per-category rules as IFD0 (a keep-thumbnail + policy previously kept their Make/Model/DateTime); and `exif::retain` now fails + **safe** for a >4 GiB blob under a stripping policy (drop, not pass-through). The + `Web`/`ColorAndRotation` presets were already safe — these close gaps for + hand-rolled `Custom` policies. (d8a2fae) +- **From-scratch EXIF construction** — `Exif::new(TextEncoding)` (+ `Default`, + which uses `Ascii`) starts an empty little-endian tree, completing the + `parse`/`new` → edit → `to_bytes` flow so you can build a blob with no source: + `Exif::new(TextEncoding::Ascii)` → `set_copyright(…)` → `to_bytes()` (raw TIFF; + the codec adds the APP1 `Exif\0\0` framing). The `TextEncoding` is required — the + Exif 2.x ASCII (type 2) vs Exif 3.0 UTF-8 (type 129) choice is a blob property + used by `set_copyright`/`set_artist` (type 129 is read by almost nothing, so it + can't be a silent default). (b7acd9f, 73c5799) +- **`Metadata::with_copyright(&str)` / `with_artist(&str)`** — one-liner rights + stamping that builds an EXIF blob if there is none and merges into a parseable + existing one (keeping other tags), replacing an unparseable one. Written ASCII + (Exif 2.x, most compatible); for UTF-8/Exif 3.0 or other tags, build via + `exif::Exif` + `with_exif`. (1051288) + +## [0.1.21] - 2026-05-29 + +### Added + +- **Field-level metadata retention** — `Metadata::filtered(&MetadataPolicy)`, + the shared filter for re-encode / recompress pipelines: keep what a + downstream image needs, strip the rest, without callers hand-parsing EXIF. + - `MetadataPolicy`: `PreserveExact` (keep all, incl. a redundant sRGB ICC), + `Preserve` (keep all but drop a redundant sRGB ICC), `Web` (**default** — + ICC non-sRGB + EXIF orientation/rights + CICP/HDR; drop the rest of EXIF + and all XMP), `ColorAndRotation` (only what places pixels: ICC non-sRGB + + CICP/HDR + orientation), and `Custom(MetadataFields)`. + - `MetadataFields` (`#[non_exhaustive]`, `with_*` builders): `icc: + IccRetention` (`#[non_exhaustive]`; `Drop` / `KeepNonSrgb` / `Keep` — + three-way sRGB handling), `exif: ExifPolicy`, and `xmp` / `cicp` / `hdr: + Retention`. + - `exif::Retention` (`#[non_exhaustive]`; `Keep` / `Discard`, query via + `keeps`/`discards`) — explicit per-field intent, no `bool`-direction + ambiguity. + - Every disposition type (`MetadataPolicy`, `IccRetention`, `Retention`) and + every record (`Metadata`, `MetadataFields`, `ExifPolicy`) is + `#[non_exhaustive]` with builder construction, so new policies, ICC modes, + EXIF categories, retention fields, and `Metadata` fields land additively — + the surface never needs a semver-major break (see the module's *Forward + compatibility* docs). +- **Structured EXIF** (`zencodec::exif`) — `Exif<'a>` parses a TIFF/EXIF blob + into a borrowing IFD tree (zero-copy; thumbnails/values are never copied), + `Exif::filtered(&ExifPolicy)` prunes by category, and `Exif::to_bytes` + re-serializes a valid TIFF with recomputed offsets. `ExifPolicy` + (`#[non_exhaustive]`, `with_*` builders) has seven categories: `orientation`, + `rights`, `thumbnail`, `gps`, `datetimes`, `camera`, `other` — so e.g. + "drop only the thumbnail" or "strip GPS" is one field. `exif::retain` is the + `Cow` entry point: borrows the source unchanged when nothing is dropped + (so `Metadata::filtered` is a cheap `Arc` clone), allocates only on a real + rewrite. Bounds-checked, no panics on untrusted input; preserves byte order + and `Exif\0\0` framing. (`helpers::parse_exif_orientation` now delegates + here.) + - Hardened (adversarial review + 80M+ fuzz executions across four targets): + the serializer **deduplicates aliased out-of-line values** so a malformed + IFD pointing many entries at one blob can't amplify the rewrite ~1000× + (DoS); Copyright/Artist accessors read both **ASCII (type 2) and UTF-8 + (type 129, Exif 3.0)** per CIPA DC-008 (a UTF-8-typed field was previously + dropped as unknown), expose raw bytes (`copyright_bytes` / `artist_bytes`) + alongside the lossy-UTF-8 text view, and a pruning rewrite preserves field + bytes **and TIFF type** verbatim (never transcoded — neither corrupted nor + "corrected"); EXIF categories were corrected per the spec's tag tables — + the Exif-IFD creator/owner *name* tags (CameraOwnerName 0xA430, Photographer + 0xA437, ImageEditor 0xA438) are attribution (`rights`, kept by a copyright + policy — they were previously stripped as "other"), and firmware / editing- + software / unique-ID tags are device identity (`camera`); the thumbnail + length tag is read as SHORT *or* LONG (real cameras use SHORT — was silently + dropping valid thumbnails); + structural sub-IFD pointers too short to hold an offset are preserved + (peek-before-remove) instead of dropping the sub-IFD; and `retain` passes a + >4 GiB blob through untouched rather than risk `u32` offset truncation. + - Robust error model: `Exif::parse` returns `None` on structural failure but + **gracefully skips** an individual unreadable / unknown-type / out-of-bounds + entry (and salvages a truncated entry table) — one bad or future-typed + entry no longer discards the whole IFD; `retain` **fails safe** (drops EXIF + it can't parse under a stripping policy rather than leaking it through); and + `to_bytes` is **canonical** (a byte-exact fixpoint), so filtering is + idempotent (a fuzz-found non-idempotence, now a regression seed). + - Test infrastructure: differential tests against `kamadak-exif` + (`tests/exif_differential.rs`), four libFuzzer targets (`fuzz/` — parse, + roundtrip, filter, and `Metadata::filtered`), a stable regression harness + with a committed crash seed (`tests/fuzz_regression.rs`), and a zero-copy + benchmark over 1 KiB–1 MiB thumbnails (`benches/exif_filter.rs`). - `ThreadingPolicy::resolve_thread_count()` — cross-codec shared helper that translates a [`ThreadingPolicy`] to the integer thread count that native-threaded encoder libraries (rav1e/ravif, dav1d/rav1d, libwebp, etc.) diff --git a/CLAUDE.md b/CLAUDE.md index e66d394..3c29e4b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -66,4 +66,27 @@ Tiny, stable crate defining the common interface that all zen* codecs implement: ## Known Issues -(none) +Three bugs verified during the cross-codec color/metadata scenario-matrix +research (2026-06-01). The first is in this crate; the other two are recorded +here as cross-repo findings (do NOT edit those repos from here — flag to the +owner). Full design context: [`docs/color-emit-model.md`](docs/color-emit-model.md). + +1. **Double-rotation hazard — FIXED (this crate, `src/metadata.rs`).** When a + decoder bakes orientation upright it sets `Metadata::orientation = Identity` + while the EXIF blob still carries the original `Orientation` tag (e.g. `6`); a + consumer that re-applied the tag would rotate twice. `Metadata::filtered` now + reconciles them — it rewrites the embedded tag to match the authoritative + `orientation` field via `helpers::set_exif_orientation` (offset-preserving, + fires only on a mismatch so the matched case keeps the zero-copy `Arc` clone). + Regression: `filtered_reconciles_baked_orientation_tag`. + +2. **AVIF descriptor-CICP override (zenavif, `src/codec.rs:824-831`).** + `apply_descriptor_color` overrides a metadata-set CICP unconditionally, + ignoring a CICP explicitly provided via `Metadata`. It should check for a + caller-supplied CICP before overriding from the pixel descriptor. + +3. **Missing signal-range conversion kernels (zenpixels-convert).** No + `Narrow <-> Full` range conversion kernels exist, so a range mismatch refuses + zero-copy but can relabel without rescaling — a black-crush risk. Needs + `ConvertStep::{Expand,Contract}NarrowToFull`. Until then, range must be + preserved verbatim, never relabeled. diff --git a/Cargo.toml b/Cargo.toml index 8848e9a..9f05f97 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "zencodec" -version = "0.1.20" +version = "0.1.21" edition = "2024" rust-version = "1.88" license = "Apache-2.0 OR MIT" @@ -21,13 +21,24 @@ include = [ name = "zencodec" [dependencies] +# Published manifest uses crates.io versions (CI + `cargo publish` build the +# real artifact). Local development builds against the in-tree zen sources via +# a gitignored `.cargo/config.toml` `paths` override — see CONTRIBUTING note. zenpixels = { version = "0.2.10", features = ["icc"] } almost-enough = { version = "0.4.4", default-features = false, features = ["alloc"] } enough = "0.4.4" -whereat = { version = "0.1.5"} +whereat = { version = "0.1.5" } [dev-dependencies] +# Differential-test oracle for the EXIF parser (tests/exif_differential.rs). +# Pure-Rust, BSD-2-Clause; only built for tests, never shipped. +kamadak-exif = "0.6.1" +zenbench = "0.1.8" thiserror = "2" walkdir = "2.5.0" rayon = "1.10.0" moxcms = { version = "0.8.1", features = ["options"] } + +[[bench]] +name = "exif_filter" +harness = false diff --git a/README.md b/README.md index 8c42418..705459d 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,35 @@ let pixels = decoded.into_buffer(); **Pixel types from `zenpixels`.** All pixel interchange types (`PixelSlice`, `PixelBuffer`, `PixelDescriptor`, etc.) are defined in the `zenpixels` crate. All zen\* crates depend on `zenpixels` directly. +## Metadata Retention + +Re-encode and recompress pipelines need to decide what metadata survives. `Metadata::filtered` applies a `MetadataPolicy`, so callers never hand-parse EXIF: + +```rust,ignore +use zencodec::{MetadataPolicy, MetadataFields, IccRetention, exif::{ExifPolicy, Retention}}; + +// Decode → filter → re-encode. `Web` (the default) keeps the ICC profile +// (unless a redundant sRGB), EXIF orientation + rights, and CICP/HDR color +// signaling — and strips GPS, timestamps, camera info, thumbnail, and XMP. +let kept = decoded_meta.filtered(&MetadataPolicy::Web); + +// Presets: PreserveExact (keep all, incl. duplicate sRGB), Preserve (drop dup +// sRGB), Web, ColorAndRotation (only what places pixels), Custom. +let minimal = decoded_meta.filtered(&MetadataPolicy::ColorAndRotation); + +// Per-field control — drop only the thumbnail, keep everything else: +let policy = MetadataPolicy::Custom( + MetadataFields::KEEP_ALL.with_exif(ExifPolicy::KEEP_ALL.with_thumbnail(Retention::Discard)), +); +let no_thumb = decoded_meta.filtered(&policy); +``` + +`MetadataFields` encapsulates EXIF in an `ExifPolicy` with seven keep/discard categories — `orientation`, `rights`, `thumbnail`, `gps`, `datetimes`, `camera`, `other` — and three-way ICC handling (`IccRetention::{Drop, KeepNonSrgb, Keep}`). EXIF passes through byte-unchanged (zero-copy) when no category is dropped, and is rewritten — offsets recomputed — only when pruning. CICP/HDR are color *signaling* (dropping them changes displayed pixels), so the presets keep them; a `Custom` policy can drop them. The structured parser/editor is public as [`zencodec::exif::Exif`](https://docs.rs/zencodec) (`parse` → `filtered`/edit → `to_bytes`) for direct EXIF work — including setting Copyright/Artist (`set_copyright` / `set_artist`, with a `TextEncoding` choice of Exif 2.x ASCII or Exif 3.0 UTF-8). + +**Privacy is an explicit choice.** `Metadata` carries an `Option` — there's **no implicit default**: you choose retention with `with_policy(MetadataPolicy::Web)` (privacy-safe) or `PreserveExact` (verbatim). `Metadata::for_embedding()` returns `Option` — the filtered metadata a codec embeds, or `None` when no policy was chosen, which a codec treats as "embed nothing." So a forgotten policy **strips, never leaks**. A codec calls it inside its existing `with_metadata` (no trait change). The carried bytes stay untouched until embed, so you can still pull `metadata.exif` out, edit it with any EXIF library, and put it back via `with_exif`. + +To **stamp** rights in one line — `Metadata::none().with_copyright("© 2026 You")` builds (or merges into) the EXIF blob (ASCII); or build it directly with `Exif::new(TextEncoding::Ascii).set_copyright(…)` → `to_bytes()` — `Exif::new` requires the Exif 2.x-vs-3.0 field-type choice (type 129 is read by almost nothing, so it's never a silent default). + ## What's in this crate | Module | Contents | @@ -84,8 +113,9 @@ let pixels = decoded.into_buffer(); | `zencodec::encode` | `EncoderConfig`, `EncodeJob`, `Encoder`, `AnimationFrameEncoder`, `EncodeOutput`, `EncodeCapabilities`, `EncodePolicy`, `best_encode_format`, dyn dispatch traits (`DynEncoderConfig`, `DynEncodeJob`, `DynEncoder`, `DynAnimationFrameEncoder`) | | `zencodec::decode` | `DecoderConfig`, `DecodeJob`, `Decode`, `StreamingDecode`, `AnimationFrameDecoder`, `DecodeOutput`, `DecodeCapabilities`, `DecodePolicy`, `DecodeRowSink`, `SinkError`, `OutputInfo`, `SourceEncodingDetails`, `negotiate_pixel_format`, `is_format_available`, dyn dispatch traits (`DynDecoderConfig`, `DynDecodeJob`, `DynDecoder`, `DynStreamingDecoder`, `DynAnimationFrameDecoder`) | | `zencodec::gainmap` | `GainMapInfo`, `GainMapParams`, `GainMapChannel`, `GainMapDirection`, `GainMapPresence`, `Iso21496Format` (with variants `JxlJhgm`, `AvifTmap`, `JpegApp2BodyWithUrn`), `ISO_21496_1_URN`, `ISO_21496_1_PRIMARY_APP2_BODY`, `serialize_iso21496_fmt` / `serialize_iso21496_fmt_into` / `parse_iso21496_fmt`, `GainMapParseError` — cross-codec gain map types and wire-format helpers (ISO 21496-1) | -| `zencodec::helpers` | Codec implementation helpers (not consumer API) — shared boilerplate for trait implementors | -| root | `ImageFormat`, `ImageFormatDefinition`, `ImageFormatRegistry` (format detection via `ImageFormatRegistry::detect()`), `ImageInfo`, `Metadata`, `Orientation`, `OrientationHint`, `ResourceLimits`, `LimitExceeded`, `ThreadingPolicy`, `UnsupportedOperation`, `CodecErrorExt`, `find_cause`, `Unsupported`, `Extensions`, `AnimationFrame`, `OwnedAnimationFrame`, `Cicp`, `ContentLightLevel`, `MasteringDisplay`, `StopToken`, `Unstoppable` | +| `zencodec::exif` | Structured EXIF/TIFF: `Exif` (borrowing parse → prune → serialize), `ExifPolicy` (7 keep/discard categories), `Retention`, `ByteOrder`, `retain` | +| `zencodec::helpers` | Codec implementation helpers (not consumer API) — shared boilerplate for trait implementors, plus the lightweight `parse_exif_orientation` accessor | +| root | `ImageFormat`, `ImageFormatDefinition`, `ImageFormatRegistry` (format detection via `ImageFormatRegistry::detect()`), `ImageInfo`, `Metadata`, `MetadataPolicy`, `MetadataFields`, `IccRetention`, `Exif`, `ExifPolicy`, `Retention`, `ByteOrder`, `Orientation`, `OrientationHint`, `ResourceLimits`, `LimitExceeded`, `ThreadingPolicy`, `UnsupportedOperation`, `CodecErrorExt`, `find_cause`, `Unsupported`, `Extensions`, `AnimationFrame`, `OwnedAnimationFrame`, `Cicp`, `ContentLightLevel`, `MasteringDisplay`, `StopToken`, `Unstoppable` | zencodec has no feature flags. The full API is always available. diff --git a/benches/exif_filter.rs b/benches/exif_filter.rs new file mode 100644 index 0000000..0ddb812 --- /dev/null +++ b/benches/exif_filter.rs @@ -0,0 +1,85 @@ +//! EXIF filter benchmark — validates the zero-copy claim as thumbnail size +//! grows (1 KB → 1 MB). +//! +//! The point: a passthrough filter (`retain` with a keep-everything policy) +//! borrows the source and must NOT scale with thumbnail size, while a pruning +//! rewrite (and `to_bytes`) copies the thumbnail and scales linearly. If +//! `retain_passthrough` ever tracks thumbnail size, the zero-copy `Cow` +//! contract has regressed. +//! +//! Run: `cargo bench --bench exif_filter` + +use zenbench::prelude::*; + +use zencodec::exif::{Exif, ExifPolicy, Retention, retain}; + +/// A little-endian EXIF blob: IFD0 (orientation) → IFD1 (JPEG thumbnail of +/// `thumb_len` bytes). Thumbnail bytes are synthesized at runtime (never +/// committed). +fn exif_with_thumbnail(thumb_len: usize) -> Vec { + let mut v = vec![b'I', b'I', 0x2A, 0x00]; + v.extend_from_slice(&8u32.to_le_bytes()); + // IFD0 @8: 1 entry (orientation), next → IFD1 @26. + v.extend_from_slice(&1u16.to_le_bytes()); + v.extend_from_slice(&0x0112u16.to_le_bytes()); // Orientation + v.extend_from_slice(&3u16.to_le_bytes()); // SHORT + v.extend_from_slice(&1u32.to_le_bytes()); + v.extend_from_slice(&[6, 0, 0, 0]); // Rotate90 + v.extend_from_slice(&26u32.to_le_bytes()); // next = IFD1 + // IFD1 @26: 0x0201 (offset=56), 0x0202 (length), next=0. + v.extend_from_slice(&2u16.to_le_bytes()); + v.extend_from_slice(&0x0201u16.to_le_bytes()); + v.extend_from_slice(&4u16.to_le_bytes()); // LONG + v.extend_from_slice(&1u32.to_le_bytes()); + v.extend_from_slice(&56u32.to_le_bytes()); // thumbnail offset + v.extend_from_slice(&0x0202u16.to_le_bytes()); + v.extend_from_slice(&4u16.to_le_bytes()); // LONG + v.extend_from_slice(&1u32.to_le_bytes()); + v.extend_from_slice(&(thumb_len as u32).to_le_bytes()); + v.extend_from_slice(&0u32.to_le_bytes()); // next IFD + // Thumbnail @56. + v.extend(core::iter::repeat_n(0xABu8, thumb_len)); + v +} + +fn build_group(suite: &mut Suite, thumb_len: usize, label: &'static str) { + let blob = exif_with_thumbnail(thumb_len); + // A pruning policy that KEEPS the thumbnail but drops GPS → forces a + // rewrite that must copy the thumbnail. + let prune = ExifPolicy::KEEP_ALL.with_gps(Retention::Discard); + + suite.group(label, move |g| { + g.throughput(Throughput::Bytes(thumb_len as u64)); + g.throughput_unit("thumb-byte"); + + let b1 = blob.clone(); + g.bench("parse", move |b| { + b.iter(|| zenbench::black_box(Exif::parse(&b1))) + }); + + // Zero-copy passthrough: should be flat across thumbnail size. + let b2 = blob.clone(); + g.bench("retain_passthrough", move |b| { + b.iter(|| zenbench::black_box(retain(&b2, &ExifPolicy::KEEP_ALL))) + }); + + // Pruning rewrite: copies the thumbnail → scales with size. + let b3 = blob.clone(); + g.bench("retain_prune_rewrite", move |b| { + b.iter(|| zenbench::black_box(retain(&b3, &prune))) + }); + + let b4 = blob.clone(); + g.bench("parse_then_to_bytes", move |b| { + b.iter(|| zenbench::black_box(Exif::parse(&b4).map(|x| x.to_bytes()))) + }); + }); +} + +fn bench_exif_filter(suite: &mut Suite) { + build_group(suite, 1 << 10, "thumb_1KiB"); + build_group(suite, 64 << 10, "thumb_64KiB"); + build_group(suite, 1 << 20, "thumb_1MiB"); +} + +zenbench::main!(bench_exif_filter); diff --git a/benchmarks/exif_filter_2026-05-29.txt b/benchmarks/exif_filter_2026-05-29.txt new file mode 100644 index 0000000..99085bd --- /dev/null +++ b/benchmarks/exif_filter_2026-05-29.txt @@ -0,0 +1,57 @@ +# zencodec EXIF filter benchmark — zero-copy validation +# Date: 2026-05-29 Branch: feat/metadata-policy (PR #17) +# Host: AMD Ryzen 9 7950X, WSL2 (no -C target-cpu=native) +# Command: cargo bench --bench exif_filter +# Takeaway: parse + retain(passthrough) are FLAT in thumbnail size +# (zero-copy borrow); only an actual prune-rewrite / to_bytes copies +# the thumbnail and scales linearly. retain_passthrough: 6.2ns @1KiB, +# 6.2ns @64KiB, 13ns @1MiB. rewrite: 282ns / 1083ns / 42447ns. + +═══════════════════════════════════════════════════════════════ + zenbench 1780106647-1dce52 + git: a7836166ecf9a58abe3c572c67dee764d33af93d +═══════════════════════════════════════════════════════════════ + + thumb_1KiB 4 rounds × 3K calls ⚠ only 4 rounds + mean ±mad ns 95% CI vs base iB/s + ├─ parse 48.2 ±1.4ns [47.0–49.3]ns 19.8G + ├─ retain_passthrough 6.2 ±0.1ns [-89.8%–-84.6%] 154G + ├─ retain_prune_rewrite 282.3 ±23.8ns [+425.3%–+565.2%] 3.38G [1] + ╰─ parse_then_to_bytes 247.2 ±17.5ns [+387.8%–+437.9%] 3.86G + + retain_passthrough █████████████████████████████████████████████ 154 GiB/s + parse ██████ 19.8 GiB/s + parse_then_to_bytes █ 3.86 GiB/s + retain_prune_rewrite █ 3.38 GiB/s + [1] drift r=-0.80 — later rounds faster + + thumb_64KiB 4 rounds × 956 calls ⚠ only 4 rounds + mean ±mad ns 95% CI vs base iB/s + ├─ parse 47.4 ±0.2ns [46.8–48.3]ns 1288G + ├─ retain_passthrough 6.2 ±0.0ns [-87.3%–-86.1%] 9842G [1] + ├─ retain_prune_rewrite 1083.2 ±3.6ns [+2165.3%–+2217.8%] 56.3G + ╰─ parse_then_to_bytes 1062.2 ±13.7ns [+2141.8%–+2176.6%] 57.5G + + retain_passthrough █████████████████████████████████████████████ 9842 GiB/s + parse ██████ 1288 GiB/s + parse_then_to_bytes █ 57.5 GiB/s + retain_prune_rewrite █ 56.3 GiB/s + [1] drift r=1.00 — later rounds slower + + thumb_1MiB 4 rounds × 26 calls ⚠ only 4 rounds + mean ±mad ns 95% CI vs base iB/s + ├─ parse 76.6 ±8.2ns [61.7–100.9]ns 12748G [1] + ├─ retain_passthrough 13.4 ±0.5ns [-83.7%–-74.6%] 73091G [2] + ├─ retain_prune_rewrite 42447.0 ±1242.0ns [+53219.1%–+58015.0%] 23.0G + ╰─ parse_then_to_bytes 42638.1 ±1893.0ns [+52586.9%–+59165.3%] 22.9G + + retain_passthrough ████████████████████████████████████████████ 73091 GiB/s + parse ████████ 12748 GiB/s + retain_prune_rewrite █ 23.0 GiB/s + parse_then_to_bytes █ 22.9 GiB/s + [1] CV=32% + [2] CV=38% + + total: 367.2s (353 noisy rounds) +═══════════════════════════════════════════════════════════════ + filter: cargo bench -- --group=NAME format: --format=llm|csv|md|json diff --git a/docs/color-emit-model.md b/docs/color-emit-model.md new file mode 100644 index 0000000..cd204a2 --- /dev/null +++ b/docs/color-emit-model.md @@ -0,0 +1,132 @@ +# Color emission model (grounded design) + +Status: **canonical.** This records the *minimal* shared color surface and — just +as importantly — the designs that were tried, dogfooded, adversarially reviewed, +and **rejected**, so they don't get rebuilt. Companion analysis: +[`cross-codec-color-metadata.md`](cross-codec-color-metadata.md). + +## Thesis + +The only thing that genuinely needs to be **shared** across codecs is a *pure +color-carrier policy*: given a source's color (`SourceColor`) and a target's +capabilities (`EncodeCapabilities`), decide which carriers to write (ICC vs +CICP). Everything else — pixel+metadata materialization, specialized +coefficient-domain transcodes, the decode→re-encode orchestration — already has +a home and must **not** be pulled into a grand "emit model" or a cross-codec +trait. + +This was reached the hard way: an over-built `EmitFacts`/`EmitIntent`/`EmitPlan` +"scenario" model + a `TranscodeEncoder` trait were dogfooded into 5 codecs and +adversarially reviewed; the review + a full read of zenpixels/zenpipe killed +them (see *Rejected designs*). The grounded surface is ~360 lines with **zero +codec dependencies**. + +## The shared surface — `zencodec::color` + +```rust +pub fn resolve_color_emit( + src: &SourceColor, // what the source file signalled (cicp / icc / channel_count) + target: &EncodeCapabilities, // which carriers the target format has + their quality + policy: ColorEmitPolicy, +) -> ColorEmitPlan; // { cicp: Option, icc: IccDisposition } + +pub enum ColorEmitPolicy { Compatibility, Balanced /*default*/, Compact, Verbatim, Custom(ColorEmitFields) } +pub enum IccDisposition { KeepSource, SynthesizeFrom(Cicp), Drop } +pub struct ColorEmitFields { icc: IccRetention, cicp: CicpEmission } // ::new(icc, cicp) +pub enum CicpEmission { WhereValidCarrier /*default*/, WhereverSupported, Never } +``` + +Pure, `no_std`, **no CMS, no codec deps**. It emits a *plan*; the bytes are +materialized one layer up. `SourceColor` is the type the pipeline actually +produces (decode → `ImageInfo.source_color`; the bridge to encode is a flat +`Metadata`). The resolver also handles the grayscale/CMYK terminal states +(suppress CICP, keep ICC) and never emits a redundant `SynthesizeFrom(sRGB)`. + +### Capabilities (three flags drive it) + +- `cicp()` — has a CICP carrier slot at all. +- `cicp_is_valid_carrier()` — the carrier is standardized/honored, so CICP is + emitted by default (JXL enum, AVIF/HEIC `nclx`, **PNG `cICP`**). Distinct from + authority — PNG isn't the decode authority but is a valid carrier. +- `cicp_safe_sole_carrier()` — safe to ship CICP-only and drop the ICC (JXL only; + AVIF/HEIC/PNG keep the ICC alongside). + +## Lowering the plan (where the bytes happen) + +A codec or the pipeline lowers `ColorEmitPlan` to bytes through **zenpixels-convert's +`finalize_for_output_with`** — which already converts pixels *and* emits matching +`OutputMetadata` atomically (pixels and embedded color cannot diverge): + +- `ColorEmitPlan.cicp` → the format's native CICP carrier. +- `IccDisposition::KeepSource` → `OutputProfile::SameAsOrigin` (re-embed source ICC). +- `IccDisposition::SynthesizeFrom(cicp)` → `zenpixels_convert::icc_profile_for_primaries` + (a `const fn` table of bundled profiles — **no CMS, no allocation**; returns + `None` for BT.709/sRGB so the assumed default is never embedded). +- `IccDisposition::Drop` → no ICC. + +So "synthesize an ICC" can never silently lose color and never needs a CMS in the +codec — it's a table lookup. + +## Orientation (separate, tiny) + +The double-rotation hazard (a decoder bakes orientation upright but the embedded +EXIF blob still says `Rotate90`) is closed by +`helpers::set_exif_orientation(blob, value)` — an offset-preserving inline rewrite +of the 0x0112 tag. It's applied by the **pipeline**, which knows when it baked +orientation. It is *not* part of color policy and not a "unified plan". + +## Transcodes (pairwise, self-contained — not shared) + +Specialized lossless/coefficient transcodes are **not** a generic capability: + +- **JPEG → JPEG** (orient / recompress): entirely inside zenjpeg + (`zenjpeg::lossless`, `zenjpeg::recompress`). +- **JPEG → JXL** (lossless embed): inside jxl-encoder via jbrd (its own + `JpegData` parser — the JXL spec's recompression feature, **needs no zenjpeg**). + +These preserve metadata verbatim, so they don't even call the color resolver. +The set of real pairs is tiny and well-known. The **dispatch** belongs in +**zenpipe**, which already depends on every codec — a small finite table of known +pairs calling those functions directly, plus `resolve_color_emit` on the +decode→re-encode path. No codec ever learns about another. + +zenpipe already has the sketch: `try_lossless_jpeg` (in `lossless.rs`, currently +only called from tests) is the precedent to wire and generalize. **That's a later +piece**, tracked separately. + +## Rejected designs (do not rebuild) + +- **`EmitFacts { Fresh | Decoded | Passthrough }` + `PixelFidelity`** — nothing in + the pipeline produces a `ColorOrigin`/fidelity: decode attaches no color to the + buffer; provenance lives in `ImageInfo.source_color` and the carrier is a flat + `Metadata`. A codec `Encoder` only ever sees `with_metadata(Metadata)`, so it + could only ever build `Fresh` — the scenario machinery was dead code. The + `PixelDescriptor` already *is* the current gamut, so deriving `Reauthored` was + redundant. `resolve_color_emit(&SourceColor, …)` takes the type that flows. +- **`TranscodeEncoder` trait in zencodec** — a generic "output codec transcodes + from source-format X" trait forces every output codec to *ingest* every input + format (JXL←JPEG needs JPEG parsing; PNG←? needs zenpng; …) → **every codec + depends on every other codec**. The real pairs are ~2 and each self-contained. + zenpipe (deps-all) dispatches; no trait. +- **`EmitIntent` unifying color + metadata + orientation into one knob** — + aesthetic, not grounded. `MetadataPolicy` (#17) and `ColorEmitPolicy` are fine + apart; orientation is a one-helper correctness fix, not a policy axis. +- **A resolver that produces final `Metadata` bytes** — a third metadata producer + alongside `Metadata::filtered` and `OutputMetadata`. Atomicity is already + `finalize_for_output_with`'s job. + +## What landed (the surviving red-team fixes) + +The 5-codec dogfood + adversarial review found real defects; the ones that +survived the grounding, all small and all on the `resolve_color_emit` shape: + +1. `ColorEmitFields::new` / `CicpEmission` are constructible → `ColorEmitPolicy::Custom` + is actually reachable downstream. +2. `cicp_is_valid_carrier` tier → PNG/WebP emit cICP under Balanced instead of + laundering wide-gamut color through a synthesized ICC. +3. No redundant `SynthesizeFrom(sRGB)` (the canned table returns `None` for sRGB). +4. `set_exif_orientation` for the double-rotation hazard. + +The `SynthesizeFrom`-silently-drops-color critical dissolves under lowering — +`icc_profile_for_primaries` always materializes a non-sRGB profile, never a CMS, +never a silent drop. diff --git a/docs/spec.md b/docs/spec.md index 37a5643..42eca38 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -456,10 +456,148 @@ Non-color metadata blobs. Fields: `exif: Option>`, `xmp: Option> Owned metadata for encode/decode roundtrip. Fields: `icc_profile`, `exif`, `xmp` (`Option>`), `cicp`, `content_light_level`, `mastering_display` (Copy), -`orientation`. `#[non_exhaustive]`. - -Methods: builder pattern (`with_icc()`, etc.), `transfer_function()`, -`color_primaries()`, `is_empty()`. `From<&ImageInfo>` conversion. +`orientation`, and `policy: MetadataPolicy`. `#[non_exhaustive]`. + +Methods: builder pattern (`with_icc()`, `with_policy()`, etc.), +`with_copyright(&str)` / `with_artist(&str)` (build-or-merge the rights tag into +the EXIF blob, ASCII), `transfer_function()`, `color_primaries()`, `is_empty()`, +`filtered(&MetadataPolicy) -> Metadata`, `for_embedding() -> Option`. +`From<&ImageInfo>` conversion. + +**Embed-time policy (explicit privacy decision).** `policy: Option` +carries *intent only* — the raw `exif`/`xmp`/`icc_profile` bytes are untouched (so +an inspect / bring-your-own EXIF-library round-trip still sees the originals) until +a codec materializes them. There is **no implicit default**: `policy` is `None` +until the caller chooses one via `with_policy(…)`. `Metadata::for_embedding()` +returns `Option` — `Some(self.filtered(&p))` once a policy `p` is set, +else `None`, which a codec treats as "embed nothing." That's fail-safe: a +forgotten policy **strips, never leaks**. It's the hook a codec calls inside its +existing `EncodeJob::with_metadata` (works for every codec; no trait/signature +change). `filtered()`'s output is marked `Some(PreserveExact)`, so re-embedding +never double-strips. `Web` is the recommended privacy-safe choice; +`with_policy(MetadataPolicy::PreserveExact)` embeds verbatim. + +### `MetadataPolicy` / `MetadataFields` / `IccRetention` + +Field-level retention policy for `Metadata::filtered()` — the shared metadata +filter for re-encode / recompress pipelines. + +`MetadataPolicy` (`#[non_exhaustive]`, **no `Default`** — retention is a privacy +decision the caller must make explicitly; `Web` is the recommended choice): +- `PreserveExact` — keep everything, byte-faithfully (incl. a redundant sRGB ICC). +- `Preserve` — keep everything, but drop a redundant sRGB ICC. +- `Web` (default) — ICC (unless redundant sRGB) + EXIF orientation/rights + + CICP/HDR; drop the rest of EXIF (GPS, timestamps, camera, thumbnail) and XMP. +- `ColorAndRotation` — only what places pixels: ICC (non-sRGB) + CICP/HDR + + EXIF orientation. Drops attribution, XMP, other EXIF. +- `Custom(MetadataFields)` — explicit per-field control. + +`MetadataFields` (`Copy`, `#[non_exhaustive]`, `with_*` builders + `KEEP_ALL` / +`DISCARD_ALL` consts): `icc: IccRetention`, `exif: ExifPolicy`, and `xmp` / +`cicp` / `hdr: Retention`. `MetadataPolicy::fields()` resolves a policy. + +`IccRetention`: `Drop` / `KeepNonSrgb` (drop only a redundant sRGB, +`zenpixels::icc::is_common_srgb`) / `Keep` (byte-faithful). + +CICP / HDR are color *signaling* (dropping them changes displayed pixels), so +the presets keep them; only a `Custom` policy can drop them. Gain maps are not +part of `Metadata` (they live at the encode-request layer) and are unaffected. + +### `exif::Exif` / `ExifPolicy` / `Retention` + +Structured EXIF model (`zencodec::exif`). `Exif<'a>` (`parse` **or** `new` → +`filtered` / edit → `to_bytes`) borrows the source — entry values and the +thumbnail are never copied (entry values are `Cow`, borrowed on parse, owned when +injected by an edit). `Exif::new(TextEncoding)` (and `Default`, which uses `Ascii`) +starts an empty little-endian tree for building from scratch — e.g. stamp a +Copyright on an image that had no EXIF: `Exif::new(TextEncoding::Ascii)` → +`set_copyright(…)` → `to_bytes()` (raw TIFF; the codec adds the APP1 `Exif\0\0` +framing). The `TextEncoding` is **required** at `new` — it's the Exif 2.x ASCII +(type 2) vs Exif 3.0 UTF-8 (type 129) compat choice, a blob property used by all +string writes (type 129 is read by almost nothing today, so it can't be a silent +default). Read accessors: `orientation()`, `copyright()` / `artist()` (lossy-UTF-8 +text *view*, borrowing `&self`), `copyright_bytes()` / `artist_bytes()` (raw +field bytes), `has_thumbnail()`, `has_gps()`. Edit accessors: `set_copyright(&str)` +/ `set_artist(&str)` insert-or-replace the IFD0 tag using the blob's `TextEncoding` +(materialized on the next `to_bytes`). `to_bytes()` re-serializes a valid TIFF +with recomputed offsets, preserving byte order and `Exif\0\0` framing; it is a +byte-exact fixpoint, so filtering and editing stay idempotent. + +`TextEncoding` (`#[non_exhaustive]`) — the EXIF text convention a write uses: +`Ascii` (Exif 2.x, TIFF type 2; carries UTF-8 bytes de-facto — most compatible, +the recommended default) or `Utf8` (Exif 3.0 / CIPA DC-008-2023, TIFF type 129; +spec-conformant Unicode, thin reader support). Both write the same UTF-8 bytes, +NUL-terminated; they differ only in the declared TIFF type. Re-exported at the +crate root. + +Encoding (read side): Copyright/Artist may be ASCII (type 2, 7-bit) **or UTF-8 +(type 129, Exif 3.0)**; non-ASCII bytes stuffed into a type-2 field are the +non-conformant-but-common case. zencodec reads both — `copyright` / `artist` +give a lossy-UTF-8 display view, `*_bytes` give the exact bytes. A pruning +rewrite **never transcodes**: it preserves the value bytes **and TIFF type** +verbatim (a field is neither corrupted nor "corrected"). Writing is the only +path that mints new bytes, and the caller picks the type via `TextEncoding`. + +`ExifPolicy` (`Copy`, `#[non_exhaustive]`, `with_*` builders) — seven keep/drop +categories of `Retention`: `orientation`, `rights` (copyright + artist), +`thumbnail`, `gps`, `datetimes`, `camera`, `other`. Consts: `KEEP_ALL`, +`DISCARD_ALL`, `ATTRIBUTED_ORIENTATION`, `ORIENTATION_ONLY`. + +`Retention` (`Keep` / `Discard`) — explicit per-field intent. + +`exif::retain(&[u8], &ExifPolicy) -> Option>` — `Cow::Borrowed` when +nothing is dropped (so `Metadata::filtered` is a cheap `Arc` clone), +`Cow::Owned` on a rewrite, `None` when all EXIF is discarded. + +`helpers::parse_exif_orientation` is a lightweight orientation accessor that +delegates here. Limitation: a partial rewrite that *keeps* `MakerNote` (0x927C) +relocates it without fixing its maker-specific internal offsets — keep all EXIF +(no prune) for byte-exact MakerNote. + +Privacy (partial-strip policies): `MakerNote` is dropped whenever `gps` **or** +`camera` is stripped (it's opaque and can embed GPS/serials); `SubIFDs` (0x014A, +an unmodeled sub-IFD pointer) is dropped on a rewrite rather than left dangling; +IFD1 (thumbnail directory) entries are filtered by the same per-category rules as +IFD0, so a keep-thumbnail policy doesn't leak the Make/Model/DateTime it carries. +The `Web`/`ColorAndRotation` presets drop `gps`/`camera`/`thumbnail`/`other`, so +they were already safe; these close the gaps for hand-rolled `Custom` policies. +Cross-carrier caveat: XMP can duplicate GPS/identity — a policy that keeps XMP +ships it even when the EXIF copy is stripped. + +Hardening: bounds-checked, no panics on untrusted input (32M+ fuzz executions); +the serializer dedups aliased out-of-line values to prevent rewrite +memory-amplification; ASCII accessors require the ASCII/UTF-8 TIFF type; thumbnail +length is read as SHORT or LONG; under a stripping policy `retain` fails **safe** +— unparseable or >4 GiB blobs are dropped, never passed through unfiltered. +Validated by differential tests vs `kamadak-exif`, libFuzzer targets, and a +1 KiB–1 MiB-thumbnail zero-copy benchmark. + +#### EXIF write / edit path + +Editing a parsed blob is supported for the string rights fields: `set_copyright` +/ `set_artist` insert-or-replace the IFD0 tag, then `to_bytes` re-serializes +through the canonical serializer (offsets recomputed, fixpoint preserved). +Mechanism: `Entry.value` is `Cow<'a, [u8]>`, so parsed entries stay borrowed +(zero-copy) while injected ones are owned. + +The caller picks the TIFF type explicitly via `TextEncoding` (`Ascii` = type 2, +`Utf8` = type 129) rather than the writer auto-upgrading to type 129 for +non-ASCII. This is deliberate: type 129 has thin reader support (ExifTool reads +it; kamadak / Pillow / most do not), so an auto-upgrade would silently produce +copyright strings most tools can't read. `Ascii` writes the string's UTF-8 bytes +into the type-2 field (the de-facto interchange form — maximally compatible); +`Utf8` is the spec-conformant choice when the consumer is known to handle it. +Both are NUL-terminated with the count including the NUL. + +Still planned (additive, semver-minor; deferred until a concrete consumer): + +- `exif::Builder` to construct a blob from scratch (today editing starts from a + parsed `Exif`), and setters for non-string fields. +- For broad copyright readability, also writing XMP `dc:rights` (universally + UTF-8) is the most portable option and a likely companion feature. +- Orientation editing already exists as a byte-level, offset-preserving rewrite + (`helpers::set_exif_orientation`), used by the pipeline to reconcile a + baked-upright buffer's `Metadata::orientation` with its embedded tag. ### `OutputInfo` diff --git a/fuzz/.gitignore b/fuzz/.gitignore new file mode 100644 index 0000000..7be2c00 --- /dev/null +++ b/fuzz/.gitignore @@ -0,0 +1,8 @@ +# Working corpus + crash artifacts live in block storage (/mnt/v/fuzzes/zencodec/), +# not git. Only fuzz_targets/, Cargo.toml, Cargo.lock, and tiny regression/ seeds +# are tracked. +target/ +corpus/ +artifacts/ +coverage/ +*.profraw diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 0000000..e2a2aa7 --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "zencodec-fuzz" +version = "0.0.0" +publish = false +edition = "2024" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.zencodec] +path = ".." + +[[bin]] +name = "exif_parse" +path = "fuzz_targets/exif_parse.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "exif_roundtrip" +path = "fuzz_targets/exif_roundtrip.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "exif_filter" +path = "fuzz_targets/exif_filter.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "metadata_filtered" +path = "fuzz_targets/metadata_filtered.rs" +test = false +doc = false +bench = false diff --git a/fuzz/fuzz_targets/exif_filter.rs b/fuzz/fuzz_targets/exif_filter.rs new file mode 100644 index 0000000..fc4f120 --- /dev/null +++ b/fuzz/fuzz_targets/exif_filter.rs @@ -0,0 +1,31 @@ +#![no_main] +//! `Exif::filtered(policy).to_bytes()` and `exif::retain` must not panic for +//! any input or policy. The first byte seeds the 7-category policy bitmask. +use libfuzzer_sys::fuzz_target; +use zencodec::exif::{Exif, ExifPolicy, Retention, retain}; + +fn ret(bit: bool) -> Retention { + if bit { Retention::Keep } else { Retention::Discard } +} + +fuzz_target!(|data: &[u8]| { + let (cfg, rest) = match data.split_first() { + Some((c, r)) => (*c, r), + None => return, + }; + let policy = ExifPolicy::DISCARD_ALL + .with_orientation(ret(cfg & 0x01 != 0)) + .with_rights(ret(cfg & 0x02 != 0)) + .with_thumbnail(ret(cfg & 0x04 != 0)) + .with_gps(ret(cfg & 0x08 != 0)) + .with_datetimes(ret(cfg & 0x10 != 0)) + .with_camera(ret(cfg & 0x20 != 0)) + .with_other(ret(cfg & 0x40 != 0)); + if let Some(x) = Exif::parse(rest) { + let pruned = x.filtered(&policy); + let bytes = pruned.to_bytes(); + // The pruned, re-serialized blob must itself parse. + let _ = Exif::parse(&bytes); + } + let _ = retain(rest, &policy); +}); diff --git a/fuzz/fuzz_targets/exif_parse.rs b/fuzz/fuzz_targets/exif_parse.rs new file mode 100644 index 0000000..0c5ec06 --- /dev/null +++ b/fuzz/fuzz_targets/exif_parse.rs @@ -0,0 +1,14 @@ +#![no_main] +//! `Exif::parse` and every accessor must not panic on arbitrary input. +use libfuzzer_sys::fuzz_target; +use zencodec::exif::Exif; + +fuzz_target!(|data: &[u8]| { + if let Some(x) = Exif::parse(data) { + let _ = x.orientation(); + let _ = x.copyright(); + let _ = x.artist(); + let _ = x.has_gps(); + let _ = x.has_thumbnail(); + } +}); diff --git a/fuzz/fuzz_targets/exif_roundtrip.rs b/fuzz/fuzz_targets/exif_roundtrip.rs new file mode 100644 index 0000000..de26fb2 --- /dev/null +++ b/fuzz/fuzz_targets/exif_roundtrip.rs @@ -0,0 +1,17 @@ +#![no_main] +//! `parse → to_bytes → parse` must round-trip: the serializer always produces +//! a re-parseable TIFF, and the key accessors are preserved. +use libfuzzer_sys::fuzz_target; +use zencodec::exif::Exif; + +fuzz_target!(|data: &[u8]| { + if let Some(x) = Exif::parse(data) { + let bytes = x.to_bytes(); + let y = Exif::parse(&bytes).expect("serializer output must re-parse"); + assert_eq!(x.orientation(), y.orientation(), "orientation drift"); + assert_eq!(x.copyright(), y.copyright(), "copyright drift"); + assert_eq!(x.artist(), y.artist(), "artist drift"); + assert_eq!(x.has_gps(), y.has_gps(), "gps presence drift"); + assert_eq!(x.has_thumbnail(), y.has_thumbnail(), "thumbnail presence drift"); + } +}); diff --git a/fuzz/fuzz_targets/metadata_filtered.rs b/fuzz/fuzz_targets/metadata_filtered.rs new file mode 100644 index 0000000..ae81751 --- /dev/null +++ b/fuzz/fuzz_targets/metadata_filtered.rs @@ -0,0 +1,40 @@ +#![no_main] +//! `Metadata::filtered` integration: build Metadata from arbitrary EXIF/XMP/ICC +//! bytes and apply every policy. Must never panic, and the result must be +//! internally consistent (re-filtering is idempotent under the same policy). +use libfuzzer_sys::fuzz_target; +use zencodec::exif::{ExifPolicy, Retention}; +use zencodec::{IccRetention, Metadata, MetadataFields, MetadataPolicy}; + +fuzz_target!(|data: &[u8]| { + let (sel, rest) = match data.split_first() { + Some(x) => x, + None => return, + }; + let meta = Metadata::none() + .with_exif(rest.to_vec()) + .with_xmp(rest.to_vec()) + .with_icc(rest.to_vec()); + + let policy = match sel % 6 { + 0 => MetadataPolicy::PreserveExact, + 1 => MetadataPolicy::Preserve, + 2 => MetadataPolicy::Web, + 3 => MetadataPolicy::ColorAndRotation, + 4 => MetadataPolicy::Custom( + MetadataFields::KEEP_ALL.with_exif(ExifPolicy::KEEP_ALL.with_gps(Retention::Discard)), + ), + _ => MetadataPolicy::Custom( + MetadataFields::DISCARD_ALL + .with_icc(IccRetention::KeepNonSrgb) + .with_exif(ExifPolicy::ATTRIBUTED_ORIENTATION), + ), + }; + + let out = meta.filtered(&policy); + // Idempotence: filtering the result again with the same policy is stable + // (catches a filter that produces output it can't re-process). + let out2 = out.filtered(&policy); + assert_eq!(out.exif, out2.exif, "filtered EXIF not idempotent"); + assert_eq!(out.orientation, out2.orientation); +}); diff --git a/fuzz/regression/crash-idempotence-10a1002a b/fuzz/regression/crash-idempotence-10a1002a new file mode 100644 index 0000000..386b444 Binary files /dev/null and b/fuzz/regression/crash-idempotence-10a1002a differ diff --git a/src/capabilities.rs b/src/capabilities.rs index cc8cef4..6af9f3d 100644 --- a/src/capabilities.rs +++ b/src/capabilities.rs @@ -102,6 +102,9 @@ pub struct EncodeCapabilities { exif: bool, xmp: bool, cicp: bool, + // CICP carrier quality (distinct from `cicp` = "has a CICP carrier slot") + cicp_is_valid_carrier: bool, + cicp_safe_sole_carrier: bool, // Operation support stop: bool, animation: bool, @@ -143,6 +146,8 @@ impl EncodeCapabilities { exif: false, xmp: false, cicp: false, + cicp_is_valid_carrier: false, + cicp_safe_sole_carrier: false, stop: false, animation: false, push_rows: false, @@ -181,6 +186,27 @@ impl EncodeCapabilities { pub const fn cicp(&self) -> bool { self.cicp } + /// Whether this format has a standardized, real-world-honored CICP carrier — + /// so CICP can be emitted as a color signal by default. + /// + /// True for JXL codestream enum, AVIF/HEIC `nclx`, and PNG `cICP`. Distinct + /// from [`cicp`](Self::cicp) (= "has a CICP carrier slot at all") and from + /// [`cicp_safe_sole_carrier`](Self::cicp_safe_sole_carrier) (= "safe to ship + /// CICP *only* and drop the ICC"). Gates CICP emission under + /// [`CicpEmission::WhereValidCarrier`](crate::color::CicpEmission::WhereValidCarrier). + pub const fn cicp_is_valid_carrier(&self) -> bool { + self.cicp_is_valid_carrier + } + /// Whether it is safe in practice to ship CICP as the *sole* color carrier + /// and drop a redundant ICC profile for this format. + /// + /// Stricter than [`cicp_is_valid_carrier`](Self::cicp_is_valid_carrier): a + /// format can have a valid CICP carrier yet still need an ICC kept for + /// real-world tool compatibility. As of 2026 this is true only for JXL + /// (matches libjxl's `want_icc=false` default); AVIF/HEIC/PNG keep the ICC. + pub const fn cicp_safe_sole_carrier(&self) -> bool { + self.cicp_safe_sole_carrier + } /// Whether `with_stop` on encode jobs is respected (not a no-op). pub const fn stop(&self) -> bool { self.stop @@ -316,6 +342,18 @@ impl EncodeCapabilities { self.cicp = v; self } + /// Set whether this format has a standardized CICP carrier. + /// See [`cicp_is_valid_carrier`](Self::cicp_is_valid_carrier). + pub const fn with_cicp_is_valid_carrier(mut self, v: bool) -> Self { + self.cicp_is_valid_carrier = v; + self + } + /// Set whether CICP is safe as the sole color carrier (drop redundant ICC). + /// See [`cicp_safe_sole_carrier`](Self::cicp_safe_sole_carrier). + pub const fn with_cicp_safe_sole_carrier(mut self, v: bool) -> Self { + self.cicp_safe_sole_carrier = v; + self + } /// Set whether cooperative cancellation via [`Stop`](enough::Stop) is supported. pub const fn with_stop(mut self, v: bool) -> Self { self.stop = v; diff --git a/src/color.rs b/src/color.rs new file mode 100644 index 0000000..4fbb671 --- /dev/null +++ b/src/color.rs @@ -0,0 +1,451 @@ +//! Internal: color-signaling emission policy (ICC profile vs CICP code points). +//! +//! This module is private — its types (`ColorEmitPolicy`, `ColorEmitFields`, +//! `ColorEmitPlan`, `IccDisposition`, `CicpEmission`) and `resolve_color_emit` +//! are re-exported at the crate root. The public overview lives on +//! `ColorEmitPolicy`; the full per-format design is in `docs/color-emit-model.md`. + +use zenpixels::icc; +use zenpixels::{Cicp, ColorModel}; + +use crate::capabilities::EncodeCapabilities; +use crate::info::SourceColor; +use crate::metadata::IccRetention; + +/// How an image's color *description* (ICC profile vs CICP code points) is +/// emitted when encoding or transcoding — the obvious, intent-named knob. +/// +/// This is orthogonal to which *pixels* are written. Containers differ in which +/// color carriers they have and in how reliably real-world decoders honor each +/// one, so emitting "the right" color description is a per-target decision. +/// +/// # Presets +/// +/// Pick an intent — the same meaning whether encoding from pixels or transcoding +/// from another file: +/// +/// - [`Compatibility`](ColorEmitPolicy::Compatibility) — always embed an ICC; add CICP where reliable. +/// - [`Balanced`](ColorEmitPolicy::Balanced) (**default**) — emit CICP where the format has a +/// standardized CICP carrier, drop a redundant ICC only where CICP is safe as the sole carrier +/// (JXL today) or the ICC is plain sRGB. +/// - [`Compact`](ColorEmitPolicy::Compact) — smallest: prefer CICP wherever the format carries it, drop the ICC. +/// - [`Verbatim`](ColorEmitPolicy::Verbatim) — carry the source's signals unchanged. +/// - [`Custom`](ColorEmitPolicy::Custom) — explicit [`ColorEmitFields`] for power users. +/// +/// # The resolver: [`resolve_color_emit`] +/// +/// [`resolve_color_emit`] reconciles a [`SourceColor`] against a target's +/// [`EncodeCapabilities`] under a `ColorEmitPolicy` and returns a [`ColorEmitPlan`] — +/// a pure description of what to emit. This crate is `no_std` and carries no +/// CMS, so the plan only describes intent ([`IccDisposition::SynthesizeFrom`], +/// etc.); the bytes are materialized one layer up. +/// +/// # Lowering the plan +/// +/// A codec (or the pipeline) lowers a [`ColorEmitPlan`] to the bytes it writes — for +/// the pixel-encode path, through `zenpixels_convert`'s atomic +/// `finalize_for_output_with` (which guarantees pixels and embedded color cannot +/// diverge): +/// +/// - [`ColorEmitPlan::cicp`] → the format's native CICP carrier (JXL enum color, +/// AVIF/HEIC `nclx`, PNG `cICP`). +/// - [`IccDisposition::KeepSource`] → re-embed the source ICC bytes +/// (`OutputProfile::SameAsOrigin`). +/// - [`IccDisposition::SynthesizeFrom`]`(cicp)` → fetch a bundled profile via +/// `zenpixels_convert::icc_profile_for_primaries` (a `const fn` table — **no CMS**; +/// it returns `None` for BT.709/sRGB, so the assumed default is never embedded). +/// - [`IccDisposition::Drop`] → emit no ICC. +/// +/// Orientation/EXIF reconciliation is separate: when a pipeline bakes orientation +/// upright it rewrites the source EXIF orientation tag with +/// [`helpers::set_exif_orientation`](crate::helpers::set_exif_orientation) so the +/// tag and the pixels can't disagree (the double-rotation hazard). +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +#[non_exhaustive] +pub enum ColorEmitPolicy { + /// Widest compatibility: always embed an ICC profile (synthesizing one from + /// CICP when the source had none); add CICP where the format treats it as + /// authority. Largest color overhead. + Compatibility, + /// **Default.** Emit CICP where it is the format's authority and drop a + /// redundant ICC only where CICP is safe as the *sole* carrier + /// ([`cicp_safe_sole_carrier`](EncodeCapabilities::cicp_safe_sole_carrier) — + /// JXL today) or the ICC is a plain sRGB profile. Otherwise keep the ICC. + #[default] + Balanced, + /// Smallest color overhead: prefer CICP wherever the format can carry it at + /// all, and drop the ICC whenever CICP can describe the color. + Compact, + /// Carry the source's color signals through unchanged — derive and strip + /// nothing. For transcodes that must preserve exactly what was there. + Verbatim, + /// Explicit mechanism control. + Custom(ColorEmitFields), +} + +/// Whether CICP is emitted, behind [`ColorEmitPolicy::Custom`]. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +#[non_exhaustive] +pub enum CicpEmission { + /// Emit CICP where the format has a standardized, real-world-honored CICP + /// carrier ([`cicp_is_valid_carrier`](EncodeCapabilities::cicp_is_valid_carrier)): + /// JXL/AVIF/HEIC `nclx`, and PNG `cICP`. The default. Distinct from + /// "drop the ICC" — a valid carrier (PNG) still keeps the ICC alongside. + #[default] + WhereValidCarrier, + /// Emit CICP wherever the format has *any* carrier slot, even a non-standard + /// or emergent one. + WhereverSupported, + /// Never emit CICP (ICC-only output). + Never, +} + +/// Mechanism fields behind [`ColorEmitPolicy::Custom`]. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub struct ColorEmitFields { + /// When to drop the ICC profile. + pub icc: IccRetention, + /// Whether to emit CICP. + pub cicp: CicpEmission, +} + +impl Default for ColorEmitFields { + fn default() -> Self { + Self { + icc: IccRetention::DropIfCicpSafeSoleCarrier, + cicp: CicpEmission::WhereValidCarrier, + } + } +} + +impl ColorEmitFields { + /// Construct explicit color-emission fields for [`ColorEmitPolicy::Custom`]. + /// + /// `ColorEmitFields` is `#[non_exhaustive]`, so downstream crates cannot build it + /// with a struct literal — use this constructor (or [`Default`]) so + /// [`ColorEmitPolicy::Custom`] is actually reachable. + pub const fn new(icc: IccRetention, cicp: CicpEmission) -> Self { + Self { icc, cicp } + } +} + +impl ColorEmitPolicy { + /// Resolve a preset to its mechanism fields. + pub const fn fields(&self) -> ColorEmitFields { + match self { + Self::Compatibility => ColorEmitFields { + icc: IccRetention::Keep, + cicp: CicpEmission::WhereValidCarrier, + }, + Self::Balanced => ColorEmitFields { + icc: IccRetention::DropIfCicpSafeSoleCarrier, + cicp: CicpEmission::WhereValidCarrier, + }, + Self::Compact => ColorEmitFields { + icc: IccRetention::DropIfCicpRepresentable, + cicp: CicpEmission::WhereverSupported, + }, + Self::Verbatim => ColorEmitFields { + icc: IccRetention::Keep, + cicp: CicpEmission::WhereValidCarrier, + }, + Self::Custom(f) => *f, + } + } +} + +/// What to do with the ICC profile channel for one encode. +/// +/// The bytes are materialized by the codec adapter / CMS layer, not here. +#[derive(Clone, Debug, PartialEq, Eq)] +#[non_exhaustive] +pub enum IccDisposition { + /// Embed the source ICC bytes verbatim. + KeepSource, + /// Embed an ICC synthesized from this CICP (target has no CICP carrier, or + /// the policy wants an ICC alongside). The caller materializes the bytes. + SynthesizeFrom(Cicp), + /// Emit no ICC profile. + Drop, +} + +/// A resolved plan for emitting an image's color description on encode. +/// +/// Produced by [`resolve_color_emit`]. Deliberately minimal: it carries the +/// ICC/CICP decision, which is what current transcode needs. `#[non_exhaustive]` +/// so range/rendering-intent/HDR/gain-map dispositions and a warnings channel +/// can be added back additively when a consumer needs them. +#[derive(Clone, Debug, PartialEq)] +#[non_exhaustive] +pub struct ColorEmitPlan { + /// CICP to write to the target's native carrier, if any. + pub cicp: Option, + /// Disposition of the ICC profile channel. + pub icc: IccDisposition, +} + +/// The CICP that describes this source's color as code points, if any: +/// the explicit CICP, else derived from the ICC (`cicp` tag, then corpus). +fn representable_cicp(src: &SourceColor) -> Option { + if let Some(c) = src.cicp { + return Some(c); + } + let icc_bytes = src.icc_profile.as_ref()?; + icc::extract_cicp(icc_bytes) + .or_else(|| icc::identify_common(icc_bytes).and_then(|id| id.to_cicp())) +} + +/// Reconcile a source's color description against a target's capabilities under +/// a [`ColorEmitPolicy`], returning what to emit. +/// +/// Pure and `no_std`. Decides ICC vs CICP emission, including the grayscale / +/// CMYK terminal states (where CICP is inapplicable and the ICC must be kept). +pub fn resolve_color_emit( + src: &SourceColor, + target: &EncodeCapabilities, + policy: ColorEmitPolicy, +) -> ColorEmitPlan { + let fields = policy.fields(); + let src_has_icc = src.icc_profile.is_some(); + + // Grayscale / CMYK: CICP is RGB-centric and cannot describe these. Keep the + // ICC (the only valid color description) and suppress CICP — emitting an RGB + // CICP over gray/CMYK pixels would recolor them. + let model = src + .icc_profile + .as_deref() + .and_then(icc::profile_color_space); + let is_gray = matches!(model, Some(ColorModel::Gray)) || src.channel_count == Some(1); + let is_cmyk = matches!(model, Some(ColorModel::Cmyk)); + if is_gray || is_cmyk { + return ColorEmitPlan { + cicp: None, + icc: if src_has_icc { + IccDisposition::KeepSource + } else { + IccDisposition::Drop + }, + }; + } + + let repr_cicp = representable_cicp(src); + let cicp_represents = repr_cicp.is_some(); + let has_carrier = target.cicp(); + let is_valid_carrier = target.cicp_is_valid_carrier(); + let sole_safe = target.cicp_safe_sole_carrier(); + let icc_is_srgb = src.icc_profile.as_deref().is_some_and(icc::is_common_srgb); + + // Whether to emit CICP. + let emit_cicp = match policy { + ColorEmitPolicy::Verbatim => has_carrier && src.cicp.is_some(), + _ => match fields.cicp { + CicpEmission::Never => false, + CicpEmission::WhereValidCarrier => has_carrier && is_valid_carrier && cicp_represents, + CicpEmission::WhereverSupported => has_carrier && cicp_represents, + }, + }; + let cicp_out = if emit_cicp { + if policy == ColorEmitPolicy::Verbatim { + src.cicp + } else { + repr_cicp + } + } else { + None + }; + + // Whether to drop the ICC. + let drop_by_rule = match fields.icc { + IccRetention::Drop => true, + IccRetention::Keep => false, + IccRetention::KeepNonSrgb => icc_is_srgb, + IccRetention::DropIfCicpRepresentable => emit_cicp && cicp_represents, + IccRetention::DropIfCicpSafeSoleCarrier => emit_cicp && sole_safe && cicp_represents, + }; + // Balanced additionally sheds a redundant sRGB ICC even where CICP isn't the + // sole carrier (the most common pure-weight case). + let drop_icc = match policy { + ColorEmitPolicy::Balanced => drop_by_rule || (emit_cicp && icc_is_srgb), + _ => drop_by_rule, + }; + + // sRGB is the universally-assumed default: the canned-profile table has no + // sRGB ICC to synthesize (`zenpixels_convert::icc_profile_for_primaries` + // returns `None` for BT.709), so a `SynthesizeFrom(sRGB)` directive would + // lower to nothing. Don't emit it — drop instead. + let synth_worthwhile = cicp_represents && repr_cicp != Some(Cicp::SRGB); + + let icc = if src_has_icc { + if drop_icc { + IccDisposition::Drop + } else { + IccDisposition::KeepSource + } + } else if !emit_cicp && synth_worthwhile && policy != ColorEmitPolicy::Verbatim { + // No source ICC and CICP isn't carrying the color (target is ICC-only): + // synthesize an ICC so the (non-default) color isn't lost. + IccDisposition::SynthesizeFrom(repr_cicp.expect("synth_worthwhile")) + } else if matches!(policy, ColorEmitPolicy::Compatibility) && synth_worthwhile { + // Compatibility wants an ICC present alongside CICP (non-sRGB only). + IccDisposition::SynthesizeFrom(repr_cicp.expect("synth_worthwhile")) + } else { + IccDisposition::Drop + }; + + ColorEmitPlan { + cicp: cicp_out, + icc, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use zenpixels::ColorAuthority; + + // Capability fixtures matching the 2026 reliability findings. + fn caps_jxl() -> EncodeCapabilities { + EncodeCapabilities::new() + .with_icc(true) + .with_cicp(true) + .with_cicp_is_valid_carrier(true) + .with_cicp_safe_sole_carrier(true) + } + fn caps_avif() -> EncodeCapabilities { + EncodeCapabilities::new() + .with_icc(true) + .with_cicp(true) + .with_cicp_is_valid_carrier(true) + .with_cicp_safe_sole_carrier(false) + } + fn caps_jpeg() -> EncodeCapabilities { + // No CICP carrier at all. + EncodeCapabilities::new().with_icc(true) + } + fn caps_png() -> EncodeCapabilities { + // PNG cICP: a standardized-but-emergent carrier — valid, not sole-safe. + EncodeCapabilities::new() + .with_icc(true) + .with_cicp(true) + .with_cicp_is_valid_carrier(true) + .with_cicp_safe_sole_carrier(false) + } + + fn src_cicp(c: Cicp) -> SourceColor { + SourceColor::default() + .with_cicp(c) + .with_color_authority(ColorAuthority::Cicp) + .with_channel_count(3) + } + + #[test] + fn jxl_balanced_strips_representable_icc() { + // JXL (sole-safe): CICP present + an ICC whose color CICP represents → + // emit CICP, drop the ICC (matches libjxl's want_icc=false default). + let src = SourceColor::default() + .with_cicp(Cicp::SRGB) + .with_icc_profile(alloc::vec![0u8; 132]) + .with_channel_count(3); + let plan = resolve_color_emit(&src, &caps_jxl(), ColorEmitPolicy::Balanced); + assert_eq!(plan.cicp, Some(Cicp::SRGB)); + assert_eq!(plan.icc, IccDisposition::Drop); + } + + #[test] + fn avif_balanced_keeps_nonsrgb_icc_alongside_cicp() { + // AVIF (not sole-safe): a non-sRGB ICC is kept alongside CICP. (The + // redundant-sRGB drop needs a corpus-recognized profile and is covered + // by the conformance suite, which has a real sRGB profile via `cms`.) + let p3 = src_cicp(Cicp::DISPLAY_P3).with_icc_profile(alloc::vec![0u8; 132]); + let plan = resolve_color_emit(&p3, &caps_avif(), ColorEmitPolicy::Balanced); + assert_eq!(plan.cicp, Some(Cicp::DISPLAY_P3)); + assert_eq!(plan.icc, IccDisposition::KeepSource); + } + + #[test] + fn jpeg_synthesizes_icc_from_cicp() { + // CICP-only source → JPEG (no CICP carrier): synthesize an ICC. + let src = src_cicp(Cicp::DISPLAY_P3); + let plan = resolve_color_emit(&src, &caps_jpeg(), ColorEmitPolicy::Balanced); + assert_eq!(plan.cicp, None); + assert_eq!(plan.icc, IccDisposition::SynthesizeFrom(Cicp::DISPLAY_P3)); + } + + #[test] + fn compact_strips_icc_on_avif() { + // Compact drops the ICC wherever CICP represents the color, even on AVIF. + let p3 = src_cicp(Cicp::DISPLAY_P3).with_icc_profile(alloc::vec![0u8; 132]); + let plan = resolve_color_emit(&p3, &caps_avif(), ColorEmitPolicy::Compact); + assert_eq!(plan.cicp, Some(Cicp::DISPLAY_P3)); + assert_eq!(plan.icc, IccDisposition::Drop); + } + + #[test] + fn compatibility_always_keeps_or_synthesizes_icc() { + // CICP-only source, AVIF, Compatibility → CICP emitted AND an ICC synthesized. + let src = src_cicp(Cicp::DISPLAY_P3); + let plan = resolve_color_emit(&src, &caps_avif(), ColorEmitPolicy::Compatibility); + assert_eq!(plan.cicp, Some(Cicp::DISPLAY_P3)); + assert_eq!(plan.icc, IccDisposition::SynthesizeFrom(Cicp::DISPLAY_P3)); + } + + #[test] + fn grayscale_keeps_icc_suppresses_cicp() { + // A 1-channel source: CICP is inapplicable; keep ICC, suppress CICP. + let src = SourceColor::default() + .with_icc_profile(alloc::vec![0u8; 132]) + .with_channel_count(1); + let plan = resolve_color_emit(&src, &caps_avif(), ColorEmitPolicy::Balanced); + assert_eq!(plan.cicp, None); + assert_eq!(plan.icc, IccDisposition::KeepSource); + } + + #[test] + fn verbatim_passes_source_through() { + // Verbatim keeps both, derives nothing. + let src = src_cicp(Cicp::DISPLAY_P3).with_icc_profile(alloc::vec![0u8; 132]); + let plan = resolve_color_emit(&src, &caps_avif(), ColorEmitPolicy::Verbatim); + assert_eq!(plan.cicp, Some(Cicp::DISPLAY_P3)); + assert_eq!(plan.icc, IccDisposition::KeepSource); + } + + #[test] + fn default_policy_is_balanced() { + assert_eq!(ColorEmitPolicy::default(), ColorEmitPolicy::Balanced); + } + + #[test] + fn png_emits_cicp_keeps_icc_under_balanced() { + // PNG: standardized cICP carrier but not sole-safe → emit cICP AND keep + // iCCP. Regression for the missing valid-carrier tier — a non-authority + // carrier must still emit CICP under Balanced. + let p3 = src_cicp(Cicp::DISPLAY_P3).with_icc_profile(alloc::vec![0u8; 132]); + let plan = resolve_color_emit(&p3, &caps_png(), ColorEmitPolicy::Balanced); + assert_eq!(plan.cicp, Some(Cicp::DISPLAY_P3)); + assert_eq!(plan.icc, IccDisposition::KeepSource); + } + + #[test] + fn srgb_only_source_does_not_synthesize_redundant_icc() { + // CICP-only sRGB → JPEG (no carrier): sRGB is the assumed default and the + // canned table has no sRGB profile → drop, never SynthesizeFrom(sRGB). + let src = src_cicp(Cicp::SRGB); + let plan = resolve_color_emit(&src, &caps_jpeg(), ColorEmitPolicy::Balanced); + assert_eq!(plan.cicp, None); + assert_eq!(plan.icc, IccDisposition::Drop); + } + + #[test] + fn custom_policy_is_constructible() { + // ColorEmitFields::new makes ColorEmitPolicy::Custom reachable from downstream. + let policy = ColorEmitPolicy::Custom(ColorEmitFields::new( + IccRetention::Keep, + CicpEmission::Never, + )); + let p3 = src_cicp(Cicp::DISPLAY_P3).with_icc_profile(alloc::vec![0u8; 132]); + let plan = resolve_color_emit(&p3, &caps_avif(), policy); + assert_eq!(plan.cicp, None); // CicpEmission::Never + assert_eq!(plan.icc, IccDisposition::KeepSource); // IccRetention::Keep + } +} diff --git a/src/exif.rs b/src/exif.rs new file mode 100644 index 0000000..d3248d8 --- /dev/null +++ b/src/exif.rs @@ -0,0 +1,1983 @@ +//! Structured, borrowing EXIF/TIFF model: parse → inspect/prune → serialize. +//! +//! [`Exif::parse`] reads a TIFF/EXIF blob into a tree of IFDs whose entry +//! values *borrow* the source bytes (zero-copy — a multi-KB thumbnail is never +//! copied during parsing or pruning). [`Exif::filtered`] prunes the tree by +//! [`ExifPolicy`] category, and [`Exif::to_bytes`] re-serializes a valid TIFF, +//! recomputing all offsets. [`retain`](crate::exif::retain) is the `Cow` convenience used by +//! [`Metadata::filtered`](crate::Metadata::filtered): it borrows the source +//! unchanged when nothing is dropped and allocates only on a real rewrite. +//! +//! Spec: TIFF 6.0 (Adobe, 1992) + EXIF 2.32 (CIPA DC-008). The structural +//! pointer tags — Exif IFD (0x8769), GPS IFD (0x8825), and the JPEG thumbnail +//! pointers (0x0201/0x0202) — are modeled as tree edges, not entries, and +//! re-synthesized with fresh offsets on serialize. +//! +//! Error model (no panics on untrusted input — every read is bounds-checked): +//! - **Structural failure → `None`.** A bad byte-order mark, wrong magic, +//! IFD0 offset past EOF, or an over-cap entry count (`MAX_IFD_ENTRIES`) makes +//! `Exif::parse` return `None`. +//! - **Graceful per-entry degradation.** A single unreadable, unknown-type, or +//! out-of-bounds entry is *skipped*, and a truncated entry table salvages the +//! entries read so far — one malformed (or future-typed) entry never discards +//! the rest of the IFD's metadata. Skipped entries are dropped on a rewrite. +//! - **Fail-safe filtering.** [`retain`](crate::exif::retain) drops EXIF it can't parse under a +//! stripping policy (rather than passing it through and risking a leak); see +//! its docs. +//! +//! Known limitation: rewriting (any partial prune) relocates the `MakerNote` +//! blob (0x927C, the `camera` category), whose maker-specific *internal* +//! offsets cannot always be fixed up. Pipelines needing byte-exact MakerNote +//! should keep all EXIF (no prune), in which case the source passes through +//! untouched. Uncompressed (StripOffsets) thumbnails are dropped-only — kept +//! correctly only in the no-prune passthrough. + +use alloc::borrow::Cow; +use alloc::vec::Vec; +use zenpixels::Orientation; + +// ── TIFF/EXIF tag numbers (canonical, no bare hex in the logic below) ──────── +// Names follow TIFF 6.0 / EXIF (CIPA DC-008). Tag values are stable across spec +// revisions; the comment notes the revision that introduced the less-common ones. + +// IFD0 / TIFF baseline. +const TAG_MAKE: u16 = 0x010F; +const TAG_MODEL: u16 = 0x0110; +const TAG_ORIENTATION: u16 = 0x0112; +const TAG_SOFTWARE: u16 = 0x0131; +const TAG_DATETIME: u16 = 0x0132; // a.k.a. ModifyDate +const TAG_ARTIST: u16 = 0x013B; +const TAG_HOST_COMPUTER: u16 = 0x013C; +const TAG_COPYRIGHT: u16 = 0x8298; + +// Structural pointers + JPEG thumbnail (modeled as tree edges, not entries). +const TAG_EXIF_IFD: u16 = 0x8769; +const TAG_GPS_IFD: u16 = 0x8825; +const TAG_INTEROP_IFD: u16 = 0xA005; +const TAG_THUMB_OFFSET: u16 = 0x0201; // JPEGInterchangeFormat +const TAG_THUMB_LENGTH: u16 = 0x0202; // JPEGInterchangeFormatLength +// SubIFDs (TIFF/DNG) — an array of offsets to nested IFDs (alt/full-res images +// with their own EXIF/GPS). NOT modeled here, so its offsets can't be fixed up +// on a rewrite; dropped during filtering rather than left dangling. +const TAG_SUBIFDS: u16 = 0x014A; + +// Exif sub-IFD: capture timestamps (the `datetimes` category). +const TAG_DATETIME_ORIGINAL: u16 = 0x9003; +const TAG_DATETIME_DIGITIZED: u16 = 0x9004; +const TAG_OFFSET_TIME: u16 = 0x9010; // Exif 2.31+ +const TAG_OFFSET_TIME_ORIGINAL: u16 = 0x9011; // Exif 2.31+ +const TAG_OFFSET_TIME_DIGITIZED: u16 = 0x9012; // Exif 2.31+ +const TAG_SUBSEC_TIME: u16 = 0x9290; +const TAG_SUBSEC_TIME_ORIGINAL: u16 = 0x9291; +const TAG_SUBSEC_TIME_DIGITIZED: u16 = 0x9292; + +// Exif sub-IFD: device / capture identity (the `camera` category). +const TAG_MAKER_NOTE: u16 = 0x927C; +const TAG_IMAGE_UNIQUE_ID: u16 = 0xA420; +const TAG_BODY_SERIAL_NUMBER: u16 = 0xA431; +const TAG_LENS_SPECIFICATION: u16 = 0xA432; +const TAG_LENS_MAKE: u16 = 0xA433; +const TAG_LENS_MODEL: u16 = 0xA434; +const TAG_LENS_SERIAL_NUMBER: u16 = 0xA435; + +// Creator / rights-holder *name* tags (the `rights` category, alongside +// Copyright + Artist). CameraOwnerName is Exif 2.3+; Photographer / ImageEditor +// are Exif 3.0 (CIPA DC-008-2023). +const TAG_CAMERA_OWNER_NAME: u16 = 0xA430; +const TAG_PHOTOGRAPHER: u16 = 0xA437; // Exif 3.0 +const TAG_IMAGE_EDITOR: u16 = 0xA438; // Exif 3.0 + +// Exif 3.0 software-identity tags (the `camera` category). +const TAG_CAMERA_FIRMWARE: u16 = 0xA439; // Exif 3.0 +const TAG_RAW_DEVELOPING_SOFTWARE: u16 = 0xA43A; // Exif 3.0 +const TAG_IMAGE_EDITING_SOFTWARE: u16 = 0xA43B; // Exif 3.0 +const TAG_METADATA_EDITING_SOFTWARE: u16 = 0xA43C; // Exif 3.0 + +// ── TIFF field types (TIFF 6.0 §2; type 129 is Exif 3.0) ───────────────────── +const TIFF_BYTE: u16 = 1; +const TIFF_ASCII: u16 = 2; +const TIFF_SHORT: u16 = 3; +const TIFF_LONG: u16 = 4; +const TIFF_RATIONAL: u16 = 5; +const TIFF_SBYTE: u16 = 6; +const TIFF_UNDEFINED: u16 = 7; +const TIFF_SSHORT: u16 = 8; +const TIFF_SLONG: u16 = 9; +const TIFF_SRATIONAL: u16 = 10; +const TIFF_FLOAT: u16 = 11; +const TIFF_DOUBLE: u16 = 12; +const TIFF_IFD: u16 = 13; +/// Exif 3.0 (CIPA DC-008-2023) type 129 = UTF-8 string (8-bit bytes, +/// NUL-terminated, count includes the NUL). The spec-conformant way to store +/// Unicode in an IFD field — see [`TextEncoding`]. +const TIFF_UTF8: u16 = 129; + +const TIFF_HEADER_SIZE: usize = 8; +const MAX_IFD_ENTRIES: u16 = 1000; +const EXIF_PREFIX: &[u8] = b"Exif\0\0"; + +/// TIFF byte order, preserved across a parse → serialize round-trip. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ByteOrder { + /// Little-endian (`II`, Intel). + Little, + /// Big-endian (`MM`, Motorola). + Big, +} + +/// Which EXIF text convention a string field ([`Copyright`](Exif::set_copyright), +/// [`Artist`](Exif::set_artist)) is written with. EXIF has two ways to carry +/// text and a writer must pick one — there is no universally-read Unicode field. +/// +/// Both variants write the **same UTF-8 bytes** (the `&str` you pass), NUL- +/// terminated; they differ only in the declared TIFF field type. For pure-ASCII +/// text the two outputs are identical except for that type tag. +/// +/// `#[non_exhaustive]`: a future text convention can be added without a breaking +/// change. The variants are still constructible by name. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum TextEncoding { + /// **Exif 2.x** — TIFF `ASCII` (type 2). The spec says 7-bit ASCII, but the + /// de-facto real-world convention (and what this writes) is UTF-8 bytes + /// carried in the ASCII field — non-conformant for non-ASCII, yet read + /// correctly by essentially every tool (kamadak-exif, Pillow, ExifTool, …). + /// The most compatible choice, so the recommended default. + Ascii, + /// **Exif 3.0** (CIPA DC-008-2023) — TIFF `UTF-8` (type 129). The spec- + /// conformant Unicode type, but reader support is still thin (ExifTool reads + /// it; many libraries do not). Prefer [`Ascii`](Self::Ascii) unless the + /// consumer is known to understand type 129. + Utf8, +} + +/// EXIF category an IFD0/Exif-IFD entry belongs to. GPS and thumbnail are +/// modeled structurally (whole sub-IFD), not per-entry. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Category { + Orientation, + Rights, + Datetimes, + Camera, + Other, +} + +fn classify(tag: u16) -> Category { + match tag { + TAG_ORIENTATION => Category::Orientation, + // Attribution / rights-holder. Copyright (the rights *notice*), Artist + // (creator), plus the Exif-IFD creator/owner *name* tags + // (CameraOwnerName, Photographer, ImageEditor) — the spec says Artist + // mirrors one of these, so they're the same "who made / holds rights" + // class a copyright-preserving policy keeps. + TAG_COPYRIGHT + | TAG_ARTIST + | TAG_CAMERA_OWNER_NAME + | TAG_PHOTOGRAPHER + | TAG_IMAGE_EDITOR => Category::Rights, + // DateTime, DateTimeOriginal/Digitized, sub-sec + offset-time variants. + TAG_DATETIME + | TAG_DATETIME_ORIGINAL + | TAG_DATETIME_DIGITIZED + | TAG_OFFSET_TIME + | TAG_OFFSET_TIME_ORIGINAL + | TAG_OFFSET_TIME_DIGITIZED + | TAG_SUBSEC_TIME + | TAG_SUBSEC_TIME_ORIGINAL + | TAG_SUBSEC_TIME_DIGITIZED => Category::Datetimes, + // Device / software identity: Make, Model, Software, HostComputer, + // MakerNote, body/lens serials + lens make/model, ImageUniqueID, and the + // firmware / developing / editing software tags. + TAG_MAKE + | TAG_MODEL + | TAG_SOFTWARE + | TAG_HOST_COMPUTER + | TAG_MAKER_NOTE + | TAG_IMAGE_UNIQUE_ID + | TAG_BODY_SERIAL_NUMBER + | TAG_LENS_SPECIFICATION + | TAG_LENS_MAKE + | TAG_LENS_MODEL + | TAG_LENS_SERIAL_NUMBER + | TAG_CAMERA_FIRMWARE + | TAG_RAW_DEVELOPING_SOFTWARE + | TAG_IMAGE_EDITING_SOFTWARE + | TAG_METADATA_EDITING_SOFTWARE => Category::Camera, + _ => Category::Other, + } +} + +/// One IFD entry. Value bytes are [`Cow`]: **borrowed** from the source blob on +/// [`parse`](Exif::parse) (zero-copy — a multi-KB thumbnail is never copied) and +/// **owned** for an entry injected by an edit ([`set_copyright`](Exif::set_copyright)). +#[derive(Debug, Clone, PartialEq, Eq)] +struct Entry<'a> { + tag: u16, + kind: u16, + count: u32, + /// Resolved value bytes (`count × type_size`), in source byte order. + value: Cow<'a, [u8]>, + /// Byte offset of `value` within the TIFF (post-prefix): `e + 8` for an + /// inline value, or the out-of-line pointer. Lets an in-place tag rewrite + /// (e.g. [`set_orientation`]) reuse this parse instead of re-walking the IFD. + /// Meaningful only for a parsed (borrowed) entry; `0` for an injected one + /// (which is always re-serialized by [`to_bytes`](Exif::to_bytes), never + /// rewritten in place). + value_offset: usize, +} + +/// A parsed EXIF/TIFF tree borrowing from the source bytes. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Exif<'a> { + order: ByteOrder, + had_prefix: bool, + ifd0: Vec>, + exif_ifd: Option>>, + gps_ifd: Option>>, + ifd1: Option>>, + thumbnail: Option<&'a [u8]>, + /// Field type used when *writing* a string tag ([`set_copyright`](Self::set_copyright) + /// / [`set_artist`](Self::set_artist)) — the Exif 2.x-vs-3.0 compatibility + /// choice. Set by [`new`](Self::new); parsing defaults it to + /// [`TextEncoding::Ascii`] (it is not stored in the TIFF, so it does not + /// survive a parse round-trip). + text_encoding: TextEncoding, +} + +/// TIFF/Exif type size in bytes, or `None` for an unknown type. +fn type_size(kind: u16) -> Option { + Some(match kind { + TIFF_BYTE | TIFF_ASCII | TIFF_SBYTE | TIFF_UNDEFINED | TIFF_UTF8 => 1, + TIFF_SHORT | TIFF_SSHORT => 2, + TIFF_LONG | TIFF_SLONG | TIFF_FLOAT | TIFF_IFD => 4, + TIFF_RATIONAL | TIFF_SRATIONAL | TIFF_DOUBLE => 8, + _ => return None, + }) +} + +fn rd16(d: &[u8], o: usize, order: ByteOrder) -> Option { + let b = d.get(o..o + 2)?; + Some(match order { + ByteOrder::Big => u16::from_be_bytes([b[0], b[1]]), + ByteOrder::Little => u16::from_le_bytes([b[0], b[1]]), + }) +} + +fn rd32(d: &[u8], o: usize, order: ByteOrder) -> Option { + let b = d.get(o..o + 4)?; + Some(match order { + ByteOrder::Big => u32::from_be_bytes([b[0], b[1], b[2], b[3]]), + ByteOrder::Little => u32::from_le_bytes([b[0], b[1], b[2], b[3]]), + }) +} + +/// Parse one IFD at `off`, resolving each entry's value slice. Returns the +/// entries and the next-IFD offset (0 = none). Structural pointer tags are +/// left in place for the caller to extract. +fn parse_ifd(tiff: &[u8], off: usize, order: ByteOrder) -> Option<(Vec>, u32)> { + let count = rd16(tiff, off, order)?; + if count > MAX_IFD_ENTRIES { + return None; // DoS cap — reject the whole IFD + } + let entries_start = off.checked_add(2)?; + let mut entries = Vec::new(); + for i in 0..count as usize { + // Graceful degradation: a truncated entry table salvages the entries + // read so far (stop), and an individual unreadable/unknown-type/ + // out-of-bounds entry is skipped — one malformed or future-typed entry + // doesn't discard all of the IFD's metadata. + let Some(e) = i.checked_mul(12).and_then(|o| entries_start.checked_add(o)) else { + break; + }; + if e.checked_add(12).is_none_or(|end| end > tiff.len()) { + break; // truncated table + } + if let Some(entry) = resolve_entry(tiff, e, order) { + entries.push(entry); + } + } + // The next-IFD offset is structurally required, but tolerate a blob that + // ends right after the last entry (treat as "no further IFD"). + let next = rd32(tiff, entries_start.checked_add(count as usize * 12)?, order).unwrap_or(0); + Some((entries, next)) +} + +/// Read one 12-byte IFD entry at `e` (assumed within bounds) and resolve its +/// value slice. `None` for an unknown TIFF type, an overflowing +/// `count × type_size`, or an out-of-bounds out-of-line value — the caller +/// skips such an entry rather than failing the whole IFD. +fn resolve_entry(tiff: &[u8], e: usize, order: ByteOrder) -> Option> { + let tag = rd16(tiff, e, order)?; + let kind = rd16(tiff, e + 2, order)?; + let cnt = rd32(tiff, e + 4, order)?; + let tsize = type_size(kind)?; + let byte_len = (cnt as usize).checked_mul(tsize)?; + let (value, value_offset) = if byte_len <= 4 { + (tiff.get(e + 8..e + 8 + byte_len)?, e + 8) + } else { + let voff = rd32(tiff, e + 8, order)? as usize; + (tiff.get(voff..voff.checked_add(byte_len)?)?, voff) + }; + Some(Entry { + tag, + kind, + count: cnt, + value: Cow::Borrowed(value), + value_offset, + }) +} + +/// Extract a structural pointer tag's offset, removing it from `entries`. +fn take_pointer(entries: &mut Vec>, tag: u16, order: ByteOrder) -> Option { + let pos = entries.iter().position(|e| e.tag == tag)?; + // Peek before removing: a malformed pointer with a < 4-byte value isn't a + // usable offset, so leave it in place (it round-trips as a normal entry) + // rather than silently dropping the whole sub-IFD it nominally points at. + let b = entries[pos].value.get(0..4)?; + let off = match order { + ByteOrder::Big => u32::from_be_bytes([b[0], b[1], b[2], b[3]]), + ByteOrder::Little => u32::from_le_bytes([b[0], b[1], b[2], b[3]]), + } as usize; + entries.remove(pos); + Some(off) +} + +impl<'a> Default for Exif<'a> { + /// An empty EXIF tree with the compatible [`TextEncoding::Ascii`] default — + /// see [`Exif::new`]. + fn default() -> Self { + Self::new(TextEncoding::Ascii) + } +} + +impl<'a> Exif<'a> { + /// Start an empty EXIF tree to build from scratch — e.g. to stamp a + /// Copyright on an image that carried no EXIF. Little-endian, no `Exif\0\0` + /// prefix. + /// + /// `text_encoding` is the **required** Exif 2.x-vs-3.0 compatibility choice + /// for any string field this blob writes, because it can't be defaulted + /// safely: [`TextEncoding::Utf8`] (type 129) is unreadable by most tools, so + /// pick [`TextEncoding::Ascii`] (UTF-8 bytes in a type-2 field — the + /// compatible de-facto form) unless every consumer is known to handle + /// type 129. ([`Exif::default()`](Default) uses `Ascii`.) Set fields with + /// [`set_copyright`](Self::set_copyright) / [`set_artist`](Self::set_artist), + /// then [`to_bytes`](Self::to_bytes) (a raw TIFF — the JPEG/codec layer adds + /// the APP1 `Exif\0\0` framing). + /// + /// ``` + /// use zencodec::exif::{Exif, TextEncoding}; + /// let mut exif = Exif::new(TextEncoding::Ascii); // compatible default + /// exif.set_copyright("© 2026 Lilith"); + /// let blob = exif.to_bytes(); + /// assert_eq!(Exif::parse(&blob).unwrap().copyright().unwrap(), "© 2026 Lilith"); + /// ``` + pub fn new(text_encoding: TextEncoding) -> Self { + Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: Vec::new(), + exif_ifd: None, + gps_ifd: None, + ifd1: None, + thumbnail: None, + text_encoding, + } + } + + /// Parse a TIFF/EXIF blob (optionally `Exif\0\0`-prefixed). Returns `None` + /// for malformed input. Zero-copy: entry values borrow `data`. + pub fn parse(data: &'a [u8]) -> Option { + let had_prefix = + data.len() >= EXIF_PREFIX.len() && data[..EXIF_PREFIX.len()] == *EXIF_PREFIX; + let tiff = if had_prefix { + &data[EXIF_PREFIX.len()..] + } else { + data + }; + if tiff.len() < TIFF_HEADER_SIZE { + return None; + } + let order = match [tiff[0], tiff[1]] { + [b'M', b'M'] => ByteOrder::Big, + [b'I', b'I'] => ByteOrder::Little, + _ => return None, + }; + if rd16(tiff, 2, order)? != 42 { + return None; + } + let ifd0_off = rd32(tiff, 4, order)? as usize; + let (mut ifd0, next) = parse_ifd(tiff, ifd0_off, order)?; + + // Extract sub-IFD pointers as tree edges. The Interop IFD (0xA005) is + // not modeled — strip its pointer so a rewrite can't leave a dangling + // offset (it survives only via the no-prune passthrough). + let exif_ifd = take_pointer(&mut ifd0, TAG_EXIF_IFD, order).and_then(|o| { + parse_ifd(tiff, o, order).map(|(mut e, _)| { + take_pointer(&mut e, TAG_INTEROP_IFD, order); + e + }) + }); + let gps_ifd = take_pointer(&mut ifd0, TAG_GPS_IFD, order) + .and_then(|o| parse_ifd(tiff, o, order).map(|(e, _)| e)); + + // IFD1 (thumbnail directory) follows IFD0's next pointer. The JPEG + // thumbnail offset (0x0201) and length (0x0202) are peeked first and + // only removed once the thumbnail is actually captured — so a thumbnail + // whose length is encoded as SHORT (spec-permitted, common in real + // cameras) is preserved, not silently dropped, and a malformed pair + // round-trips as ordinary entries instead of vanishing. + let (mut ifd1, mut thumbnail) = (None, None); + if next != 0 + && let Some((mut entries, _)) = parse_ifd(tiff, next as usize, order) + { + let toff = entries + .iter() + .find(|e| e.tag == TAG_THUMB_OFFSET) + .and_then(|e| read_uint(e, order)); + let tlen = entries + .iter() + .find(|e| e.tag == TAG_THUMB_LENGTH) + .and_then(|e| read_uint(e, order)); + if let (Some(o), Some(l)) = (toff, tlen) + && let Some(t) = tiff.get(o as usize..(o as usize).checked_add(l as usize)?) + { + thumbnail = Some(t); + entries.retain(|e| e.tag != TAG_THUMB_OFFSET && e.tag != TAG_THUMB_LENGTH); + } + ifd1 = Some(entries); + } + + Some(Exif { + order, + had_prefix, + ifd0, + exif_ifd, + gps_ifd, + ifd1, + thumbnail, + // Not stored in the TIFF; edits to a parsed blob default to the + // compatible ASCII (type-2) form unless rebuilt via `Exif::new`. + text_encoding: TextEncoding::Ascii, + }) + } + + /// The EXIF Orientation tag (0x0112), if present and valid. + pub fn orientation(&self) -> Option { + let e = self.ifd0.iter().find(|e| e.tag == TAG_ORIENTATION)?; + let raw = read_uint(e, self.order)?; + Orientation::from_exif(u8::try_from(raw).ok()?) + } + + /// The Copyright tag (0x8298) as text — a **lossy view** of + /// [`copyright_bytes`](Self::copyright_bytes). See the [encoding + /// note](#encoding). + /// + /// This is the copyright *notice* (rights statement). The rights-holder / + /// creator *name* is a separate concept — the [`artist`](Self::artist) tag + /// and the Exif-IFD CameraOwnerName / Photographer / ImageEditor tags (all + /// in the [`rights`](ExifPolicy::rights) category). The Copyright field has + /// historically held two NUL-separated segments (photographer copyright, + /// then editor copyright); this returns the first segment. A second segment, + /// if present, is preserved byte-for-byte on a rewrite but not surfaced + /// separately. + pub fn copyright(&self) -> Option> { + ascii_value(&self.ifd0, TAG_COPYRIGHT) + } + + /// The Artist tag (0x013B) as text — a **lossy view** of + /// [`artist_bytes`](Self::artist_bytes). See the [encoding note](#encoding). + pub fn artist(&self) -> Option> { + ascii_value(&self.ifd0, TAG_ARTIST) + } + + /// The raw Copyright (0x8298) value bytes, NUL-terminator stripped — the + /// field exactly as stored, with no decoding. + /// + /// # Encoding + /// + /// Per Exif / CIPA DC-008 (Table 6), Copyright and Artist are stored as + /// **ASCII (type 2, NUL-terminated 7-bit)**; Exif 3.0 (CIPA DC-008-2023) + /// added **UTF-8 (type 129)** as the spec-conformant way to carry Unicode in + /// these fields. A type-2 field that nonetheless contains non-ASCII bytes + /// (UTF-8 / Latin-1 stuffed into an ASCII field — common in the wild) is the + /// non-conformant case. zencodec reads both string types and + /// [`copyright`](Self::copyright) / [`artist`](Self::artist) decode them as + /// UTF-8 lossily (invalid sequences → U+FFFD) for a display string, while + /// these `*_bytes` accessors return the exact bytes. A pruning rewrite + /// **never transcodes** — it preserves the value bytes **and TIFF type** + /// verbatim, so a field is neither corrupted nor "corrected". Writing a field + /// is explicit and the only path that mints new bytes: + /// [`set_copyright`](Self::set_copyright) / [`set_artist`](Self::set_artist) + /// take a [`TextEncoding`] choosing the TIFF type (Exif 2.x ASCII vs Exif 3.0 + /// UTF-8). + /// + /// Non-ASCII bytes in a type-2 field are **not** stripped: before type 129 + /// existed (Exif 2.32), the de-facto way to carry non-ASCII here was + /// undeclared UTF-8, so decoding as UTF-8 recovers the common case — + /// stripping the high bytes would corrupt it. A field that actually used a + /// legacy code page (Latin-1, Shift-JIS) decodes lossily (→ U+FFFD); read + /// `*_bytes` and apply your own decoder for those. + pub fn copyright_bytes(&self) -> Option<&[u8]> { + ascii_bytes(&self.ifd0, TAG_COPYRIGHT) + } + + /// The raw Artist (0x013B) value bytes, NUL-terminator stripped. See + /// [`copyright_bytes`](Self::copyright_bytes) for the encoding note. + pub fn artist_bytes(&self) -> Option<&[u8]> { + ascii_bytes(&self.ifd0, TAG_ARTIST) + } + + /// Whether an embedded thumbnail is present. + pub fn has_thumbnail(&self) -> bool { + self.thumbnail.is_some() + } + + /// Whether a GPS sub-IFD is present. + pub fn has_gps(&self) -> bool { + self.gps_ifd.is_some() + } + + /// Set (insert or replace) the IFD0 Copyright tag (0x8298) to `text`. + /// + /// The TIFF field type is this blob's [`text_encoding`](Self::new) (Exif 2.x + /// ASCII type 2, or Exif 3.0 UTF-8 type 129) — chosen once at [`new`](Self::new), + /// or [`TextEncoding::Ascii`] for a parsed blob. The value is written + /// NUL-terminated (count includes the NUL); an existing Copyright entry is + /// replaced in place (keeping IFD order), otherwise a new one is appended. + /// Materialized on the next [`to_bytes`](Self::to_bytes); the injected value + /// is owned, so the output is independent of any source. + /// + /// To *remove* the field instead, [`filtered`](Self::filtered) with a policy + /// that discards [`rights`](ExifPolicy::rights). `text` is written as-is (its + /// UTF-8 bytes); an embedded NUL truncates the field when later read. + pub fn set_copyright(&mut self, text: &str) { + set_ifd0_string(&mut self.ifd0, TAG_COPYRIGHT, text, self.text_encoding); + } + + /// Set (insert or replace) the IFD0 Artist tag (0x013B) to `text`. See + /// [`set_copyright`](Self::set_copyright) for encoding and replace semantics. + pub fn set_artist(&mut self, text: &str) { + set_ifd0_string(&mut self.ifd0, TAG_ARTIST, text, self.text_encoding); + } + + /// Prune the tree by `policy`, returning a new borrowing view. Surviving + /// entries still borrow the original source (no payload copy). + pub fn filtered(&self, policy: &ExifPolicy) -> Exif<'a> { + let keep = |e: &&Entry<'a>| match e.tag { + // Unmodeled sub-IFD pointer (only Exif/GPS/Interop are modeled). Its + // offset can't be recomputed on a rewrite, so keeping it would emit a + // dangling offset / orphaned sub-IFD — drop it instead. + TAG_SUBIFDS => false, + // MakerNote is opaque and routinely embeds GPS coordinates + serials; + // it can only be dropped wholesale. Strip it whenever EITHER camera or + // GPS is being removed — otherwise a gps-strip could still leak the + // location carried inside the maker block. + TAG_MAKER_NOTE => policy.camera.keeps() && policy.gps.keeps(), + tag => policy.keeps(classify(tag)), + }; + let ifd0 = self.ifd0.iter().filter(keep).cloned().collect(); + let exif_ifd = self + .exif_ifd + .as_ref() + .map(|d| d.iter().filter(keep).cloned().collect::>()) + .filter(|d: &Vec<_>| !d.is_empty()); + let gps_ifd = match policy.gps { + Retention::Keep => self.gps_ifd.clone(), + Retention::Discard => None, + }; + // IFD1 (thumbnail directory) carries its own Make/Model/DateTime/etc.; + // run it through the same per-category filter as IFD0 so "keep thumbnail, + // drop camera/datetimes" doesn't leak those via the thumbnail dir. The + // IFD1 wrapper is kept (possibly empty) to hold the thumbnail pointers + // that `to_bytes` synthesizes. + let (ifd1, thumbnail) = match policy.thumbnail { + Retention::Keep => ( + self.ifd1 + .as_ref() + .map(|d| d.iter().filter(keep).cloned().collect::>()), + self.thumbnail, + ), + Retention::Discard => (None, None), + }; + Exif { + order: self.order, + had_prefix: self.had_prefix, + ifd0, + exif_ifd, + gps_ifd, + ifd1, + thumbnail, + text_encoding: self.text_encoding, + } + } + + /// Serialize to a valid TIFF, recomputing every offset. Preserves the + /// source byte order and `Exif\0\0` framing. + pub fn to_bytes(&self) -> Vec { + let mut out = Vec::new(); + if self.had_prefix { + out.extend_from_slice(EXIF_PREFIX); + } + // All stored offsets are TIFF-relative (origin = the byte after any + // `Exif\0\0` prefix), so they're computed from `TIFF_HEADER_SIZE`, not + // from the prefixed buffer position. + + // Synthesized structural pointers per IFD (tag → kind always LONG). + let ifd0_ptrs = { + let mut v = Vec::new(); + if self.exif_ifd.is_some() { + v.push(TAG_EXIF_IFD); + } + if self.gps_ifd.is_some() { + v.push(TAG_GPS_IFD); + } + v + }; + let ifd1_ptrs: &[u16] = if self.thumbnail.is_some() { + &[TAG_THUMB_OFFSET, TAG_THUMB_LENGTH] + } else { + &[] + }; + + // Block sizes (entry table + ext data), pointers counted in the table. + // `ext_size` is the *deduplicated* out-of-line size, matching what + // `write_ifd` emits. + let sz = |entries: &[Entry<'a>], nptr: usize| -> (usize, usize) { + let table = 2 + 12 * (entries.len() + nptr) + 4; + (table, ext_size(entries)) + }; + + let (t0, x0) = sz(&self.ifd0, ifd0_ptrs.len()); + let mut cursor = TIFF_HEADER_SIZE + t0 + x0; + let exif_off = self.exif_ifd.as_ref().map(|d| { + let o = cursor; + let (t, x) = sz(d, 0); + cursor += t + x; + o + }); + let gps_off = self.gps_ifd.as_ref().map(|d| { + let o = cursor; + let (t, x) = sz(d, 0); + cursor += t + x; + o + }); + let ifd1_off = self.ifd1.as_ref().map(|d| { + let o = cursor; + let (t, x) = sz(d, ifd1_ptrs.len()); + cursor += t + x; + o + }); + let thumb_off = self.thumbnail.map(|t| { + let o = cursor; + cursor += t.len(); + o + }); + + // Header. + match self.order { + ByteOrder::Little => out.extend_from_slice(b"II"), + ByteOrder::Big => out.extend_from_slice(b"MM"), + } + self.put16(&mut out, 42); + self.put32(&mut out, (TIFF_HEADER_SIZE) as u32); + + // IFD0 (with Exif/GPS pointers; next → IFD1). + let mut ifd0_ptr_vals = Vec::new(); + if let Some(o) = exif_off { + ifd0_ptr_vals.push((TAG_EXIF_IFD, o as u32)); + } + if let Some(o) = gps_off { + ifd0_ptr_vals.push((TAG_GPS_IFD, o as u32)); + } + self.write_ifd( + &mut out, + &self.ifd0, + &ifd0_ptr_vals, + TIFF_HEADER_SIZE + t0, + ifd1_off.unwrap_or(0) as u32, + ); + + if let Some(d) = &self.exif_ifd { + let eb = exif_off.unwrap() + 2 + 12 * d.len() + 4; + self.write_ifd(&mut out, d, &[], eb, 0); + } + if let Some(d) = &self.gps_ifd { + let eb = gps_off.unwrap() + 2 + 12 * d.len() + 4; + self.write_ifd(&mut out, d, &[], eb, 0); + } + if let Some(d) = &self.ifd1 { + let mut pv = Vec::new(); + if let (Some(to), Some(t)) = (thumb_off, self.thumbnail) { + pv.push((TAG_THUMB_OFFSET, to as u32)); + pv.push((TAG_THUMB_LENGTH, t.len() as u32)); + } + let eb = ifd1_off.unwrap() + 2 + 12 * (d.len() + ifd1_ptrs.len()) + 4; + self.write_ifd(&mut out, d, &pv, eb, 0); + } + if let Some(t) = self.thumbnail { + out.extend_from_slice(t); + } + out + } + + fn put16(&self, out: &mut Vec, v: u16) { + match self.order { + ByteOrder::Big => out.extend_from_slice(&v.to_be_bytes()), + ByteOrder::Little => out.extend_from_slice(&v.to_le_bytes()), + } + } + + fn put32(&self, out: &mut Vec, v: u32) { + match self.order { + ByteOrder::Big => out.extend_from_slice(&v.to_be_bytes()), + ByteOrder::Little => out.extend_from_slice(&v.to_le_bytes()), + } + } + + /// Write one IFD: `ptr_vals` are synthesized LONG entries (tag → value); + /// `ext_base` is the TIFF-relative offset where this IFD's out-of-line + /// values begin; `next` is the next-IFD offset. + /// + /// Entries are written tag-sorted, and out-of-line values are laid out in + /// that **same** order (and deduplicated by alias). Matching the layout + /// order to the write order makes `to_bytes` *canonical* — re-serializing + /// its own output is a byte-exact fixpoint, so filtering is idempotent. + fn write_ifd( + &self, + out: &mut Vec, + entries: &[Entry<'a>], + ptr_vals: &[(u16, u32)], + ext_base: usize, + next: u32, + ) { + // Merge real entries and synthesized pointers, sorted by tag. + enum Item<'b, 'a> { + Real(&'b Entry<'a>), + Ptr(u16, u32), + } + let mut items: Vec = entries + .iter() + .map(Item::Real) + .chain(ptr_vals.iter().map(|&(t, v)| Item::Ptr(t, v))) + .collect(); + items.sort_by_key(|it| match it { + Item::Real(e) => e.tag, + Item::Ptr(t, _) => *t, + }); + + self.put16(out, items.len() as u16); + // Lay out ext data in this (tag-sorted) write order, deduping aliases. + let mut ext = Vec::new(); + let mut placed: Vec<(usize, usize, usize)> = Vec::new(); // (ptr, len, ext_off) + for it in &items { + match it { + Item::Ptr(tag, val) => { + self.put16(out, *tag); + self.put16(out, TIFF_LONG); + self.put32(out, 1); + self.put32(out, *val); + } + Item::Real(e) => { + self.put16(out, e.tag); + self.put16(out, e.kind); + self.put32(out, e.count); + if e.value.len() <= 4 { + let mut v = [0u8; 4]; + v[..e.value.len()].copy_from_slice(&e.value); + out.extend_from_slice(&v); + } else { + let (ptr, len) = (e.value.as_ptr() as usize, e.value.len()); + let off = if let Some(&(.., o)) = + placed.iter().find(|&&(p, l, _)| p == ptr && l == len) + { + o + } else { + let o = ext.len(); + ext.extend_from_slice(&e.value); + if ext.len() % 2 == 1 { + ext.push(0); + } + placed.push((ptr, len, o)); + o + }; + self.put32(out, (ext_base + off) as u32); + } + } + } + } + self.put32(out, next); + out.extend_from_slice(&ext); + } +} + +/// Total out-of-line (>4-byte) byte size for one IFD, **deduplicating values +/// that alias the same source bytes** (so the count matches what `write_ifd` +/// emits). Order-independent — used by the layout pre-pass to size IFD blocks. +/// +/// Dedup defends against a serializer memory-amplification DoS: a malformed +/// IFD can point hundreds of entries at one out-of-line blob (parse is +/// zero-copy, so they alias). Without dedup, `to_bytes` would copy that blob +/// once per entry — up to ~1000× blowup. With dedup the rewritten output is +/// bounded by the source size. Entries that merely have *equal content* at +/// *different* source locations are not merged (only true aliases are). +fn ext_size(entries: &[Entry<'_>]) -> usize { + let mut total = 0usize; + let mut placed: Vec<(usize, usize)> = Vec::new(); // (ptr, len) + for e in entries { + if e.value.len() <= 4 { + continue; + } + let key = (e.value.as_ptr() as usize, e.value.len()); + if !placed.contains(&key) { + total += align2(e.value.len()); + placed.push(key); + } + } + total +} + +fn align2(n: usize) -> usize { + n + (n & 1) +} + +/// Read a SHORT or LONG value as `u32` (for thumbnail length / offset). +fn read_uint(e: &Entry<'_>, order: ByteOrder) -> Option { + match e.kind { + TIFF_SHORT => rd16(&e.value, 0, order).map(u32::from), + TIFF_LONG => rd32(&e.value, 0, order), + _ => None, + } +} + +/// Rewrite the IFD0 Orientation tag (0x0112) to `value` in place, returning a +/// new blob — or `None` if the blob is malformed or carries no SHORT/LONG +/// Orientation tag. +/// +/// Reuses [`Exif::parse`] to locate the tag's inline value (via the entry's +/// recorded [`value_offset`](Entry::value_offset)) rather than re-walking the +/// IFD, then overwrites those bytes — offset-preserving, so the rest of the blob +/// (offsets, thumbnail, all other tags) is byte-identical. Orientation is always +/// inline (a 1-element SHORT/LONG fits the 4-byte value field). +pub(crate) fn set_orientation(data: &[u8], value: Orientation) -> Option> { + let exif = Exif::parse(data)?; + let entry = exif.ifd0.iter().find(|e| e.tag == TAG_ORIENTATION)?; + let size = match entry.kind { + TIFF_SHORT => 2usize, + TIFF_LONG => 4, + _ => return None, // non-integer orientation carrier — leave untouched + }; + // `value_offset` is relative to the TIFF; account for the optional prefix. + let base = if exif.had_prefix { + EXIF_PREFIX.len() + } else { + 0 + }; + let off = base.checked_add(entry.value_offset)?; + let v = u32::from(value as u8); + let mut out = data.to_vec(); + let dst = out.get_mut(off..off.checked_add(size)?)?; + match exif.order { + ByteOrder::Big => dst.copy_from_slice(&v.to_be_bytes()[4 - size..]), + ByteOrder::Little => dst.copy_from_slice(&v.to_le_bytes()[..size]), + } + Some(out) +} + +/// The raw bytes of a string-typed entry — ASCII (type 2) or UTF-8 (type 129, +/// Exif 2.32) — up to (not incl.) the first NUL. `None` if the tag is absent, +/// not a string type (a wrong-type field is ignored, not reinterpreted), or +/// empty. +fn ascii_bytes<'e>(entries: &'e [Entry<'_>], tag: u16) -> Option<&'e [u8]> { + let e = entries.iter().find(|e| e.tag == tag)?; + if e.kind != TIFF_ASCII && e.kind != TIFF_UTF8 { + return None; + } + let value: &[u8] = &e.value; + let end = value.iter().position(|&b| b == 0).unwrap_or(value.len()); + let bytes = &value[..end]; + if bytes.is_empty() { None } else { Some(bytes) } +} + +/// Lossy-UTF-8 *view* of [`ascii_bytes`]. EXIF type-2 is spec'd 7-bit ASCII; +/// real files embed UTF-8/Latin-1, so valid UTF-8 (incl. ASCII) is borrowed +/// and invalid sequences (e.g. raw Latin-1 `0xA9`) become U+FFFD (owned). The +/// result is always a valid `str`; it is a read-only view and is never written +/// back (the filter preserves the original bytes verbatim). +fn ascii_value<'e>(entries: &'e [Entry<'_>], tag: u16) -> Option> { + let bytes = ascii_bytes(entries, tag)?; + Some(match core::str::from_utf8(bytes) { + Ok(s) => Cow::Borrowed(s), + Err(_) => Cow::Owned(alloc::string::String::from_utf8_lossy(bytes).into_owned()), + }) +} + +/// Insert-or-replace a NUL-terminated string entry (Copyright, Artist, …) in an +/// IFD. The value is `text`'s UTF-8 bytes plus a trailing NUL (count includes +/// the NUL); the TIFF type is ASCII (2) or UTF-8 (129) per [`TextEncoding`]. The +/// owned `Cow` makes the entry independent of any source blob. An existing entry +/// with the same tag is overwritten in place (preserving IFD order); otherwise +/// the new entry is appended. +fn set_ifd0_string<'a>(entries: &mut Vec>, tag: u16, text: &str, encoding: TextEncoding) { + let kind = match encoding { + TextEncoding::Ascii => TIFF_ASCII, + TextEncoding::Utf8 => TIFF_UTF8, + }; + let mut bytes = text.as_bytes().to_vec(); + bytes.push(0); // NUL terminator; TIFF string count includes it. + let entry = Entry { + tag, + kind, + count: bytes.len() as u32, + value: Cow::Owned(bytes), + value_offset: 0, // injected entry: re-serialized by to_bytes, never rewritten in place + }; + match entries.iter_mut().find(|e| e.tag == tag) { + Some(slot) => *slot = entry, + None => entries.push(entry), + } +} + +/// Keep-or-discard for a single metadata field. Explicit (no `bool`-direction +/// ambiguity). +/// +/// `#[non_exhaustive]`: a future disposition (e.g. anonymize-in-place) can be +/// added without a breaking change. Query via [`keeps`](Self::keeps) / +/// [`discards`](Self::discards) rather than matching the variant, so callers +/// stay correct as variants are added. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum Retention { + /// Keep the field. + Keep, + /// Discard the field. + Discard, +} + +impl Retention { + /// `true` if the field is kept. + #[inline] + #[must_use] + pub const fn keeps(self) -> bool { + matches!(self, Retention::Keep) + } + + /// `true` if the field is dropped. + #[inline] + #[must_use] + pub const fn discards(self) -> bool { + matches!(self, Retention::Discard) + } +} + +/// Per-category EXIF retention. Categories not matched by a specific field +/// fall under [`other`](Self::other). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub struct ExifPolicy { + /// Orientation tag (0x0112). + pub orientation: Retention, + /// Rights: Copyright (0x8298) + Artist (0x013B). + pub rights: Retention, + /// Embedded thumbnail (IFD1 + its image data). + pub thumbnail: Retention, + /// GPS sub-IFD (location). + pub gps: Retention, + /// Capture timestamps (DateTime / Original / Digitized + sub-sec/offset). + pub datetimes: Retention, + /// Camera/device identity (Make, Model, Software, lens, serial, MakerNote). + pub camera: Retention, + /// Everything else (dimensions, exposure settings, …). + pub other: Retention, +} + +impl ExifPolicy { + /// Keep every category. + pub const KEEP_ALL: Self = Self { + orientation: Retention::Keep, + rights: Retention::Keep, + thumbnail: Retention::Keep, + gps: Retention::Keep, + datetimes: Retention::Keep, + camera: Retention::Keep, + other: Retention::Keep, + }; + /// Discard every category (drops EXIF entirely). + pub const DISCARD_ALL: Self = Self { + orientation: Retention::Discard, + rights: Retention::Discard, + thumbnail: Retention::Discard, + gps: Retention::Discard, + datetimes: Retention::Discard, + camera: Retention::Discard, + other: Retention::Discard, + }; + /// Keep only orientation + rights (the web default). + pub const ATTRIBUTED_ORIENTATION: Self = Self { + orientation: Retention::Keep, + rights: Retention::Keep, + ..Self::DISCARD_ALL + }; + /// Keep only orientation. + pub const ORIENTATION_ONLY: Self = Self { + orientation: Retention::Keep, + ..Self::DISCARD_ALL + }; + + /// Set the orientation category. (Builder — this type is `#[non_exhaustive]`, + /// so downstream crates tweak it from a const via `with_*` rather than + /// struct-update syntax.) + #[must_use] + pub const fn with_orientation(mut self, r: Retention) -> Self { + self.orientation = r; + self + } + /// Set the rights (copyright/artist) category. + #[must_use] + pub const fn with_rights(mut self, r: Retention) -> Self { + self.rights = r; + self + } + /// Set the thumbnail category. + #[must_use] + pub const fn with_thumbnail(mut self, r: Retention) -> Self { + self.thumbnail = r; + self + } + /// Set the GPS category. + #[must_use] + pub const fn with_gps(mut self, r: Retention) -> Self { + self.gps = r; + self + } + /// Set the timestamps category. + #[must_use] + pub const fn with_datetimes(mut self, r: Retention) -> Self { + self.datetimes = r; + self + } + /// Set the camera/device-identity category. + #[must_use] + pub const fn with_camera(mut self, r: Retention) -> Self { + self.camera = r; + self + } + /// Set the "everything else" category. + #[must_use] + pub const fn with_other(mut self, r: Retention) -> Self { + self.other = r; + self + } + + fn keeps(&self, c: Category) -> bool { + match c { + Category::Orientation => self.orientation.keeps(), + Category::Rights => self.rights.keeps(), + Category::Datetimes => self.datetimes.keeps(), + Category::Camera => self.camera.keeps(), + Category::Other => self.other.keeps(), + } + } + + /// Whether every category is kept (→ source passes through unchanged). + pub fn keeps_everything(&self) -> bool { + self.orientation.keeps() + && self.rights.keeps() + && self.thumbnail.keeps() + && self.gps.keeps() + && self.datetimes.keeps() + && self.camera.keeps() + && self.other.keeps() + } + + /// Whether every category is discarded (→ EXIF dropped entirely). + pub fn discards_everything(&self) -> bool { + *self == Self::DISCARD_ALL + } +} + +/// Apply an [`ExifPolicy`] to a TIFF/EXIF blob, returning the retained bytes. +/// +/// - Keep-everything policy → [`Cow::Borrowed`] (source unchanged, zero-copy). +/// - Partial policy on parseable EXIF → [`Cow::Owned`] rewrite (or `None` if +/// nothing survives). +/// - Discard-everything policy → `None`. +/// - **Unparseable EXIF under a stripping policy → `None` (fail-safe).** If the +/// blob can't be parsed, the requested strip can't be verified, so the EXIF +/// is dropped rather than passed through — a passthrough could leak GPS / +/// camera data a lenient viewer might still read. (Orientation is unaffected: +/// it's carried separately on [`Metadata`](crate::Metadata).) A +/// keep-everything policy never reaches this path. +/// - **Oversize (> 4 GiB) under a stripping policy → `None` (fail-safe).** Such a +/// blob can't be safely rewritten (offsets are `u32`), so it is dropped rather +/// than passed through unfiltered — same reasoning as the unparseable case. +pub fn retain<'a>(src: &'a [u8], policy: &ExifPolicy) -> Option> { + if policy.discards_everything() { + return None; + } + if policy.keeps_everything() { + return Some(Cow::Borrowed(src)); + } + // A valid TIFF is ≤ 4 GiB (offsets are `u32`); a larger blob can't be + // rewritten without risking offset truncation. We've already returned for + // keep-everything above, so reaching here means a stripping policy — fail + // SAFE (drop) rather than pass the original through unfiltered, which would + // leak exactly the GPS/camera data the policy asked to remove. (Matches the + // unparseable-input fail-safe below; a >4 GiB EXIF blob is itself anomalous.) + if src.len() > u32::MAX as usize { + return None; + } + match Exif::parse(src) { + Some(exif) => { + let pruned = exif.filtered(policy); + if pruned.ifd0.is_empty() + && pruned.exif_ifd.is_none() + && pruned.gps_ifd.is_none() + && pruned.ifd1.is_none() + { + None + } else { + Some(Cow::Owned(pruned.to_bytes())) + } + } + // Unparseable EXIF under a stripping policy: we can't verify the strip, + // so fail safe and drop it (orientation survives on `Metadata`). + None => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use alloc::vec; + + fn e(tag: u16, kind: u16, count: u32, value: &[u8]) -> Entry<'_> { + Entry { + tag, + kind, + count, + value: Cow::Borrowed(value), + value_offset: 0, // synthetic entries; only the rewrite path needs a real offset + } + } + + /// A full tree: IFD0 (Make + Orientation + Copyright) + Exif-IFD + /// (DateTimeOriginal) + GPS-IFD + IFD1 thumbnail. Built directly (the test + /// module sees private fields) and serialized by `to_bytes`. + fn sample(order: ByteOrder, had_prefix: bool) -> alloc::vec::Vec { + let ori: alloc::vec::Vec = match order { + ByteOrder::Little => 6u16.to_le_bytes().to_vec(), + ByteOrder::Big => 6u16.to_be_bytes().to_vec(), + }; + // Leak-free: build owned then serialize; borrows live within this fn. + let exif = Exif { + order, + had_prefix, + ifd0: vec![ + e(TAG_MAKE, TIFF_ASCII, 4, b"Cam\0"), // Make (camera) + e(TAG_ORIENTATION, TIFF_SHORT, 1, &ori), // Orientation=Rotate90 + e(TAG_COPYRIGHT, TIFF_ASCII, 7, b"(c) Me\0"), // Copyright (out-of-line) + ], + exif_ifd: Some(vec![e(TAG_DATETIME_ORIGINAL, TIFF_ASCII, 5, b"2020\0")]), + gps_ifd: Some(vec![e(0x0001, TIFF_ASCII, 2, b"N\0")]), // GPSLatitudeRef + ifd1: Some(vec![]), + thumbnail: Some(&[0xFF, 0xD8, 0xFF, 0xD9]), + text_encoding: TextEncoding::Ascii, + }; + exif.to_bytes() + } + + #[test] + fn round_trip_full_tree_little_endian() { + let bytes = sample(ByteOrder::Little, false); + let x = Exif::parse(&bytes).expect("parses"); + assert_eq!(x.order, ByteOrder::Little); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + assert_eq!(x.copyright().unwrap(), "(c) Me"); + assert!(x.has_gps()); + assert!(x.has_thumbnail()); + // Idempotent: re-serializing and re-parsing is stable. + let bytes2 = x.to_bytes(); + let x2 = Exif::parse(&bytes2).expect("re-parses"); + assert_eq!(x2.orientation(), Some(Orientation::Rotate90)); + assert_eq!(x2.copyright().unwrap(), "(c) Me"); + assert!(x2.has_gps() && x2.has_thumbnail()); + } + + #[test] + fn round_trip_big_endian() { + let bytes = sample(ByteOrder::Big, false); + let x = Exif::parse(&bytes).expect("parses"); + assert_eq!(x.order, ByteOrder::Big); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + assert_eq!(x.copyright().unwrap(), "(c) Me"); + assert!(x.has_gps() && x.has_thumbnail()); + } + + #[test] + fn round_trip_with_exif_prefix_and_subifds() { + // Exercises the offset fix: sub-IFD pointers must be TIFF-relative even + // with an `Exif\0\0` prefix present. + let bytes = sample(ByteOrder::Little, true); + assert_eq!(&bytes[..6], b"Exif\0\0"); + let x = Exif::parse(&bytes).expect("parses"); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + assert_eq!(x.copyright().unwrap(), "(c) Me"); + assert!(x.has_gps()); + assert!(x.has_thumbnail()); + } + + #[test] + fn drop_gps_keeps_everything_else() { + let bytes = sample(ByteOrder::Little, false); + let x = Exif::parse(&bytes).unwrap(); + let p = ExifPolicy { + gps: Retention::Discard, + ..ExifPolicy::KEEP_ALL + }; + let out = x.filtered(&p).to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert!(!y.has_gps()); + assert!(y.has_thumbnail()); + assert_eq!(y.orientation(), Some(Orientation::Rotate90)); + assert_eq!(y.copyright().unwrap(), "(c) Me"); + } + + #[test] + fn drop_thumbnail_keeps_everything_else() { + let bytes = sample(ByteOrder::Little, false); + let x = Exif::parse(&bytes).unwrap(); + let p = ExifPolicy { + thumbnail: Retention::Discard, + ..ExifPolicy::KEEP_ALL + }; + let out = x.filtered(&p).to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert!(!y.has_thumbnail()); + assert!(y.has_gps()); + assert_eq!(y.orientation(), Some(Orientation::Rotate90)); + assert_eq!(y.copyright().unwrap(), "(c) Me"); + } + + #[test] + fn attributed_orientation_drops_camera_datetime_gps_thumbnail() { + let bytes = sample(ByteOrder::Little, false); + let x = Exif::parse(&bytes).unwrap(); + let out = x.filtered(&ExifPolicy::ATTRIBUTED_ORIENTATION).to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!(y.orientation(), Some(Orientation::Rotate90)); + assert_eq!(y.copyright().unwrap(), "(c) Me"); + assert!(!y.has_gps()); + assert!(!y.has_thumbnail()); + // Camera (Make) and DateTimeOriginal gone. + assert!(!out.windows(2).any(|w| w == TAG_MAKE.to_le_bytes())); + assert!( + !out.windows(2) + .any(|w| w == TAG_DATETIME_ORIGINAL.to_le_bytes()) + ); + } + + #[test] + fn retain_cow_behaviour() { + let bytes = sample(ByteOrder::Little, false); + // Keep everything → borrows source (zero-copy). + assert!(matches!( + retain(&bytes, &ExifPolicy::KEEP_ALL), + Some(Cow::Borrowed(_)) + )); + // Prune → owns a rewritten buffer. + let p = ExifPolicy { + gps: Retention::Discard, + ..ExifPolicy::KEEP_ALL + }; + assert!(matches!(retain(&bytes, &p), Some(Cow::Owned(_)))); + // Discard everything → None. + assert!(retain(&bytes, &ExifPolicy::DISCARD_ALL).is_none()); + } + + #[test] + fn out_of_line_copyright_relocates_correctly() { + // A long copyright forces out-of-line storage; after a prune that + // shifts layout, it must still resolve. + let long = b"Copyright 2026 Lilith, all rights reserved worldwide.\0"; + let exif = Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: vec![ + e(0x010F, 2, 4, b"Cam\0"), + e(TAG_COPYRIGHT, 2, long.len() as u32, long), + ], + exif_ifd: None, + gps_ifd: None, + ifd1: None, + thumbnail: None, + text_encoding: TextEncoding::Ascii, + }; + let pruned = exif.filtered(&ExifPolicy { + camera: Retention::Discard, + ..ExifPolicy::KEEP_ALL + }); + let out = pruned.to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!( + y.copyright().unwrap(), + "Copyright 2026 Lilith, all rights reserved worldwide." + ); + } + + #[test] + fn malformed_inputs_return_none() { + assert!(Exif::parse(b"").is_none()); + assert!(Exif::parse(b"garbage").is_none()); + assert!(Exif::parse(&[0u8; 7]).is_none()); + // Good header, IFD0 offset past EOF. + let mut bad = b"II".to_vec(); + bad.extend_from_slice(&42u16.to_le_bytes()); + bad.extend_from_slice(&9999u32.to_le_bytes()); + assert!(Exif::parse(&bad).is_none()); + } + + #[test] + fn excessive_entry_count_rejected() { + let mut bad = b"II".to_vec(); + bad.extend_from_slice(&42u16.to_le_bytes()); + bad.extend_from_slice(&8u32.to_le_bytes()); + bad.extend_from_slice(&60000u16.to_le_bytes()); // > MAX_IFD_ENTRIES + assert!(Exif::parse(&bad).is_none()); + } + + // ── Edge cases (mined from reference EXIF parsers) ────────────────────── + + /// Little-endian TIFF, IFD0 at offset 8, given 12-byte entries + tail. + fn le_ifd0(entries: &[[u8; 12]], next: u32, tail: &[u8]) -> alloc::vec::Vec { + let mut v = vec![b'I', b'I', 0x2A, 0x00]; + v.extend_from_slice(&8u32.to_le_bytes()); + v.extend_from_slice(&(entries.len() as u16).to_le_bytes()); + for e in entries { + v.extend_from_slice(e); + } + v.extend_from_slice(&next.to_le_bytes()); + v.extend_from_slice(tail); + v + } + + /// One little-endian IFD entry with an inline (≤4-byte) value. + fn entry_inline(tag: u16, kind: u16, count: u32, val: [u8; 4]) -> [u8; 12] { + let mut e = [0u8; 12]; + e[0..2].copy_from_slice(&tag.to_le_bytes()); + e[2..4].copy_from_slice(&kind.to_le_bytes()); + e[4..8].copy_from_slice(&count.to_le_bytes()); + e[8..12].copy_from_slice(&val); + e + } + + /// One little-endian IFD entry whose value lives out-of-line at `offset`. + fn entry_offset(tag: u16, kind: u16, count: u32, offset: u32) -> [u8; 12] { + entry_inline(tag, kind, count, offset.to_le_bytes()) + } + + #[test] + fn orientation_as_rational_is_rejected() { + // Orientation declared RATIONAL (type 5) → orientation() returns None + // (only SHORT/LONG accepted), but the blob still parses. + let e = entry_offset(TAG_ORIENTATION, 5, 1, 26); // 8-byte value in tail + let blob = le_ifd0(&[e], 0, &[1, 0, 0, 0, 1, 0, 0, 0]); + let x = Exif::parse(&blob).expect("parses"); + assert_eq!(x.orientation(), None); + } + + #[test] + fn ascii_no_nul_terminator_uses_whole_value() { + let e = entry_inline(TAG_COPYRIGHT, TIFF_ASCII, 4, *b"abcd"); // no NUL + let blob = le_ifd0(&[e], 0, &[]); + assert_eq!(Exif::parse(&blob).unwrap().copyright().unwrap(), "abcd"); + } + + #[test] + fn ascii_embedded_nul_truncates() { + let e = entry_offset(TAG_COPYRIGHT, TIFF_ASCII, 6, 26); + let blob = le_ifd0(&[e], 0, b"ab\0cd\0"); + assert_eq!(Exif::parse(&blob).unwrap().copyright().unwrap(), "ab"); + } + + #[test] + fn ascii_leading_nul_is_none() { + let e = entry_inline(TAG_COPYRIGHT, TIFF_ASCII, 1, [0, 0, 0, 0]); + let blob = le_ifd0(&[e], 0, &[]); + assert!(Exif::parse(&blob).unwrap().copyright().is_none()); + } + + #[test] + fn latin1_copyright_decodes_lossy() { + // 0xA9 is Latin-1 ©, invalid UTF-8 → U+FFFD via lossy decode. + let e = entry_offset(TAG_COPYRIGHT, TIFF_ASCII, 5, 26); + let blob = le_ifd0(&[e], 0, b"\xA9 Me\0"); + assert_eq!( + Exif::parse(&blob).unwrap().copyright().unwrap(), + "\u{FFFD} Me" + ); + } + + #[test] + fn utf8_copyright_decodes_borrowed() { + // 0xC2 0xA9 is UTF-8 © → valid, returned borrowed. + let e = entry_offset(TAG_COPYRIGHT, TIFF_ASCII, 6, 26); + let blob = le_ifd0(&[e], 0, b"\xC2\xA9 Me\0"); + let x = Exif::parse(&blob).unwrap(); + assert_eq!(x.copyright().unwrap(), "© Me"); + assert!(matches!(x.copyright().unwrap(), Cow::Borrowed(_))); + } + + #[test] + fn copyright_wrong_type_is_ignored() { + // Copyright tag declared SHORT (not ASCII) → copyright() returns None + // rather than reinterpreting the bytes as a string. + let e = entry_inline(TAG_COPYRIGHT, TIFF_SHORT, 1, [6, 0, 0, 0]); + let blob = le_ifd0(&[e], 0, &[]); + assert!(Exif::parse(&blob).unwrap().copyright().is_none()); + } + + #[test] + fn count_type_size_overflow_entry_skipped_others_survive() { + // An entry whose count × type_size overflows (or is absurdly large) is + // skipped, not fatal — sibling entries still parse (graceful + // degradation). Orientation here survives the bad RATIONAL entry. + let ori = entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0]); + let bad = entry_offset(0x0111, 5, 0x8000_0000, 100); // huge RATIONAL + let blob = le_ifd0(&[ori, bad], 0, &[]); + let x = Exif::parse(&blob).expect("salvages the good entry"); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + } + + #[test] + fn unknown_tiff_type_entry_skipped() { + // A future/unknown TIFF type (99) is skipped, not fatal. + let ori = entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0]); + let weird = entry_inline(0x9999, 99, 1, [1, 2, 3, 4]); + let blob = le_ifd0(&[ori, weird], 0, &[]); + let x = Exif::parse(&blob).expect("salvages the good entry"); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + } + + #[test] + fn truncated_entry_table_salvages_prior_entries() { + // IFD claims 2 entries but only 1 (+ a stub) is present: the readable + // entry survives, the truncated one is dropped, no panic. + let mut v = vec![b'I', b'I', 0x2A, 0x00]; + v.extend_from_slice(&8u32.to_le_bytes()); + v.extend_from_slice(&2u16.to_le_bytes()); // claims 2 entries + v.extend_from_slice(&entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0])); + v.extend_from_slice(&[0xAA, 0xBB]); // truncated 2nd entry (only 2 of 12 bytes) + let x = Exif::parse(&v).expect("salvages the readable entry"); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + } + + #[test] + fn unparseable_exif_under_stripping_policy_drops_fail_safe() { + // Can't parse → can't verify the strip → drop (fail-safe), not pass + // through. (A keep-everything policy never reaches this path.) + assert!( + retain( + b"not a valid tiff blob", + &ExifPolicy::ATTRIBUTED_ORIENTATION + ) + .is_none() + ); + // Keep-everything still passes through unchanged. + let garbage = b"not a valid tiff blob"; + assert!(matches!( + retain(garbage, &ExifPolicy::KEEP_ALL), + Some(Cow::Borrowed(_)) + )); + } + + #[test] + fn exif_pointer_to_invalid_offset_is_swallowed() { + // Exif-IFD pointer past EOF → exif_ifd None, IFD0 still readable. + let ori = entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0]); + let ptr = entry_inline(TAG_EXIF_IFD, TIFF_LONG, 1, 0xFFFFu32.to_le_bytes()); + let blob = le_ifd0(&[ori, ptr], 0, &[]); + let x = Exif::parse(&blob).expect("parses"); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + assert!(!x.has_gps()); + } + + #[test] + fn exif_pointer_cycle_to_ifd0_terminates() { + // Exif-IFD pointer points back at IFD0 (offset 8). zencodec re-parses + // IFD0 once as the child (no recursion into child pointers) → must + // terminate, no hang/stack-overflow. + let ori = entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0]); + let ptr = entry_inline(TAG_EXIF_IFD, TIFF_LONG, 1, 8u32.to_le_bytes()); + let blob = le_ifd0(&[ori, ptr], 0, &[]); + let x = Exif::parse(&blob).expect("parses + terminates"); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + } + + #[test] + fn huge_thumbnail_1mb_round_trips_and_borrows() { + let big = vec![0xABu8; 1 << 20]; // 1 MiB thumbnail + let exif = Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: vec![e(TAG_ORIENTATION, TIFF_SHORT, 1, &[6, 0])], + exif_ifd: None, + gps_ifd: None, + ifd1: Some(vec![]), + thumbnail: Some(&big), + text_encoding: TextEncoding::Ascii, + }; + let bytes = exif.to_bytes(); + let y = Exif::parse(&bytes).expect("parses"); + assert!(y.has_thumbnail()); + assert_eq!(y.thumbnail.unwrap().len(), 1 << 20); + assert_eq!(y.thumbnail.unwrap(), &big[..]); + // Keeping everything borrows the source — no copy of the 1 MiB payload. + assert!(matches!( + retain(&bytes, &ExifPolicy::KEEP_ALL), + Some(Cow::Borrowed(_)) + )); + } + + // ── Regressions for adversarial-review findings ───────────────────────── + + /// #4: a thumbnail whose length tag (0x0202) is encoded as SHORT (common in + /// real cameras) must be recognized, not silently dropped. + #[test] + fn thumbnail_length_as_short_is_recognized() { + // IFD0 @8 (orientation, next→IFD1@26); IFD1 @26 (0x0201 LONG offset=56, + // 0x0202 SHORT length=4); thumbnail bytes @56. + let mut v = vec![b'I', b'I', 0x2A, 0x00]; + v.extend_from_slice(&8u32.to_le_bytes()); + // IFD0: 1 entry, next = 26. + v.extend_from_slice(&1u16.to_le_bytes()); + v.extend_from_slice(&entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0])); + v.extend_from_slice(&26u32.to_le_bytes()); + // IFD1 @26: 2 entries, next = 0. + v.extend_from_slice(&2u16.to_le_bytes()); + v.extend_from_slice(&entry_inline( + TAG_THUMB_OFFSET, + TIFF_LONG, + 1, + 56u32.to_le_bytes(), + )); + v.extend_from_slice(&entry_inline(TAG_THUMB_LENGTH, TIFF_SHORT, 1, [4, 0, 0, 0])); // SHORT! + v.extend_from_slice(&0u32.to_le_bytes()); + // Thumbnail @56. + v.extend_from_slice(&[0xFF, 0xD8, 0xFF, 0xD9]); + + let x = Exif::parse(&v).expect("parses"); + assert!( + x.has_thumbnail(), + "SHORT-length thumbnail must be recognized" + ); + assert_eq!(x.thumbnail.unwrap(), &[0xFF, 0xD8, 0xFF, 0xD9]); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + } + + /// #1: many entries aliasing one out-of-line blob must not amplify the + /// rewritten output (`ext_size` dedups true aliases). + #[test] + fn aliased_out_of_line_values_do_not_amplify() { + // 40 "other"-category entries, ASCII count=100, all pointing at one + // 100-byte blob in the tail (aliased after zero-copy parse). + let n = 40u32; + let blob_len = 100u32; + let tail_off = 8 + 2 + 12 * n + 4; + let entries: Vec<[u8; 12]> = (0..n) + .map(|i| entry_offset(0x1000 + i as u16, TIFF_ASCII, blob_len, tail_off)) + .collect(); + let src = le_ifd0(&entries, 0, &vec![0x41u8; blob_len as usize]); + + let x = Exif::parse(&src).expect("parses"); + // Force a rewrite while keeping the aliased "other" entries. + let policy = ExifPolicy::KEEP_ALL.with_gps(Retention::Discard); + let out = x.filtered(&policy).to_bytes(); + // Deduped: one shared blob, not 40 copies. Without dedup this would be + // ~40 × 100 = 4000+ bytes of ext; with it, ~100. + assert!( + out.len() < src.len() + 64, + "amplification: {} vs src {}", + out.len(), + src.len() + ); + assert!(Exif::parse(&out).is_some(), "rewritten output re-parses"); + } + + /// #3: a structural sub-IFD pointer too short to hold a 4-byte offset is + /// preserved as an ordinary entry, not silently dropped. + #[test] + fn short_subifd_pointer_is_preserved() { + let ori = entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0]); + // 0x8769 as BYTE/count-1 → 1-byte value, not a usable pointer. + let bad_ptr = entry_inline(TAG_EXIF_IFD, 1, 1, [1, 0, 0, 0]); + let src = le_ifd0(&[ori, bad_ptr], 0, &[]); + let x = Exif::parse(&src).expect("parses"); + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); + let out = x.to_bytes(); + // The 0x8769 entry survived the round-trip (LE tag bytes 0x69, 0x87). + assert!(out.windows(2).any(|w| w == TAG_EXIF_IFD.to_le_bytes())); + } + + /// Encoding: a non-ASCII (Latin-1) copyright is exposed raw via + /// `copyright_bytes`, viewed lossily via `copyright`, and — critically — + /// survives a pruning rewrite **byte-exact** (no transcode, no corruption, + /// no "fixing" to ASCII). + #[test] + fn non_ascii_copyright_preserved_byte_exact() { + // 0xA9 = Latin-1 ©, invalid UTF-8. + let e = entry_offset(TAG_COPYRIGHT, TIFF_ASCII, 5, 26); + let src = le_ifd0(&[e], 0, b"\xA9 Me\0"); + let x = Exif::parse(&src).expect("parses"); + assert_eq!(x.copyright_bytes(), Some(&b"\xA9 Me"[..])); // exact bytes + assert_eq!(x.copyright().unwrap(), "\u{FFFD} Me"); // lossy view + + // Force a rewrite that keeps rights (copyright). Bytes must be preserved + // verbatim — NOT transcoded to the UTF-8 of U+FFFD. + let policy = ExifPolicy::KEEP_ALL.with_gps(Retention::Discard); + let out = x.filtered(&policy).to_bytes(); + let y = Exif::parse(&out).expect("re-parses"); + assert_eq!( + y.copyright_bytes(), + Some(&b"\xA9 Me"[..]), + "Latin-1 copyright must round-trip byte-exact, not transcode" + ); + } + + /// Canonicalization (regression for a fuzz-found non-idempotence): + /// `to_bytes` is a byte-exact fixpoint even when input entries are not + /// tag-sorted and carry out-of-line values — the ext layout follows the + /// tag-sorted write order, so re-serializing the output reproduces it and + /// filtering is idempotent. + #[test] + fn to_bytes_is_canonical_fixpoint() { + let va = [0xAAu8; 10]; + let vb = [0xBBu8; 10]; + // Descending tag order (unsorted), both out-of-line (>4 bytes). + let x = Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: vec![ + e(0x0200, TIFF_ASCII, 10, &va), + e(0x0100, TIFF_ASCII, 10, &vb), + ], + exif_ifd: None, + gps_ifd: None, + ifd1: None, + thumbnail: None, + text_encoding: TextEncoding::Ascii, + }; + let b1 = x.to_bytes(); + let b2 = Exif::parse(&b1).expect("re-parses").to_bytes(); + assert_eq!(b1, b2, "to_bytes must be a byte-exact fixpoint (canonical)"); + } + + /// Exif 2.32 / CIPA DC-008 Table 6: Copyright/Artist may be **UTF-8** + /// (type 129), not just ASCII (type 2). A UTF-8-typed Copyright must parse, + /// read as Unicode, and round-trip (type + bytes preserved) — not get + /// dropped as an unknown type. + #[test] + fn utf8_typed_copyright_parses_and_round_trips() { + // "© Me\0" = C2 A9 20 4D 65 00 (6 bytes incl. NUL), stored out-of-line. + let e = entry_offset(TAG_COPYRIGHT, TIFF_UTF8, 6, 26); + let blob = le_ifd0(&[e], 0, b"\xC2\xA9 Me\0"); + let x = Exif::parse(&blob).expect("parses UTF-8-typed copyright"); + assert_eq!(x.copyright().unwrap(), "© Me"); + assert!(matches!(x.copyright().unwrap(), Cow::Borrowed(_))); // valid UTF-8 + let out = x.to_bytes(); + let y = Exif::parse(&out).expect("re-parses"); + assert_eq!(y.copyright().unwrap(), "© Me"); + assert_eq!(y.copyright_bytes(), Some(&b"\xC2\xA9 Me"[..])); + // The UTF-8 type (129 → LE 0x81,0x00) survived the round-trip. + assert!(out.windows(2).any(|w| w == TIFF_UTF8.to_le_bytes())); + } + + /// Filtering is idempotent: re-filtering the result with the same policy + /// yields byte-identical EXIF (relies on the canonical `to_bytes`). + #[test] + fn filtering_is_idempotent() { + let src = sample(ByteOrder::Little, false); + let policy = ExifPolicy::KEEP_ALL.with_gps(Retention::Discard); + let once = match retain(&src, &policy) { + Some(c) => c.into_owned(), + None => return, + }; + let twice = retain(&once, &policy).map(|c| c.into_owned()); + assert_eq!(Some(once), twice, "filtering must be idempotent"); + } + + /// Attribution vs device identity: the Exif-IFD creator/owner *name* tags + /// (Photographer 0xA437, etc.) are `Rights` — a rights-keeping policy keeps + /// them — while device tags (BodySerialNumber 0xA431) are `Camera` and get + /// stripped. (Regression for the copyright-owner-vs-string classification.) + #[test] + fn attribution_tags_kept_device_tags_dropped() { + let exif = Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: vec![e(TAG_ORIENTATION, TIFF_SHORT, 1, &[6, 0])], + exif_ifd: Some(vec![ + e(TAG_BODY_SERIAL_NUMBER, TIFF_ASCII, 4, b"SN1\0"), // → Camera + e(TAG_PHOTOGRAPHER, TIFF_ASCII, 4, b"Me\0\0"), // → Rights + ]), + gps_ifd: None, + ifd1: None, + thumbnail: None, + text_encoding: TextEncoding::Ascii, + }; + // Web keeps orientation + rights, drops camera/device. + let out = exif + .filtered(&ExifPolicy::ATTRIBUTED_ORIENTATION) + .to_bytes(); + assert!( + out.windows(2).any(|w| w == TAG_PHOTOGRAPHER.to_le_bytes()), + "Photographer (attribution) must survive a rights policy" + ); + assert!( + !out.windows(2) + .any(|w| w == TAG_BODY_SERIAL_NUMBER.to_le_bytes()), + "BodySerialNumber (device identity) must be stripped" + ); + } + + // ── Editing: set_copyright / set_artist (Exif 2.x ASCII vs Exif 3.0 UTF-8) ─ + + /// One orientation-only IFD0 blob, for insert tests. + fn orientation_only() -> alloc::vec::Vec { + le_ifd0( + &[entry_inline(TAG_ORIENTATION, TIFF_SHORT, 1, [6, 0, 0, 0])], + 0, + &[], + ) + } + + /// Insert a Copyright into a blob that had none, as Exif 2.x ASCII (type 2). + /// Round-trips through serialize → parse; the other tags are untouched. + #[test] + fn set_copyright_inserts_ascii_type2() { + let blob = orientation_only(); + let mut x = Exif::parse(&blob).unwrap(); + assert!(x.copyright().is_none()); + x.set_copyright("(c) 2026 Lilith"); + let out = x.to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!(y.copyright().unwrap(), "(c) 2026 Lilith"); + assert_eq!(y.copyright_bytes(), Some(&b"(c) 2026 Lilith"[..])); + assert_eq!(y.orientation(), Some(Orientation::Rotate90)); // unchanged + // Stored as ASCII (type 2 → LE 0x02,0x00). + assert!(out.windows(2).any(|w| w == TIFF_ASCII.to_le_bytes())); + } + + /// Set a Copyright as Exif 3.0 UTF-8 (type 129); the declared type survives. + #[test] + fn set_copyright_utf8_writes_type129() { + // Exif 3.0 / type-129 blob (the explicit opt-in). + let mut x = Exif::new(TextEncoding::Utf8); + x.set_copyright("© 2026 Lilith"); + let out = x.to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!(y.copyright().unwrap(), "© 2026 Lilith"); + // UTF-8 type (129 → LE 0x81,0x00) is what was written. + assert!(out.windows(2).any(|w| w == TIFF_UTF8.to_le_bytes())); + } + + /// Setting Copyright replaces an existing entry in place (no duplicate). + #[test] + fn set_copyright_replaces_existing() { + let src = sample(ByteOrder::Little, false); // has "(c) Me" + let mut x = Exif::parse(&src).unwrap(); + assert_eq!(x.copyright().unwrap(), "(c) Me"); + x.set_copyright("(c) New Owner"); + let out = x.to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!(y.copyright().unwrap(), "(c) New Owner"); + let n = y.ifd0.iter().filter(|e| e.tag == TAG_COPYRIGHT).count(); + assert_eq!(n, 1, "must replace, not duplicate"); + } + + /// Exif 2.x ASCII shoehorns UTF-8 bytes into the type-2 field (the de-facto + /// convention): the bytes are the string's UTF-8, the type stays 2, and the + /// value reads back as the same Unicode. + #[test] + fn set_copyright_ascii_carries_utf8_bytes_defacto() { + let blob = orientation_only(); + let mut x = Exif::parse(&blob).unwrap(); + x.set_copyright("© Лилит"); // non-ASCII into type 2 + let out = x.to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!(y.copyright_bytes(), Some("© Лилит".as_bytes())); + assert_eq!(y.copyright().unwrap(), "© Лилит"); // valid UTF-8 → reads back + assert!(out.windows(2).any(|w| w == TIFF_ASCII.to_le_bytes())); + } + + /// `set_artist` mirrors `set_copyright` and lands in the `rights` category + /// (kept by a rights-keeping policy, dropped when rights are discarded). + #[test] + fn set_artist_round_trips_and_is_rights() { + let blob = orientation_only(); + let mut x = Exif::parse(&blob).unwrap(); + x.set_artist("Lilith"); + let out = x.to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!(y.artist().unwrap(), "Lilith"); + let kept = y.filtered(&ExifPolicy::ATTRIBUTED_ORIENTATION).to_bytes(); + assert!(Exif::parse(&kept).unwrap().artist().is_some()); + let dropped = y + .filtered(&ExifPolicy::KEEP_ALL.with_rights(Retention::Discard)) + .to_bytes(); + assert!(Exif::parse(&dropped).unwrap().artist().is_none()); + } + + /// An edited (owned) entry survives a layout-shifting rewrite: the value is + /// owned, not aliased to a source offset, so the serializer relocates it like + /// any other out-of-line value. + #[test] + fn edited_copyright_survives_filter_rewrite() { + let src = sample(ByteOrder::Little, false); + let mut x = Exif::parse(&src).unwrap(); + let long = "Copyright 2026 Lilith River — all rights reserved worldwide."; + x.set_copyright(long); // long → out-of-line + let out = x + .filtered(&ExifPolicy::KEEP_ALL.with_gps(Retention::Discard)) + .to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert_eq!(y.copyright().unwrap(), long); + assert!(!y.has_gps()); + assert_eq!(y.orientation(), Some(Orientation::Rotate90)); + } + + /// Editing then serializing stays canonical (a byte-exact fixpoint), so an + /// edited blob filters idempotently like a parsed one. + #[test] + fn edited_to_bytes_is_canonical_fixpoint() { + let blob = orientation_only(); + let mut x = Exif::parse(&blob).unwrap(); + x.set_copyright("(c) Me"); + let b1 = x.to_bytes(); + let b2 = Exif::parse(&b1).unwrap().to_bytes(); + assert_eq!(b1, b2, "edited output must be a canonical fixpoint"); + } + + // ── Privacy hardening (MakerNote / SubIFDs / IFD1) ─────────────────────── + + /// MakerNote (0x927C) is opaque and can embed GPS/serials; it must drop when + /// GPS is stripped even if the `camera` category is kept (it can't be + /// selectively scrubbed), and stay when both camera and gps are kept. + #[test] + fn makernote_dropped_when_gps_stripped_even_if_camera_kept() { + let exif = Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: vec![ + e(TAG_ORIENTATION, TIFF_SHORT, 1, &[6, 0]), + e(TAG_MAKER_NOTE, TIFF_UNDEFINED, 8, b"MAKER\0\0\0"), + ], + exif_ifd: None, + gps_ifd: None, + ifd1: None, + thumbnail: None, + text_encoding: TextEncoding::Ascii, + }; + // Keep camera, drop GPS → MakerNote must be gone (could carry location). + let stripped = exif + .filtered(&ExifPolicy::KEEP_ALL.with_gps(Retention::Discard)) + .to_bytes(); + assert!( + !stripped + .windows(2) + .any(|w| w == TAG_MAKER_NOTE.to_le_bytes()), + "MakerNote must be stripped when GPS is dropped" + ); + // Both camera and gps kept (drop only `other`) → MakerNote survives. + let kept = exif + .filtered(&ExifPolicy::KEEP_ALL.with_other(Retention::Discard)) + .to_bytes(); + assert!( + kept.windows(2).any(|w| w == TAG_MAKER_NOTE.to_le_bytes()), + "MakerNote kept when both camera and gps are kept" + ); + } + + /// An unmodeled SubIFDs pointer (0x014A) is dropped on a rewrite rather than + /// left as a dangling offset; the rest of IFD0 survives. + #[test] + fn subifds_pointer_dropped_on_rewrite() { + let exif = Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: vec![ + e(TAG_ORIENTATION, TIFF_SHORT, 1, &[6, 0]), + e(TAG_SUBIFDS, TIFF_LONG, 1, &[0x40, 0, 0, 0]), + ], + exif_ifd: None, + gps_ifd: None, + ifd1: None, + thumbnail: None, + text_encoding: TextEncoding::Ascii, + }; + let out = exif + .filtered(&ExifPolicy::KEEP_ALL.with_gps(Retention::Discard)) + .to_bytes(); + assert!( + !out.windows(2).any(|w| w == TAG_SUBIFDS.to_le_bytes()), + "SubIFDs pointer must be dropped on rewrite (would dangle)" + ); + assert_eq!( + Exif::parse(&out).unwrap().orientation(), + Some(Orientation::Rotate90) + ); + } + + /// IFD1 (thumbnail dir) entries obey categories: keep the thumbnail image but + /// drop camera → the thumbnail survives while IFD1's Make is stripped. + #[test] + fn ifd1_entries_filtered_by_category() { + let exif = Exif { + order: ByteOrder::Little, + had_prefix: false, + ifd0: vec![e(TAG_ORIENTATION, TIFF_SHORT, 1, &[6, 0])], + exif_ifd: None, + gps_ifd: None, + ifd1: Some(vec![e(TAG_MAKE, TIFF_ASCII, 4, b"Cam\0")]), // camera tag in IFD1 + thumbnail: Some(&[0xFF, 0xD8, 0xFF, 0xD9]), + text_encoding: TextEncoding::Ascii, + }; + let out = exif + .filtered(&ExifPolicy::KEEP_ALL.with_camera(Retention::Discard)) + .to_bytes(); + let y = Exif::parse(&out).unwrap(); + assert!(y.has_thumbnail(), "thumbnail image must be kept"); + assert!( + !out.windows(2).any(|w| w == TAG_MAKE.to_le_bytes()), + "IFD1 camera tag (Make) must be stripped" + ); + } + + // ── From-scratch construction (Exif::new) ──────────────────────────────── + + /// Build a fresh EXIF from nothing: new → set_copyright → to_bytes → parse. + #[test] + fn new_from_scratch_copyright_round_trips() { + let mut exif = Exif::new(TextEncoding::Ascii); + assert!(exif.copyright().is_none()); + exif.set_copyright("(c) 2026 Lilith"); + let blob = exif.to_bytes(); + let y = Exif::parse(&blob).expect("fresh blob parses"); + assert_eq!(y.copyright().unwrap(), "(c) 2026 Lilith"); + assert!(!y.has_gps() && !y.has_thumbnail()); + // Copyright is `rights`, so it survives even the web preset. + let kept = y.filtered(&ExifPolicy::ATTRIBUTED_ORIENTATION).to_bytes(); + assert_eq!( + Exif::parse(&kept).unwrap().copyright().unwrap(), + "(c) 2026 Lilith" + ); + } + + /// `Exif::default()` == empty `new()`, and an empty blob round-trips. + #[test] + fn new_default_empty_round_trips() { + let blob = Exif::default().to_bytes(); + let y = Exif::parse(&blob).expect("empty blob parses"); + assert!(y.copyright().is_none() && !y.has_gps() && !y.has_thumbnail()); + } +} diff --git a/src/gainmap.rs b/src/gainmap.rs index 9919b31..8b8c205 100644 --- a/src/gainmap.rs +++ b/src/gainmap.rs @@ -41,9 +41,9 @@ use crate::info::Cicp; /// /// | Context | Zencodec produces/consumes | Framing the caller adds on top | /// |---------|----------------------------|--------------------------------| -/// | JPEG APP2 (secondary) | [`JpegApp2BodyWithUrn`] bytes — URN + payload | `FF E2` marker + `u16 BE` length (length counts itself + these bytes) | -/// | AVIF `tmap` item | [`AvifTmap`] bytes — this *is* the tmap item payload | ISOBMFF item framing (`iinf` / `iloc` / `iref`) pointing at these bytes | -/// | JXL `jhgm` box | [`JpegApp2`] bytes go into the bundle's `gain_map_metadata` field only — **no URN** | A whole `JxlGainMapBundle` (`jhgm_version u8` + `gain_map_metadata_size u16 BE` + **payload** + color encoding + alt ICC + JXL codestream), then the ISOBMFF `jhgm` box around the bundle | +/// | JPEG APP2 (secondary) | [`JpegApp2BodyWithUrn`](Iso21496Format::JpegApp2BodyWithUrn) bytes — URN + payload | `FF E2` marker + `u16 BE` length (length counts itself + these bytes) | +/// | AVIF `tmap` item | [`AvifTmap`](Iso21496Format::AvifTmap) bytes — this *is* the tmap item payload | ISOBMFF item framing (`iinf` / `iloc` / `iref`) pointing at these bytes | +/// | JXL `jhgm` box | [`JpegApp2`](Iso21496Format::JpegApp2) bytes go into the bundle's `gain_map_metadata` field only — **no URN** | A whole `JxlGainMapBundle` (`jhgm_version u8` + `gain_map_metadata_size u16 BE` + **payload** + color encoding + alt ICC + JXL codestream), then the ISOBMFF `jhgm` box around the bundle | /// /// **Note on JXL:** `jhgm` is *not* just "an ISOBMFF box around the ISO /// payload". It's a structured bundle with its own header and trailing @@ -795,7 +795,7 @@ pub enum GainMapParseError { MinExceedsMax { channel: usize, min: f64, max: f64 }, /// A value is NaN or infinity. NonFiniteValue { field: &'static str }, - /// Input to [`parse_iso21496_with_urn`] did not begin with the ISO 21496-1 URN. + /// Input to [`parse_iso21496_fmt`] did not begin with the ISO 21496-1 URN. UrnMismatch, } diff --git a/src/helpers/exif.rs b/src/helpers/exif.rs index 65cd561..29cd05e 100644 --- a/src/helpers/exif.rs +++ b/src/helpers/exif.rs @@ -1,59 +1,17 @@ -//! Minimal EXIF orientation parser. +//! Lightweight EXIF orientation accessor. //! -//! Parses the EXIF Orientation tag (0x0112 / TIFF tag 274) from TIFF-structured -//! EXIF data. Handles both raw TIFF bytes and JPEG APP1 style (`Exif\0\0` prefix). -//! -//! Spec references: -//! - TIFF 6.0 specification (Adobe, 1992): IFD structure, byte order, tag 274 -//! - EXIF 2.32 (CIPA DC-008-Translation-2019): Orientation tag semantics -//! - TIFF/EP (ISO 12234-2): Same orientation tag definition -//! -//! # Design -//! -//! This parser is intentionally minimal — it extracts only the orientation tag. -//! For full EXIF parsing (make, model, GPS, dates), use `zencodecs::exif::parse_exif`. -//! -//! Safety properties: -//! - Every byte read is bounds-checked (returns `None` on truncation) -//! - IFD entry count capped at 1000 to prevent DoS from malformed data -//! - No recursion, no heap allocation, `no_std` compatible -//! - Handles both big-endian (Motorola/MM) and little-endian (Intel/II) -//! - Accepts TIFF SHORT (type 3) and LONG (type 4) for the orientation value -//! - Validates orientation value is in 1..=8 -//! - Does NOT follow EXIF sub-IFD pointers (orientation is always in IFD0) +//! A thin convenience over the structured [`crate::exif`] parser: extracts just +//! the Orientation tag (0x0112) from a TIFF/EXIF blob. For anything richer +//! (copyright, GPS, thumbnail, pruning, re-serialization) use [`crate::exif::Exif`]. use zenpixels::Orientation; -/// EXIF Orientation tag (TIFF tag 274 / 0x0112). -const TAG_ORIENTATION: u16 = 0x0112; -/// TIFF type SHORT (unsigned 16-bit integer). -const TIFF_SHORT: u16 = 3; -/// TIFF type LONG (unsigned 32-bit integer). -const TIFF_LONG: u16 = 4; -/// Maximum IFD entries to scan before giving up (DoS protection). -const MAX_IFD_ENTRIES: u16 = 1000; -/// Minimum TIFF header size: byte order (2) + magic (2) + IFD0 offset (4). -const TIFF_HEADER_SIZE: usize = 8; - -/// Parse the EXIF orientation from TIFF-structured EXIF data. -/// -/// Accepts either: -/// - **Raw TIFF bytes** starting with byte order mark (`II` or `MM`) -/// - **JPEG APP1 style** with `Exif\0\0` prefix followed by TIFF data -/// - **HEIF EXIF item** with 4-byte offset header — strip this before calling -/// -/// Returns the [`Orientation`] if the tag is found and valid (1-8), -/// or `None` for missing/invalid/truncated data. -/// -/// # Spec compliance +/// Parse the EXIF Orientation tag (0x0112) from a TIFF/EXIF blob. /// -/// - Validates TIFF byte order mark and magic number (42) -/// - Walks IFD0 entries up to a fixed cap (1000 entries) -/// - Accepts both SHORT (2-byte) and LONG (4-byte) orientation values, -/// per TIFF 6.0 which recommends SHORT but doesn't forbid LONG -/// - Exploits IFD tag sort order for early exit (tags are sorted ascending) -/// - Correctly handles the IFD value/offset field: values ≤4 bytes are -/// stored inline at the entry's value field, not at an external offset +/// Accepts raw TIFF bytes or a JPEG APP1 `Exif\0\0`-prefixed blob, both byte +/// orders, and SHORT or LONG values. Delegates to [`crate::exif::Exif`], so it +/// is fully bounds-checked and never panics on malformed input. Returns `None` +/// if the blob is malformed or carries no valid Orientation tag. /// /// # Examples /// @@ -61,130 +19,39 @@ const TIFF_HEADER_SIZE: usize = 8; /// use zencodec::helpers::parse_exif_orientation; /// use zenpixels::Orientation; /// -/// // Minimal valid TIFF with orientation tag (little-endian) -/// let mut tiff = vec![ -/// b'I', b'I', // byte order: little-endian -/// 42, 0, // TIFF magic -/// 8, 0, 0, 0, // IFD0 offset = 8 -/// 1, 0, // 1 IFD entry -/// 0x12, 0x01, // tag = 0x0112 (Orientation) -/// 3, 0, // type = SHORT -/// 1, 0, 0, 0, // count = 1 -/// 6, 0, 0, 0, // value = 6 (Rotate90) +/// let tiff = vec![ +/// b'I', b'I', 42, 0, 8, 0, 0, 0, // header: LE, magic 42, IFD0 @ 8 +/// 1, 0, // 1 entry +/// 0x12, 0x01, 3, 0, 1, 0, 0, 0, // tag 0x0112, SHORT, count 1 +/// 6, 0, 0, 0, // value 6 (Rotate90) +/// 0, 0, 0, 0, // next IFD = 0 /// ]; -/// assert_eq!( -/// parse_exif_orientation(&tiff), -/// Some(Orientation::Rotate90), -/// ); -/// -/// // Also works with Exif\0\0 prefix (JPEG APP1 style) -/// let mut app1 = b"Exif\0\0".to_vec(); -/// app1.extend_from_slice(&tiff); -/// assert_eq!( -/// parse_exif_orientation(&app1), -/// Some(Orientation::Rotate90), -/// ); +/// assert_eq!(parse_exif_orientation(&tiff), Some(Orientation::Rotate90)); /// ``` pub fn parse_exif_orientation(data: &[u8]) -> Option { - // Strip optional Exif\0\0 prefix (JPEG APP1 style). - let tiff = if data.len() >= 6 && data[..6] == *b"Exif\0\0" { - &data[6..] - } else { - data - }; - - if tiff.len() < TIFF_HEADER_SIZE { - return None; - } - - // Determine byte order from TIFF header. - let be = match [tiff[0], tiff[1]] { - [b'M', b'M'] => true, // Motorola byte order (big-endian) - [b'I', b'I'] => false, // Intel byte order (little-endian) - _ => return None, - }; - - // Verify TIFF magic number (42). - if rd16(tiff, 2, be)? != 42 { - return None; - } - - // Read IFD0 offset and validate. - let ifd0 = rd32(tiff, 4, be)? as usize; - let entry_count = rd16(tiff, ifd0, be)?; - - // Cap entry count to prevent DoS from malformed data. - if entry_count > MAX_IFD_ENTRIES { - return None; - } - - let entries_start = ifd0.checked_add(2)?; - - // Walk IFD0 entries looking for orientation tag. - for i in 0..entry_count as usize { - let off = entries_start.checked_add(i.checked_mul(12)?)?; - - // Each IFD entry is 12 bytes: tag(2) + type(2) + count(4) + value(4) - if off.checked_add(12)? > tiff.len() { - break; - } - - let tag = rd16(tiff, off, be)?; - - // IFD entries are sorted by tag number (TIFF 6.0 §2). - // If we've passed 0x0112, it's not here. - if tag > TAG_ORIENTATION { - break; - } - if tag != TAG_ORIENTATION { - continue; - } - - let type_id = rd16(tiff, off + 2, be)?; - let count = rd32(tiff, off + 4, be)?; - - // Orientation must be a single value. - if count < 1 { - return None; - } - - // Read the value. Per TIFF 6.0 §2: if the value fits in 4 bytes, - // it's stored inline at offset+8. Orientation is SHORT (2 bytes) - // or occasionally LONG (4 bytes) — both fit inline. - let raw = match type_id { - TIFF_SHORT => rd16(tiff, off + 8, be)? as u32, - TIFF_LONG => rd32(tiff, off + 8, be)?, - _ => return None, - }; - - // Orientation values are 1-8. - if raw > 8 { - return None; - } - return Orientation::from_exif(raw as u8); - } - - None + crate::exif::Exif::parse(data)?.orientation() } -/// Read a u16 from `data` at `offset` with bounds checking. -fn rd16(data: &[u8], offset: usize, big_endian: bool) -> Option { - let b = data.get(offset..offset + 2)?; - Some(if big_endian { - u16::from_be_bytes([b[0], b[1]]) - } else { - u16::from_le_bytes([b[0], b[1]]) - }) -} - -/// Read a u32 from `data` at `offset` with bounds checking. -fn rd32(data: &[u8], offset: usize, big_endian: bool) -> Option { - let b = data.get(offset..offset + 4)?; - Some(if big_endian { - u32::from_be_bytes([b[0], b[1], b[2], b[3]]) - } else { - u32::from_le_bytes([b[0], b[1], b[2], b[3]]) - }) +/// Rewrite the EXIF Orientation tag (0x0112) in a TIFF/EXIF blob to `value`, +/// returning a new blob. +/// +/// The orientation value is stored inline (SHORT or LONG), so the canonical +/// [`crate::exif::Exif`] parser locates it and the byte is overwritten in place +/// — no TIFF offsets are recomputed, so the rest of the blob is byte-identical. +/// Accepts raw TIFF bytes or a JPEG APP1 `Exif\0\0`-prefixed blob, both byte +/// orders. +/// +/// Returns `None` if the blob is malformed or carries no Orientation tag — the +/// caller should then leave the blob unchanged. This is the byte-level half of +/// closing the double-rotation hazard: when a decoder bakes orientation upright, +/// the structured field says `Identity` but the embedded blob still says e.g. +/// `Rotate90`; rewriting the tag to `1` keeps them in agreement. +/// +/// Reuses the same IFD walker as [`parse_exif_orientation`] rather than a second +/// hand-rolled scanner, so the two can't diverge on prefix/byte-order/type/bounds +/// handling. +pub fn set_exif_orientation(data: &[u8], value: Orientation) -> Option> { + crate::exif::set_orientation(data, value) } #[cfg(test)] @@ -192,269 +59,129 @@ mod tests { use super::*; use alloc::vec::Vec; - /// Build a minimal TIFF with one IFD entry for orientation. - fn make_tiff(big_endian: bool, orientation: u16, type_id: u16) -> Vec { - let mut buf = Vec::new(); - let w16 = |buf: &mut Vec, v: u16| { - if big_endian { - buf.extend_from_slice(&v.to_be_bytes()); + /// Minimal TIFF with one orientation entry (SHORT or LONG), either order. + fn tiff(order_be: bool, value: u32, type_id: u16) -> Vec { + let mut v = Vec::new(); + let w16 = |v: &mut Vec, x: u16| { + v.extend_from_slice(&if order_be { + x.to_be_bytes() } else { - buf.extend_from_slice(&v.to_le_bytes()); - } + x.to_le_bytes() + }) }; - let w32 = |buf: &mut Vec, v: u32| { - if big_endian { - buf.extend_from_slice(&v.to_be_bytes()); + let w32 = |v: &mut Vec, x: u32| { + v.extend_from_slice(&if order_be { + x.to_be_bytes() } else { - buf.extend_from_slice(&v.to_le_bytes()); - } + x.to_le_bytes() + }) }; - - // Header - if big_endian { - buf.extend_from_slice(b"MM"); - } else { - buf.extend_from_slice(b"II"); - } - w16(&mut buf, 42); // magic - w32(&mut buf, 8); // IFD0 offset - - // IFD0 - w16(&mut buf, 1); // 1 entry - w16(&mut buf, TAG_ORIENTATION); // tag - w16(&mut buf, type_id); // type - w32(&mut buf, 1); // count - if type_id == TIFF_LONG { - w32(&mut buf, orientation as u32); // value (LONG) + v.extend_from_slice(if order_be { b"MM" } else { b"II" }); + w16(&mut v, 42); + w32(&mut v, 8); + w16(&mut v, 1); // 1 entry + w16(&mut v, 0x0112); + w16(&mut v, type_id); + w32(&mut v, 1); + // Inline value: SHORT occupies the leading bytes of the 4-byte field. + if type_id == 3 { + w16(&mut v, value as u16); + w16(&mut v, 0); } else { - w16(&mut buf, orientation); // value (SHORT) - w16(&mut buf, 0); // padding + w32(&mut v, value); } - - buf + w32(&mut v, 0); // next IFD + v } - // ── Basic parsing ────────────────────────────────────────────────── - #[test] - fn identity_little_endian() { - let tiff = make_tiff(false, 1, TIFF_SHORT); - assert_eq!(parse_exif_orientation(&tiff), Some(Orientation::Identity)); + fn short_little_endian() { + assert_eq!( + parse_exif_orientation(&tiff(false, 6, 3)), + Some(Orientation::Rotate90) + ); } #[test] - fn rotate90_big_endian() { - let tiff = make_tiff(true, 6, TIFF_SHORT); - assert_eq!(parse_exif_orientation(&tiff), Some(Orientation::Rotate90)); + fn short_big_endian() { + assert_eq!( + parse_exif_orientation(&tiff(true, 6, 3)), + Some(Orientation::Rotate90) + ); } #[test] - fn all_orientations() { - let expected = [ - (1, Orientation::Identity), - (2, Orientation::FlipH), - (3, Orientation::Rotate180), - (4, Orientation::FlipV), - (5, Orientation::Transpose), - (6, Orientation::Rotate90), - (7, Orientation::Transverse), - (8, Orientation::Rotate270), - ]; - for (val, orient) in expected { - let tiff = make_tiff(false, val, TIFF_SHORT); - assert_eq!(parse_exif_orientation(&tiff), Some(orient), "value={val}"); - } + fn long_type_both_orders() { + assert_eq!( + parse_exif_orientation(&tiff(false, 8, 4)), + Some(Orientation::Rotate270) + ); + assert_eq!( + parse_exif_orientation(&tiff(true, 8, 4)), + Some(Orientation::Rotate270) + ); } - // ── APP1 prefix handling ─────────────────────────────────────────── - #[test] fn with_exif_prefix() { - let tiff = make_tiff(false, 6, TIFF_SHORT); - let mut app1 = b"Exif\0\0".to_vec(); - app1.extend_from_slice(&tiff); - assert_eq!(parse_exif_orientation(&app1), Some(Orientation::Rotate90)); - } - - #[test] - fn raw_tiff_without_prefix() { - let tiff = make_tiff(true, 3, TIFF_SHORT); - assert_eq!(parse_exif_orientation(&tiff), Some(Orientation::Rotate180)); - } - - // ── LONG type support ────────────────────────────────────────────── - - #[test] - fn orientation_as_long() { - let tiff = make_tiff(false, 8, TIFF_LONG); - assert_eq!(parse_exif_orientation(&tiff), Some(Orientation::Rotate270)); + let mut blob = b"Exif\0\0".to_vec(); + blob.extend_from_slice(&tiff(false, 6, 3)); + assert_eq!(parse_exif_orientation(&blob), Some(Orientation::Rotate90)); } #[test] - fn orientation_as_long_big_endian() { - let tiff = make_tiff(true, 5, TIFF_LONG); - assert_eq!(parse_exif_orientation(&tiff), Some(Orientation::Transpose)); - } - - // ── Invalid/edge cases ───────────────────────────────────────────── - - #[test] - fn empty_input() { + fn invalid_inputs_return_none() { + assert_eq!(parse_exif_orientation(b"garbage"), None); assert_eq!(parse_exif_orientation(&[]), None); + assert_eq!(parse_exif_orientation(&[0u8; 7]), None); + // Orientation value out of range (9) → no valid orientation. + assert_eq!(parse_exif_orientation(&tiff(false, 9, 3)), None); + assert_eq!(parse_exif_orientation(&tiff(false, 0, 3)), None); + } + + #[test] + fn set_orientation_roundtrips_all_orders_and_types() { + for be in [false, true] { + for &type_id in &[3u16, 4] { + // Start at Rotate90 (6), rewrite to Identity (1), read back. + let blob = tiff(be, 6, type_id); + assert_eq!(parse_exif_orientation(&blob), Some(Orientation::Rotate90)); + let rewritten = + set_exif_orientation(&blob, Orientation::Identity).expect("tag present"); + assert_eq!(rewritten.len(), blob.len()); // offsets unchanged + assert_eq!( + parse_exif_orientation(&rewritten), + Some(Orientation::Identity) + ); + } + } } #[test] - fn too_short() { - assert_eq!(parse_exif_orientation(&[0x49, 0x49, 42, 0]), None); - } - - #[test] - fn bad_byte_order() { - let mut tiff = make_tiff(false, 1, TIFF_SHORT); - tiff[0] = b'X'; - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn bad_magic() { - let mut tiff = make_tiff(false, 1, TIFF_SHORT); - tiff[2] = 0; - tiff[3] = 0; // magic = 0 instead of 42 - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn orientation_value_0_invalid() { - let tiff = make_tiff(false, 0, TIFF_SHORT); - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn orientation_value_9_invalid() { - let tiff = make_tiff(false, 9, TIFF_SHORT); - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn orientation_value_255_invalid() { - let tiff = make_tiff(false, 255, TIFF_SHORT); - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn wrong_type_rejected() { - // TIFF type 2 (ASCII) is not valid for orientation - let tiff = make_tiff(false, 6, 2); - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn ifd_offset_beyond_data() { - let mut tiff = make_tiff(false, 1, TIFF_SHORT); - // Set IFD0 offset to beyond data length - tiff[4] = 0xFF; - tiff[5] = 0xFF; - tiff[6] = 0; - tiff[7] = 0; - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn truncated_ifd_entry() { - let tiff = make_tiff(false, 6, TIFF_SHORT); - // Truncate to just after IFD entry count, before the entry data - assert_eq!(parse_exif_orientation(&tiff[..12]), None); - } - - // ── Tag sorting / early exit ─────────────────────────────────────── - - #[test] - fn orientation_after_other_tags() { - // Build TIFF with ImageWidth (0x0100) before Orientation (0x0112) - let mut buf = Vec::new(); - buf.extend_from_slice(b"II"); // little-endian - buf.extend_from_slice(&42u16.to_le_bytes()); - buf.extend_from_slice(&8u32.to_le_bytes()); // IFD0 offset - - buf.extend_from_slice(&2u16.to_le_bytes()); // 2 entries - - // Entry 1: ImageWidth (0x0100) = 640 - buf.extend_from_slice(&0x0100u16.to_le_bytes()); - buf.extend_from_slice(&TIFF_SHORT.to_le_bytes()); - buf.extend_from_slice(&1u32.to_le_bytes()); - buf.extend_from_slice(&640u16.to_le_bytes()); - buf.extend_from_slice(&0u16.to_le_bytes()); - - // Entry 2: Orientation (0x0112) = 6 - buf.extend_from_slice(&TAG_ORIENTATION.to_le_bytes()); - buf.extend_from_slice(&TIFF_SHORT.to_le_bytes()); - buf.extend_from_slice(&1u32.to_le_bytes()); - buf.extend_from_slice(&6u16.to_le_bytes()); - buf.extend_from_slice(&0u16.to_le_bytes()); - - assert_eq!(parse_exif_orientation(&buf), Some(Orientation::Rotate90)); - } - - #[test] - fn early_exit_on_higher_tag() { - // Build TIFF with only ImageDescription (0x010E) — past 0x0112 in sort order? No, 0x010E < 0x0112. - // Use XResolution (0x011A) which is > 0x0112 to test early exit. - let mut buf = Vec::new(); - buf.extend_from_slice(b"II"); - buf.extend_from_slice(&42u16.to_le_bytes()); - buf.extend_from_slice(&8u32.to_le_bytes()); - - buf.extend_from_slice(&1u16.to_le_bytes()); // 1 entry - - // Entry: XResolution (0x011A) — tag > TAG_ORIENTATION - buf.extend_from_slice(&0x011Au16.to_le_bytes()); - buf.extend_from_slice(&TIFF_SHORT.to_le_bytes()); - buf.extend_from_slice(&1u32.to_le_bytes()); - buf.extend_from_slice(&72u16.to_le_bytes()); - buf.extend_from_slice(&0u16.to_le_bytes()); - - // Should return None (orientation not present, early exit) - assert_eq!(parse_exif_orientation(&buf), None); - } - - // ── DoS protection ───────────────────────────────────────────────── - - #[test] - fn excessive_entry_count_rejected() { - let mut tiff = make_tiff(false, 6, TIFF_SHORT); - // Set entry count to 1001 (> MAX_IFD_ENTRIES) - tiff[8] = 0xE9; - tiff[9] = 0x03; // 1001 in LE - assert_eq!(parse_exif_orientation(&tiff), None); - } - - #[test] - fn max_entry_count_accepted() { - let mut tiff = make_tiff(false, 6, TIFF_SHORT); - // Set entry count to 1000 (= MAX_IFD_ENTRIES) — accepted but - // will break on bounds check since we don't have 1000 entries - tiff[8] = 0xE8; - tiff[9] = 0x03; // 1000 in LE - // Won't find orientation (entry is at index 0 but tag bytes are - // now part of the "count" field area) — just shouldn't panic - let _ = parse_exif_orientation(&tiff); - } - - // ── Exif\0\0 prefix edge cases ──────────────────────────────────── - - #[test] - fn exif_prefix_only_no_tiff() { - assert_eq!(parse_exif_orientation(b"Exif\0\0"), None); - } - - #[test] - fn exif_prefix_truncated() { - assert_eq!(parse_exif_orientation(b"Exif\0"), None); - } - - #[test] - fn exif_prefix_with_garbage() { - let data = b"Exif\0\0GARBAGE".to_vec(); - assert_eq!(parse_exif_orientation(&data), None); + fn set_orientation_with_exif_prefix() { + let mut blob = b"Exif\0\0".to_vec(); + blob.extend_from_slice(&tiff(false, 6, 3)); + let out = set_exif_orientation(&blob, Orientation::Rotate180).expect("tag present"); + assert_eq!(parse_exif_orientation(&out), Some(Orientation::Rotate180)); + } + + #[test] + fn set_orientation_absent_tag_or_garbage_is_none() { + // No 0x0112 entry: a minimal IFD with a different tag. + let mut v = b"II".to_vec(); + v.extend_from_slice(&42u16.to_le_bytes()); + v.extend_from_slice(&8u32.to_le_bytes()); + v.extend_from_slice(&1u16.to_le_bytes()); // 1 entry + v.extend_from_slice(&0x010Fu16.to_le_bytes()); // Make tag, not orientation + v.extend_from_slice(&3u16.to_le_bytes()); + v.extend_from_slice(&1u32.to_le_bytes()); + v.extend_from_slice(&[0, 0, 0, 0]); + v.extend_from_slice(&0u32.to_le_bytes()); + assert_eq!(set_exif_orientation(&v, Orientation::Identity), None); + assert_eq!( + set_exif_orientation(b"garbage", Orientation::Identity), + None + ); + assert_eq!(set_exif_orientation(&[], Orientation::Identity), None); } } diff --git a/src/helpers/mod.rs b/src/helpers/mod.rs index ca7605b..bc77490 100644 --- a/src/helpers/mod.rs +++ b/src/helpers/mod.rs @@ -6,7 +6,7 @@ //! //! # Submodules //! -//! - [`icc`]: ICC profile identification and pixel descriptor derivation. +//! - `icc`: ICC profile identification and pixel descriptor derivation. use alloc::borrow::Cow; @@ -19,7 +19,7 @@ use crate::traits::{AnimationFrameDecoder, Decode, DecodeJob}; mod exif; mod icc; -pub use exif::parse_exif_orientation; +pub use exif::{parse_exif_orientation, set_exif_orientation}; pub use icc::descriptor_for_decoded_pixels_v2; #[allow(deprecated)] pub use icc::{ diff --git a/src/lib.rs b/src/lib.rs index d3c112d..d6c33db 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -38,9 +38,14 @@ extern crate alloc; whereat::define_at_crate_info!(); mod capabilities; +/// Cross-codec color-signaling emission policy (ICC vs CICP). See +/// `docs/color-emit-model.md`. +mod color; mod cost; mod detect; mod error; +/// Structured EXIF/TIFF parsing, pruning, and serialization. +pub mod exif; mod extensions; mod format; /// Cross-codec gain map types (ISO 21496-1). @@ -48,7 +53,7 @@ pub mod gainmap; /// Codec implementation helpers (not consumer API). pub mod helpers; /// Lightweight ICC profile inspection (tag extraction, no full parse). -pub mod icc; +mod icc; mod info; mod limits; mod metadata; @@ -63,6 +68,14 @@ mod traits; // Public root: shared types used by both encode and decode // ========================================================================= +pub use color::{ + CicpEmission, ColorEmitFields, ColorEmitPlan, ColorEmitPolicy, IccDisposition, + resolve_color_emit, +}; +// `ByteOrder` is intentionally NOT re-exported at the root: it is a TIFF/EXIF +// header detail used only within the `exif` module, and the bare name is too +// generic for the crate root. Reach it as `exif::ByteOrder`. +pub use exif::{Exif, ExifPolicy, Retention, TextEncoding}; pub use extensions::Extensions; pub use format::{ImageFormat, ImageFormatDefinition, ImageFormatRegistry}; pub use gainmap::{ @@ -76,7 +89,7 @@ pub use info::{ ResolutionUnit, SourceColor, Supplements, }; pub use limits::{LimitExceeded, ResourceLimits, ThreadingPolicy}; -pub use metadata::Metadata; +pub use metadata::{IccRetention, Metadata, MetadataFields, MetadataPolicy}; pub use orientation::{Orientation, OrientationHint}; pub use output::{AnimationFrame, OwnedAnimationFrame}; pub use zenpixels::ColorAuthority; diff --git a/src/limits.rs b/src/limits.rs index 754a88a..a10e777 100644 --- a/src/limits.rs +++ b/src/limits.rs @@ -25,7 +25,7 @@ /// /// # Controlling thread count /// -/// Thread count is controlled externally via [`rayon::ThreadPool::install()`]: +/// Thread count is controlled externally via `rayon::ThreadPool::install()`: /// /// ```ignore /// use rayon::ThreadPoolBuilder; diff --git a/src/metadata.rs b/src/metadata.rs index 00d9d57..03c6fcb 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -2,10 +2,39 @@ //! //! [`Metadata`] carries ICC, EXIF, XMP, CICP, HDR, and orientation data //! using `Arc<[u8]>` for byte buffers (cheap cloning via ref-count bump). +//! +//! # Forward compatibility +//! +//! This surface is shaped so it never needs a semver-major break. Every +//! growable record ([`Metadata`], [`MetadataFields`], [`ExifPolicy`]) is +//! `#[non_exhaustive]` and built from a constructor plus `with_*` setters, and +//! every disposition enum ([`MetadataPolicy`], [`IccRetention`], +//! [`Retention`](crate::exif::Retention)) is `#[non_exhaustive]` — so new +//! record fields and new enum variants both land additively, and downstream +//! cannot struct-literal or exhaustively match these types. Query +//! [`Retention`](crate::exif::Retention) via +//! [`keeps`](crate::exif::Retention::keeps) / `discards` rather than matching, +//! so callers stay correct as variants are added. +//! +//! Anticipated additive growth (each a new field or variant, never a break): +//! partial-XMP retention beside the whole-segment `xmp` switch, gain-map and +//! depth-map retention, new [`ExifPolicy`] categories, new [`IccRetention`] +//! modes, and new color-signaling fields on [`Metadata`] / +//! [`SourceColor`](crate::SourceColor). +//! +//! The known cross-codec carrier gaps (imazen/zenpipe#36 — +//! `Metadata::orientation` emission, decode-side EXIF-orientation +//! normalization, CICP wiring for native-carrier formats, and AVIF EXIF-blob +//! preservation) are fixable as behavioral changes in the codec adapters: +//! [`Metadata`] already models every value those fixes produce +//! ([`orientation`](Metadata::orientation), [`cicp`](Metadata::cicp), +//! [`exif`](Metadata::exif)), so none require a type, field, or signature +//! change here. use alloc::sync::Arc; use crate::Orientation; +use crate::exif::{Exif, ExifPolicy, Retention}; use crate::info::{Cicp, ContentLightLevel, MasteringDisplay}; use zenpixels::{ColorPrimaries, TransferFunction}; @@ -31,12 +60,25 @@ pub struct Metadata { pub mastering_display: Option, /// EXIF orientation. pub orientation: Orientation, + /// Embed-time retention policy — the **explicit** privacy decision for how + /// [`for_embedding`](Self::for_embedding) prunes this metadata before a codec + /// writes it. `None` (the constructed default) means *no policy chosen*: the + /// caller must set one via [`with_policy`](Self::with_policy), and until they + /// do, [`for_embedding`](Self::for_embedding) returns `None` so a codec + /// embeds nothing (fail-safe — a forgotten policy strips, never leaks). + /// [`MetadataPolicy::Web`] is the recommended privacy-safe choice; + /// [`PreserveExact`](MetadataPolicy::PreserveExact) embeds verbatim. + /// + /// Intent only — the carried `exif`/`xmp`/`icc_profile` bytes are untouched + /// until [`for_embedding`](Self::for_embedding) applies the policy, so + /// inspection / external-EXIF-library round-trips still see the originals. + pub policy: Option, } // Metadata contains 3× Option> (fat pointers), so size varies by // pointer width. Catch unexpected growth from new fields or alignment changes. #[cfg(target_pointer_width = "64")] -const _: () = assert!(core::mem::size_of::() == 104); +const _: () = assert!(core::mem::size_of::() == 120); impl Metadata { /// Create empty metadata. @@ -73,6 +115,43 @@ impl Metadata { self } + /// Set the EXIF Copyright tag, creating an EXIF blob if there is none and + /// merging into the existing one otherwise. + /// + /// Written as ASCII (Exif 2.x, the most widely-read form). For UTF-8 (Exif + /// 3.0) or other tags, build the blob via [`exif::Exif`](crate::exif::Exif) + /// and pass it to [`with_exif`](Self::with_exif). Unparseable existing EXIF + /// is replaced with a fresh blob carrying just this field. + #[must_use] + pub fn with_copyright(self, copyright: &str) -> Self { + self.set_exif_string(copyright, |e, t| e.set_copyright(t)) + } + + /// Set the EXIF Artist tag. See [`with_copyright`](Self::with_copyright) for + /// encoding and merge semantics. + #[must_use] + pub fn with_artist(self, artist: &str) -> Self { + self.set_exif_string(artist, |e, t| e.set_artist(t)) + } + + /// Shared helper for [`with_copyright`]/[`with_artist`]: parse the existing + /// EXIF (or start fresh via [`Exif::new`]), apply `set`, and re-serialize. + fn set_exif_string(mut self, text: &str, set: impl FnOnce(&mut Exif<'_>, &str)) -> Self { + let bytes = { + // `exif` borrows `self.exif` here; `to_bytes` copies values out, so + // `bytes` is owned and the borrow ends before we reassign below. + let mut exif = self + .exif + .as_deref() + .and_then(Exif::parse) + .unwrap_or_default(); + set(&mut exif, text); + exif.to_bytes() + }; + self.exif = Some(Arc::from(bytes)); + self + } + /// Set the XMP metadata. /// /// Accepts `Vec`, `&[u8]`, or `Arc<[u8]>`. @@ -105,6 +184,39 @@ impl Metadata { self } + /// Set the embed-time retention [`policy`](Self::policy) — the explicit + /// privacy choice. [`MetadataPolicy::Web`] is the recommended privacy-safe + /// option; [`PreserveExact`](MetadataPolicy::PreserveExact) embeds verbatim. + /// There is no implicit default: without this call, + /// [`for_embedding`](Self::for_embedding) yields `None`. + #[must_use] + pub fn with_policy(mut self, policy: MetadataPolicy) -> Self { + self.policy = Some(policy); + self + } + + /// The metadata a codec should actually embed: `self` pruned by its + /// explicitly-set [`policy`](Self::policy), or `None` if no policy was chosen. + /// + /// `None` is fail-safe: a codec treats it as "embed nothing," so a forgotten + /// [`with_policy`](Self::with_policy) strips all metadata rather than leaking + /// it. This is the zencodec filtering hook a codec calls inside its + /// `EncodeJob::with_metadata`, with no EXIF logic of its own: + /// + /// ```ignore + /// fn with_metadata(mut self, meta: Metadata) -> Self { + /// self.metadata = meta.for_embedding(); // Option: None ⇒ embed nothing + /// self + /// } + /// ``` + /// + /// A returned `Some` carries [`MetadataPolicy::PreserveExact`], so it is + /// already final — re-embedding it never strips twice. + #[must_use] + pub fn for_embedding(&self) -> Option { + self.policy.map(|p| self.filtered(&p)) + } + /// Whether any metadata is present. pub fn is_empty(&self) -> bool { self.icc_profile.is_none() @@ -135,6 +247,317 @@ impl Metadata { .map(|c| c.color_primaries_enum()) .unwrap_or(ColorPrimaries::Bt709) } + + /// Apply a retention [`MetadataPolicy`], returning a filtered copy. + /// + /// The shared field-level metadata filter for re-encode / recompress + /// pipelines: keep what a downstream image needs, strip the rest, without + /// callers hand-parsing EXIF. + /// + /// - **ICC** is three-way ([`IccRetention`]): keep as-is, keep only when + /// it isn't a redundant sRGB ([`zenpixels::icc::is_common_srgb`]), or drop. + /// - **EXIF** is pruned by category via [`ExifPolicy`]. The source blob + /// passes through unchanged (zero-copy `Arc` clone) when no category is + /// dropped and the embedded orientation already matches the field, and is + /// rewritten — offsets recomputed — only when pruning. + /// - **Orientation** is reconciled: the embedded EXIF orientation tag is + /// rewritten to match the authoritative [`orientation`](Metadata::orientation) + /// field, so a baked-upright image (field `Identity`, blob still rotated) + /// cannot be double-rotated by a consumer that re-applies the tag. + /// - **CICP** and **HDR** light-level/mastering are color *signaling* (they + /// change how pixels display); the presets keep them, a + /// [`Custom`](MetadataPolicy::Custom) policy can drop them. + /// + /// # HDR signaling and gain maps — keep these consistent with the pixels + /// + /// CICP (`transfer_characteristics`, `color_primaries`, `matrix_coefficients`) + /// and the HDR `ContentLightLevel` / `MasteringDisplay` describe **how the + /// stored pixels are to be interpreted**. They are not free-floating notes: + /// a decoder uses CICP transfer (e.g. PQ or HLG) to linearize, and uses + /// CLLI/MDCV to tone-map for the target display. If they disagree with the + /// actual pixels, the image renders **wrong** (clipped highlights, wrong + /// gamut, double tone-mapping). + /// + /// A **gain map** is a *separate plane* (not a field of [`Metadata`] — it + /// lives at the encode-request / codec-output layer with its + /// [`GainMapInfo`](crate::GainMapInfo)). The base image, its HDR signaling, + /// and the gain map together reconstruct the HDR rendition. That coupling + /// is the hazard: + /// + /// - **Dropping or flattening the gain map (HDR → SDR) without also fixing + /// the signaling leaves invalid metadata.** If you tone-map to an SDR + /// base and discard the gain map, but leave `transfer_characteristics = + /// PQ/HLG` and an MDCV describing a 1000-nit mastering display, a + /// conformant decoder will treat your SDR pixels as HDR and tone-map them + /// a second time — visibly wrong. When the gain map goes, the HDR + /// signaling that described the HDR rendition must go (or be rewritten to + /// match the SDR base: `transfer` → sRGB, drop CLLI/MDCV). + /// - Conversely, **stripping CICP/HDR while keeping a gain map** orphans the + /// gain map (the decoder no longer knows the base is HDR-relative), so the + /// HDR rendition is lost or misrendered. + /// + /// `filtered` **cannot see the gain map** (it isn't in `Metadata`), so it + /// cannot enforce this — the consistency is the **caller's responsibility** + /// at the layer that owns the gain map. Practical guidance: + /// + /// - Keeping the gain map untouched → keep CICP/HDR (`Web` / `Preserve`). + /// - Flattening to SDR and dropping the gain map → drop HDR here (a + /// [`Custom`](MetadataPolicy::Custom) policy with `hdr: Discard`, and set + /// the encoder's CICP to the SDR transfer) so the signaling matches the + /// pixels you actually wrote. + /// + /// `cicp` and `hdr` are deliberately *separate* retention flags so this + /// SDR-flatten case is expressible (drop HDR light-level/mastering while + /// keeping CICP primaries). + #[must_use] + pub fn filtered(&self, policy: &MetadataPolicy) -> Metadata { + let f = policy.fields(); + let mut out = Metadata::none(); + + // ICC — three-way; only KeepNonSrgb drops a redundant sRGB profile. + out.icc_profile = match f.icc { + IccRetention::Drop => None, + // Target-blind retention keeps the profile; the CICP-conditional + // drop is resolved against a concrete target in + // `color::resolve_color_emit`, which `filtered` does not see. + IccRetention::Keep + | IccRetention::DropIfCicpRepresentable + | IccRetention::DropIfCicpSafeSoleCarrier => self.icc_profile.clone(), + IccRetention::KeepNonSrgb => self + .icc_profile + .as_ref() + .filter(|icc| !zenpixels::icc::is_common_srgb(icc)) + .cloned(), + }; + + // Orientation field (codecs may apply it without re-reading the EXIF). + out.orientation = if f.exif.orientation.keeps() { + self.orientation + } else { + Orientation::Identity + }; + + // Color signaling. + if f.cicp.keeps() { + out.cicp = self.cicp; + } + if f.hdr.keeps() { + out.content_light_level = self.content_light_level; + out.mastering_display = self.mastering_display; + } + + // XMP (whole-segment). + if f.xmp.keeps() { + out.xmp = self.xmp.clone(); + } + + // EXIF — pruned by category; `Arc` clone when nothing is dropped. + out.exif = self + .exif + .as_ref() + .and_then(|src| match crate::exif::retain(src, &f.exif)? { + alloc::borrow::Cow::Borrowed(_) => Some(src.clone()), + alloc::borrow::Cow::Owned(v) => Some(Arc::from(v)), + }); + + // Reconcile the embedded EXIF orientation tag with the authoritative + // `out.orientation` field. A decoder that bakes orientation upright sets + // the field to Identity while the source blob still carries the original + // tag (e.g. Rotate90); left alone, a consumer that re-applies the EXIF tag + // would rotate twice. Rewriting the tag to match closes that. Only fires + // on a mismatch, so the matched/common case keeps the zero-copy `Arc` + // clone above; absent or tag-less blobs are left untouched. + let want = out.orientation; + let reconciled = out + .exif + .as_deref() + .filter(|e| parse_exif_orientation(e) != Some(want)) + .and_then(|e| crate::helpers::set_exif_orientation(e, want)); + if let Some(v) = reconciled { + out.exif = Some(Arc::from(v)); + } + // The result is already pruned to `policy`; mark it final so a later + // `for_embedding` / re-filter is a no-op rather than stripping again. + out.policy = Some(MetadataPolicy::PreserveExact); + out + } +} + +/// How to treat the ICC profile when filtering [`Metadata`]. +/// +/// `#[non_exhaustive]`: ICC handling can gain dispositions (e.g. a future +/// convert-to-sRGB or keep-if-display-referred mode) without a breaking +/// change. Match with a `_` arm. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum IccRetention { + /// Always drop the profile. + Drop, + /// Keep the profile unless it is a redundant sRGB profile — the common + /// choice (sRGB is the assumed default, so embedding it is pure weight). + KeepNonSrgb, + /// Keep the profile as-is, even a redundant sRGB one (byte-faithful). + Keep, + /// Drop the profile when it maps to a CICP expressible as code points + /// (sRGB / Display-P3 / BT.2020 / BT.2100…) — i.e. CICP fully describes the + /// color. **Target-aware**: only takes effect in + /// [`color::resolve_color_emit`](crate::color::resolve_color_emit), where the + /// target's CICP carrier is known. In the target-blind [`Metadata::filtered`] + /// path it conservatively keeps the profile. + DropIfCicpRepresentable, + /// Drop the profile only when the target format's CICP is safe as the sole + /// color carrier ([`EncodeCapabilities::cicp_safe_sole_carrier`](crate::encode::EncodeCapabilities::cicp_safe_sole_carrier) + /// — JXL today) and CICP represents the color. Like + /// [`DropIfCicpRepresentable`](Self::DropIfCicpRepresentable), this is + /// target-aware and keeps the profile in [`Metadata::filtered`]. + DropIfCicpSafeSoleCarrier, +} + +/// Per-field metadata retention for [`MetadataPolicy::Custom`]. +/// +/// EXIF is encapsulated in [`ExifPolicy`] (pruned by category); the remaining +/// fields use [`Retention`] (explicit `Keep`/`Discard`). This type is +/// `#[non_exhaustive]` (new fields can be added without a breaking change), so +/// downstream crates build from [`KEEP_ALL`](Self::KEEP_ALL) / +/// [`DISCARD_ALL`](Self::DISCARD_ALL) via the `with_*` builders rather than +/// struct-update syntax. Drop only GPS, keep all else: +/// +/// ``` +/// use zencodec::{MetadataFields, exif::{ExifPolicy, Retention}}; +/// let fields = MetadataFields::KEEP_ALL +/// .with_exif(ExifPolicy::KEEP_ALL.with_gps(Retention::Discard)); +/// ``` +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub struct MetadataFields { + /// ICC color profile. + pub icc: IccRetention, + /// EXIF, pruned by category. + pub exif: ExifPolicy, + /// XMP, whole-segment. The XMP packet (RDF/XML) can carry GPS + /// (`exif:GPS*`), edit history (`photoshop:History`, `xmpMM:History`), + /// and C2PA provenance (`xmpMM` manifests), so the presets that strip + /// privacy/bloat ([`Web`](MetadataPolicy::Web)) discard it wholesale while + /// keeping EXIF rights. + /// + /// Partial XMP (e.g. keep `dc:rights`/`dc:creator`, drop GPS + history + + /// C2PA) is a planned future addition — it needs an RDF/XML parser, so it + /// is deferred rather than half-done. It will arrive as a *new* + /// `MetadataFields` field (this struct is `#[non_exhaustive]`, so adding + /// one is non-breaking); `xmp` will remain the whole-segment master switch. + pub xmp: Retention, + /// CICP color signaling. + pub cicp: Retention, + /// HDR `ContentLightLevel` + `MasteringDisplay`. + pub hdr: Retention, +} + +impl MetadataFields { + /// Keep every field (ICC kept as-is, including a redundant sRGB). + pub const KEEP_ALL: Self = Self { + icc: IccRetention::Keep, + exif: ExifPolicy::KEEP_ALL, + xmp: Retention::Keep, + cicp: Retention::Keep, + hdr: Retention::Keep, + }; + /// Discard every field. + pub const DISCARD_ALL: Self = Self { + icc: IccRetention::Drop, + exif: ExifPolicy::DISCARD_ALL, + xmp: Retention::Discard, + cicp: Retention::Discard, + hdr: Retention::Discard, + }; + + /// Set ICC retention. (Builder — this type is `#[non_exhaustive]`.) + #[must_use] + pub const fn with_icc(mut self, r: IccRetention) -> Self { + self.icc = r; + self + } + /// Set the EXIF retention policy. + #[must_use] + pub const fn with_exif(mut self, p: ExifPolicy) -> Self { + self.exif = p; + self + } + /// Set XMP retention. + #[must_use] + pub const fn with_xmp(mut self, r: Retention) -> Self { + self.xmp = r; + self + } + /// Set CICP retention. + #[must_use] + pub const fn with_cicp(mut self, r: Retention) -> Self { + self.cicp = r; + self + } + /// Set HDR (light-level/mastering) retention. + #[must_use] + pub const fn with_hdr(mut self, r: Retention) -> Self { + self.hdr = r; + self + } +} + +/// Field-level metadata retention policy applied by [`Metadata::filtered`]. +/// +/// `Copy` (all variants, including `Custom(MetadataFields)`, are `Copy`) so it +/// can be bundled by value into [`EncodePolicy`](crate::encode::EncodePolicy). +/// +/// **No `Default`.** Metadata retention is a privacy decision, so callers must +/// name a policy explicitly — there is no implicit fallback. [`Web`](Self::Web) +/// is the recommended privacy-safe choice for publishing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum MetadataPolicy { + /// Keep everything the source carried, byte-faithfully — including a + /// redundant sRGB ICC profile. + PreserveExact, + /// Keep everything, but drop a redundant sRGB ICC profile. + Preserve, + /// The web-publish set (recommended for publishing): keep the ICC profile + /// (unless a redundant sRGB), EXIF orientation + rights (copyright/artist), + /// and CICP / HDR color signaling. Drop the rest of EXIF (GPS, timestamps, + /// camera/device identity, thumbnail) and all XMP. + Web, + /// Keep only what places pixels on screen: the ICC profile (unless a + /// redundant sRGB), CICP / HDR color signaling, and EXIF orientation. + /// Drops attribution, XMP, and all other EXIF. + ColorAndRotation, + /// Explicit per-field control via [`MetadataFields`]. + Custom(MetadataFields), +} + +impl MetadataPolicy { + /// Resolve the policy to its concrete per-field retention set. + #[must_use] + pub fn fields(&self) -> MetadataFields { + match self { + Self::PreserveExact => MetadataFields::KEEP_ALL, + Self::Preserve => MetadataFields { + icc: IccRetention::KeepNonSrgb, + ..MetadataFields::KEEP_ALL + }, + Self::Web => MetadataFields { + icc: IccRetention::KeepNonSrgb, + exif: ExifPolicy::ATTRIBUTED_ORIENTATION, + xmp: Retention::Discard, + cicp: Retention::Keep, + hdr: Retention::Keep, + }, + Self::ColorAndRotation => MetadataFields { + icc: IccRetention::KeepNonSrgb, + exif: ExifPolicy::ORIENTATION_ONLY, + xmp: Retention::Discard, + cicp: Retention::Keep, + hdr: Retention::Keep, + }, + Self::Custom(f) => *f, + } + } } impl From<&crate::ImageInfo> for Metadata { @@ -147,6 +570,12 @@ impl From<&crate::ImageInfo> for Metadata { content_light_level: info.source_color.content_light_level, mastering_display: info.source_color.mastering_display, orientation: info.orientation, + // No implicit policy: the raw bytes are carried for inspection, but + // the caller must choose retention via `with_policy` before encoding. + // Until then `for_embedding` yields `None` (embed nothing — fail-safe, + // never a silent leak). `Web` is the usual privacy-safe choice; + // `PreserveExact` for a verbatim transcode. + policy: None, } } } @@ -386,4 +815,361 @@ mod tests { // Explicit FlipH must win over the EXIF blob's Rotate90. assert_eq!(meta.orientation, Orientation::FlipH); } + + // ── MetadataPolicy / filtered ────────────────────────────────────────── + + use crate::exif::Exif; + + /// LE TIFF source: Make (0x010F, camera) + Orientation (0x0112) + Copyright + /// (0x8298, out-of-line), tag-sorted. `prefix` adds `Exif\0\0` framing. + fn src_exif(orientation: u16, copyright: &str, prefix: bool) -> alloc::vec::Vec { + use alloc::vec::Vec; + let mut cw = copyright.as_bytes().to_vec(); + cw.push(0); // > 4 bytes → out-of-line + let n: u16 = 3; + let ifd_size = 2 + 12 * n as usize + 4; + let ext_off = 8 + ifd_size; + + let mut t = Vec::new(); + t.extend_from_slice(b"II"); + t.extend_from_slice(&42u16.to_le_bytes()); + t.extend_from_slice(&8u32.to_le_bytes()); + t.extend_from_slice(&n.to_le_bytes()); + // Make 0x010F ASCII "Cam\0" (4 bytes, inline) — camera category. + t.extend_from_slice(&0x010Fu16.to_le_bytes()); + t.extend_from_slice(&2u16.to_le_bytes()); + t.extend_from_slice(&4u32.to_le_bytes()); + t.extend_from_slice(b"Cam\0"); + // Orientation 0x0112 SHORT (inline). + t.extend_from_slice(&0x0112u16.to_le_bytes()); + t.extend_from_slice(&3u16.to_le_bytes()); + t.extend_from_slice(&1u32.to_le_bytes()); + t.extend_from_slice(&u32::from(orientation).to_le_bytes()); + // Copyright 0x8298 ASCII (out-of-line). + t.extend_from_slice(&0x8298u16.to_le_bytes()); + t.extend_from_slice(&2u16.to_le_bytes()); + t.extend_from_slice(&(cw.len() as u32).to_le_bytes()); + t.extend_from_slice(&(ext_off as u32).to_le_bytes()); + t.extend_from_slice(&0u32.to_le_bytes()); // next-IFD offset + t.extend_from_slice(&cw); + + if prefix { + let mut out = Vec::with_capacity(6 + t.len()); + out.extend_from_slice(b"Exif\0\0"); + out.extend_from_slice(&t); + out + } else { + t + } + } + + /// True if the (little-endian) tag appears in the blob's entry stream. + fn has_tag(blob: &[u8], tag: u16) -> bool { + blob.windows(2).any(|w| w == tag.to_le_bytes()) + } + + #[test] + fn no_policy_means_for_embedding_none() { + // Force-explicit: no policy chosen ⇒ for_embedding yields None, so a + // codec embeds nothing (fail-safe — a forgotten policy strips, not leaks). + let meta = Metadata::none().with_exif(src_exif(6, "(c) Me", false)); + assert_eq!(meta.policy, None); + assert!(meta.for_embedding().is_none()); + } + + #[test] + fn policy_fields_resolution() { + assert_eq!( + MetadataPolicy::PreserveExact.fields(), + MetadataFields::KEEP_ALL + ); + assert_eq!( + MetadataPolicy::PreserveExact.fields().icc, + IccRetention::Keep + ); + assert_eq!( + MetadataPolicy::Preserve.fields().icc, + IccRetention::KeepNonSrgb + ); + assert_eq!(MetadataPolicy::Preserve.fields().exif, ExifPolicy::KEEP_ALL); + + let web = MetadataPolicy::Web.fields(); + assert_eq!(web.icc, IccRetention::KeepNonSrgb); + assert_eq!(web.exif, ExifPolicy::ATTRIBUTED_ORIENTATION); + assert_eq!(web.xmp, Retention::Discard); + assert_eq!(web.cicp, Retention::Keep); + assert_eq!(web.hdr, Retention::Keep); + + let car = MetadataPolicy::ColorAndRotation.fields(); + assert_eq!(car.exif, ExifPolicy::ORIENTATION_ONLY); + assert_eq!(car.cicp, Retention::Keep); + + let custom = MetadataFields { + xmp: Retention::Keep, + ..MetadataFields::DISCARD_ALL + }; + assert_eq!(MetadataPolicy::Custom(custom).fields(), custom); + } + + #[test] + fn icc_three_way_retention() { + let icc = alloc::vec![0xABu8; 256]; // arbitrary → not recognized as sRGB + let meta = Metadata::none().with_icc(icc.clone()); + // KeepNonSrgb keeps a non-sRGB profile (Web/Preserve). + assert_eq!( + meta.filtered(&MetadataPolicy::Web).icc_profile.as_deref(), + Some(icc.as_slice()) + ); + // Keep keeps it too (PreserveExact). + assert_eq!( + meta.filtered(&MetadataPolicy::PreserveExact) + .icc_profile + .as_deref(), + Some(icc.as_slice()) + ); + // Drop removes it. + let drop = MetadataFields { + icc: IccRetention::Drop, + ..MetadataFields::KEEP_ALL + }; + assert!( + meta.filtered(&MetadataPolicy::Custom(drop)) + .icc_profile + .is_none() + ); + } + + #[test] + fn web_keeps_orientation_rights_drops_camera_and_xmp() { + let src = src_exif(6, "(c) 2026 Lilith", false); + let meta = Metadata::none() + .with_exif(src.clone()) + .with_xmp(alloc::vec![1, 2, 3]) + .with_cicp(Cicp::SRGB) + .with_content_light_level(ContentLightLevel { + max_content_light_level: 1000, + max_frame_average_light_level: 400, + }); + assert_eq!(meta.orientation, Orientation::Rotate90); + + let out = meta.filtered(&MetadataPolicy::Web); + let e = out.exif.as_deref().expect("rewritten EXIF"); + let ex = Exif::parse(e).expect("parses"); + assert_eq!(ex.orientation(), Some(Orientation::Rotate90)); + assert_eq!(ex.copyright().unwrap(), "(c) 2026 Lilith"); + // Camera (Make 0x010F) dropped; output is smaller than the source. + assert!(!has_tag(e, 0x010F)); + assert!(e.len() < src.len()); + assert_eq!(out.orientation, Orientation::Rotate90); + assert!(out.xmp.is_none()); + assert_eq!(out.cicp, Some(Cicp::SRGB)); + assert!(out.content_light_level.is_some()); + } + + #[test] + fn preserve_exact_passes_exif_through_byte_identical() { + let src = src_exif(6, "(c) Owner", false); + let meta = Metadata::none() + .with_exif(src.clone()) + .with_xmp(alloc::vec![9, 9]) + .with_icc(alloc::vec![0xABu8; 200]); + let out = meta.filtered(&MetadataPolicy::PreserveExact); + assert_eq!(out.exif.as_deref(), Some(src.as_slice())); + assert!(has_tag(out.exif.as_deref().unwrap(), 0x010F)); // camera kept + assert_eq!(out.xmp.as_deref(), Some([9, 9].as_slice())); + assert!(out.icc_profile.is_some()); + } + + #[test] + fn color_and_rotation_keeps_orientation_drops_rights() { + let src = src_exif(8, "(c) Owner", false); + let meta = Metadata::none().with_exif(src).with_cicp(Cicp::SRGB); + let out = meta.filtered(&MetadataPolicy::ColorAndRotation); + let e = out.exif.as_deref().expect("EXIF"); + let ex = Exif::parse(e).expect("parses"); + assert_eq!(ex.orientation(), Some(Orientation::Rotate270)); + assert!(ex.copyright().is_none()); // rights dropped + assert!(!has_tag(e, 0x010F)); // camera dropped + assert_eq!(out.cicp, Some(Cicp::SRGB)); + } + + // ── Embed-time policy carried on Metadata (for_embedding) ──────────────── + + #[test] + fn default_policy_is_none() { + // No implicit policy — the caller must choose one explicitly. + assert_eq!(Metadata::none().policy, None); + assert_eq!(Metadata::default().policy, None); + } + + #[test] + fn with_policy_sets_policy() { + let m = Metadata::none().with_policy(MetadataPolicy::PreserveExact); + assert_eq!(m.policy, Some(MetadataPolicy::PreserveExact)); + } + + #[test] + fn for_embedding_with_web_strips_camera_keeps_rights() { + // The codec-facing hook: an explicit Web policy strips camera identity + // while keeping orientation + rights. + let src = src_exif(6, "(c) Me", false); + let meta = Metadata::none() + .with_exif(src) + .with_policy(MetadataPolicy::Web); + let embed = meta.for_embedding().expect("policy set ⇒ Some"); + let e = embed.exif.as_deref().expect("EXIF"); + let ex = Exif::parse(e).expect("parses"); + assert_eq!(ex.orientation(), Some(Orientation::Rotate90)); + assert_eq!(ex.copyright().unwrap(), "(c) Me"); + assert!(!has_tag(e, 0x010F), "camera (Make) must be stripped"); + } + + #[test] + fn for_embedding_preserve_exact_is_verbatim() { + let src = src_exif(6, "(c) Me", false); + let meta = Metadata::none() + .with_exif(src.clone()) + .with_policy(MetadataPolicy::PreserveExact); + let embed = meta.for_embedding().expect("policy set ⇒ Some"); + assert_eq!(embed.exif.as_deref(), Some(src.as_slice()), "verbatim"); + assert!( + has_tag(embed.exif.as_deref().unwrap(), 0x010F), + "camera kept" + ); + } + + #[test] + fn for_embedding_output_is_marked_final_no_double_strip() { + let src = src_exif(6, "(c) Me", false); + let once = Metadata::none() + .with_exif(src) + .with_policy(MetadataPolicy::Web) + .for_embedding() + .expect("policy set ⇒ Some"); // Web-filtered + assert_eq!(once.policy, Some(MetadataPolicy::PreserveExact)); + // Re-embedding the already-filtered metadata is a no-op, not a re-strip. + let twice = once.for_embedding().expect("PreserveExact ⇒ Some"); + assert_eq!(twice.exif, once.exif); + let ex = Exif::parse(twice.exif.as_deref().unwrap()).unwrap(); + assert_eq!(ex.copyright().unwrap(), "(c) Me"); + } + + // ── with_copyright / with_artist sugar (build/merge an EXIF blob) ───────── + + #[test] + fn with_copyright_creates_blob_from_nothing() { + let meta = Metadata::none().with_copyright("(c) 2026 Lilith"); + let e = meta.exif.as_deref().expect("EXIF created"); + assert_eq!( + Exif::parse(e).unwrap().copyright().unwrap(), + "(c) 2026 Lilith" + ); + } + + #[test] + fn with_copyright_merges_into_existing() { + // src carries Make (camera), Orientation=6 (Rotate90), Copyright "old". + let src = src_exif(6, "old", false); + let meta = Metadata::none() + .with_exif(src) + .with_copyright("(c) New Owner"); + let e = meta.exif.as_deref().expect("EXIF"); + let x = Exif::parse(e).unwrap(); + assert_eq!(x.copyright().unwrap(), "(c) New Owner"); // replaced + assert_eq!(x.orientation(), Some(Orientation::Rotate90)); // preserved + assert!(has_tag(e, 0x010F), "Make preserved on merge"); + assert_eq!(meta.orientation, Orientation::Rotate90); // with_exif synced it + } + + #[test] + fn with_artist_creates_blob() { + let meta = Metadata::none().with_artist("Lilith"); + let e = meta.exif.as_deref().expect("EXIF"); + assert_eq!(Exif::parse(e).unwrap().artist().unwrap(), "Lilith"); + } + + #[test] + fn filtered_reconciles_baked_orientation_tag() { + // Simulate a decoder that baked orientation upright: the structured field + // is Identity, but the source EXIF blob still carries Rotate90 (6). + let blob = src_exif(6, "(c) Owner", false); + let meta = Metadata::none() + .with_exif(blob) // parses 6 → field = Rotate90 + .with_orientation(Orientation::Identity); // baked: field reset to Identity + assert_eq!(meta.orientation, Orientation::Identity); + // The unfiltered blob still says Rotate90 — the divergence. + assert_eq!( + parse_exif_orientation(meta.exif.as_deref().unwrap()), + Some(Orientation::Rotate90) + ); + + // filtered() rewrites the embedded tag to match the authoritative field, + // so the emitted metadata is self-consistent (no double-rotation). + let out = meta.filtered(&MetadataPolicy::PreserveExact); + assert_eq!(out.orientation, Orientation::Identity); + assert_eq!( + parse_exif_orientation(out.exif.as_deref().unwrap()), + Some(Orientation::Identity), + "baked-upright blob must be rewritten to Identity, not left at Rotate90" + ); + } + + #[test] + fn custom_drop_only_camera_keeps_rest() { + let src = src_exif(6, "(c) Owner", false); + let fields = MetadataFields { + exif: ExifPolicy { + camera: Retention::Discard, + ..ExifPolicy::KEEP_ALL + }, + ..MetadataFields::KEEP_ALL + }; + let out = Metadata::none() + .with_exif(src) + .filtered(&MetadataPolicy::Custom(fields)); + let e = out.exif.as_deref().expect("EXIF"); + let ex = Exif::parse(e).expect("parses"); + assert_eq!(ex.orientation(), Some(Orientation::Rotate90)); + assert_eq!(ex.copyright().unwrap(), "(c) Owner"); + assert!(!has_tag(e, 0x010F)); // only camera dropped + } + + #[test] + fn dropping_orientation_resets_field_to_identity() { + let meta = Metadata::none().with_orientation(Orientation::Rotate90); + let fields = MetadataFields { + exif: ExifPolicy { + orientation: Retention::Discard, + ..ExifPolicy::KEEP_ALL + }, + ..MetadataFields::KEEP_ALL + }; + let out = meta.filtered(&MetadataPolicy::Custom(fields)); + assert_eq!(out.orientation, Orientation::Identity); + } + + #[test] + fn exif_prefix_preserved_through_rewrite() { + let src = src_exif(6, "(c) Owner", true); // Exif\0\0 prefix + let out = Metadata::none() + .with_exif(src) + .filtered(&MetadataPolicy::Web); + let e = out.exif.as_deref().expect("EXIF"); + assert_eq!(&e[..6], b"Exif\0\0"); + let ex = Exif::parse(e).expect("parses"); + assert_eq!(ex.orientation(), Some(Orientation::Rotate90)); + assert_eq!(ex.copyright().unwrap(), "(c) Owner"); + } + + #[test] + fn filtered_empty_metadata_is_empty() { + for p in [ + MetadataPolicy::PreserveExact, + MetadataPolicy::Preserve, + MetadataPolicy::Web, + MetadataPolicy::ColorAndRotation, + ] { + assert!(Metadata::none().filtered(&p).is_empty()); + } + } } diff --git a/src/policy.rs b/src/policy.rs index c29d907..cb614f4 100644 --- a/src/policy.rs +++ b/src/policy.rs @@ -224,29 +224,55 @@ impl DecodePolicy { } } -/// Encode metadata policy. +/// Output-emission policy for an encode or transcode: which color carrier to +/// emit, which metadata to retain, and a coarse per-channel embed gate. One +/// object, three concerns that apply at different stages. /// -/// Controls which metadata an encoder embeds in the output. -/// All fields default to `None`, meaning the codec uses its own default. -/// `Some(true)` explicitly allows embedding; `Some(false)` explicitly strips. +/// - [`color`](Self::color) — color-carrier emission (ICC bytes vs CICP code +/// points). The codec reads it during encode via +/// [`resolve_color`](Self::resolve_color) and feeds it to +/// [`resolve_color_emit`](crate::resolve_color_emit). `None` defers to the +/// codec's default. +/// - [`metadata`](Self::metadata) — field-level retention (which EXIF tags, a +/// redundant sRGB ICC, XMP, CICP/HDR signaling). Applied by the pipeline or +/// caller via [`Metadata::filtered`](crate::Metadata::filtered) *before* the +/// record reaches the codec, so it is always honored; codecs do not read this +/// field. `None` leaves the record unfiltered. +/// - [`embed_icc`](Self::embed_icc) / [`embed_exif`](Self::embed_exif) / +/// [`embed_xmp`](Self::embed_xmp) — a coarse, best-effort per-channel embed +/// gate handed to the codec via +/// [`EncodeJob::with_policy`](crate::encode::EncodeJob::with_policy). +/// Tri-state (`None` = codec default, `Some(true/false)` = embed/strip), +/// whole-channel only. Best-effort: the `with_policy` default is a no-op, so a +/// codec that does not implement it silently ignores this gate. For reliable +/// retention use `metadata`, not these. /// /// # Example /// /// ``` /// use zencodec::encode::EncodePolicy; +/// use zencodec::{ColorEmitPolicy, MetadataPolicy}; /// -/// // Strip all metadata from output -/// let policy = EncodePolicy::strip_all(); -/// -/// // Or fine-grained: keep ICC, strip EXIF/XMP +/// // Smallest output: prefer compact color carriers, keep only color + rotation. /// let policy = EncodePolicy::none() -/// .with_embed_icc(true) -/// .with_embed_exif(false) -/// .with_embed_xmp(false); +/// .with_color(ColorEmitPolicy::Compact) +/// .with_metadata_policy(MetadataPolicy::ColorAndRotation); +/// +/// // Coarse legacy gate: ask the codec to strip every metadata channel. +/// let policy = EncodePolicy::strip_all(); /// ``` #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] #[non_exhaustive] pub struct EncodePolicy { + /// Color-carrier emission policy (ICC bytes vs CICP code points). `None` + /// defers to the codec's default. The codec reads it during encode via + /// [`resolve_color`](EncodePolicy::resolve_color). + pub color: Option, + /// Field-level metadata retention. `None` leaves the record unfiltered. + /// Applied by the pipeline/caller via + /// [`Metadata::filtered`](crate::Metadata::filtered) before encode; codecs + /// do not read this field. + pub metadata: Option, /// Embed ICC color profiles in the output. pub embed_icc: Option, /// Embed EXIF metadata in the output. @@ -255,12 +281,17 @@ pub struct EncodePolicy { pub embed_xmp: Option, } -const _: () = assert!(core::mem::size_of::() == 3); +// No longer a 3-byte gate: EncodePolicy now bundles the color + metadata +// policies. Keep a loose upper bound to catch accidental bloat (e.g. a field +// that pulls in a Vec/Arc). +const _: () = assert!(core::mem::size_of::() <= 32); impl EncodePolicy { /// No preferences — codec uses its own defaults. pub const fn none() -> Self { Self { + color: None, + metadata: None, embed_icc: None, embed_exif: None, embed_xmp: None, @@ -270,6 +301,14 @@ impl EncodePolicy { /// Strip all metadata from output. pub const fn strip_all() -> Self { Self { + color: None, + // Carry a real discard policy through the reliable metadata channel + // (`Metadata::filtered` / `resolve_metadata`), not only the advisory + // `embed_*` flags — the latter silently no-op on codecs that don't + // implement `with_policy`, so a strip via flags alone could leak. + metadata: Some(crate::MetadataPolicy::Custom( + crate::MetadataFields::DISCARD_ALL, + )), embed_icc: Some(false), embed_exif: Some(false), embed_xmp: Some(false), @@ -279,6 +318,8 @@ impl EncodePolicy { /// Preserve all metadata in output. pub const fn preserve_all() -> Self { Self { + color: None, + metadata: Some(crate::MetadataPolicy::PreserveExact), embed_icc: Some(true), embed_exif: Some(true), embed_xmp: Some(true), @@ -326,6 +367,37 @@ impl EncodePolicy { None => default, } } + + /// Set the color-carrier emission policy. + pub const fn with_color(mut self, policy: crate::ColorEmitPolicy) -> Self { + self.color = Some(policy); + self + } + + /// Set the field-level metadata retention policy. + pub const fn with_metadata_policy(mut self, policy: crate::MetadataPolicy) -> Self { + self.metadata = Some(policy); + self + } + + /// Resolve the color-carrier emission policy, falling back to `default` — + /// codecs pass their own default here (e.g. + /// [`ColorEmitPolicy::Balanced`](crate::ColorEmitPolicy::Balanced)), so a + /// caller that set nothing keeps the codec's behavior. + pub const fn resolve_color(&self, default: crate::ColorEmitPolicy) -> crate::ColorEmitPolicy { + match self.color { + Some(p) => p, + None => default, + } + } + + /// Resolve the metadata retention policy, falling back to `default`. + pub const fn resolve_metadata(&self, default: crate::MetadataPolicy) -> crate::MetadataPolicy { + match self.metadata { + Some(p) => p, + None => default, + } + } } #[cfg(test)] @@ -392,6 +464,13 @@ mod tests { assert_eq!(p.embed_icc, Some(false)); assert_eq!(p.embed_exif, Some(false)); assert_eq!(p.embed_xmp, Some(false)); + // Reliable channel: strip_all carries a real discard policy, so a + // pipeline applying `resolve_metadata` actually strips even when the + // advisory embed_* flags are a no-op on the codec. + assert_eq!( + p.resolve_metadata(crate::MetadataPolicy::Web), + crate::MetadataPolicy::Custom(crate::MetadataFields::DISCARD_ALL) + ); } #[test] @@ -400,6 +479,10 @@ mod tests { assert_eq!(p.embed_icc, Some(true)); assert_eq!(p.embed_exif, Some(true)); assert_eq!(p.embed_xmp, Some(true)); + assert_eq!( + p.resolve_metadata(crate::MetadataPolicy::Web), + crate::MetadataPolicy::PreserveExact + ); } #[test] diff --git a/tests/exif_differential.rs b/tests/exif_differential.rs new file mode 100644 index 0000000..a36264a --- /dev/null +++ b/tests/exif_differential.rs @@ -0,0 +1,200 @@ +//! Differential tests: parse the same EXIF blobs with `zencodec::exif::Exif` +//! and the mature `kamadak-exif` crate, and assert the accessor outputs agree. +//! +//! Scope: well-formed blobs where parity is meaningful (orientation as +//! SHORT/LONG, copyright/artist inline + out-of-line, both byte orders). The +//! oracle's raw path doesn't strip the `Exif\0\0` prefix, so the harness does. +//! Behavioral seams where zencodec is deliberately lenient (missing next-IFD +//! offset, child next pointers, >8 IFD chains) are out of scope here — those +//! are covered by the in-crate unit tests and fuzzing. + +use exif::{In, Tag, Value}; +use zencodec::exif::Exif; + +/// Read orientation + copyright + artist from the oracle (`kamadak-exif`). +fn oracle(blob: &[u8]) -> Option<(Option, Option, Option)> { + let tiff: &[u8] = blob.strip_prefix(b"Exif\0\0").unwrap_or(blob); + let (fields, _le) = exif::parse_exif(tiff).ok()?; + let get = |t: Tag| { + fields + .iter() + .find(|f| f.tag == t && f.ifd_num == In::PRIMARY) + }; + let orientation = get(Tag::Orientation).and_then(|f| f.value.get_uint(0)); + let ascii = |t: Tag| -> Option { + match &get(t)?.value { + Value::Ascii(v) if !v.is_empty() && !v[0].is_empty() => { + Some(String::from_utf8_lossy(&v[0]).into_owned()) + } + _ => None, + } + }; + Some((orientation, ascii(Tag::Copyright), ascii(Tag::Artist))) +} + +/// Build a well-formed TIFF (`be` = big-endian). IFD0 at offset 8 with an +/// orientation entry (SHORT or LONG) plus optional copyright/artist ASCII +/// entries (out-of-line when > 4 bytes). Tag-sorted: 0x0112 < 0x013B < 0x8298. +fn build( + be: bool, + orientation: u16, + ori_long: bool, + copyright: Option<&str>, + artist: Option<&str>, +) -> Vec { + let w16 = |v: &mut Vec, x: u16| { + v.extend_from_slice(&if be { x.to_be_bytes() } else { x.to_le_bytes() }) + }; + let w32 = |v: &mut Vec, x: u32| { + v.extend_from_slice(&if be { x.to_be_bytes() } else { x.to_le_bytes() }) + }; + + // Collect entries as (tag, type, count, inline-or-offset value bytes). + struct E { + tag: u16, + kind: u16, + count: u32, + inline: Option<[u8; 4]>, + ext: Vec, + } + let mut entries: Vec = Vec::new(); + + // Orientation. + if ori_long { + let mut v = [0u8; 4]; + v.copy_from_slice(&if be { + u32::from(orientation).to_be_bytes() + } else { + u32::from(orientation).to_le_bytes() + }); + entries.push(E { + tag: 0x0112, + kind: 4, + count: 1, + inline: Some(v), + ext: Vec::new(), + }); + } else { + let mut v = [0u8; 4]; + let b = if be { + orientation.to_be_bytes() + } else { + orientation.to_le_bytes() + }; + v[..2].copy_from_slice(&b); + entries.push(E { + tag: 0x0112, + kind: 3, + count: 1, + inline: Some(v), + ext: Vec::new(), + }); + } + // Artist (0x013B) then Copyright (0x8298) — ASCII, NUL-terminated. + let push_ascii = |entries: &mut Vec, tag: u16, s: &str| { + let mut bytes = s.as_bytes().to_vec(); + bytes.push(0); + if bytes.len() <= 4 { + let mut v = [0u8; 4]; + v[..bytes.len()].copy_from_slice(&bytes); + entries.push(E { + tag, + kind: 2, + count: bytes.len() as u32, + inline: Some(v), + ext: Vec::new(), + }); + } else { + entries.push(E { + tag, + kind: 2, + count: bytes.len() as u32, + inline: None, + ext: bytes, + }); + } + }; + if let Some(a) = artist { + push_ascii(&mut entries, 0x013B, a); + } + if let Some(c) = copyright { + push_ascii(&mut entries, 0x8298, c); + } + + let n = entries.len(); + let ext_base = 8 + 2 + 12 * n + 4; // header + count + entries + next-IFD + + let mut v = Vec::new(); + v.extend_from_slice(if be { b"MM" } else { b"II" }); + w16(&mut v, 42); + w32(&mut v, 8); + w16(&mut v, n as u16); + let mut ext = Vec::new(); + for e in &entries { + w16(&mut v, e.tag); + w16(&mut v, e.kind); + w32(&mut v, e.count); + match &e.inline { + Some(b) => v.extend_from_slice(b), + None => { + w32(&mut v, (ext_base + ext.len()) as u32); + ext.extend_from_slice(&e.ext); + if ext.len() % 2 == 1 { + ext.push(0); + } + } + } + } + w32(&mut v, 0); // next-IFD offset + v.extend_from_slice(&ext); + v +} + +#[test] +fn differential_orientation_copyright_artist() { + let mut compared = 0usize; + for &be in &[false, true] { + for &ori_long in &[false, true] { + for ori in 1u16..=8 { + for copyright in [None, Some("(c)"), Some("Copyright 2026 Lilith")] { + for artist in [None, Some("Me"), Some("Lilith Ver{}er")] { + let blob = build(be, ori, ori_long, copyright, artist); + + // zencodec must always parse a well-formed blob. + let x = Exif::parse(&blob).expect("zencodec parses well-formed blob"); + let zen = ( + x.orientation().map(|o| u32::from(o.to_exif())), + x.copyright().map(|c| c.into_owned()), + x.artist().map(|a| a.into_owned()), + ); + + // Oracle: where it agrees to parse, accessor outputs must match. + if let Some(orc) = oracle(&blob) { + assert_eq!( + zen.0, orc.0, + "orientation mismatch (be={be}, long={ori_long}, ori={ori})" + ); + assert_eq!(zen.1, orc.1, "copyright mismatch ({copyright:?})"); + assert_eq!(zen.2, orc.2, "artist mismatch ({artist:?})"); + compared += 1; + } + } + } + } + } + } + // Sanity: the oracle actually parsed a substantial share, so the assertions ran. + assert!(compared >= 100, "too few oracle comparisons: {compared}"); +} + +#[test] +fn differential_exif_prefix_framing() { + let bare = build(false, 6, false, Some("Copyright 2026"), None); + let mut prefixed = b"Exif\0\0".to_vec(); + prefixed.extend_from_slice(&bare); + + let x = Exif::parse(&prefixed).expect("parses prefixed"); + let orc = oracle(&prefixed).expect("oracle parses (after prefix strip)"); + assert_eq!(x.orientation().map(|o| u32::from(o.to_exif())), orc.0); + assert_eq!(x.copyright().map(|c| c.into_owned()), orc.1); +} diff --git a/tests/fuzz_regression.rs b/tests/fuzz_regression.rs new file mode 100644 index 0000000..71d9acc --- /dev/null +++ b/tests/fuzz_regression.rs @@ -0,0 +1,110 @@ +//! Fuzz crash regression suite. +//! +//! Runs every file in `fuzz/regression/` through the same logic as the +//! `exif_parse`, `exif_roundtrip`, and `exif_filter` fuzz targets, but as a +//! regular `cargo test` — no nightly toolchain needed. Each seed is a +//! previously-found crash that has been fixed; a failure here is a regression. +//! +//! To add a seed: drop the (minimized) crash file into `fuzz/regression/` with +//! a `crash-` name. The working corpus and unminimized artifacts live in +//! block storage (`/mnt/v/fuzzes/zencodec/`), not git. + +use std::fs; +use std::path::PathBuf; + +use zencodec::exif::{Exif, ExifPolicy, Retention}; + +fn regression_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fuzz/regression") +} + +fn ret(bit: bool) -> Retention { + if bit { + Retention::Keep + } else { + Retention::Discard + } +} + +/// Mirror of `exif_parse`: parse + accessors never panic. +fn run_parse(data: &[u8]) { + if let Some(x) = Exif::parse(data) { + let _ = x.orientation(); + let _ = x.copyright(); + let _ = x.artist(); + let _ = x.has_gps(); + let _ = x.has_thumbnail(); + } +} + +/// Mirror of `exif_roundtrip`: serializer output re-parses, accessors preserved. +fn run_roundtrip(data: &[u8]) { + if let Some(x) = Exif::parse(data) { + let bytes = x.to_bytes(); + let y = Exif::parse(&bytes).expect("serializer output must re-parse"); + assert_eq!(x.orientation(), y.orientation()); + assert_eq!(x.copyright(), y.copyright()); + assert_eq!(x.artist(), y.artist()); + assert_eq!(x.has_gps(), y.has_gps()); + assert_eq!(x.has_thumbnail(), y.has_thumbnail()); + } +} + +/// Mirror of `exif_filter`: prune + serialize + retain never panic. +fn run_filter(data: &[u8]) { + let (cfg, rest) = match data.split_first() { + Some((c, r)) => (*c, r), + None => return, + }; + let policy = ExifPolicy::DISCARD_ALL + .with_orientation(ret(cfg & 0x01 != 0)) + .with_rights(ret(cfg & 0x02 != 0)) + .with_thumbnail(ret(cfg & 0x04 != 0)) + .with_gps(ret(cfg & 0x08 != 0)) + .with_datetimes(ret(cfg & 0x10 != 0)) + .with_camera(ret(cfg & 0x20 != 0)) + .with_other(ret(cfg & 0x40 != 0)); + if let Some(x) = Exif::parse(rest) { + let bytes = x.filtered(&policy).to_bytes(); + // Canonical / idempotent: re-filtering the output is a byte-exact + // fixpoint (regression for a fuzz-found non-idempotence). + if let Some(y) = Exif::parse(&bytes) { + assert_eq!( + bytes, + y.filtered(&policy).to_bytes(), + "filter not idempotent" + ); + } + } + let _ = zencodec::exif::retain(rest, &policy); +} + +#[test] +fn fuzz_regression_seeds() { + let dir = regression_dir(); + let entries = match fs::read_dir(&dir) { + Ok(e) => e, + Err(_) => return, // no seeds yet — nothing to regress + }; + let mut count = 0; + for entry in entries.flatten() { + let path = entry.path(); + if !path.is_file() { + continue; + } + // Skip dotfiles (e.g. .gitkeep). + if path + .file_name() + .and_then(|n| n.to_str()) + .is_some_and(|n| n.starts_with('.')) + { + continue; + } + let data = fs::read(&path).expect("read seed"); + run_parse(&data); + run_roundtrip(&data); + run_filter(&data); + count += 1; + } + eprintln!("fuzz_regression: replayed {count} seed(s)"); +}