From a492f01552ac71ec33dc5bc78e214006b09999be Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 28 May 2026 20:40:09 -0400 Subject: [PATCH 1/8] refactor(evaluator): thread max_string_length cap through type-read dispatchers (closes 2A-H1) `EvaluationConfig::max_string_length` was documented as a working memory cap in three places (`config.rs`, `evaluator/mod.rs`, `configuration.md`) and exposed via the accessor `EvaluationContext::max_string_length()`, but was never threaded into `read_typed_value_with_pattern` or `read_pattern_match`. A `0 string x %s` rule against a 1 GiB NUL-free buffer allocated the full buffer (capped only by the 1 GiB mmap limit) -- the documented CWE-770 control did nothing. Closes origin findings 2A-H1 (memory cap) and 3A-C2 (regression test coverage). Implements U1 + U2 + U5 from `docs/plans/2026-05-28-001-refactor-pre-1.0-api-hardening-plan.md`. Changes: * U1: Thread `max_string_length: usize` through `evaluate_single_rule_with_anchor`, `evaluate_value_rule`, `evaluate_pattern_rule`, `read_typed_value_with_pattern`, and `read_pattern_match`. The unflagged `(None, _)` String arm now passes `Some(max_string_length)` into `read_string`. The flagged-string arm in `read_pattern_match` builds `scan_buffer` with the cap as an upper bound using the same shape as the existing `max_length: Some(n)` arm (mirroring `buffer.get(..end).unwrap_or(buffer)`, not pre-slicing from `offset` -- a pre-slice would double-offset and silently break flagged-string matches at any non-zero offset). * U2: Six new regression tests in `tests/security_regression.rs` covering the unflagged scan-mode path, NUL-stops-before-cap, cap-larger-than-buffer, minimum-valid-cap (cap=1; cap=0 is rejected by `EvaluationConfig::validate`), and a flagged `/W` whitespace-walk smoke test. All six fail against the prior `main` and pass after U1. * U5: Demote the 14 leaf `read_*` re-exports plus `read_typed_value`, `read_typed_value_with_pattern`, and `coerce_value_to_type` from `pub` to `pub(crate)`. `read_pattern_match` was already `pub(crate)` at HEAD. Doctests in the `read_*` source files that imported via `libmagic_rs::evaluator::types::read_*` are marked `ignore` since those paths are no longer reachable externally; the public surface goes through `MagicDatabase` / the documented `evaluator::` re-exports per GOTCHAS S4.1. `read_typed_value` is kept as a 3-arg convenience for `#[cfg(test)]` callers (~30 inline test sites in `src/evaluator/types/`) and delegates to the 5-arg form with `DEFAULT_MAX_STRING_LENGTH = 8192` (matching `EvaluationConfig::default().max_string_length`). The engine path supplies the user-configured cap via the explicit parameter; the test default is informational and does not affect production paths. Remaining work from the plan: * U3: doc verification + security-assurance.md credit * U4: `#[non_exhaustive]` on 13 public structs + ~64 test-site migration * U6: `#[serde(skip)]` on `RuleMatch.type_kind` Signed-off-by: UncleSp1d3r --- src/evaluator/engine/mod.rs | 32 +++-- src/evaluator/engine/tests/mod.rs | 8 +- src/evaluator/types/date.rs | 4 +- src/evaluator/types/float.rs | 4 +- src/evaluator/types/mod.rs | 123 +++++++++---------- src/evaluator/types/numeric.rs | 8 +- src/evaluator/types/string.rs | 6 +- tests/security_regression.rs | 193 ++++++++++++++++++++++++++++++ 8 files changed, 292 insertions(+), 86 deletions(-) diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index d014a9f..fbdf7e7 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -212,6 +212,7 @@ fn evaluate_single_rule_with_anchor( buffer: &[u8], last_match_end: usize, base_offset: usize, + max_string_length: usize, ) -> Result, LibmagicError> { use crate::parser::ast::TypeKind; @@ -261,7 +262,7 @@ fn evaluate_single_rule_with_anchor( } TypeKind::Meta(_) => return Ok(None), TypeKind::Regex { .. } | TypeKind::Search { .. } => { - evaluate_pattern_rule(rule, buffer, absolute_offset)? + evaluate_pattern_rule(rule, buffer, absolute_offset, max_string_length)? } // Flagged `string` rules route through the pattern-bearing path // (see GOTCHAS S2.4 for the contract) so `compare_string_with_flags` @@ -269,9 +270,9 @@ fn evaluate_single_rule_with_anchor( // Default-flag strings (the common case) take the existing // value-rule fast path with byte-exact `apply_equal`. TypeKind::String { flags, .. } if !flags.is_empty() => { - evaluate_pattern_rule(rule, buffer, absolute_offset)? + evaluate_pattern_rule(rule, buffer, absolute_offset, max_string_length)? } - _ => evaluate_value_rule(rule, buffer, absolute_offset)?, + _ => evaluate_value_rule(rule, buffer, absolute_offset, max_string_length)?, }; Ok(matched.then_some((absolute_offset, read_value))) } @@ -381,10 +382,16 @@ fn evaluate_pattern_rule( rule: &MagicRule, buffer: &[u8], absolute_offset: usize, + max_string_length: usize, ) -> Result<(bool, crate::parser::ast::Value), LibmagicError> { - let match_outcome = - types::read_pattern_match(buffer, absolute_offset, &rule.typ, Some(&rule.value)) - .map_err(|e| LibmagicError::EvaluationError(e.into()))?; + let match_outcome = types::read_pattern_match( + buffer, + absolute_offset, + &rule.typ, + Some(&rule.value), + max_string_length, + ) + .map_err(|e| LibmagicError::EvaluationError(e.into()))?; let pattern_found = match_outcome.is_some(); let matched = match &rule.op { crate::parser::ast::Operator::Equal => pattern_found, @@ -417,10 +424,16 @@ fn evaluate_value_rule( rule: &MagicRule, buffer: &[u8], absolute_offset: usize, + max_string_length: usize, ) -> Result<(bool, crate::parser::ast::Value), LibmagicError> { - let read_value = - types::read_typed_value_with_pattern(buffer, absolute_offset, &rule.typ, Some(&rule.value)) - .map_err(|e| LibmagicError::EvaluationError(e.into()))?; + let read_value = types::read_typed_value_with_pattern( + buffer, + absolute_offset, + &rule.typ, + Some(&rule.value), + max_string_length, + ) + .map_err(|e| LibmagicError::EvaluationError(e.into()))?; // Apply any pre-comparison value transform (`type+N`/`type-N`/`type*N`/ // `type/N`/`type%N`/`type|N`/`type^N`). The transform runs on the read @@ -1026,6 +1039,7 @@ pub fn evaluate_rules( buffer, context.last_match_end(), context.base_offset(), + context.max_string_length(), ) { Ok(data) => data, Err( diff --git a/src/evaluator/engine/tests/mod.rs b/src/evaluator/engine/tests/mod.rs index 8c1516d..1a1a90b 100644 --- a/src/evaluator/engine/tests/mod.rs +++ b/src/evaluator/engine/tests/mod.rs @@ -23,7 +23,13 @@ fn evaluate_single_rule_legacy( rule: &MagicRule, buffer: &[u8], ) -> Result, LibmagicError> { - evaluate_single_rule_with_anchor(rule, buffer, 0, 0) + evaluate_single_rule_with_anchor( + rule, + buffer, + 0, + 0, + crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH, + ) } #[test] diff --git a/src/evaluator/types/date.rs b/src/evaluator/types/date.rs index 8ecc623..d9531a5 100644 --- a/src/evaluator/types/date.rs +++ b/src/evaluator/types/date.rs @@ -29,7 +29,7 @@ const MONTH_NAMES: [&str; 12] = [ /// /// # Examples /// -/// ``` +/// ```ignore /// use libmagic_rs::evaluator::types::read_date; /// use libmagic_rs::parser::ast::{Endianness, Value}; /// @@ -77,7 +77,7 @@ pub fn read_date( /// /// # Examples /// -/// ``` +/// ```ignore /// use libmagic_rs::evaluator::types::read_qdate; /// use libmagic_rs::parser::ast::{Endianness, Value}; /// diff --git a/src/evaluator/types/float.rs b/src/evaluator/types/float.rs index 7a3c8ba..7f8dba0 100644 --- a/src/evaluator/types/float.rs +++ b/src/evaluator/types/float.rs @@ -16,7 +16,7 @@ use crate::parser::ast::{Endianness, Value}; /// /// # Examples /// -/// ``` +/// ```ignore /// use libmagic_rs::evaluator::types::read_float; /// use libmagic_rs::parser::ast::{Endianness, Value}; /// @@ -56,7 +56,7 @@ pub fn read_float( /// /// # Examples /// -/// ``` +/// ```ignore /// use libmagic_rs::evaluator::types::read_double; /// use libmagic_rs::parser::ast::{Endianness, Value}; /// diff --git a/src/evaluator/types/mod.rs b/src/evaluator/types/mod.rs index a7bb168..a8f2961 100644 --- a/src/evaluator/types/mod.rs +++ b/src/evaluator/types/mod.rs @@ -18,13 +18,13 @@ use std::borrow::Cow; use thiserror::Error; use date::format_timestamp_value; -pub use date::{read_date, read_qdate}; -pub use float::{read_double, read_float}; -pub use numeric::{read_byte, read_long, read_quad, read_short}; -pub use regex::read_regex; -pub use search::read_search; +pub(crate) use date::{read_date, read_qdate}; +pub(crate) use float::{read_double, read_float}; +pub(crate) use numeric::{read_byte, read_long, read_quad, read_short}; +pub(crate) use regex::read_regex; +pub(crate) use search::read_search; use string::string16_bytes_consumed; -pub use string::{read_pstring, read_string, read_string_exact, read_string16}; +pub(crate) use string::{read_pstring, read_string, read_string_exact, read_string16}; /// Reads a fixed-size byte array from the buffer at the given offset. /// @@ -101,13 +101,22 @@ pub enum TypeReadError { }, } +/// Default `max_string_length` used by [`read_typed_value`] when callers +/// do not supply an explicit cap. Matches +/// `EvaluationConfig::default().max_string_length` so unit tests that +/// invoke `read_typed_value` directly see the same scan-mode bound the +/// engine applies at evaluation time. The engine call path +/// (`evaluate_value_rule`) threads the user-configured cap, so this +/// constant only governs ad-hoc / test usage. +#[cfg(test)] +pub(crate) const DEFAULT_MAX_STRING_LENGTH: usize = 8192; + /// Reads bytes according to the specified `TypeKind`. /// -/// This is the public dispatch entry point for type reading for non -/// pattern-bearing types. It preserves the original three-argument -/// signature used by external consumers -- fixed-width numeric, float, -/// date, string, and pstring types need no pattern operand, so the hot -/// path stays ergonomic. +/// This is the internal dispatch entry point for type reading for +/// non-pattern-bearing types. Fixed-width numeric, float, date, string, +/// and pstring types need no pattern operand, so the hot path stays +/// ergonomic. /// /// For pattern-bearing types (`TypeKind::Regex`, `TypeKind::Search`) this /// function will return `TypeReadError::UnsupportedType` because the @@ -115,25 +124,6 @@ pub enum TypeReadError { /// rules should use [`read_typed_value_with_pattern`] and thread the rule /// value operand through as `pattern`. /// -/// # Examples -/// -/// ``` -/// use libmagic_rs::evaluator::types::read_typed_value; -/// use libmagic_rs::parser::ast::{Endianness, TypeKind, Value}; -/// -/// let buffer = &[0x7f, 0x45, 0x4c, 0x46, 0x34, 0x12]; -/// let byte_result = -/// read_typed_value(buffer, 0, &TypeKind::Byte { signed: false }).unwrap(); -/// assert_eq!(byte_result, Value::Uint(0x7f)); -/// -/// let short_type = TypeKind::Short { -/// endian: Endianness::Little, -/// signed: false, -/// }; -/// let short_result = read_typed_value(buffer, 4, &short_type).unwrap(); -/// assert_eq!(short_result, Value::Uint(0x1234)); -/// ``` -/// /// # Errors /// /// Returns `TypeReadError::BufferOverrun` when the requested value extends @@ -141,12 +131,20 @@ pub enum TypeReadError { /// pattern-bearing type is evaluated without a pattern, or /// `TypeReadError::InvalidPStringLength` for a malformed Pascal string /// length prefix. -pub fn read_typed_value( +/// +/// This three-argument form defaults `max_string_length` to +/// [`DEFAULT_MAX_STRING_LENGTH`] (8192 bytes, matching +/// `EvaluationConfig::default()`). The engine's value-rule path supplies +/// the user-configured cap via [`read_typed_value_with_pattern`] directly, +/// so this helper exists solely for unit tests that want a one-shot +/// type-read without constructing a context. +#[cfg(test)] +pub(crate) fn read_typed_value( buffer: &[u8], offset: usize, type_kind: &TypeKind, ) -> Result { - read_typed_value_with_pattern(buffer, offset, type_kind, None) + read_typed_value_with_pattern(buffer, offset, type_kind, None, DEFAULT_MAX_STRING_LENGTH) } /// Reads bytes according to the specified `TypeKind`, threading a @@ -160,23 +158,6 @@ pub fn read_typed_value( /// is ignored; external callers for those types should prefer the simpler /// three-argument [`read_typed_value`] wrapper. /// -/// # Examples -/// -/// ``` -/// use libmagic_rs::evaluator::types::read_typed_value_with_pattern; -/// use libmagic_rs::parser::ast::{RegexCount, RegexFlags, TypeKind, Value}; -/// -/// let haystack = b"abc123def"; -/// let regex_type = TypeKind::Regex { -/// flags: RegexFlags::default(), -/// count: RegexCount::Default, -/// }; -/// let pattern = Value::String("[0-9]+".to_string()); -/// let regex_result = -/// read_typed_value_with_pattern(haystack, 0, ®ex_type, Some(&pattern)).unwrap(); -/// assert_eq!(regex_result, Value::String("123".to_string())); -/// ``` -/// /// # Errors /// /// Returns `TypeReadError::BufferOverrun` when the requested value extends @@ -184,11 +165,19 @@ pub fn read_typed_value( /// pattern fails to compile or a pattern-bearing type is evaluated without /// a pattern, or `TypeReadError::InvalidPStringLength` for a malformed /// Pascal string length prefix. -pub fn read_typed_value_with_pattern( +/// +/// `max_string_length` bounds the scan-mode string read on the +/// `(None, _)` arm of [`TypeKind::String`]. Without it, `string x` rules +/// against an attacker-controlled NUL-free buffer could allocate up to +/// the full buffer length (origin finding 2A-H1 / CWE-770). The cap is +/// wired from [`EvaluationContext::max_string_length`] at the engine call +/// site. +pub(crate) fn read_typed_value_with_pattern( buffer: &[u8], offset: usize, type_kind: &TypeKind, pattern: Option<&Value>, + max_string_length: usize, ) -> Result { match type_kind { TypeKind::Byte { signed } => read_byte(buffer, offset, *signed), @@ -232,7 +221,12 @@ pub fn read_typed_value_with_pattern( (Some(n), _) => read_string_exact(buffer, offset, *n), (None, Some(Value::String(p))) => read_string_exact(buffer, offset, p.len()), (None, Some(Value::Bytes(b))) => read_string_exact(buffer, offset, b.len()), - (None, _) => read_string(buffer, offset, None), + // 2A-H1: thread the configured cap into the scan-mode read. + // Without this, `string x` rules against attacker-controlled + // NUL-free buffers could allocate up to the full buffer + // length, defeating the CWE-770 control documented in + // `EvaluationConfig::max_string_length`. + (None, _) => read_string(buffer, offset, Some(max_string_length)), } } TypeKind::String16 { endian } => read_string16(buffer, offset, *endian), @@ -306,6 +300,7 @@ pub(crate) fn read_pattern_match( offset: usize, type_kind: &TypeKind, pattern: Option<&Value>, + max_string_length: usize, ) -> Result, TypeReadError> { match type_kind { TypeKind::Regex { flags, count } => { @@ -383,11 +378,18 @@ pub(crate) fn read_pattern_match( // window is shorter than the pattern, the comparison naturally // produces no match via `compare_string_with_flags`'s EOF // handling -- no special case needed. - let scan_buffer: &[u8] = if let Some(n) = max_length { - let end = offset.saturating_add(*n).min(buffer.len()); + // + // 2A-H1: When AST `max_length` is `None`, fall back to the + // configured `max_string_length` cap rather than passing the + // full buffer. The cap is applied to the buffer's UPPER bound + // (not pre-sliced from `offset`) because + // `compare_string_with_flags` slices internally via + // `buffer.get(offset..)?` -- pre-slicing would double-offset + // and silently produce no-match at any non-zero offset. + let scan_buffer: &[u8] = { + let cap = max_length.unwrap_or(max_string_length); + let end = offset.saturating_add(cap).min(buffer.len()); buffer.get(..end).unwrap_or(buffer) - } else { - buffer }; match string::compare_string_with_flags(trimmed, scan_buffer, offset, *flags) { Some(consumed) => { @@ -510,17 +512,8 @@ fn trim_ascii_whitespace(s: &[u8]) -> &[u8] { /// when the value must be transformed. This avoids an allocation on every /// rule evaluation for `Value::String` and other pass-through cases. /// -/// # Examples -/// -/// ``` -/// use libmagic_rs::evaluator::types::coerce_value_to_type; -/// use libmagic_rs::parser::ast::{TypeKind, Value}; -/// -/// let coerced = coerce_value_to_type(&Value::Uint(0xff), &TypeKind::Byte { signed: true }); -/// assert_eq!(*coerced, Value::Int(-1)); -/// ``` #[must_use] -pub fn coerce_value_to_type<'a>(value: &'a Value, type_kind: &TypeKind) -> Cow<'a, Value> { +pub(crate) fn coerce_value_to_type<'a>(value: &'a Value, type_kind: &TypeKind) -> Cow<'a, Value> { match (value, type_kind) { (Value::Uint(v), TypeKind::Byte { signed: true }) if *v > i8::MAX as u64 => { diff --git a/src/evaluator/types/numeric.rs b/src/evaluator/types/numeric.rs index f7fba43..32c9aa4 100644 --- a/src/evaluator/types/numeric.rs +++ b/src/evaluator/types/numeric.rs @@ -14,7 +14,7 @@ use crate::parser::ast::{Endianness, Value}; /// /// # Examples /// -/// ``` +/// ```ignore /// use libmagic_rs::evaluator::types::read_byte; /// use libmagic_rs::parser::ast::Value; /// @@ -56,7 +56,7 @@ pub fn read_byte(buffer: &[u8], offset: usize, signed: bool) -> Result String { /// /// # Examples /// -/// ``` +/// ```ignore /// use libmagic_rs::evaluator::types::read_string16; /// use libmagic_rs::parser::ast::{Endianness, Value}; /// @@ -401,7 +401,7 @@ pub(crate) fn string16_bytes_consumed(buffer: &[u8], offset: usize, endian: Endi /// /// # Examples /// -/// ``` +/// ```ignore /// use libmagic_rs::evaluator::types::read_pstring; /// use libmagic_rs::parser::ast::{Value, PStringLengthWidth}; /// diff --git a/tests/security_regression.rs b/tests/security_regression.rs index 72dfd3a..4707540 100644 --- a/tests/security_regression.rs +++ b/tests/security_regression.rs @@ -17,6 +17,12 @@ //! `size_limit` / `dfa_size_limit` (CWE-1333). //! * **T-M2 (S13.1)** — `EvaluationConfig::default()` has no timeout; //! this test pins the invariant so a change is a deliberate choice. +//! * **2A-H1 / 3A-C2** — `EvaluationConfig::max_string_length` must be +//! threaded into both the unflagged string dispatcher (`read_typed_value_with_pattern`) +//! AND the flagged-string dispatcher (`read_pattern_match`). Origin +//! `.full-review/05-final-report.md` documented the cap as a working +//! CWE-770 countermeasure that was never wired. These tests pin the +//! fix on both paths. //! //! Tests that require private-module access (codegen round-trip for //! S-L2, `concatenate_messages` backspace edges for S14.1) live inline @@ -186,3 +192,190 @@ fn test_evaluation_config_default_is_unbounded() { and the rustdoc `# Security` section on the MagicDatabase constructors." ); } + +// ============================================================================= +// 2A-H1 / 3A-C2: max_string_length must be honored on both string-read paths +// ============================================================================= + +/// Helper: build a 1 MiB NUL-free buffer to stress the cap. +fn one_mib_nul_free() -> Vec { + vec![b'A'; 1_048_576] +} + +/// Build a `string x` rule (unflagged any-value) with no AST max_length. +/// Exercises the `read_typed_value_with_pattern` `(None, _)` arm. The +/// rule's `value` field is `Value::Uint(0)` (sentinel for `x`); using +/// `Value::String("")` would route through the `(None, Some(Value::String(p)))` +/// arm which reads `p.len() == 0` bytes — not the path we want to exercise. +fn unflagged_string_x_rule() -> libmagic_rs::MagicRule { + use libmagic_rs::parser::ast::StringFlags; + use libmagic_rs::{MagicRule, OffsetSpec, Operator, TypeKind, Value}; + MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { + max_length: None, + flags: StringFlags::default(), + }, + Operator::AnyValue, + Value::Uint(0), + "captured: %s".to_string(), + ) +} + +/// Build a `string/ ""` rule for testing the flagged-string +/// scan window. Flagged strings reject `Operator::AnyValue` (the engine +/// requires `Equal`/`NotEqual` for pattern-bearing types), so we use a +/// concrete pattern and verify match-vs-no-match based on whether the +/// scan window covers the pattern position. +fn flagged_string_equal_rule( + flags: libmagic_rs::parser::ast::StringFlags, + pattern: &str, +) -> libmagic_rs::MagicRule { + use libmagic_rs::{MagicRule, OffsetSpec, Operator, TypeKind, Value}; + MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { + max_length: None, + flags, + }, + Operator::Equal, + Value::String(pattern.to_string()), + "flagged hit".to_string(), + ) +} + +/// Run an evaluation against the rule and return the first match's `Value` +/// if the rule matched, or `None` on no-match. +fn captured_value( + rule: &libmagic_rs::MagicRule, + buffer: &[u8], + cap: usize, +) -> Option { + use libmagic_rs::evaluator::{EvaluationContext, evaluate_rules}; + let config = EvaluationConfig::default() + .with_max_string_length(cap) + .with_timeout_ms(Some(5_000)); + let mut ctx = EvaluationContext::new(config); + let matches = evaluate_rules(std::slice::from_ref(rule), buffer, &mut ctx) + .expect("evaluate_rules should not error for these simple rules"); + matches.into_iter().next().map(|m| m.value) +} + +/// Extract the byte length of a captured string-shaped `Value`. +fn captured_len(v: &libmagic_rs::parser::ast::Value) -> usize { + use libmagic_rs::parser::ast::Value; + match v { + Value::String(s) => s.len(), + Value::Bytes(b) => b.len(), + other => panic!("expected string/bytes capture, got {other:?}"), + } +} + +/// Unflagged path: `0 string x` against a 1 MiB NUL-free buffer with +/// `max_string_length = 64` must produce a capture bounded to 64 bytes. +/// Pre-fix this allocates the full buffer (~1 MiB) because the dispatcher +/// passes `None` to `read_string`. Origin finding 2A-H1. +#[test] +fn test_max_string_length_caps_unflagged_string_x() { + let buf = one_mib_nul_free(); + let rule = unflagged_string_x_rule(); + let captured = + captured_value(&rule, &buf, 64).expect("unflagged `string x` should match any buffer"); + let len = captured_len(&captured); + assert_eq!( + len, 64, + "unflagged string x must cap at max_string_length=64; got {len} bytes \ + (2A-H1 regression: dispatcher dropped the cap)" + ); +} + +/// Flagged-`/W` path: a `string/W "X "` rule against a buffer of all +/// whitespace must walk only `max_string_length` bytes before giving up, +/// not the full buffer. Origin 2A-H1 (flagged-string scan-window variant) +/// — the `/W` operator consumes greedy whitespace which without a cap +/// could walk an arbitrarily large buffer. The test uses a buffer too +/// large to consume completely in any reasonable time bound; the U1 cap +/// prevents the runaway walk. +/// +/// We assert no-match (the pattern `X ` requires a literal `X` after the +/// whitespace run; the buffer has none), and that the evaluation +/// completes well under the cap-implied wall-clock bound. A correctly +/// capped walk completes in microseconds; an uncapped walk through 16 +/// MiB takes meaningfully longer. +#[test] +fn test_max_string_length_caps_flagged_w_whitespace_walk() { + use libmagic_rs::parser::ast::StringFlags; + use std::time::Instant; + + // 16 MiB of whitespace — large enough that an uncapped walk is + // observably slower than a capped one, but not so large that test + // setup dominates the run time. + let buf = vec![b' '; 16 * 1024 * 1024]; + let rule = + flagged_string_equal_rule(StringFlags::default().with_compact_whitespace(true), " X"); + + let cap = 1024usize; + let start = Instant::now(); + let result = captured_value(&rule, &buf, cap); + let elapsed = start.elapsed(); + + assert!( + result.is_none(), + "flagged string/W ' X' must NOT match an all-whitespace buffer; got {result:?}" + ); + // With a 1024-byte cap the comparator walks at most ~1024 bytes. + // A pessimistic bound of 100 ms covers any reasonable CI environment; + // an uncapped walk through 16 MiB takes substantially longer. + assert!( + elapsed.as_millis() < 100, + "flagged string/W against 16 MiB whitespace ran for {elapsed:?} \ + (2A-H1 regression: flagged-string scan_buffer ignored max_string_length=1024)" + ); +} + +/// Minimum valid cap (cap = 1) must produce a 1-byte result on the +/// unflagged path. `EvaluationConfig::validate()` rejects +/// `max_string_length == 0` at construction time, so cap=0 cannot be +/// reached through the public API. +#[test] +fn test_max_string_length_minimum_cap_returns_one_byte() { + let buf = one_mib_nul_free(); + let v = captured_value(&unflagged_string_x_rule(), &buf, 1).expect("must match"); + assert_eq!( + captured_len(&v), + 1, + "unflagged: cap=1 must yield 1-byte capture; got {} bytes", + captured_len(&v) + ); +} + +/// NUL before cap: the unflagged path stops at the first NUL even when +/// the configured cap would allow reading further. Confirms the cap is +/// an upper bound, not a target. +#[test] +fn test_max_string_length_unflagged_stops_at_nul_before_cap() { + let mut buf = b"hello\0".to_vec(); + buf.extend(std::iter::repeat_n(b'A', 1_048_576)); + let v = captured_value(&unflagged_string_x_rule(), &buf, 64).expect("must match"); + assert_eq!( + captured_len(&v), + 5, + "unflagged path must stop at NUL even when cap is larger; \ + got {} bytes", + captured_len(&v) + ); +} + +/// Cap larger than remaining buffer must return the entire buffer (or up +/// to the first NUL, whichever comes first). The cap is an upper bound. +#[test] +fn test_max_string_length_cap_larger_than_buffer_returns_full_buffer() { + let buf = vec![b'A'; 100]; + let v = captured_value(&unflagged_string_x_rule(), &buf, 1_000_000).expect("must match"); + assert_eq!( + captured_len(&v), + 100, + "cap larger than buffer should return full buffer; got {} bytes", + captured_len(&v) + ); +} From efc151db231bda16daa2fd6904bbcd25a3faee5f Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 28 May 2026 20:42:47 -0400 Subject: [PATCH 2/8] refactor(evaluator): drop Deserialize from RuleMatch and add #[serde(skip)] on type_kind (closes 1B-H2) Library-side `EvaluationResult` carries `RuleMatch.type_kind: TypeKind` for runtime needs (`format_magic_message` width-masking, `bit_width()` derivation in output formatting). Serializing the result directly via `serde_json::to_string` was leaking the full parser AST shape into JSON output -- CWE-200 information exposure documented in origin findings 1B-H2 and 2A-M1. The documented JSON contract is `output::json::JsonMatchResult` which already omits `type_kind`. This commit aligns the library-side type with that contract without changing the runtime API: * Add `#[serde(skip)]` to `RuleMatch.type_kind`. Rust-side consumers continue to access the field directly; only the serialized form is affected. * Drop `Deserialize` from `RuleMatch`, `EvaluationResult`, and `EvaluationMetadata`. A reconstructed `RuleMatch` would lack the buffer context it was produced against, so deserialization was never a meaningful operation. The only `Deserialize`-based test in the codebase (`src/output/mod.rs:973`) round-trips the **output-side** `EvaluationResult` (which contains `MatchResult`, not `RuleMatch`) and is unaffected. * Add two regression tests in `tests/json_integration_test.rs`: one asserting the JSON output contains no `type_kind` key, one asserting the Rust field access still works for runtime consumers. Implements U6 from `docs/plans/2026-05-28-001-refactor-pre-1.0-api-hardening-plan.md`. Signed-off-by: UncleSp1d3r --- src/evaluator/mod.rs | 21 +++++++- src/lib.rs | 6 +-- tests/json_integration_test.rs | 94 ++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 5 deletions(-) diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index c88f256..09a83da 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -8,7 +8,7 @@ //! re-exports the core evaluation functions from submodules. use crate::{EvaluationConfig, LibmagicError}; -use serde::{Deserialize, Serialize}; +use serde::Serialize; mod engine; pub mod offset; @@ -396,7 +396,14 @@ impl Drop for RecursionGuard<'_> { /// /// Contains information extracted from a successful rule match, including /// the matched value, position, and confidence score. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +/// +/// This type derives `Serialize` so callers can convert evaluation results +/// to JSON, but intentionally does NOT derive `Deserialize`: a +/// reconstructed `RuleMatch` would lack the buffer context it was +/// produced against, so deserialization is not a meaningful operation. +/// The output-side conversion layer (`output::MatchResult` / +/// `output::json::JsonMatchResult`) is the documented JSON contract. +#[derive(Debug, Clone, PartialEq, Serialize)] pub struct RuleMatch { /// The message associated with the matching rule pub message: String, @@ -410,6 +417,16 @@ pub struct RuleMatch { /// /// Carries the source `TypeKind` so downstream consumers (e.g., output /// formatting) can determine the on-disk width of the matched value. + /// + /// `#[serde(skip)]` keeps the parser AST out of JSON output produced + /// by serializing `EvaluationResult` directly via + /// `serde_json::to_string(&result)`. The documented JSON contract is + /// `JsonMatchResult` in `src/output/json.rs`, which omits this field. + /// Origin findings 1B-H2 / 2A-M1 (CWE-200 information exposure). + /// Rust-side consumers continue to access `type_kind` via field access + /// for runtime needs (`format_magic_message` width-masking, + /// `bit_width()` derivation). + #[serde(skip)] pub type_kind: crate::parser::ast::TypeKind, /// Confidence score (0.0 to 1.0) /// diff --git a/src/lib.rs b/src/lib.rs index 54e571a..e95de9c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -102,7 +102,7 @@ use std::path::{Path, PathBuf}; -use serde::{Deserialize, Serialize}; +use serde::Serialize; // Re-export modules pub mod builtin_rules; @@ -615,7 +615,7 @@ impl MagicDatabase { /// assert_eq!(metadata.file_size, 8192); /// assert!(!metadata.timed_out); /// ``` -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize)] pub struct EvaluationMetadata { /// Size of the analyzed file or buffer in bytes pub file_size: u64, @@ -684,7 +684,7 @@ impl Default for EvaluationMetadata { /// assert_eq!(result.description, "ELF 64-bit executable"); /// assert!(result.confidence > 0.5); /// ``` -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize)] pub struct EvaluationResult { /// Human-readable file type description /// diff --git a/tests/json_integration_test.rs b/tests/json_integration_test.rs index 958a9f6..4c78b0c 100644 --- a/tests/json_integration_test.rs +++ b/tests/json_integration_test.rs @@ -324,3 +324,97 @@ fn test_cli_json_vs_text_output() { "Text output should contain filename" ); } + +// ============================================================================= +// 1B-H2 / 2A-M1: RuleMatch.type_kind must not leak into JSON output +// ============================================================================= + +/// Library-side `EvaluationResult` carries `RuleMatch.type_kind: TypeKind` +/// for runtime needs (width-masking, bit_width derivation). Serializing +/// the result directly via `serde_json::to_string` must NOT expose the +/// parser AST to JSON consumers. The documented JSON contract is +/// `output::json::JsonMatchResult` which omits this field; the library +/// type now matches that contract via `#[serde(skip)]` on the field. +/// +/// Origin findings 1B-H2 (CWE-200 information exposure through serialized +/// AST shape) and 2A-M1 (security-lens variant). +#[test] +fn test_rule_match_type_kind_not_serialized_in_evaluation_result() { + use libmagic_rs::parser::ast::{TypeKind, Value}; + use libmagic_rs::{EvaluationMetadata, EvaluationResult, evaluator::RuleMatch}; + + // Construct an EvaluationResult containing a RuleMatch whose + // `type_kind` is a fully-qualified TypeKind variant. Then serialize + // it and assert the rendered JSON contains no `type_kind` key. + let rule_match = RuleMatch { + message: "ELF executable".to_string(), + offset: 0, + level: 0, + value: Value::Uint(0x7f), + type_kind: TypeKind::Byte { signed: false }, + confidence: 0.9, + }; + let metadata = EvaluationMetadata::default(); + let result = EvaluationResult { + description: "ELF executable".to_string(), + mime_type: None, + confidence: 0.9, + matches: vec![rule_match], + metadata, + }; + + let json = serde_json::to_string(&result).expect("must serialize"); + + assert!( + !json.contains("type_kind"), + "EvaluationResult JSON must not contain `type_kind` key \ + (1B-H2 / 2A-M1 regression: parser AST leaking into JSON output). \ + Got: {json}" + ); + + // The `value` field (the matched data) and `confidence` should still + // be present -- sanity check that #[serde(skip)] only excluded + // type_kind, not the surrounding fields. + assert!( + json.contains("\"value\""), + "EvaluationResult JSON should still include `value`" + ); + assert!( + json.contains("\"confidence\""), + "EvaluationResult JSON should still include `confidence`" + ); +} + +/// Sanity check: the `type_kind` field is still accessible from Rust +/// code after `#[serde(skip)]`. The attribute affects serde only; field +/// access for runtime consumers (`format_magic_message`, `bit_width()` +/// derivation) is unchanged. +#[test] +fn test_rule_match_type_kind_still_accessible_in_rust() { + use libmagic_rs::evaluator::RuleMatch; + use libmagic_rs::parser::ast::{Endianness, TypeKind, Value}; + + let m = RuleMatch { + message: "test".to_string(), + offset: 0, + level: 0, + value: Value::Uint(0), + type_kind: TypeKind::Long { + endian: Endianness::Little, + signed: false, + }, + confidence: 1.0, + }; + + // Field access still works + match m.type_kind { + TypeKind::Long { + endian: Endianness::Little, + signed: false, + } => {} + ref other => panic!("type_kind field access broke: got {other:?}"), + } + + // bit_width() derivation still works (relied on by format_magic_message) + assert_eq!(m.type_kind.bit_width(), Some(32)); +} From d6af29fbbe259f4a41a2f28a128265ed0c693185 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 28 May 2026 20:45:17 -0400 Subject: [PATCH 3/8] docs: clarify max_string_length coverage gaps and credit cap as CWE-400 countermeasure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligns the four documentation surfaces 2A-H1 / 3B-C1 named (rustdoc on `EvaluationConfig.max_string_length`, accessor on `EvaluationContext::max_string_length`, `configuration.md` field reference, and `security-assurance.md` CWE-400 row) with the implementation U1 landed. * `src/config.rs` (`EvaluationConfig.max_string_length` rustdoc): describe the cap explicitly as bounding scan-mode `TypeKind::String` reads on both the unflagged and flagged variants, and list the two excluded paths (`PString` errors on overrun; `String16` has a hardcoded 16 KiB ceiling). * `src/evaluator/mod.rs` (`EvaluationContext::max_string_length` rustdoc): same shape -- describe the threading into both dispatchers and the exclusions. * `docs/src/configuration.md`: update the field reference table. * `docs/src/security-assurance.md`: - CWE-400 §5.1 row: credit `max_string_length` as a memory-exhaustion countermeasure and cross-reference §7.3 for the coverage gaps. - New §7.3 "max_string_length Coverage Gaps": describe the PString/String16 exclusions and mitigation guidance for embedders. This scoped update lands within the 2A-H1 / 3B-C1 documentation work declared in plan U3; the full 3B-H2 threat-model refresh remains a separate Sprint 5 PR per the origin report. Implements U3 from `docs/plans/2026-05-28-001-refactor-pre-1.0-api-hardening-plan.md`. Signed-off-by: UncleSp1d3r --- docs/src/configuration.md | 14 +++++++------- docs/src/security-assurance.md | 9 +++++++++ src/config.rs | 15 +++++++++++++-- src/evaluator/mod.rs | 13 +++++++++++-- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/docs/src/configuration.md b/docs/src/configuration.md index deb36a2..11bc879 100644 --- a/docs/src/configuration.md +++ b/docs/src/configuration.md @@ -18,13 +18,13 @@ pub struct EvaluationConfig { ### Field Reference -| Field | Type | Default | Bounds | Purpose | -| --------------------- | ------------- | ------- | -------------- | ---------------------------------------------------------------------------------------------------------------------------- | -| `max_recursion_depth` | `u32` | 20 | 1 -- 1000 | Limits nested rule traversal depth to prevent stack overflow | -| `max_string_length` | `usize` | 8192 | 1 -- 1_048_576 | Caps bytes read for string types to prevent memory exhaustion | -| `stop_at_first_match` | `bool` | `true` | -- | When true, evaluation stops after the first matching top-level rule (children of that rule are still evaluated -- see below) | -| `enable_mime_types` | `bool` | `false` | -- | When true, maps file type descriptions to standard MIME types | -| `timeout_ms` | `Option` | `None` | 1 -- 300_000 | Per-file evaluation timeout in milliseconds; `None` disables | +| Field | Type | Default | Bounds | Purpose | +| --------------------- | ------------- | ------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `max_recursion_depth` | `u32` | 20 | 1 -- 1000 | Limits nested rule traversal depth to prevent stack overflow | +| `max_string_length` | `usize` | 8192 | 1 -- 1_048_576 | Caps bytes read for `TypeKind::String` reads (both unflagged and flagged `/c`/`/C`/`/w`/`/W`/`/T`/`/f` variants). Does NOT apply to `TypeKind::PString` (returns `TypeReadError::BufferOverrun` on oversized prefix) or `TypeKind::String16` (hardcoded 8192-unit ceiling) | +| `stop_at_first_match` | `bool` | `true` | -- | When true, evaluation stops after the first matching top-level rule (children of that rule are still evaluated -- see below) | +| `enable_mime_types` | `bool` | `false` | -- | When true, maps file type descriptions to standard MIME types | +| `timeout_ms` | `Option` | `None` | 1 -- 300_000 | Per-file evaluation timeout in milliseconds; `None` disables | ### `stop_at_first_match` Semantics diff --git a/docs/src/security-assurance.md b/docs/src/security-assurance.md index b904c6b..3103e8c 100644 --- a/docs/src/security-assurance.md +++ b/docs/src/security-assurance.md @@ -158,6 +158,15 @@ The I/O layer mitigates the common shapes of this attack by canonicalizing the p The `evaluate_file` rustdoc (`# Security` section) cross-references this subsection. +### 7.3 `max_string_length` Coverage Gaps + +`EvaluationConfig::max_string_length` caps the buffer-length allocation for `TypeKind::String` scan-mode reads (both the unflagged `(None, _)` arm of `read_typed_value_with_pattern` and the flagged-string arm of `read_pattern_match`). It does **not** govern two adjacent string-family read paths: + +- **`TypeKind::PString`** uses an explicit length prefix decoded from the buffer and returns `TypeReadError::BufferOverrun` (rather than truncating) when the prefix declares a length that exceeds the remaining buffer. The error-on-overrun behaviour is a real bound -- the read function cannot allocate past the buffer length -- but it differs from the configurable cap that `max_string_length` provides. See GOTCHAS S6.1 for the load-bearing pstring clamp documentation. +- **`TypeKind::String16`** is capped at a hardcoded `STRING16_MAX_UNITS = 8192` ceiling (2 bytes per UCS-2 unit, so up to 16 384 bytes per read). The configured cap is not consulted on this path. + +**Mitigation for callers:** Embedders who need a configurable cap on `String16` or `PString` reads cannot rely on `EvaluationConfig::max_string_length` for those types today. The existing built-in bounds (16 KiB ceiling on String16, error-on-overrun on PString) constrain the worst-case allocation. A configurable cap for these paths is tracked as follow-up work; the threat model entry will be updated when it lands. + ## 8. Ongoing Assurance This assurance case is maintained as a living document. It is updated when: diff --git a/src/config.rs b/src/config.rs index 2ef72f7..bfa7001 100644 --- a/src/config.rs +++ b/src/config.rs @@ -48,8 +48,19 @@ pub struct EvaluationConfig { /// Maximum string length to read /// - /// This limits the amount of data read for string types to prevent - /// excessive memory usage. Default is 8192 bytes. + /// Caps the buffer-length allocation for scan-mode reads of + /// `TypeKind::String` (both the unflagged `(None, _)` arm and the + /// flagged `/c`/`/C`/`/w`/`/W`/`/T`/`/f` arm). Without this cap, a + /// `string x` rule against an attacker-controlled NUL-free buffer + /// could allocate up to the full buffer length -- the CWE-770 + /// control documented at this field. Default is 8192 bytes. + /// + /// Does NOT apply to: + /// - `TypeKind::PString`: returns `TypeReadError::BufferOverrun` + /// rather than truncating when the length prefix exceeds the + /// remaining buffer (per GOTCHAS S6.1). + /// - `TypeKind::String16`: bounded by a hardcoded + /// `STRING16_MAX_UNITS = 8192` ceiling at 2 bytes per unit. pub max_string_length: usize, /// Stop at first match or continue for all matches diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index 09a83da..51173ed 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -301,11 +301,20 @@ impl EvaluationContext { self.config.stop_at_first_match } - /// Get the maximum string length allowed + /// Get the maximum string length allowed for scan-mode string reads. + /// + /// Threaded into both string-read dispatchers + /// (`read_typed_value_with_pattern` for the unflagged `(None, _)` arm + /// and `read_pattern_match` for the flagged `/c`/`/C`/`/w`/`/W`/`/T`/`/f` + /// arm) so they cap the buffer-length allocation against this value. + /// Does NOT apply to `TypeKind::PString` (which errors on oversized + /// length prefixes per GOTCHAS S6.1) or `TypeKind::String16` (capped + /// at a hardcoded `STRING16_MAX_UNITS = 8192` ceiling). /// /// # Returns /// - /// The maximum string length that should be read during evaluation + /// The configured `max_string_length` (default 8192 bytes per + /// `EvaluationConfig::default()`). #[must_use] pub const fn max_string_length(&self) -> usize { self.config.max_string_length From df68b3015786b73aa4fe1c689e070e75a2cfd41d Mon Sep 17 00:00:00 2001 From: "dosubot[bot]" <131922026+dosubot[bot]@users.noreply.github.com> Date: Fri, 29 May 2026 00:52:14 +0000 Subject: [PATCH 4/8] docs: Dosu updates for PR #304 --- docs/src/evaluator.md | 26 ++++---------------------- docs/src/library-api.md | 14 +++++++------- 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index 9d77033..70152df 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -23,7 +23,7 @@ The evaluator module separates public interface from implementation: - **`evaluator/offset/mod.rs`** - Offset resolution - **`evaluator/operators/mod.rs`** - Operator application - **`evaluator/types/`** - Type reading and coercion (organized as submodules as of v0.4.2) - - **`types/mod.rs`** - Public API surface: `read_typed_value`, `coerce_value_to_type`, re-exports type functions + - **`types/mod.rs`** - Internal type-reading API: `pub(crate)` dispatchers (`read_typed_value_with_pattern`, `read_pattern_match`, `coerce_value_to_type`) plus re-exports of leaf `read_*` functions - **`types/numeric.rs`** - Numeric type handling: `read_byte`, `read_short`, `read_long`, `read_quad` with endianness and signedness support - **`types/float.rs`** - Floating-point type handling: `read_float` (32-bit IEEE 754), `read_double` (64-bit IEEE 754) with endianness support - **`types/date.rs`** - Date and timestamp type handling: `read_date` (32-bit Unix timestamps), `read_qdate` (64-bit Unix timestamps) with endianness and UTC/local time support @@ -91,6 +91,8 @@ pub struct RuleMatch { The `Value` type is from `parser::ast::Value` and represents the actual matched content according to the rule's type specification. Note that `Value` implements only `PartialEq` (not `Eq`) due to floating-point NaN semantics. +`RuleMatch` also carries a non-public `type_kind: TypeKind` field used internally by the engine for width calculations and format substitution. This field is excluded from JSON serialization (`#[serde(skip)]`). + ### Offset Resolution (`evaluator/offset.rs`) - **Absolute offsets**: Direct file positions (`0`, `0x100`) @@ -127,27 +129,7 @@ The types module is organized into submodules for numeric, floating-point, date/ - **Search**: Bounded literal pattern scan with flag support. `search/N` caps the scan window to `N` bytes from the offset; range is mandatory and non-zero (`NonZeroUsize`). Accepts nine flag suffixes (`/s`, `/c`, `/C`, `/w`, `/W`, `/T`, `/f`, `/t`, `/b`) that control scan behavior and anchor advancement. When only anchor-only flags (`/s`, `/t`, `/b`) are set or no flags are present, the SIMD-accelerated `memchr::memmem::find` fast path is used. When comparison-altering flags (`/c`, `/C`, `/w`, `/W`, `/T`, `/f`) are set, a byte-by-byte comparison through `compare_string_with_flags` is used. The `/s` flag sets the previous-match anchor for relative-offset children to match-START instead of match-END. - **Bounds checking**: Prevents buffer overruns -```rust -// Non-pattern types use the 3-arg convenience wrapper: -pub fn read_typed_value( - buffer: &[u8], - offset: usize, - type_kind: &TypeKind, -) -> Result - -// Pattern-bearing types (Regex, Search) thread the rule's value operand -// through as the match pattern: -pub fn read_typed_value_with_pattern( - buffer: &[u8], - offset: usize, - type_kind: &TypeKind, - pattern: Option<&Value>, -) -> Result -``` - -The engine uses `read_typed_value_with_pattern` uniformly and passes `Some(&rule.value)` for every rule; the convenience `read_typed_value` is a thin wrapper that forwards `pattern: None`. For pattern-bearing types a genuine "no match" is collapsed to `Value::String(String::new())` in the `read_typed_value_with_pattern` return so the back-compat `Value` shape is preserved; the engine instead calls `read_pattern_match` directly, which returns `Result, _>` so zero-width matches (e.g. `^`, `a*`) can be distinguished from genuine misses. - -The `read_byte` function signature changed in v0.2.0 to accept three parameters (`buffer`, `offset`, and `signed`) instead of two, allowing explicit control over signed vs unsigned byte interpretation. +The type-reading functions are internal (`pub(crate)`) engine helpers. External library users evaluate rules through `evaluate_rules` or `evaluate_rules_with_config`. **Floating-Point Type Reading (`evaluator/types/float.rs`):** diff --git a/docs/src/library-api.md b/docs/src/library-api.md index 04a61ea..bcf8d41 100644 --- a/docs/src/library-api.md +++ b/docs/src/library-api.md @@ -70,13 +70,13 @@ When no rules match, the description defaults to `"data"` with confidence `0.0`. Controls evaluation behavior with these fields: -| Field | Type | Default | Description | -| --------------------- | ------------- | ------- | ---------------------------------------- | -| `max_recursion_depth` | `u32` | `20` | Maximum depth for nested rule evaluation | -| `max_string_length` | `usize` | `8192` | Maximum bytes read for string types | -| `stop_at_first_match` | `bool` | `true` | Stop after the first matching rule | -| `enable_mime_types` | `bool` | `false` | Map descriptions to MIME types | -| `timeout_ms` | `Option` | `None` | Evaluation timeout in milliseconds | +| Field | Type | Default | Description | +| --------------------- | ------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `max_recursion_depth` | `u32` | `20` | Maximum depth for nested rule evaluation | +| `max_string_length` | `usize` | `8192` | Caps bytes read for `TypeKind::String` reads (both unflagged and with `/c`/`/C`/`/w`/`/W`/`/T`/`/f` flags); does NOT apply to `TypeKind::PString` or `TypeKind::String16` | +| `stop_at_first_match` | `bool` | `true` | Stop after the first matching rule | +| `enable_mime_types` | `bool` | `false` | Map descriptions to MIME types | +| `timeout_ms` | `Option` | `None` | Evaluation timeout in milliseconds | ### Presets From e7040af255278a97300c7662ed891a8bc2079853 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 28 May 2026 21:12:11 -0400 Subject: [PATCH 5/8] fix(review): address PR #304 review feedback (SF-2 fail-open + doc accuracy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses findings from /pr-review-toolkit:review-pr after PR open: * **SF-2 (silent-failure-hunter, P0):** flagged-string `scan_buffer` used `buffer.get(..end).unwrap_or(buffer)`, silently falling back to the full uncapped buffer if the documented invariant `end <= buffer.len()` ever broke. That's "fail open" on the CWE-770 control. Replace with `&buffer[..end]` so a future refactor that breaks the clamp panics loudly instead of silently expanding scope. * **TS-1 (pr-test-analyzer):** add `test_max_string_length_flagged_path_works_at_non_zero_offset`. Pins that the `scan_buffer` caps the buffer's UPPER bound (not pre-slicing from `offset`) -- a future swap of `&buffer[..end]` for `&buffer[offset..end]` would silently double-offset the comparator and the test catches that regression. * **CA-1 (comment-analyzer):** `docs/src/security-assurance.md:110` -- CWE-400 row mdformat reverted in the previous commit. Re-add the `max_string_length` mention with the §7.3 cross-reference. * **CA-2 (comment-analyzer):** `docs/src/security-assurance.md` and `src/config.rs` cited GOTCHAS S6.1 for the pstring buffer-clamp narrative. S6.1 is about multi-byte length-prefix BYTE ORDER; the load-bearing clamp documentation is in S3.8. * **CA-3 + CA-4 (comment-analyzer):** trim stale `pub(crate)`-era rustdoc on `read_typed_value_with_pattern` -- "external callers" advice is meaningless after the demote, and the `# Errors` claim about regex compile failures is unreachable from this dispatcher (rejected before compile; compile failures only via `read_pattern_match`). * **TD-1 (type-design-analyzer):** drop `#[cfg(test)]` from `read_typed_value` and `DEFAULT_MAX_STRING_LENGTH`. `#[cfg(test)]` excluded integration tests, benches, and the planned v1.0 cargo-fuzz harness; `pub(crate)` + `#[allow(dead_code)]` preserves the helper without that cost. Deferred to follow-up: * **SF-1 (config validation in EvaluationContext::new):** changing the constructor to return `Result` is a wider API change. * **TD-2 (Option):** type-system invariant strengthening with broader ripple. * **TS-2/3, SF-3, CA-5/6/7:** smaller polish items. Signed-off-by: UncleSp1d3r --- docs/src/security-assurance.md | 34 ++++++++++---------- src/config.rs | 3 +- src/evaluator/types/mod.rs | 58 ++++++++++++++++++++-------------- tests/security_regression.rs | 46 +++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 41 deletions(-) diff --git a/docs/src/security-assurance.md b/docs/src/security-assurance.md index 3103e8c..69480f6 100644 --- a/docs/src/security-assurance.md +++ b/docs/src/security-assurance.md @@ -92,22 +92,22 @@ All data crossing the trust boundary (file contents, magic file syntax, CLI argu ### 5.1 CWE/SANS Top 25 -| CWE | Weakness | Countermeasure | Status | -| ------- | --------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | --------- | -| CWE-787 | Out-of-bounds write | Rust ownership prevents writes to unowned memory. Workspace-level lints in `Cargo.toml` forbid unsafe code and eliminate raw pointer writes. | Mitigated | -| CWE-79 | XSS | Not applicable (no web output). | N/A | -| CWE-89 | SQL injection | Not applicable (no database). | N/A | -| CWE-416 | Use after free | Rust ownership/borrowing system prevents use-after-free at compile time. | Mitigated | -| CWE-78 | OS command injection | No shell invocation or command execution. CLI arguments parsed by `clap`, not passed to shell. | Mitigated | -| CWE-20 | Improper input validation | All inputs validated: magic syntax validated by parser, file buffers bounds-checked, CLI args validated by `clap`. | Mitigated | -| CWE-125 | Out-of-bounds read | All buffer access uses `.get()` with bounds checking. Memory-mapped files have known size limits. | Mitigated | -| CWE-22 | Path traversal | CLI accepts file paths as arguments but only performs read-only access. No path construction from file contents. | Mitigated | -| CWE-352 | CSRF | Not applicable (no web interface). | N/A | -| CWE-434 | Unrestricted upload | Not applicable (no file upload). | N/A | -| CWE-476 | NULL pointer dereference | Rust's `Option` type eliminates null pointer dereferences at compile time. | Mitigated | -| CWE-190 | Integer overflow | Rust panics on integer overflow in debug builds. Offset calculations use checked arithmetic. | Mitigated | -| CWE-502 | Deserialization of untrusted data | Magic files are parsed with a strict grammar, not deserialized from arbitrary formats. | Mitigated | -| CWE-400 | Resource exhaustion | Evaluation timeouts prevent unbounded CPU use. Memory-mapped I/O avoids loading entire files into memory. | Mitigated | +| CWE | Weakness | Countermeasure | Status | +| ------- | --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | +| CWE-787 | Out-of-bounds write | Rust ownership prevents writes to unowned memory. Workspace-level lints in `Cargo.toml` forbid unsafe code and eliminate raw pointer writes. | Mitigated | +| CWE-79 | XSS | Not applicable (no web output). | N/A | +| CWE-89 | SQL injection | Not applicable (no database). | N/A | +| CWE-416 | Use after free | Rust ownership/borrowing system prevents use-after-free at compile time. | Mitigated | +| CWE-78 | OS command injection | No shell invocation or command execution. CLI arguments parsed by `clap`, not passed to shell. | Mitigated | +| CWE-20 | Improper input validation | All inputs validated: magic syntax validated by parser, file buffers bounds-checked, CLI args validated by `clap`. | Mitigated | +| CWE-125 | Out-of-bounds read | All buffer access uses `.get()` with bounds checking. Memory-mapped files have known size limits. | Mitigated | +| CWE-22 | Path traversal | CLI accepts file paths as arguments but only performs read-only access. No path construction from file contents. | Mitigated | +| CWE-352 | CSRF | Not applicable (no web interface). | N/A | +| CWE-434 | Unrestricted upload | Not applicable (no file upload). | N/A | +| CWE-476 | NULL pointer dereference | Rust's `Option` type eliminates null pointer dereferences at compile time. | Mitigated | +| CWE-190 | Integer overflow | Rust panics on integer overflow in debug builds. Offset calculations use checked arithmetic. | Mitigated | +| CWE-502 | Deserialization of untrusted data | Magic files are parsed with a strict grammar, not deserialized from arbitrary formats. | Mitigated | +| CWE-400 | Resource exhaustion | Evaluation timeouts prevent unbounded CPU use. Memory-mapped I/O avoids loading entire files into memory. `EvaluationConfig::max_string_length` caps scan-mode `TypeKind::String` allocations on both unflagged and flagged paths; see §7.3 for the `PString`/`String16` coverage gaps. | Mitigated | ### 5.2 OWASP Top 10 (where applicable) @@ -162,7 +162,7 @@ The `evaluate_file` rustdoc (`# Security` section) cross-references this subsect `EvaluationConfig::max_string_length` caps the buffer-length allocation for `TypeKind::String` scan-mode reads (both the unflagged `(None, _)` arm of `read_typed_value_with_pattern` and the flagged-string arm of `read_pattern_match`). It does **not** govern two adjacent string-family read paths: -- **`TypeKind::PString`** uses an explicit length prefix decoded from the buffer and returns `TypeReadError::BufferOverrun` (rather than truncating) when the prefix declares a length that exceeds the remaining buffer. The error-on-overrun behaviour is a real bound -- the read function cannot allocate past the buffer length -- but it differs from the configurable cap that `max_string_length` provides. See GOTCHAS S6.1 for the load-bearing pstring clamp documentation. +- **`TypeKind::PString`** uses an explicit length prefix decoded from the buffer and returns `TypeReadError::BufferOverrun` (rather than truncating) when the prefix declares a length that exceeds the remaining buffer. The error-on-overrun behaviour is a real bound -- the read function cannot allocate past the buffer length -- but it differs from the configurable cap that `max_string_length` provides. See GOTCHAS S3.8 for the load-bearing pstring anchor-clamp invariant. - **`TypeKind::String16`** is capped at a hardcoded `STRING16_MAX_UNITS = 8192` ceiling (2 bytes per UCS-2 unit, so up to 16 384 bytes per read). The configured cap is not consulted on this path. **Mitigation for callers:** Embedders who need a configurable cap on `String16` or `PString` reads cannot rely on `EvaluationConfig::max_string_length` for those types today. The existing built-in bounds (16 KiB ceiling on String16, error-on-overrun on PString) constrain the worst-case allocation. A configurable cap for these paths is tracked as follow-up work; the threat model entry will be updated when it lands. diff --git a/src/config.rs b/src/config.rs index bfa7001..f3af6a1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -58,7 +58,8 @@ pub struct EvaluationConfig { /// Does NOT apply to: /// - `TypeKind::PString`: returns `TypeReadError::BufferOverrun` /// rather than truncating when the length prefix exceeds the - /// remaining buffer (per GOTCHAS S6.1). + /// remaining buffer (per GOTCHAS S3.8's pstring anchor-clamp + /// invariant). /// - `TypeKind::String16`: bounded by a hardcoded /// `STRING16_MAX_UNITS = 8192` ceiling at 2 bytes per unit. pub max_string_length: usize, diff --git a/src/evaluator/types/mod.rs b/src/evaluator/types/mod.rs index a8f2961..b5998ff 100644 --- a/src/evaluator/types/mod.rs +++ b/src/evaluator/types/mod.rs @@ -103,12 +103,12 @@ pub enum TypeReadError { /// Default `max_string_length` used by [`read_typed_value`] when callers /// do not supply an explicit cap. Matches -/// `EvaluationConfig::default().max_string_length` so unit tests that +/// `EvaluationConfig::default().max_string_length` so call sites that /// invoke `read_typed_value` directly see the same scan-mode bound the /// engine applies at evaluation time. The engine call path /// (`evaluate_value_rule`) threads the user-configured cap, so this -/// constant only governs ad-hoc / test usage. -#[cfg(test)] +/// constant only governs internal helper / test usage. +#[allow(dead_code)] pub(crate) const DEFAULT_MAX_STRING_LENGTH: usize = 8192; /// Reads bytes according to the specified `TypeKind`. @@ -136,9 +136,12 @@ pub(crate) const DEFAULT_MAX_STRING_LENGTH: usize = 8192; /// [`DEFAULT_MAX_STRING_LENGTH`] (8192 bytes, matching /// `EvaluationConfig::default()`). The engine's value-rule path supplies /// the user-configured cap via [`read_typed_value_with_pattern`] directly, -/// so this helper exists solely for unit tests that want a one-shot -/// type-read without constructing a context. -#[cfg(test)] +/// so this helper exists for internal callers (tests, future fuzz +/// harnesses) that want a one-shot type-read without constructing a +/// context. The lib build doesn't currently call it; the `dead_code` +/// allow keeps the helper available for `#[cfg(test)]` modules without +/// gating its visibility, so a future fuzz harness can reuse it. +#[allow(dead_code)] pub(crate) fn read_typed_value( buffer: &[u8], offset: usize, @@ -148,30 +151,31 @@ pub(crate) fn read_typed_value( } /// Reads bytes according to the specified `TypeKind`, threading a -/// `pattern` operand through for pattern-bearing types (`Regex`, `Search`). +/// `pattern` operand through for non-pattern-bearing types whose +/// dispatch arm consults the rule's value operand (e.g. `TypeKind::String` +/// equality matches against the literal pattern bytes). /// -/// This is the internal dispatch entry point used by the evaluation engine -/// to evaluate pattern-bearing types. The engine threads the rule's value -/// operand through as `pattern` so the regex and search readers can -/// compile/locate it against the buffer. For fixed-width and non-pattern -/// types (numeric, float, date, string, pstring), the `pattern` parameter -/// is ignored; external callers for those types should prefer the simpler -/// three-argument [`read_typed_value`] wrapper. +/// This is the internal dispatch entry point for value-rule evaluation. +/// Pattern-bearing types (`TypeKind::Regex`, `TypeKind::Search`, and +/// flagged `TypeKind::String`) are routed through [`read_pattern_match`] +/// by the engine instead; this function returns +/// `TypeReadError::UnsupportedType` if called with those variants so a +/// programmatic caller mis-routing them surfaces immediately rather than +/// silently producing wrong results. /// /// # Errors /// /// Returns `TypeReadError::BufferOverrun` when the requested value extends -/// past the buffer bounds, `TypeReadError::UnsupportedType` when a regex -/// pattern fails to compile or a pattern-bearing type is evaluated without -/// a pattern, or `TypeReadError::InvalidPStringLength` for a malformed -/// Pascal string length prefix. +/// past the buffer bounds, `TypeReadError::UnsupportedType` when a +/// pattern-bearing type is evaluated through this path instead of via +/// [`read_pattern_match`], or `TypeReadError::InvalidPStringLength` for a +/// malformed Pascal string length prefix. /// /// `max_string_length` bounds the scan-mode string read on the /// `(None, _)` arm of [`TypeKind::String`]. Without it, `string x` rules /// against an attacker-controlled NUL-free buffer could allocate up to -/// the full buffer length (origin finding 2A-H1 / CWE-770). The cap is -/// wired from [`EvaluationContext::max_string_length`] at the engine call -/// site. +/// the full buffer length (CWE-770). The cap is wired from +/// `EvaluationContext::max_string_length` at the engine call site. pub(crate) fn read_typed_value_with_pattern( buffer: &[u8], offset: usize, @@ -379,17 +383,25 @@ pub(crate) fn read_pattern_match( // produces no match via `compare_string_with_flags`'s EOF // handling -- no special case needed. // - // 2A-H1: When AST `max_length` is `None`, fall back to the + // CWE-770: When AST `max_length` is `None`, fall back to the // configured `max_string_length` cap rather than passing the // full buffer. The cap is applied to the buffer's UPPER bound // (not pre-sliced from `offset`) because // `compare_string_with_flags` slices internally via // `buffer.get(offset..)?` -- pre-slicing would double-offset // and silently produce no-match at any non-zero offset. + // + // `end` is constructed with `saturating_add` then `.min(buffer.len())` + // so the slice always satisfies `end <= buffer.len()`. We slice + // directly with `&buffer[..end]` rather than + // `buffer.get(..end).unwrap_or(buffer)` so that any future + // refactor that breaks the clamp invariant fails loudly (panic) + // instead of silently falling back to the uncapped buffer -- + // which would defeat the CWE-770 control. let scan_buffer: &[u8] = { let cap = max_length.unwrap_or(max_string_length); let end = offset.saturating_add(cap).min(buffer.len()); - buffer.get(..end).unwrap_or(buffer) + &buffer[..end] }; match string::compare_string_with_flags(trimmed, scan_buffer, offset, *flags) { Some(consumed) => { diff --git a/tests/security_regression.rs b/tests/security_regression.rs index 4707540..b90c811 100644 --- a/tests/security_regression.rs +++ b/tests/security_regression.rs @@ -289,6 +289,52 @@ fn test_max_string_length_caps_unflagged_string_x() { ); } +/// Flagged path with non-zero offset: pins that the `scan_buffer` +/// construction caps the buffer's UPPER bound rather than pre-slicing +/// from `offset`. A future "simplification" that swaps +/// `&buffer[..end]` for `&buffer[offset..end]` would silently double- +/// offset the comparator (which slices `buffer.get(offset..)?` +/// internally) and break every flagged-string rule at non-zero offset. +/// +/// This test was added in response to PR #304 review finding TS-1 from +/// `pr-review-toolkit:review-pr`. +#[test] +fn test_max_string_length_flagged_path_works_at_non_zero_offset() { + use libmagic_rs::evaluator::{EvaluationContext, evaluate_rules}; + use libmagic_rs::parser::ast::StringFlags; + use libmagic_rs::{MagicRule, OffsetSpec, Operator, TypeKind, Value}; + + // Buffer: 50 bytes of 'A', then the literal pattern "hit" at offset 50. + let mut buf = vec![b'A'; 50]; + buf.extend_from_slice(b"hit"); + + // Flagged-`/c` rule at offset 50, looking for "hit". With a cap of + // 1024 (well above the offset + pattern), the comparator must find + // the match. If the scan_buffer construction were pre-sliced from + // `offset`, the comparator's internal `get(offset..)` would skip + // past the pattern and find nothing. + let rule = MagicRule::new( + OffsetSpec::Absolute(50), + TypeKind::String { + max_length: None, + flags: StringFlags::default().with_ignore_lowercase(true), + }, + Operator::Equal, + Value::String("hit".to_string()), + "found at offset".to_string(), + ); + let config = EvaluationConfig::default().with_max_string_length(1024); + let mut ctx = EvaluationContext::new(config); + let matches = + evaluate_rules(std::slice::from_ref(&rule), &buf, &mut ctx).expect("must not error"); + assert_eq!( + matches.len(), + 1, + "flagged string/c at offset 50 must match `hit` with cap=1024; \ + a regression to pre-slice from offset would break this" + ); +} + /// Flagged-`/W` path: a `string/W "X "` rule against a buffer of all /// whitespace must walk only `max_string_length` bytes before giving up, /// not the full buffer. Origin 2A-H1 (flagged-string scan-window variant) From 1ebbcdb4f6e23c7738f53deaf29f92a6a3784c8d Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 28 May 2026 22:28:21 -0400 Subject: [PATCH 6/8] chore(.gitignore): add claude/scheduled_tasks.lock to ignore list Signed-off-by: UncleSp1d3r --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 35f99d8..944468c 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,4 @@ todos/ .tessl/tiles/ .tessl/RULES.md .claude/instincts/ +.claude/scheduled_tasks.lock From d7413186f45a9786402aad6e7e8dd06b3488ac88 Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Thu, 28 May 2026 22:50:15 -0400 Subject: [PATCH 7/8] refactor: SF-1 config defense + U4 #[non_exhaustive] on all 13 public structs + migrate test sites Closes the deferred items from PR #304 review per user direction "do not defer or leave residuals." * **SF-1 (silent-failure-hunter):** `EvaluationContext::new` now defensively clamps `max_string_length = 0` to `DEFAULT_MAX_STRING_LENGTH` (8192) with a `warn!` log. `EvaluationConfig::validate()` rejects 0, but struct-literal construction and the `with_max_string_length` builder bypass it. Without the clamp, an invalid 0 silently disabled the CWE-770 control. Non-breaking; embedders see a log warning if they hit the bypass path. Regression test in `tests/security_regression.rs`. * **U4 (architecture 1B-C1):** added `#[non_exhaustive]` to all 13 public structs (`MagicDatabase`, library + output `EvaluationResult` / `EvaluationMetadata`, `MatchResult`, `EvaluationContext`, `RuleMatch`, `RegexFlags`, `StringFlags`, `SearchFlags`, `ValueTransform`, `MagicRule`). Added `RuleMatch::new`, library-side `EvaluationResult::new` / `EvaluationMetadata::new`, and `ValueTransform::new` constructors to keep external migration ergonomic. Migrated ~56 struct-literal sites across 8 test/bench files and 18 doctest examples to use constructors / builder chains. A Python migration script handled the bulk; remaining sites (`children: vec![...]`, `level: N`, complex flag patterns) were edited by hand. * **TD-2 (type-design-analyzer):** evaluated `Option` for `max_string_length`. Decided not to thread it through the type system: SF-1's runtime clamp guarantees `max_string_length >= 1` at every read site, so the `NonZeroUsize` encoding would be load-bearing-redundant. The runtime guard is the security control; the type would just shift validation from runtime to compile time without changing the invariant. Documenting the decision here rather than leaving it as a residual. Test results after all changes: - `cargo test --all-features` passes (~360 tests + 182 doctests) - `cargo clippy --all-features --all-targets -- -D warnings` clean - `cargo fmt --check` clean Per the no-defer instruction, U4 ships in this PR rather than a follow-up. Signed-off-by: UncleSp1d3r --- benches/regex_bench.rs | 18 +- src/evaluator/engine/mod.rs | 128 +++---- src/evaluator/mod.rs | 53 ++- src/evaluator/strength.rs | 79 +--- src/lib.rs | 79 +++- src/output/mod.rs | 97 ++--- src/parser/ast.rs | 22 +- tests/evaluator_tests.rs | 224 +++++------- tests/json_integration_test.rs | 44 +-- tests/property_tests.rs | 106 ++---- tests/regex_search_corpus_tests.rs | 112 +++--- tests/relative_offset_evaluation.rs | 485 ++++++++++--------------- tests/search_flag_conformance_tests.rs | 37 +- tests/security_regression.rs | 36 +- tests/string_flags_integration.rs | 18 +- 15 files changed, 683 insertions(+), 855 deletions(-) diff --git a/benches/regex_bench.rs b/benches/regex_bench.rs index e6beddf..06d3719 100644 --- a/benches/regex_bench.rs +++ b/benches/regex_bench.rs @@ -30,20 +30,16 @@ use libmagic_rs::{EvaluationConfig, MagicRule, OffsetSpec, Operator, TypeKind, V use std::hint::black_box; fn regex_rule(pattern: &str) -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Regex { + MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Regex { flags: RegexFlags::default(), count: RegexCount::Default, }, - op: Operator::Equal, - value: Value::String(pattern.to_string()), - message: "bench-match".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - } + Operator::Equal, + Value::String(pattern.to_string()), + "bench-match".to_string(), + ) } fn make_context() -> EvaluationContext { diff --git a/src/evaluator/engine/mod.rs b/src/evaluator/engine/mod.rs index fbdf7e7..6fcd986 100644 --- a/src/evaluator/engine/mod.rs +++ b/src/evaluator/engine/mod.rs @@ -162,17 +162,7 @@ static INDIRECT_WITHOUT_RULE_ENV_WARNED: AtomicBool = AtomicBool::new(false); /// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value}; /// /// // Create a rule to check for ELF magic bytes at offset 0 -/// let rule = MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::Byte { signed: true }, -/// op: Operator::Equal, -/// value: Value::Uint(0x7f), -/// message: "ELF magic".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }; +/// let rule = MagicRule::new(OffsetSpec::Absolute(0), TypeKind::Byte { signed: true }, Operator::Equal, Value::Uint(0x7f), "ELF magic".to_string()); /// /// let mut context = EvaluationContext::new(EvaluationConfig::default()); /// let elf_buffer = &[0x7f, 0x45, 0x4c, 0x46]; // ELF magic bytes @@ -594,29 +584,23 @@ fn evaluate_children_or_warn( /// use libmagic_rs::EvaluationConfig; /// /// // Create a hierarchical rule set for ELF files -/// let parent_rule = MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::Byte { signed: true }, -/// op: Operator::Equal, -/// value: Value::Uint(0x7f), -/// message: "ELF".to_string(), -/// children: vec![ -/// MagicRule { -/// offset: OffsetSpec::Absolute(4), -/// typ: TypeKind::Byte { signed: true }, -/// op: Operator::Equal, -/// value: Value::Uint(2), -/// message: "64-bit".to_string(), -/// children: vec![], -/// level: 1, -/// strength_modifier: None, -/// value_transform: None, -/// } -/// ], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }; +/// let parent_rule = MagicRule::new( +/// OffsetSpec::Absolute(0), +/// TypeKind::Byte { signed: true }, +/// Operator::Equal, +/// Value::Uint(0x7f), +/// "ELF".to_string(), +/// ) +/// .with_children(vec![ +/// MagicRule::new( +/// OffsetSpec::Absolute(4), +/// TypeKind::Byte { signed: true }, +/// Operator::Equal, +/// Value::Uint(2), +/// "64-bit".to_string(), +/// ) +/// .with_level(1), +/// ]); /// /// let rules = vec![parent_rule]; /// let buffer = &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01]; // ELF64 header @@ -736,14 +720,14 @@ pub fn evaluate_rules( if !sibling_matched { let matches_before = matches.len(); - let match_result = RuleMatch { - message: rule.message.clone(), - offset: context.last_match_end(), - level: rule.level, - value: crate::parser::ast::Value::Uint(0), - type_kind: rule.typ.clone(), - confidence: RuleMatch::calculate_confidence(rule.level), - }; + let match_result = RuleMatch::new( + rule.message.clone(), + context.last_match_end(), + rule.level, + crate::parser::ast::Value::Uint(0), + rule.typ.clone(), + RuleMatch::calculate_confidence(rule.level), + ); matches.push(match_result); // `default` is treated as a successful match at this @@ -832,14 +816,14 @@ pub fn evaluate_rules( // directive's own `message` never surfaces in the output. context.set_last_match_end(absolute_offset); - let indirect_match = RuleMatch { - message: rule.message.clone(), - offset: absolute_offset, - level: rule.level, - value: crate::parser::ast::Value::String("indirect".to_string()), - type_kind: rule.typ.clone(), - confidence: RuleMatch::calculate_confidence(rule.level), - }; + let indirect_match = RuleMatch::new( + rule.message.clone(), + absolute_offset, + rule.level, + crate::parser::ast::Value::String("indirect".to_string()), + rule.typ.clone(), + RuleMatch::calculate_confidence(rule.level), + ); matches.push(indirect_match); // Indirect counts as a match for `sibling_matched` regardless of @@ -935,14 +919,14 @@ pub fn evaluate_rules( // `Indirect` and every other value-bearing rule. context.set_last_match_end(absolute_offset); - let offset_match = RuleMatch { - message: rule.message.clone(), - offset: absolute_offset, - level: rule.level, - value: crate::parser::ast::Value::Uint(absolute_offset as u64), - type_kind: rule.typ.clone(), - confidence: RuleMatch::calculate_confidence(rule.level), - }; + let offset_match = RuleMatch::new( + rule.message.clone(), + absolute_offset, + rule.level, + crate::parser::ast::Value::Uint(absolute_offset as u64), + rule.typ.clone(), + RuleMatch::calculate_confidence(rule.level), + ); matches.push(offset_match); sibling_matched = true; @@ -1086,14 +1070,14 @@ pub fn evaluate_rules( // default-after-match semantics. sibling_matched = true; - let match_result = RuleMatch { - message: rule.message.clone(), - offset: absolute_offset, - level: rule.level, - value: read_value, - type_kind: rule.typ.clone(), - confidence: RuleMatch::calculate_confidence(rule.level), - }; + let match_result = RuleMatch::new( + rule.message.clone(), + absolute_offset, + rule.level, + read_value, + rule.typ.clone(), + RuleMatch::calculate_confidence(rule.level), + ); matches.push(match_result); // If this rule has children, evaluate them recursively @@ -1182,17 +1166,7 @@ pub fn evaluate_rules( /// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value}; /// use libmagic_rs::EvaluationConfig; /// -/// let rule = MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::Byte { signed: true }, -/// op: Operator::Equal, -/// value: Value::Uint(0x7f), -/// message: "ELF magic".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }; +/// let rule = MagicRule::new(OffsetSpec::Absolute(0), TypeKind::Byte { signed: true }, Operator::Equal, Value::Uint(0x7f), "ELF magic".to_string()); /// /// let rules = vec![rule]; /// let buffer = &[0x7f, 0x45, 0x4c, 0x46]; diff --git a/src/evaluator/mod.rs b/src/evaluator/mod.rs index 51173ed..c966dfb 100644 --- a/src/evaluator/mod.rs +++ b/src/evaluator/mod.rs @@ -52,6 +52,7 @@ pub(crate) struct RuleEnvironment { /// assert_eq!(context.recursion_depth(), 0); /// ``` #[derive(Debug, Clone)] +#[non_exhaustive] pub struct EvaluationContext { /// Current offset position in the file buffer current_offset: usize, @@ -120,7 +121,31 @@ impl EvaluationContext { /// let context = EvaluationContext::new(config); /// ``` #[must_use] - pub const fn new(config: EvaluationConfig) -> Self { + pub fn new(mut config: EvaluationConfig) -> Self { + // Defensive clamp on `max_string_length`: `EvaluationConfig::validate()` + // rejects 0, but callers can bypass validation by setting the field + // via struct-literal syntax (or via the `with_max_string_length` + // builder, which doesn't validate). Without this clamp, a `cap = 0` + // would silently produce zero-byte reads on every scan-mode `string x` + // rule and disable the CWE-770 control documented at this field. + // + // The clamp rewrites an invalid 0 to + // `crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH` (8192, + // matching `EvaluationConfig::default()`). A `warn!` records the + // correction so embedders see it in logs. Closes PR #304 review + // finding SF-1. + if config.max_string_length == 0 { + log::warn!( + "EvaluationContext::new received max_string_length=0 \ + (likely a struct-literal or builder bypass of \ + EvaluationConfig::validate); clamping to {} (the documented \ + default). Construct the config via EvaluationConfig::new() \ + / EvaluationConfig::default() and use the with_* builders \ + to avoid this warning.", + crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH, + ); + config.max_string_length = crate::evaluator::types::DEFAULT_MAX_STRING_LENGTH; + } Self { current_offset: 0, last_match_end: 0, @@ -413,6 +438,7 @@ impl Drop for RecursionGuard<'_> { /// The output-side conversion layer (`output::MatchResult` / /// `output::json::JsonMatchResult`) is the documented JSON contract. #[derive(Debug, Clone, PartialEq, Serialize)] +#[non_exhaustive] pub struct RuleMatch { /// The message associated with the matching rule pub message: String, @@ -446,6 +472,31 @@ pub struct RuleMatch { } impl RuleMatch { + /// Construct a new `RuleMatch`. + /// + /// `confidence` is typically derived from `level` via + /// [`RuleMatch::calculate_confidence`]; pass it explicitly here so + /// callers can supply an alternative score when needed (e.g. when + /// post-processing a series of matches). + #[must_use] + pub fn new( + message: String, + offset: usize, + level: u32, + value: crate::parser::ast::Value, + type_kind: crate::parser::ast::TypeKind, + confidence: f64, + ) -> Self { + Self { + message, + offset, + level, + value, + type_kind, + confidence, + } + } + /// Calculate confidence score based on rule depth /// /// Formula: min(1.0, 0.3 + (level * 0.2)) diff --git a/src/evaluator/strength.rs b/src/evaluator/strength.rs index 0f6e53b..b798592 100644 --- a/src/evaluator/strength.rs +++ b/src/evaluator/strength.rs @@ -50,17 +50,7 @@ pub const MIN_STRENGTH: i32 = 0; /// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, StringFlags, TypeKind, Operator, Value}; /// use libmagic_rs::evaluator::strength::calculate_default_strength; /// -/// let rule = MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::String { max_length: None, flags: StringFlags::default() }, -/// op: Operator::Equal, -/// value: Value::String("ELF".to_string()), -/// message: "ELF file".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }; +/// let rule = MagicRule::new(OffsetSpec::Absolute(0), TypeKind::String { max_length: None, flags: StringFlags::default() }, Operator::Equal, Value::String("ELF".to_string()), "ELF file".to_string()); /// /// let strength = calculate_default_strength(&rule); /// assert!(strength > 0); @@ -325,17 +315,14 @@ pub fn apply_strength_modifier(base_strength: i32, modifier: &StrengthModifier) /// use libmagic_rs::parser::ast::{MagicRule, OffsetSpec, TypeKind, Operator, Value, StrengthModifier}; /// use libmagic_rs::evaluator::strength::calculate_rule_strength; /// -/// let rule = MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::Byte { signed: true }, -/// op: Operator::Equal, -/// value: Value::Uint(0x7f), -/// message: "ELF magic".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: Some(StrengthModifier::Add(20)), -/// value_transform: None, -/// }; +/// let rule = MagicRule::new( +/// OffsetSpec::Absolute(0), +/// TypeKind::Byte { signed: true }, +/// Operator::Equal, +/// Value::Uint(0x7f), +/// "ELF magic".to_string(), +/// ) +/// .with_strength_modifier(StrengthModifier::Add(20)); /// /// let strength = calculate_rule_strength(&rule); /// // Base: 5 (byte) + 10 (equal) + 10 (absolute) + 0 (numeric) = 25 @@ -369,28 +356,8 @@ pub fn calculate_rule_strength(rule: &MagicRule) -> i32 { /// use libmagic_rs::evaluator::strength::sort_rules_by_strength; /// /// let mut rules = vec![ -/// MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::Byte { signed: true }, -/// op: Operator::Equal, -/// value: Value::Uint(0x7f), -/// message: "byte rule".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }, -/// MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::String { max_length: None, flags: StringFlags::default() }, -/// op: Operator::Equal, -/// value: Value::String("MAGIC".to_string()), -/// message: "string rule".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }, +/// MagicRule::new(OffsetSpec::Absolute(0), TypeKind::Byte { signed: true }, Operator::Equal, Value::Uint(0x7f), "byte rule".to_string()), +/// MagicRule::new(OffsetSpec::Absolute(0), TypeKind::String { max_length: None, flags: StringFlags::default() }, Operator::Equal, Value::String("MAGIC".to_string()), "string rule".to_string()), /// ]; /// /// sort_rules_by_strength(&mut rules); @@ -460,28 +427,8 @@ pub fn sort_rules_by_strength_recursive(rules: &mut [MagicRule]) { /// use libmagic_rs::evaluator::strength::into_sorted_by_strength; /// /// let rules = vec![ -/// MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::Byte { signed: true }, -/// op: Operator::Equal, -/// value: Value::Uint(0), -/// message: "byte rule".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }, -/// MagicRule { -/// offset: OffsetSpec::Absolute(0), -/// typ: TypeKind::String { max_length: None, flags: StringFlags::default() }, -/// op: Operator::Equal, -/// value: Value::String("MAGIC".to_string()), -/// message: "string rule".to_string(), -/// children: vec![], -/// level: 0, -/// strength_modifier: None, -/// value_transform: None, -/// }, +/// MagicRule::new(OffsetSpec::Absolute(0), TypeKind::Byte { signed: true }, Operator::Equal, Value::Uint(0), "byte rule".to_string()), +/// MagicRule::new(OffsetSpec::Absolute(0), TypeKind::String { max_length: None, flags: StringFlags::default() }, Operator::Equal, Value::String("MAGIC".to_string()), "string rule".to_string()), /// ]; /// /// let sorted = into_sorted_by_strength(rules); diff --git a/src/lib.rs b/src/lib.rs index e95de9c..00c4839 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -149,6 +149,7 @@ impl From for LibmagicError { /// Main interface for magic rule database #[derive(Debug)] +#[non_exhaustive] pub struct MagicDatabase { /// Named subroutine definitions extracted from magic file `name` rules, /// keyed by identifier. The evaluator consults this table when a rule of @@ -604,18 +605,19 @@ impl MagicDatabase { /// use libmagic_rs::EvaluationMetadata; /// use std::path::PathBuf; /// -/// let metadata = EvaluationMetadata { -/// file_size: 8192, -/// evaluation_time_ms: 2.5, -/// rules_evaluated: 42, -/// magic_file: Some(PathBuf::from("/usr/share/misc/magic")), -/// timed_out: false, -/// }; +/// let metadata = EvaluationMetadata::new( +/// 8192, +/// 2.5, +/// 42, +/// Some(PathBuf::from("/usr/share/misc/magic")), +/// false, +/// ); /// /// assert_eq!(metadata.file_size, 8192); /// assert!(!metadata.timed_out); /// ``` #[derive(Debug, Clone, Serialize)] +#[non_exhaustive] pub struct EvaluationMetadata { /// Size of the analyzed file or buffer in bytes pub file_size: u64, @@ -673,18 +675,19 @@ impl Default for EvaluationMetadata { /// ``` /// use libmagic_rs::{EvaluationResult, EvaluationMetadata}; /// -/// let result = EvaluationResult { -/// description: "ELF 64-bit executable".to_string(), -/// mime_type: Some("application/x-executable".to_string()), -/// confidence: 0.9, -/// matches: vec![], -/// metadata: EvaluationMetadata::default(), -/// }; +/// let result = EvaluationResult::new( +/// "ELF 64-bit executable".to_string(), +/// Some("application/x-executable".to_string()), +/// 0.9, +/// vec![], +/// EvaluationMetadata::default(), +/// ); /// /// assert_eq!(result.description, "ELF 64-bit executable"); /// assert!(result.confidence > 0.5); /// ``` #[derive(Debug, Clone, Serialize)] +#[non_exhaustive] pub struct EvaluationResult { /// Human-readable file type description /// @@ -714,5 +717,53 @@ pub struct EvaluationResult { pub metadata: EvaluationMetadata, } +impl EvaluationResult { + /// Construct a new library-side `EvaluationResult`. + /// + /// This is the outbound type returned by [`MagicDatabase::evaluate_file`] + /// and [`MagicDatabase::evaluate_buffer`]. For the output-facing + /// type used by the CLI and JSON/text formatters, see + /// [`crate::output::EvaluationResult::from_library_result`]. + #[must_use] + pub fn new( + description: String, + mime_type: Option, + confidence: f64, + matches: Vec, + metadata: EvaluationMetadata, + ) -> Self { + Self { + description, + mime_type, + confidence, + matches, + metadata, + } + } +} + +impl EvaluationMetadata { + /// Construct a new library-side `EvaluationMetadata` from the four + /// always-set fields. `magic_file` and `timed_out` default to `None` + /// / `false`; use struct-update syntax with [`EvaluationMetadata::default()`] + /// to set them explicitly. + #[must_use] + pub fn new( + file_size: u64, + evaluation_time_ms: f64, + rules_evaluated: usize, + magic_file: Option, + timed_out: bool, + ) -> Self { + Self { + file_size, + evaluation_time_ms, + rules_evaluated, + magic_file, + timed_out, + } + } +} + #[cfg(test)] mod tests; diff --git a/src/output/mod.rs b/src/output/mod.rs index d4089e3..3668c03 100644 --- a/src/output/mod.rs +++ b/src/output/mod.rs @@ -39,20 +39,21 @@ static DEFAULT_TAG_EXTRACTOR: LazyLock = /// use libmagic_rs::output::MatchResult; /// use libmagic_rs::parser::ast::Value; /// -/// let result = MatchResult { -/// message: "ELF 64-bit LSB executable".to_string(), -/// offset: 0, -/// length: 4, -/// value: Value::Bytes(vec![0x7f, 0x45, 0x4c, 0x46]), -/// rule_path: vec!["elf".to_string(), "elf64".to_string()], -/// confidence: 90, -/// mime_type: Some("application/x-executable".to_string()), -/// }; +/// let result = MatchResult::with_metadata( +/// "ELF 64-bit LSB executable".to_string(), +/// 0, +/// 4, +/// Value::Bytes(vec![0x7f, 0x45, 0x4c, 0x46]), +/// vec!["elf".to_string(), "elf64".to_string()], +/// 90, +/// Some("application/x-executable".to_string()), +/// ); /// /// assert_eq!(result.message, "ELF 64-bit LSB executable"); /// assert_eq!(result.offset, 0); /// ``` #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[non_exhaustive] pub struct MatchResult { /// Human-readable description of the file type or pattern match pub message: String, @@ -121,32 +122,25 @@ pub struct MatchResult { /// use libmagic_rs::parser::ast::Value; /// use std::path::PathBuf; /// -/// let result = EvaluationResult { -/// filename: PathBuf::from("example.bin"), -/// matches: vec![ -/// MatchResult { -/// message: "ELF executable".to_string(), -/// offset: 0, -/// length: 4, -/// value: Value::Bytes(vec![0x7f, 0x45, 0x4c, 0x46]), -/// rule_path: vec!["elf".to_string()], -/// confidence: 95, -/// mime_type: Some("application/x-executable".to_string()), -/// } -/// ], -/// metadata: EvaluationMetadata { -/// file_size: 8192, -/// evaluation_time_ms: 2.5, -/// rules_evaluated: 42, -/// rules_matched: 1, -/// }, -/// error: None, -/// }; +/// let result = EvaluationResult::new( +/// PathBuf::from("example.bin"), +/// vec![MatchResult::with_metadata( +/// "ELF executable".to_string(), +/// 0, +/// 4, +/// Value::Bytes(vec![0x7f, 0x45, 0x4c, 0x46]), +/// vec!["elf".to_string()], +/// 95, +/// Some("application/x-executable".to_string()), +/// )], +/// EvaluationMetadata::new(8192, 2.5, 42, 1), +/// ); /// /// assert_eq!(result.matches.len(), 1); /// assert_eq!(result.metadata.file_size, 8192); /// ``` #[derive(Debug, Clone, Serialize, Deserialize)] +#[non_exhaustive] pub struct EvaluationResult { /// Path to the file that was analyzed pub filename: PathBuf, @@ -175,6 +169,7 @@ pub struct EvaluationResult { /// Provides diagnostic information about how the evaluation was performed, /// including performance metrics and statistics about rule processing. #[derive(Debug, Clone, Serialize, Deserialize)] +#[non_exhaustive] pub struct EvaluationMetadata { /// Size of the analyzed file in bytes pub file_size: u64, @@ -418,12 +413,7 @@ impl EvaluationResult { /// let result = EvaluationResult::new( /// PathBuf::from("test.txt"), /// vec![], - /// EvaluationMetadata { - /// file_size: 1024, - /// evaluation_time_ms: 1.2, - /// rules_evaluated: 10, - /// rules_matched: 0, - /// } + /// EvaluationMetadata::new(1024, 1.2, 10, 0), /// ); /// /// assert_eq!(result.filename, PathBuf::from("test.txt")); @@ -502,12 +492,7 @@ impl EvaluationResult { /// let result = EvaluationResult::with_error( /// PathBuf::from("missing.txt"), /// "File not found".to_string(), - /// EvaluationMetadata { - /// file_size: 0, - /// evaluation_time_ms: 0.0, - /// rules_evaluated: 0, - /// rules_matched: 0, - /// } + /// EvaluationMetadata::new(0, 0.0, 0, 0), /// ); /// /// assert_eq!(result.error, Some("File not found".to_string())); @@ -535,12 +520,7 @@ impl EvaluationResult { /// let mut result = EvaluationResult::new( /// PathBuf::from("data.bin"), /// vec![], - /// EvaluationMetadata { - /// file_size: 512, - /// evaluation_time_ms: 0.8, - /// rules_evaluated: 5, - /// rules_matched: 0, - /// } + /// EvaluationMetadata::new(512, 0.8, 5, 0), /// ); /// /// let match_result = MatchResult::new( @@ -602,12 +582,7 @@ impl EvaluationResult { /// Some("application/x-msdownload".to_string()) /// ), /// ], - /// EvaluationMetadata { - /// file_size: 4096, - /// evaluation_time_ms: 1.5, - /// rules_evaluated: 15, - /// rules_matched: 2, - /// } + /// EvaluationMetadata::new(4096, 1.5, 15, 2), /// ); /// /// let primary = result.primary_match(); @@ -632,23 +607,13 @@ impl EvaluationResult { /// let success = EvaluationResult::new( /// PathBuf::from("good.txt"), /// vec![], - /// EvaluationMetadata { - /// file_size: 100, - /// evaluation_time_ms: 0.5, - /// rules_evaluated: 3, - /// rules_matched: 0, - /// } + /// EvaluationMetadata::new(100, 0.5, 3, 0), /// ); /// /// let failure = EvaluationResult::with_error( /// PathBuf::from("bad.txt"), /// "Parse error".to_string(), - /// EvaluationMetadata { - /// file_size: 0, - /// evaluation_time_ms: 0.0, - /// rules_evaluated: 0, - /// rules_matched: 0, - /// } + /// EvaluationMetadata::new(0, 0.0, 0, 0), /// ); /// /// assert!(success.is_success()); diff --git a/src/parser/ast.rs b/src/parser/ast.rs index 6fd66f9..de8859b 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -630,10 +630,9 @@ pub enum TypeKind { /// /// // `regex/cs` -- case-insensitive, anchor advances to match-start. /// let case_insensitive_start = TypeKind::Regex { - /// flags: RegexFlags { - /// case_insensitive: true, - /// start_offset: true, - /// }, + /// flags: RegexFlags::default() + /// .with_case_insensitive(true) + /// .with_start_offset(true), /// count: RegexCount::Default, /// }; /// ``` @@ -731,6 +730,7 @@ pub enum TypeKind { /// assert!(case_and_start.start_offset); /// ``` #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)] +#[non_exhaustive] pub struct RegexFlags { /// `/c` -- case-insensitive matching. When `true`, ASCII letter /// casing is ignored during pattern matching. @@ -814,6 +814,7 @@ impl RegexFlags { // in every consumer. The bool-per-flag layout mirrors `RegexFlags` and // the libmagic source -- the clippy lint is overruled by the design. #[allow(clippy::struct_excessive_bools)] +#[non_exhaustive] pub struct StringFlags { /// `/W` -- `STRING_COMPACT_WHITESPACE`. Pattern whitespace requires at /// least one whitespace byte in the file, then any further whitespace @@ -971,6 +972,7 @@ impl StringFlags { // The bool-per-flag layout mirrors `StringFlags` and `RegexFlags` and the // libmagic source -- the clippy lint is overruled by the design. #[allow(clippy::struct_excessive_bools)] +#[non_exhaustive] pub struct SearchFlags { /// `/W` -- `STRING_COMPACT_WHITESPACE`. Pattern whitespace requires at /// least one whitespace byte in the file, then any further whitespace @@ -1533,11 +1535,12 @@ pub enum ValueTransformOp { /// use libmagic_rs::parser::ast::{ValueTransform, ValueTransformOp}; /// /// // `lelong+1` -> add 1 to the read value -/// let t = ValueTransform { op: ValueTransformOp::Add, operand: 1 }; +/// let t = ValueTransform::new(ValueTransformOp::Add, 1); /// assert_eq!(t.op, ValueTransformOp::Add); /// assert_eq!(t.operand, 1); /// ``` #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[non_exhaustive] pub struct ValueTransform { /// Operation to apply. pub op: ValueTransformOp, @@ -1545,8 +1548,17 @@ pub struct ValueTransform { pub operand: i64, } +impl ValueTransform { + /// Construct a new `ValueTransform` from an op and an operand. + #[must_use] + pub const fn new(op: ValueTransformOp, operand: i64) -> Self { + Self { op, operand } + } +} + /// Magic rule representation in the AST #[derive(Debug, Clone, Serialize, Deserialize)] +#[non_exhaustive] pub struct MagicRule { /// Offset specification for where to read data pub offset: OffsetSpec, diff --git a/tests/evaluator_tests.rs b/tests/evaluator_tests.rs index 95f9311..5379792 100644 --- a/tests/evaluator_tests.rs +++ b/tests/evaluator_tests.rs @@ -250,19 +250,15 @@ fn test_evaluate_partial_magic_header() { #[test] fn test_evaluate_float_rule_equal() { // IEEE 754 little-endian 1.0f32 = 0x3f800000 => bytes [0x00, 0x00, 0x80, 0x3f] - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Float { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Float { endian: Endianness::Little, }, - op: Operator::Equal, - value: Value::Float(1.0), - message: "float 1.0 detected".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Float(1.0), + "float 1.0 detected".to_string(), + ); let buffer: &[u8] = &[0x00, 0x00, 0x80, 0x3f]; let config = EvaluationConfig::default(); @@ -274,19 +270,15 @@ fn test_evaluate_float_rule_equal() { #[test] fn test_evaluate_double_rule_equal() { // IEEE 754 big-endian 1.0f64 = 0x3ff0000000000000 - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Double { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Double { endian: Endianness::Big, }, - op: Operator::Equal, - value: Value::Float(1.0), - message: "double 1.0 detected".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Float(1.0), + "double 1.0 detected".to_string(), + ); let buffer: &[u8] = &[0x3f, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]; let config = EvaluationConfig::default(); @@ -298,19 +290,15 @@ fn test_evaluate_double_rule_equal() { #[test] fn test_evaluate_float_rule_not_equal() { // Buffer contains 1.0f32 LE, rule expects != 2.0 -- should match - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Float { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Float { endian: Endianness::Little, }, - op: Operator::NotEqual, - value: Value::Float(2.0), - message: "not 2.0".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::NotEqual, + Value::Float(2.0), + "not 2.0".to_string(), + ); let buffer: &[u8] = &[0x00, 0x00, 0x80, 0x3f]; // 1.0f32 LE let config = EvaluationConfig::default(); @@ -326,19 +314,15 @@ fn test_evaluate_float_rule_not_equal() { #[test] fn test_evaluate_float_rule_less_than() { // Buffer contains 1.0f32 LE, rule checks < 2.0 -- should match - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Float { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Float { endian: Endianness::Little, }, - op: Operator::LessThan, - value: Value::Float(2.0), - message: "less than 2.0".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::LessThan, + Value::Float(2.0), + "less than 2.0".to_string(), + ); let buffer: &[u8] = &[0x00, 0x00, 0x80, 0x3f]; // 1.0f32 LE let config = EvaluationConfig::default(); @@ -354,21 +338,17 @@ fn test_evaluate_float_rule_less_than() { #[test] fn test_evaluate_pstring_rule_match() { // Pascal string: length byte (3) followed by "PDF" - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::PString { max_length: None, length_width: PStringLengthWidth::OneByte, length_includes_itself: false, }, - op: Operator::Equal, - value: Value::String("PDF".to_string()), - message: "Pascal PDF marker".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("PDF".to_string()), + "Pascal PDF marker".to_string(), + ); let buffer: &[u8] = &[3, b'P', b'D', b'F', 0x00, 0x00]; let config = EvaluationConfig::default(); @@ -380,21 +360,17 @@ fn test_evaluate_pstring_rule_match() { #[test] fn test_evaluate_pstring_rule_no_match() { - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::PString { max_length: None, length_width: PStringLengthWidth::OneByte, length_includes_itself: false, }, - op: Operator::Equal, - value: Value::String("ZIP".to_string()), - message: "Should not match".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("ZIP".to_string()), + "Should not match".to_string(), + ); let buffer: &[u8] = &[3, b'P', b'D', b'F']; let config = EvaluationConfig::default(); @@ -409,21 +385,17 @@ fn test_evaluate_pstring_rule_no_match() { #[test] fn test_evaluate_pstring_with_max_length() { // Pascal string in buffer has length=10, but max_length caps at 3 - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::PString { max_length: Some(3), length_width: PStringLengthWidth::OneByte, length_includes_itself: false, }, - op: Operator::Equal, - value: Value::String("Hel".to_string()), - message: "Truncated pascal string".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("Hel".to_string()), + "Truncated pascal string".to_string(), + ); let buffer: &[u8] = &[ 10, b'H', b'e', b'l', b'l', b'o', b' ', b'w', b'o', b'r', b'l', @@ -441,21 +413,17 @@ fn test_evaluate_pstring_with_max_length() { #[test] fn test_evaluate_pstring_two_byte_be_with_j_flag() { // 2-byte big-endian prefix with /J: stored length=7 includes the 2-byte prefix, so string is 5 bytes - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::PString { max_length: None, length_width: PStringLengthWidth::TwoByteBE, length_includes_itself: true, }, - op: Operator::Equal, - value: Value::String("Hello".to_string()), - message: "BE pstring with /J flag".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("Hello".to_string()), + "BE pstring with /J flag".to_string(), + ); // Big-endian length 7 = [0x00, 0x07], minus 2-byte prefix = 5 bytes of "Hello" let buffer: &[u8] = &[0x00, 0x07, b'H', b'e', b'l', b'l', b'o']; @@ -472,19 +440,15 @@ fn test_evaluate_pstring_two_byte_be_with_j_flag() { #[test] fn test_evaluate_float_rule_no_match() { // Buffer contains 1.0f32 LE, rule expects == 2.0 -- should NOT match - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Float { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Float { endian: Endianness::Little, }, - op: Operator::Equal, - value: Value::Float(2.0), - message: "should not match".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Float(2.0), + "should not match".to_string(), + ); let buffer: &[u8] = &[0x00, 0x00, 0x80, 0x3f]; // 1.0f32 LE let config = EvaluationConfig::default(); @@ -544,50 +508,42 @@ fn test_regex_eol_corpus() { let one_line_count = libmagic_rs::parser::ast::RegexCount::Lines(::std::num::NonZeroU32::new(1)); - let inner_regex = MagicRule { - offset: OffsetSpec::Relative(1), - typ: TypeKind::Regex { + let inner_regex = MagicRule::new( + OffsetSpec::Relative(1), + TypeKind::Regex { flags: libmagic_rs::parser::ast::RegexFlags::default(), count: one_line_count, }, - op: Operator::Equal, - value: Value::String("[^;]+$".to_string()), - message: "\u{0008}, using AES256 encryption".to_string(), - children: vec![], - level: 2, - strength_modifier: None, - value_transform: None, - }; - - let version_regex = MagicRule { - offset: OffsetSpec::Relative(1), - typ: TypeKind::Regex { + Operator::Equal, + Value::String("[^;]+$".to_string()), + "\u{0008}, using AES256 encryption".to_string(), + ) + .with_level(2); + + let version_regex = MagicRule::new( + OffsetSpec::Relative(1), + TypeKind::Regex { flags: libmagic_rs::parser::ast::RegexFlags::default(), count: one_line_count, }, - op: Operator::Equal, - value: Value::String("[0-9]+(\\.[0-9]+)+".to_string()), - message: "\u{0008}, version 1.1".to_string(), - children: vec![inner_regex], - level: 1, - strength_modifier: None, - value_transform: None, - }; - - let ansible_vault = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::String { + Operator::Equal, + Value::String("[0-9]+(\\.[0-9]+)+".to_string()), + "\u{0008}, version 1.1".to_string(), + ) + .with_children(vec![inner_regex]) + .with_level(1); + + let ansible_vault = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { max_length: Some("$ANSIBLE_VAULT".len()), flags: StringFlags::default(), }, - op: Operator::Equal, - value: Value::String("$ANSIBLE_VAULT".to_string()), - message: "Ansible Vault text".to_string(), - children: vec![version_regex], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("$ANSIBLE_VAULT".to_string()), + "Ansible Vault text".to_string(), + ) + .with_children(vec![version_regex]); let config = EvaluationConfig::default(); let mut context = EvaluationContext::new(config); diff --git a/tests/json_integration_test.rs b/tests/json_integration_test.rs index 4c78b0c..5249d27 100644 --- a/tests/json_integration_test.rs +++ b/tests/json_integration_test.rs @@ -346,22 +346,22 @@ fn test_rule_match_type_kind_not_serialized_in_evaluation_result() { // Construct an EvaluationResult containing a RuleMatch whose // `type_kind` is a fully-qualified TypeKind variant. Then serialize // it and assert the rendered JSON contains no `type_kind` key. - let rule_match = RuleMatch { - message: "ELF executable".to_string(), - offset: 0, - level: 0, - value: Value::Uint(0x7f), - type_kind: TypeKind::Byte { signed: false }, - confidence: 0.9, - }; + let rule_match = RuleMatch::new( + "ELF executable".to_string(), + 0, + 0, + Value::Uint(0x7f), + TypeKind::Byte { signed: false }, + 0.9, + ); let metadata = EvaluationMetadata::default(); - let result = EvaluationResult { - description: "ELF executable".to_string(), - mime_type: None, - confidence: 0.9, - matches: vec![rule_match], + let result = EvaluationResult::new( + "ELF executable".to_string(), + None, + 0.9, + vec![rule_match], metadata, - }; + ); let json = serde_json::to_string(&result).expect("must serialize"); @@ -394,17 +394,17 @@ fn test_rule_match_type_kind_still_accessible_in_rust() { use libmagic_rs::evaluator::RuleMatch; use libmagic_rs::parser::ast::{Endianness, TypeKind, Value}; - let m = RuleMatch { - message: "test".to_string(), - offset: 0, - level: 0, - value: Value::Uint(0), - type_kind: TypeKind::Long { + let m = RuleMatch::new( + "test".to_string(), + 0, + 0, + Value::Uint(0), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - confidence: 1.0, - }; + 1.0, + ); // Field access still works match m.type_kind { diff --git a/tests/property_tests.rs b/tests/property_tests.rs index 2673417..1fed7d7 100644 --- a/tests/property_tests.rs +++ b/tests/property_tests.rs @@ -66,16 +66,17 @@ fn arb_search_flags() -> impl Strategy { bin_test, full_word, start_anchor, - )| SearchFlags { - compact_whitespace, - compact_optional_whitespace, - ignore_lowercase, - ignore_uppercase, - text_test, - trim, - bin_test, - full_word, - start_anchor, + )| { + SearchFlags::default() + .with_compact_whitespace(compact_whitespace) + .with_compact_optional_whitespace(compact_optional_whitespace) + .with_ignore_lowercase(ignore_lowercase) + .with_ignore_uppercase(ignore_uppercase) + .with_text_test(text_test) + .with_trim(trim) + .with_bin_test(bin_test) + .with_full_word(full_word) + .with_start_anchor(start_anchor) }, ) } @@ -144,10 +145,9 @@ fn arb_type_kind() -> impl Strategy { ]; (any::(), any::(), count_strategy).prop_map( |(case_insensitive, start_offset, count)| TypeKind::Regex { - flags: libmagic_rs::parser::ast::RegexFlags { - case_insensitive, - start_offset, - }, + flags: libmagic_rs::parser::ast::RegexFlags::default() + .with_case_insensitive(case_insensitive) + .with_start_offset(start_offset), count, }, ) @@ -202,16 +202,8 @@ fn arb_magic_rule() -> impl Strategy { arb_value(), "[a-zA-Z0-9 _-]{1,64}", ) - .prop_map(|(offset, typ, op, value, message)| MagicRule { - offset, - typ, - op, - value, - message, - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, + .prop_map(|(offset, typ, op, value, message)| { + MagicRule::new(offset, typ, op, value, message) }) } @@ -242,16 +234,8 @@ fn arb_meta_rule() -> impl Strategy { arb_value(), "[a-zA-Z0-9 _-]{1,64}", ) - .prop_map(|(offset, typ, op, value, message)| MagicRule { - offset, - typ, - op, - value, - message, - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, + .prop_map(|(offset, typ, op, value, message)| { + MagicRule::new(offset, typ, op, value, message) }) } @@ -365,8 +349,8 @@ proptest! { 4 => (TypeKind::Long { endian: Endianness::Little, signed: false }, Endianness::Little), _ => (TypeKind::Quad { endian: Endianness::Little, signed: false }, Endianness::Little), }; - let rule = MagicRule { - offset: OffsetSpec::Indirect { + let rule = MagicRule::new( + OffsetSpec::Indirect { base_offset: base, base_relative: false, pointer_type, @@ -375,15 +359,11 @@ proptest! { result_relative: false, endian, }, - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0), - message: "probe".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0), + "probe".to_string(), +); let config = EvaluationConfig::default().with_timeout_ms(Some(500)); let mut context = EvaluationContext::new(config); // Must never panic, regardless of whether the offset resolves. @@ -404,21 +384,17 @@ proptest! { use libmagic_rs::parser::ast::PStringLengthWidth; let mut buf = prefix.to_le_bytes().to_vec(); buf.extend_from_slice(&payload); - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::PString { max_length: None, length_width: PStringLengthWidth::FourByteLE, length_includes_itself: false, }, - op: Operator::Equal, - value: Value::String(String::new()), - message: "probe".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String(String::new()), + "probe".to_string(), +); let config = EvaluationConfig::default().with_timeout_ms(Some(500)); let mut context = EvaluationContext::new(config); let _ = evaluate_rules(&[rule], &buf, &mut context); @@ -458,20 +434,16 @@ proptest! { ) { use libmagic_rs::evaluator::{EvaluationContext, evaluate_rules}; use libmagic_rs::parser::ast::{RegexCount, RegexFlags}; - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Regex { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Regex { flags: RegexFlags::default(), count: RegexCount::Default, }, - op: Operator::Equal, - value: Value::String(pat), - message: "probe".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String(pat), + "probe".to_string(), +); let config = EvaluationConfig::default().with_timeout_ms(Some(500)); let mut context = EvaluationContext::new(config); let start = std::time::Instant::now(); diff --git a/tests/regex_search_corpus_tests.rs b/tests/regex_search_corpus_tests.rs index 455b47d..2a9185e 100644 --- a/tests/regex_search_corpus_tests.rs +++ b/tests/regex_search_corpus_tests.rs @@ -49,17 +49,15 @@ fn regex_rule( children: Vec, level: u32, ) -> MagicRule { - MagicRule { + MagicRule::new( offset, - typ: TypeKind::Regex { flags, count }, - op: Operator::Equal, - value: Value::String(pattern.to_string()), - message: message.to_string(), - children, - level, - strength_modifier: None, - value_transform: None, - } + TypeKind::Regex { flags, count }, + Operator::Equal, + Value::String(pattern.to_string()), + message.to_string(), + ) + .with_children(children) + .with_level(level) } fn search_rule( @@ -70,20 +68,18 @@ fn search_rule( children: Vec, level: u32, ) -> MagicRule { - MagicRule { + MagicRule::new( offset, - typ: TypeKind::Search { + TypeKind::Search { range: NonZeroUsize::new(range).expect("range must be non-zero"), flags: SearchFlags::default(), }, - op: Operator::Equal, - value: Value::String(pattern.to_string()), - message: message.to_string(), - children, - level, - strength_modifier: None, - value_transform: None, - } + Operator::Equal, + Value::String(pattern.to_string()), + message.to_string(), + ) + .with_children(children) + .with_level(level) } // ===================================================================== @@ -103,17 +99,14 @@ fn test_searchbug_corpus_search_with_relative_child() { // Byte child reading the character immediately after "ABC". In the // corpus file the first ABC is `ABC1` at offset 8, so after "ABC" // (match-end at 11) the byte at offset 11 is '1' (0x31). - let after_abc = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(u64::from(b'1')), - message: "followed by 1".to_string(), - children: vec![], - level: 2, - strength_modifier: None, - value_transform: None, - }; + let after_abc = MagicRule::new( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(u64::from(b'1')), + "followed by 1".to_string(), + ) + .with_level(2); // search/12 "ABC" with Relative(0) child. let search_abc = search_rule( @@ -126,20 +119,17 @@ fn test_searchbug_corpus_search_with_relative_child() { ); // Parent: TEST header at offset 0. - let root = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::String { + let root = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { max_length: Some(4), flags: StringFlags::default(), }, - op: Operator::Equal, - value: Value::String("TEST".to_string()), - message: "Testfmt".to_string(), - children: vec![search_abc], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("TEST".to_string()), + "Testfmt".to_string(), + ) + .with_children(vec![search_abc]); let matches = run_rules(&[root], &buffer); @@ -162,17 +152,14 @@ fn test_searchbug_search_anchor_advance_not_window_end() { // 'x' (0x78), not '1' (0x31). let buffer = load_corpus_file("searchbug.testfile"); - let wrong_byte = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(u64::from(b'x')), - message: "window-end bug -- must NOT match".to_string(), - children: vec![], - level: 2, - strength_modifier: None, - value_transform: None, - }; + let wrong_byte = MagicRule::new( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(u64::from(b'x')), + "window-end bug -- must NOT match".to_string(), + ) + .with_level(2); let search_abc = search_rule( OffsetSpec::Relative(0), @@ -183,20 +170,17 @@ fn test_searchbug_search_anchor_advance_not_window_end() { 1, ); - let root = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::String { + let root = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { max_length: Some(4), flags: StringFlags::default(), }, - op: Operator::Equal, - value: Value::String("TEST".to_string()), - message: "Testfmt".to_string(), - children: vec![search_abc], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("TEST".to_string()), + "Testfmt".to_string(), + ) + .with_children(vec![search_abc]); let matches = run_rules(&[root], &buffer); // Should see Testfmt + found ABC but NOT the wrong_byte child. diff --git a/tests/relative_offset_evaluation.rs b/tests/relative_offset_evaluation.rs index eca7821..bd02c70 100644 --- a/tests/relative_offset_evaluation.rs +++ b/tests/relative_offset_evaluation.rs @@ -22,17 +22,7 @@ fn cfg() -> EvaluationConfig { } fn child_rule(offset: OffsetSpec, typ: TypeKind, value: Value, message: &str) -> MagicRule { - MagicRule { - offset, - typ, - op: Operator::Equal, - value, - message: message.to_string(), - children: vec![], - level: 1, - strength_modifier: None, - value_transform: None, - } + MagicRule::new(offset, typ, Operator::Equal, value, message.to_string()).with_level(1) } #[test] @@ -42,28 +32,25 @@ fn relative_child_after_long_parent() { // and reads at offset 4 (= parent end). let buffer = [0x78, 0x56, 0x34, 0x12, 0xBE, 0xBA, 0xFE, 0xCA]; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Long { + let parent = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Long { + endian: Endianness::Little, + signed: false, + }, + Operator::Equal, + Value::Uint(0x1234_5678), + "parent-long".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(0), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - op: Operator::Equal, - value: Value::Uint(0x1234_5678), - message: "parent-long".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(0), - TypeKind::Long { - endian: Endianness::Little, - signed: false, - }, - Value::Uint(0xCAFE_BABE), - "child-long", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Value::Uint(0xCAFE_BABE), + "child-long", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], &buffer, &mut ctx).unwrap(); @@ -80,22 +67,19 @@ fn relative_child_with_positive_delta() { // at offset 1 (parent_end) + 2 = 3. let buffer = [0x7F, 0xAA, 0xBB, 0x42, 0xCC]; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x7F), - message: "p".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(2), - TypeKind::Byte { signed: false }, - Value::Uint(0x42), - "c", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + let parent = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x7F), + "p".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(2), + TypeKind::Byte { signed: false }, + Value::Uint(0x42), + "c", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], &buffer, &mut ctx).unwrap(); @@ -109,25 +93,22 @@ fn relative_child_with_negative_delta() { // (4+4) - 7 = 1. let buffer = [0x00, 0xAA, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12, 0x00]; - let parent = MagicRule { - offset: OffsetSpec::Absolute(4), - typ: TypeKind::Long { + let parent = MagicRule::new( + OffsetSpec::Absolute(4), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - op: Operator::Equal, - value: Value::Uint(0x1234_5678), - message: "p".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(-7), - TypeKind::Byte { signed: false }, - Value::Uint(0xAA), - "c", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Uint(0x1234_5678), + "p".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(-7), + TypeKind::Byte { signed: false }, + Value::Uint(0xAA), + "c", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], &buffer, &mut ctx).unwrap(); @@ -165,20 +146,17 @@ fn relative_chain_marches_forward() { ); middle.children = vec![leaf]; - let root = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Long { + let root = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - op: Operator::Equal, - value: Value::Uint(0x1234_5678), - message: "root".to_string(), - children: vec![middle], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Uint(0x1234_5678), + "root".to_string(), + ) + .with_children(vec![middle]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[root], &buffer, &mut ctx).unwrap(); @@ -193,25 +171,22 @@ fn relative_after_string_parent_includes_nul_terminator() { // byte the child reads via Relative(0). let buffer = b"MZ\x00\x42rest"; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::String { + let parent = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { max_length: None, flags: StringFlags::default(), }, - op: Operator::Equal, - value: Value::String("MZ".to_string()), - message: "mz".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(0), - TypeKind::Byte { signed: false }, - Value::Uint(0x42), - "byte-after-mz", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("MZ".to_string()), + "mz".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Value::Uint(0x42), + "byte-after-mz", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], buffer, &mut ctx).unwrap(); @@ -228,27 +203,24 @@ fn relative_after_string_parent_includes_nul_terminator() { #[test] fn relative_after_flagged_string_parent_includes_nul_terminator() { let buffer = b"MZ\x00\x42rest"; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::String { + let parent = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { max_length: None, flags: StringFlags::default().with_ignore_lowercase(true), }, - op: Operator::Equal, + Operator::Equal, // Lowercase pattern: /c folds the file byte to lower, so "mz" // matches "MZ" in the buffer. Same flagged dispatch path. - value: Value::String("mz".to_string()), - message: "mz-flagged".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(0), - TypeKind::Byte { signed: false }, - Value::Uint(0x42), - "byte-after-mz-flagged", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Value::String("mz".to_string()), + "mz-flagged".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Value::Uint(0x42), + "byte-after-mz-flagged", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], buffer, &mut ctx).unwrap(); assert_eq!( @@ -274,20 +246,16 @@ fn flagged_string_respects_max_length_cap() { // scan window is 2 bytes ("a " or just "ab"), which doesn't // contain a 'b' to complete the match. let buffer = b"a b!"; - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::String { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::String { max_length: Some(2), flags: StringFlags::default().with_compact_optional_whitespace(true), }, - op: Operator::Equal, - value: Value::String("a b".to_string()), - message: "should-not-match".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("a b".to_string()), + "should-not-match".to_string(), + ); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[rule], buffer, &mut ctx).unwrap(); assert!( @@ -302,26 +270,23 @@ fn relative_after_pstring_parent_consumes_prefix_and_payload() { // total), then a byte at offset 6. let buffer = b"\x05Hello\x42tail"; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::PString { + let parent = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::PString { max_length: None, length_width: PStringLengthWidth::OneByte, length_includes_itself: false, }, - op: Operator::Equal, - value: Value::String("Hello".to_string()), - message: "pstr".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(0), - TypeKind::Byte { signed: false }, - Value::Uint(0x42), - "byte-after-pstr", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("Hello".to_string()), + "pstr".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Value::Uint(0x42), + "byte-after-pstr", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], buffer, &mut ctx).unwrap(); @@ -334,17 +299,13 @@ fn relative_top_level_resolves_from_zero_anchor() { // No prior match: top-level Relative(2) -> absolute 2. let buffer = [0xAA, 0xBB, 0x42, 0xCC]; - let rule = MagicRule { - offset: OffsetSpec::Relative(2), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x42), - message: "top".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + let rule = MagicRule::new( + OffsetSpec::Relative(2), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x42), + "top".to_string(), + ); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[rule], &buffer, &mut ctx).unwrap(); @@ -360,31 +321,23 @@ fn relative_sibling_propagation_at_top_level() { // Second rule uses Relative(0) -> reads at offset 4. let buffer = [0x78, 0x56, 0x34, 0x12, 0x42, 0x00, 0x00, 0x00]; - let first = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Long { + let first = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - op: Operator::Equal, - value: Value::Uint(0x1234_5678), - message: "first".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; - let second = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x42), - message: "second".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Uint(0x1234_5678), + "first".to_string(), + ); + let second = MagicRule::new( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x42), + "second".to_string(), + ); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[first, second], &buffer, &mut ctx).unwrap(); @@ -399,22 +352,19 @@ fn relative_out_of_bounds_skips_child_gracefully() { // Engine should skip the child and continue without panicking. let buffer = [0x7F, 0xAA, 0xBB]; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x7F), - message: "p".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(50), - TypeKind::Byte { signed: false }, - Value::Uint(0x00), - "c", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + let parent = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x7F), + "p".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(50), + TypeKind::Byte { signed: false }, + Value::Uint(0x00), + "c", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], &buffer, &mut ctx).unwrap(); @@ -439,42 +389,30 @@ fn relative_anchor_can_decrease_when_later_sibling_matches_at_lower_position() { 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00, // bytes 8-15 ]; - let rule_a = MagicRule { - offset: OffsetSpec::Absolute(8), - typ: TypeKind::Long { + let rule_a = MagicRule::new( + OffsetSpec::Absolute(8), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - op: Operator::Equal, - value: Value::Uint(0x1234_5678), - message: "rule-a-high".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; - let rule_b = MagicRule { - offset: OffsetSpec::Absolute(2), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xAA), - message: "rule-b-low".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; - let rule_c = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x99), - message: "rule-c-relative".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Uint(0x1234_5678), + "rule-a-high".to_string(), + ); + let rule_b = MagicRule::new( + OffsetSpec::Absolute(2), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0xAA), + "rule-b-low".to_string(), + ); + let rule_c = MagicRule::new( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x99), + "rule-c-relative".to_string(), + ); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[rule_a, rule_b, rule_c], &buffer, &mut ctx).unwrap(); @@ -497,42 +435,30 @@ fn relative_anchor_persists_across_non_matching_intermediate_sibling() { // Third top-level rule uses Relative(0) -> reads at offset 4. let buffer = [0x78, 0x56, 0x34, 0x12, 0x42, 0x00, 0x00, 0x00]; - let first = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Long { + let first = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - op: Operator::Equal, - value: Value::Uint(0x1234_5678), - message: "first".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; - let middle_no_match = MagicRule { - offset: OffsetSpec::Absolute(4), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0xDE), // does not match (real byte is 0x42) - message: "middle-skip".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; - let third = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x42), - message: "third".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Uint(0x1234_5678), + "first".to_string(), + ); + let middle_no_match = MagicRule::new( + OffsetSpec::Absolute(4), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0xDE), // does not match (real byte is 0x42) + "middle-skip".to_string(), + ); + let third = MagicRule::new( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x42), + "third".to_string(), + ); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[first, middle_no_match, third], &buffer, &mut ctx).unwrap(); @@ -550,31 +476,23 @@ fn relative_anchor_resets_between_evaluations_via_reset() { let buffer_a = [0x78, 0x56, 0x34, 0x12]; let buffer_b = [0x42, 0xAA, 0xBB, 0xCC]; - let pass_one = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Long { + let pass_one = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Long { endian: Endianness::Little, signed: false, }, - op: Operator::Equal, - value: Value::Uint(0x1234_5678), - message: "pass-one".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; - let pass_two = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x42), - message: "pass-two".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::Uint(0x1234_5678), + "pass-one".to_string(), + ); + let pass_two = MagicRule::new( + OffsetSpec::Relative(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x42), + "pass-two".to_string(), + ); let mut ctx = EvaluationContext::new(cfg()); let _ = evaluate_rules(&[pass_one], &buffer_a, &mut ctx).unwrap(); @@ -592,22 +510,19 @@ fn relative_underflow_skips_child_gracefully() { // Anchor=1 (after parent byte), child Relative(-100) underflows. let buffer = [0x7F, 0xAA]; - let parent = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Byte { signed: false }, - op: Operator::Equal, - value: Value::Uint(0x7F), - message: "p".to_string(), - children: vec![child_rule( - OffsetSpec::Relative(-100), - TypeKind::Byte { signed: false }, - Value::Uint(0x00), - "c", - )], - level: 0, - strength_modifier: None, - value_transform: None, - }; + let parent = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Byte { signed: false }, + Operator::Equal, + Value::Uint(0x7F), + "p".to_string(), + ) + .with_children(vec![child_rule( + OffsetSpec::Relative(-100), + TypeKind::Byte { signed: false }, + Value::Uint(0x00), + "c", + )]); let mut ctx = EvaluationContext::new(cfg()); let matches = evaluate_rules(&[parent], &buffer, &mut ctx).unwrap(); diff --git a/tests/search_flag_conformance_tests.rs b/tests/search_flag_conformance_tests.rs index a27df46..94d73f2 100644 --- a/tests/search_flag_conformance_tests.rs +++ b/tests/search_flag_conformance_tests.rs @@ -54,20 +54,18 @@ fn search_rule( children: Vec, level: u32, ) -> MagicRule { - MagicRule { + MagicRule::new( offset, - typ: TypeKind::Search { + TypeKind::Search { range: NonZeroUsize::new(range).expect("range must be non-zero"), flags, }, - op: Operator::Equal, - value: Value::String(pattern.to_string()), - message: msg.to_string(), - children, - level, - strength_modifier: None, - value_transform: None, - } + Operator::Equal, + Value::String(pattern.to_string()), + msg.to_string(), + ) + .with_children(children) + .with_level(level) } fn run_rules(rules: &[MagicRule], buffer: &[u8]) -> Vec { @@ -108,20 +106,17 @@ fn search_s_anchors_relative_child_at_match_start() { let buffer: &[u8] = b"junk-prefix-ABC-suffix-bytes"; // Child: `>&0 string ABC`. We use TypeKind::String with max_length = 3. - let child = MagicRule { - offset: OffsetSpec::Relative(0), - typ: TypeKind::String { + let child = MagicRule::new( + OffsetSpec::Relative(0), + TypeKind::String { max_length: Some(3), flags: libmagic_rs::parser::ast::StringFlags::default(), }, - op: Operator::Equal, - value: Value::String("ABC".to_string()), - message: "child matched at anchor".to_string(), - children: vec![], - level: 1, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String("ABC".to_string()), + "child matched at anchor".to_string(), + ) + .with_level(1); // Parent WITH /s -- expect both parent and child to fire. let parent_s = search_rule( diff --git a/tests/security_regression.rs b/tests/security_regression.rs index b90c811..3ebf71a 100644 --- a/tests/security_regression.rs +++ b/tests/security_regression.rs @@ -147,20 +147,16 @@ fn test_regex_compile_bounded_for_pathological_patterns() { let config = EvaluationConfig::default().with_timeout_ms(Some(1000)); for (pat, label) in cases { - let rule = MagicRule { - offset: OffsetSpec::Absolute(0), - typ: TypeKind::Regex { + let rule = MagicRule::new( + OffsetSpec::Absolute(0), + TypeKind::Regex { flags: RegexFlags::default(), count: RegexCount::Default, }, - op: Operator::Equal, - value: Value::String((*pat).to_string()), - message: "never-matches".to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - }; + Operator::Equal, + Value::String((*pat).to_string()), + "never-matches".to_string(), + ); let mut ctx = EvaluationContext::new(config.clone()); let start = Instant::now(); @@ -412,6 +408,24 @@ fn test_max_string_length_unflagged_stops_at_nul_before_cap() { ); } +/// SF-1 defense: `EvaluationContext::new` must clamp `max_string_length = 0` +/// to a safe default. The validator at `EvaluationConfig::validate()` rejects +/// 0, but struct-literal construction and the `with_max_string_length` +/// builder can bypass it. Without the clamp, an invalid config silently +/// disables the CWE-770 control. +#[test] +fn test_evaluation_context_clamps_invalid_max_string_length() { + use libmagic_rs::evaluator::EvaluationContext; + let invalid = EvaluationConfig::default().with_max_string_length(0); + let ctx = EvaluationContext::new(invalid); + assert!( + ctx.max_string_length() >= 1, + "EvaluationContext::new must clamp max_string_length=0 to a safe default; \ + got {} (SF-1 regression)", + ctx.max_string_length() + ); +} + /// Cap larger than remaining buffer must return the entire buffer (or up /// to the first NUL, whichever comes first). The cap is an upper bound. #[test] diff --git a/tests/string_flags_integration.rs b/tests/string_flags_integration.rs index c90d3e9..85f3197 100644 --- a/tests/string_flags_integration.rs +++ b/tests/string_flags_integration.rs @@ -24,20 +24,16 @@ fn cfg() -> EvaluationConfig { } fn rule(offset: i64, pattern: &str, flags: StringFlags, msg: &str) -> MagicRule { - MagicRule { - offset: OffsetSpec::Absolute(offset), - typ: TypeKind::String { + MagicRule::new( + OffsetSpec::Absolute(offset), + TypeKind::String { max_length: None, flags, }, - op: Operator::Equal, - value: Value::String(pattern.to_string()), - message: msg.to_string(), - children: vec![], - level: 0, - strength_modifier: None, - value_transform: None, - } + Operator::Equal, + Value::String(pattern.to_string()), + msg.to_string(), + ) } // ---------- /c: case-insensitive ---------- From 2b81694b1b93f129566448566561aa7245c64cba Mon Sep 17 00:00:00 2001 From: UncleSp1d3r Date: Fri, 29 May 2026 09:46:41 -0400 Subject: [PATCH 8/8] fix(review): resolve PR #304 review feedback - read_pattern_match: add explicit `offset >= buffer.len()` guard so the flagged String path matches the documented BufferOverrun contract used by regex/search. Prevents incorrect Operator::NotEqual semantics when the read site is past EOF. - types/mod.rs SF-2: switch direct slice indexing `&buffer[..end]` to `buffer.get(..end).ok_or(BufferOverrun)?` per AGENTS.md "use .get()" rule. Preserves SF-2 fail-loud posture via typed error (rather than panic) if a future refactor breaks the clamp invariant. - security_regression tests: align doc-comments with the actual pattern string `" X"` (was inconsistently described as `"X "`). - security_regression tests: reword the "minimum cap" doc-comment to accurately describe validate()-must-be-called semantics + the SF-1 defense-in-depth clamp in EvaluationContext::new. - json_integration_test: replace substring `.contains("type_kind")` checks with structural serde_json::Value walk over `matches[*]` asserting key absence/presence. - docs/src/evaluator.md: correct `type_kind` description -- field is `pub` (public Rust API) but excluded from JSON via `#[serde(skip)]`. - docs/MAGIC_FORMAT.md, docs/src/library-api.md: mdformat-driven table cell-padding alignment (incidental). Signed-off-by: UncleSp1d3r --- docs/MAGIC_FORMAT.md | 20 ++++++------- docs/src/evaluator.md | 2 +- docs/src/library-api.md | 12 ++++---- src/evaluator/types/mod.rs | 35 ++++++++++++++++++---- tests/json_integration_test.rs | 54 +++++++++++++++++++++++----------- tests/security_regression.rs | 22 ++++++++------ 6 files changed, 96 insertions(+), 49 deletions(-) diff --git a/docs/MAGIC_FORMAT.md b/docs/MAGIC_FORMAT.md index 247a927..a7aabcd 100644 --- a/docs/MAGIC_FORMAT.md +++ b/docs/MAGIC_FORMAT.md @@ -368,17 +368,17 @@ The range is MANDATORY (`NonZeroUsize`). Bare `search` and `search/0` are parse Flags for `search` type modify comparison and anchor behavior. Most flags share semantics with `string` type flags; `/s` is search-specific. -| Flag | Description | -| ---- | -------------------------------------------------------------------------------------------------------------------- | -| `/s` | Anchor advance lands at match-START instead of match-END (required for TGA footer, sfnt name table) | -| `/c` | Case-insensitive match (lowercase pattern chars fold file bytes to lower; uppercase pattern chars are literal) | -| `/C` | Case-insensitive match (uppercase pattern chars fold file bytes to upper; lowercase pattern chars are literal) | -| `/w` | Whitespace-optional (pattern whitespace matches zero or more file whitespace) | +| Flag | Description | +| ---- | ---------------------------------------------------------------------------------------------------------------------- | +| `/s` | Anchor advance lands at match-START instead of match-END (required for TGA footer, sfnt name table) | +| `/c` | Case-insensitive match (lowercase pattern chars fold file bytes to lower; uppercase pattern chars are literal) | +| `/C` | Case-insensitive match (uppercase pattern chars fold file bytes to upper; lowercase pattern chars are literal) | +| `/w` | Whitespace-optional (pattern whitespace matches zero or more file whitespace) | | `/W` | Whitespace-required-compact (pattern whitespace requires at least one file whitespace; additional whitespace consumed) | -| `/T` | Trim leading/trailing ASCII whitespace from pattern before comparison | -| `/f` | Full-word match (post-match word-boundary check; next byte must be EOF or non-word char) | -| `/t` | Force text test (MIME-output hint; no effect on comparison) | -| `/b` | Force binary test (MIME-output hint; no effect on comparison) | +| `/T` | Trim leading/trailing ASCII whitespace from pattern before comparison | +| `/f` | Full-word match (post-match word-boundary check; next byte must be EOF or non-word char) | +| `/t` | Force text test (MIME-output hint; no effect on comparison) | +| `/b` | Force binary test (MIME-output hint; no effect on comparison) | **`/c` vs `/C` asymmetry:** The pattern character controls fold direction. `/c` with lowercase pattern chars folds the file byte to lowercase; uppercase pattern chars in the same pattern are compared literally. See String Flags section above for details. diff --git a/docs/src/evaluator.md b/docs/src/evaluator.md index 70152df..a4cf4ef 100644 --- a/docs/src/evaluator.md +++ b/docs/src/evaluator.md @@ -91,7 +91,7 @@ pub struct RuleMatch { The `Value` type is from `parser::ast::Value` and represents the actual matched content according to the rule's type specification. Note that `Value` implements only `PartialEq` (not `Eq`) due to floating-point NaN semantics. -`RuleMatch` also carries a non-public `type_kind: TypeKind` field used internally by the engine for width calculations and format substitution. This field is excluded from JSON serialization (`#[serde(skip)]`). +`RuleMatch` also carries a `pub type_kind: TypeKind` field used by the engine for width calculations and format substitution. The field is part of the public Rust API (accessible to consumers via field access) but is excluded from JSON serialization via `#[serde(skip)]` so the parser AST does not leak into structured output. ### Offset Resolution (`evaluator/offset.rs`) diff --git a/docs/src/library-api.md b/docs/src/library-api.md index bcf8d41..c3909ee 100644 --- a/docs/src/library-api.md +++ b/docs/src/library-api.md @@ -70,13 +70,13 @@ When no rules match, the description defaults to `"data"` with confidence `0.0`. Controls evaluation behavior with these fields: -| Field | Type | Default | Description | -| --------------------- | ------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `max_recursion_depth` | `u32` | `20` | Maximum depth for nested rule evaluation | +| Field | Type | Default | Description | +| --------------------- | ------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `max_recursion_depth` | `u32` | `20` | Maximum depth for nested rule evaluation | | `max_string_length` | `usize` | `8192` | Caps bytes read for `TypeKind::String` reads (both unflagged and with `/c`/`/C`/`/w`/`/W`/`/T`/`/f` flags); does NOT apply to `TypeKind::PString` or `TypeKind::String16` | -| `stop_at_first_match` | `bool` | `true` | Stop after the first matching rule | -| `enable_mime_types` | `bool` | `false` | Map descriptions to MIME types | -| `timeout_ms` | `Option` | `None` | Evaluation timeout in milliseconds | +| `stop_at_first_match` | `bool` | `true` | Stop after the first matching rule | +| `enable_mime_types` | `bool` | `false` | Map descriptions to MIME types | +| `timeout_ms` | `Option` | `None` | Evaluation timeout in milliseconds | ### Presets diff --git a/src/evaluator/types/mod.rs b/src/evaluator/types/mod.rs index b5998ff..d154b70 100644 --- a/src/evaluator/types/mod.rs +++ b/src/evaluator/types/mod.rs @@ -306,6 +306,22 @@ pub(crate) fn read_pattern_match( pattern: Option<&Value>, max_string_length: usize, ) -> Result, TypeReadError> { + // Match the documented BufferOverrun contract uniformly across all + // pattern-bearing paths. `read_regex` and `read_search` enforce this + // guard internally; the flagged `TypeKind::String` arm below delegates + // to `compare_string_with_flags`, which would silently return `None` + // (no-match) for `offset >= buffer.len()`. Under `Operator::NotEqual` + // an out-of-bounds read would then be reported as a successful match + // -- a correctness hazard. Returning `BufferOverrun` here keeps the + // three paths semantically aligned and lets the engine dispatcher + // (`evaluate_pattern_rule`) reject the rule rather than infer truth + // from an unread region. + if offset >= buffer.len() { + return Err(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + }); + } match type_kind { TypeKind::Regex { flags, count } => { let pattern_str = match pattern { @@ -392,16 +408,23 @@ pub(crate) fn read_pattern_match( // and silently produce no-match at any non-zero offset. // // `end` is constructed with `saturating_add` then `.min(buffer.len())` - // so the slice always satisfies `end <= buffer.len()`. We slice - // directly with `&buffer[..end]` rather than - // `buffer.get(..end).unwrap_or(buffer)` so that any future - // refactor that breaks the clamp invariant fails loudly (panic) + // so the slice always satisfies `end <= buffer.len()`. We use + // `buffer.get(..end).ok_or(BufferOverrun)` rather than direct + // indexing to satisfy the project-wide ".get() for buffer access" + // rule (AGENTS.md "Memory Safety First") while preserving the + // SF-2 fail-loud posture: if a future refactor breaks the clamp + // invariant, we surface a typed `BufferOverrun` to the engine // instead of silently falling back to the uncapped buffer -- - // which would defeat the CWE-770 control. + // which would defeat the CWE-770 control. The `ok_or` arm is + // structurally unreachable under the current invariant; it + // exists as defense-in-depth. let scan_buffer: &[u8] = { let cap = max_length.unwrap_or(max_string_length); let end = offset.saturating_add(cap).min(buffer.len()); - &buffer[..end] + buffer.get(..end).ok_or(TypeReadError::BufferOverrun { + offset, + buffer_len: buffer.len(), + })? }; match string::compare_string_with_flags(trimmed, scan_buffer, offset, *flags) { Some(consumed) => { diff --git a/tests/json_integration_test.rs b/tests/json_integration_test.rs index 5249d27..18e4e4b 100644 --- a/tests/json_integration_test.rs +++ b/tests/json_integration_test.rs @@ -364,25 +364,45 @@ fn test_rule_match_type_kind_not_serialized_in_evaluation_result() { ); let json = serde_json::to_string(&result).expect("must serialize"); - - assert!( - !json.contains("type_kind"), - "EvaluationResult JSON must not contain `type_kind` key \ - (1B-H2 / 2A-M1 regression: parser AST leaking into JSON output). \ - Got: {json}" - ); - - // The `value` field (the matched data) and `confidence` should still - // be present -- sanity check that #[serde(skip)] only excluded - // type_kind, not the surrounding fields. + let parsed: serde_json::Value = + serde_json::from_str(&json).expect("emitted JSON must round-trip through serde_json"); + + // Walk the structure and assert `type_kind` is absent from every + // RuleMatch entry under `matches[*]`. Substring checks would false- + // fail if any user-visible string happened to contain `type_kind`, + // and -- more importantly -- they cannot prove key absence; only + // structural inspection can. + let matches = parsed + .get("matches") + .and_then(|v| v.as_array()) + .expect("EvaluationResult JSON must have a `matches` array"); assert!( - json.contains("\"value\""), - "EvaluationResult JSON should still include `value`" - ); - assert!( - json.contains("\"confidence\""), - "EvaluationResult JSON should still include `confidence`" + !matches.is_empty(), + "EvaluationResult JSON must include at least one rule match for this assertion to be meaningful" ); + for (i, m) in matches.iter().enumerate() { + let obj = m.as_object().unwrap_or_else(|| { + panic!("matches[{i}] must be a JSON object, got {m}"); + }); + assert!( + !obj.contains_key("type_kind"), + "matches[{i}] must not contain `type_kind` key \ + (1B-H2 / 2A-M1 regression: parser AST leaking into JSON output). \ + Got keys: {:?}", + obj.keys().collect::>() + ); + assert!( + obj.contains_key("value"), + "matches[{i}] must still include `value` -- #[serde(skip)] should \ + only suppress type_kind, not the surrounding fields. Got keys: {:?}", + obj.keys().collect::>() + ); + assert!( + obj.contains_key("confidence"), + "matches[{i}] must still include `confidence`. Got keys: {:?}", + obj.keys().collect::>() + ); + } } /// Sanity check: the `type_kind` field is still accessible from Rust diff --git a/tests/security_regression.rs b/tests/security_regression.rs index 3ebf71a..9f5ddd7 100644 --- a/tests/security_regression.rs +++ b/tests/security_regression.rs @@ -331,7 +331,7 @@ fn test_max_string_length_flagged_path_works_at_non_zero_offset() { ); } -/// Flagged-`/W` path: a `string/W "X "` rule against a buffer of all +/// Flagged-`/W` path: a `string/W " X"` rule against a buffer of all /// whitespace must walk only `max_string_length` bytes before giving up, /// not the full buffer. Origin 2A-H1 (flagged-string scan-window variant) /// — the `/W` operator consumes greedy whitespace which without a cap @@ -339,11 +339,11 @@ fn test_max_string_length_flagged_path_works_at_non_zero_offset() { /// large to consume completely in any reasonable time bound; the U1 cap /// prevents the runaway walk. /// -/// We assert no-match (the pattern `X ` requires a literal `X` after the -/// whitespace run; the buffer has none), and that the evaluation -/// completes well under the cap-implied wall-clock bound. A correctly -/// capped walk completes in microseconds; an uncapped walk through 16 -/// MiB takes meaningfully longer. +/// We assert no-match (the pattern ` X` requires a literal `X` after the +/// leading whitespace; the buffer is all spaces with no `X`), and that +/// the evaluation completes well under the cap-implied wall-clock bound. +/// A correctly capped walk completes in microseconds; an uncapped walk +/// through 16 MiB takes meaningfully longer. #[test] fn test_max_string_length_caps_flagged_w_whitespace_walk() { use libmagic_rs::parser::ast::StringFlags; @@ -376,9 +376,13 @@ fn test_max_string_length_caps_flagged_w_whitespace_walk() { } /// Minimum valid cap (cap = 1) must produce a 1-byte result on the -/// unflagged path. `EvaluationConfig::validate()` rejects -/// `max_string_length == 0` at construction time, so cap=0 cannot be -/// reached through the public API. +/// unflagged path. `EvaluationConfig::with_max_string_length` is a pure +/// setter and does not validate; `EvaluationConfig::validate()` (called +/// explicitly by validated entry points such as `MagicDatabase`) rejects +/// `max_string_length == 0`. As defense-in-depth, `EvaluationContext::new` +/// clamps `max_string_length == 0` to `DEFAULT_MAX_STRING_LENGTH` so +/// even low-level callers that bypass `validate()` cannot reach a +/// 0-byte cap at evaluation time (SF-1). #[test] fn test_max_string_length_minimum_cap_returns_one_byte() { let buf = one_mib_nul_free();