From 74708602bbbbdd743f5bd3103734b7e41b241cc7 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 4 Jun 2026 18:20:58 -0700 Subject: [PATCH 01/13] feat(datafusion): push down isnan over NaN-preserving numeric expressions Previously `isnan(...)` was only pushed down to Iceberg when its argument was a bare column; complex numeric arguments such as `isnan(qux + 1)` were silently dropped. This resolves the argument through NaN-preserving transformations - negation, `abs`, numeric casts, and arithmetic with finite literals - down to a single column reference, so `isnan()` is pushed down as ` IS NAN`. Filter pushdown is reported as Inexact, so DataFusion re-applies the original predicate after scanning. Every supported transformation keeps NaN-ness exactly equivalent (result is NaN iff the column is NaN), so both `isnan(...)` and `NOT isnan(...)` stay correct. Unsound cases (`x * 0`, `x / 0`, `c / x`, multi-column expressions) are explicitly rejected and covered by tests. Closes #2154 --- .../src/physical_plan/expr_to_predicate.rs | 203 ++++++++++++++++-- 1 file changed, 189 insertions(+), 14 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 17c9416d54..ef60c0fae4 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -19,7 +19,7 @@ use std::vec; use datafusion::arrow::datatypes::DataType; use datafusion::logical_expr::expr::ScalarFunction; -use datafusion::logical_expr::{Expr, Like, Operator}; +use datafusion::logical_expr::{BinaryExpr, Expr, Like, Operator}; use datafusion::scalar::ScalarValue; use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, UnaryExpression}; use iceberg::spec::{Datum, PrimitiveLiteral}; @@ -225,17 +225,123 @@ fn to_iceberg_operation(op: Operator) -> OpTransformedResult { /// identified by name at runtime, so we need to handle them here. fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> TransformedResult { match func_name { - // TODO: support complex expression arguments to scalar functions - "isnan" if args.len() == 1 => { - let operand = to_iceberg_predicate(&args[0]); - match operand { - TransformedResult::Column(r) => TransformedResult::Predicate(Predicate::Unary( - UnaryExpression::new(PredicateOperator::IsNan, r), - )), - _ => TransformedResult::NotTransformed, + "isnan" if args.len() == 1 => match resolve_nan_preserving_reference(&args[0]) { + Some(r) => TransformedResult::Predicate(Predicate::Unary(UnaryExpression::new( + PredicateOperator::IsNan, + r, + ))), + None => TransformedResult::NotTransformed, + }, + _ => TransformedResult::NotTransformed, + } +} + +/// Attempts to resolve a numeric expression argument down to a single column +/// [`Reference`] such that `isnan(arg)` is logically equivalent to +/// `isnan(reference)`. +/// +/// Filter pushdown is reported as `Inexact` (see +/// [`IcebergTableProvider::supports_filters_pushdown`]), so DataFusion +/// re-applies the original predicate after scanning. We therefore only need the +/// pushed-down predicate to be implied by the original filter (it may match +/// extra rows, but must never drop a matching one). Every transformation handled +/// here preserves NaN-ness *exactly* — the result is NaN if and only if the +/// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are sound: +/// +/// * negation: `-x` is NaN iff `x` is NaN +/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN +/// * casts between numeric types preserve NaN +/// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c` +/// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c` +/// +/// Multiplication/division by zero and `c / x` are intentionally rejected: e.g. +/// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN. +/// +/// [`IcebergTableProvider::supports_filters_pushdown`]: crate::table::IcebergTableProvider +fn resolve_nan_preserving_reference(expr: &Expr) -> Option { + match expr { + Expr::Column(column) => Some(Reference::new(column.name())), + Expr::Negative(inner) => resolve_nan_preserving_reference(inner), + Expr::Cast(cast) => { + // Casts to date truncate the value and are not numeric, so they + // cannot be treated as NaN-preserving. + if cast.data_type == DataType::Date32 || cast.data_type == DataType::Date64 { + return None; } + resolve_nan_preserving_reference(&cast.expr) } - _ => TransformedResult::NotTransformed, + Expr::ScalarFunction(ScalarFunction { func, args }) + if func.name() == "abs" && args.len() == 1 => + { + resolve_nan_preserving_reference(&args[0]) + } + Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary), + _ => None, + } +} + +/// Resolves the column reference from an arithmetic expression that combines a +/// single column with a finite literal while preserving NaN-ness. See +/// [`resolve_nan_preserving_reference`] for the soundness argument. +fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { + // `nonzero` requires the literal to be non-zero (`x * 0` / `x / 0` are NaN + // for infinite `x`). `literal_allowed_left` indicates whether the literal may + // sit on the left operand: `c + x`, `c - x`, `c * x` all preserve NaN-ness, + // but `c / x` does not (e.g. `0 / 0` is NaN while `0` is not). + let (nonzero, literal_allowed_left) = match binary.op { + Operator::Plus | Operator::Minus => (false, true), + Operator::Multiply => (true, true), + Operator::Divide => (true, false), + _ => return None, + }; + + let is_literal = |expr: &Expr| { + if nonzero { + finite_nonzero_literal(expr).is_some() + } else { + finite_literal(expr).is_some() + } + }; + + if is_literal(&binary.right) { + // + resolve_nan_preserving_reference(&binary.left) + } else if literal_allowed_left && is_literal(&binary.left) { + // + resolve_nan_preserving_reference(&binary.right) + } else { + None + } +} + +/// Returns the literal's numeric value if `expr` is a finite numeric literal. +fn finite_literal(expr: &Expr) -> Option { + match expr { + Expr::Literal(value, _) => scalar_value_as_f64(value).filter(|v| v.is_finite()), + _ => None, + } +} + +/// Returns the literal's numeric value if `expr` is a finite, non-zero numeric literal. +fn finite_nonzero_literal(expr: &Expr) -> Option { + finite_literal(expr).filter(|v| *v != 0.0) +} + +/// Extracts a numeric [`ScalarValue`] as an `f64`, used only to inspect +/// finiteness and sign of literals (precision loss is irrelevant here). +fn scalar_value_as_f64(value: &ScalarValue) -> Option { + match value { + ScalarValue::Int8(Some(v)) => Some(*v as f64), + ScalarValue::Int16(Some(v)) => Some(*v as f64), + ScalarValue::Int32(Some(v)) => Some(*v as f64), + ScalarValue::Int64(Some(v)) => Some(*v as f64), + ScalarValue::UInt8(Some(v)) => Some(*v as f64), + ScalarValue::UInt16(Some(v)) => Some(*v as f64), + ScalarValue::UInt32(Some(v)) => Some(*v as f64), + ScalarValue::UInt64(Some(v)) => Some(*v as f64), + ScalarValue::Float32(Some(v)) => Some(*v as f64), + ScalarValue::Float64(Some(v)) => Some(*v), + _ => None, } } @@ -732,11 +838,80 @@ mod tests { assert_eq!(predicate, expected_predicate); } + #[test] + fn test_predicate_conversion_with_isnan_negation() { + // -x is NaN iff x is NaN + let predicate = convert_to_iceberg_predicate("isnan(-qux)").unwrap(); + assert_eq!(predicate, Reference::new("qux").is_nan()); + + let predicate = convert_to_iceberg_predicate("NOT isnan(-qux)").unwrap(); + assert_eq!(predicate, !Reference::new("qux").is_nan()); + } + + #[test] + fn test_predicate_conversion_with_isnan_abs() { + // abs(x) is NaN iff x is NaN + let predicate = convert_to_iceberg_predicate("isnan(abs(qux))").unwrap(); + assert_eq!(predicate, Reference::new("qux").is_nan()); + } + + #[test] + fn test_predicate_conversion_with_isnan_additive() { + // x + c, c + x, x - c, c - x are NaN iff x is NaN (for finite c) + for sql in [ + "isnan(qux + 1)", + "isnan(1 + qux)", + "isnan(qux - 1)", + "isnan(1 - qux)", + "isnan(qux + 1.5)", + ] { + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + assert_eq!(predicate, Reference::new("qux").is_nan(), "sql: {sql}"); + } + } + + #[test] + fn test_predicate_conversion_with_isnan_multiplicative() { + // x * c, c * x, x / c are NaN iff x is NaN (for finite non-zero c) + for sql in ["isnan(qux * 2)", "isnan(2 * qux)", "isnan(qux / 2)"] { + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + assert_eq!(predicate, Reference::new("qux").is_nan(), "sql: {sql}"); + } + } + + #[test] + fn test_predicate_conversion_with_isnan_nested_expr() { + // Nested NaN-preserving transformations resolve to the inner column + let predicate = convert_to_iceberg_predicate("isnan(-(abs(qux) + 1) * 3)").unwrap(); + assert_eq!(predicate, Reference::new("qux").is_nan()); + } + + #[test] + fn test_predicate_conversion_with_isnan_and_other_complex_condition() { + let sql = "isnan(qux + 1) AND foo > 1"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + let expected_predicate = Predicate::and( + Reference::new("qux").is_nan(), + Reference::new("foo").greater_than(Datum::long(1)), + ); + assert_eq!(predicate, expected_predicate); + } + #[test] fn test_predicate_conversion_with_isnan_unsupported_arg() { - // isnan on a complex expression (not a bare column) cannot be pushed down - let sql = "isnan(qux + 1)"; - let predicate = convert_to_iceberg_predicate(sql); - assert_eq!(predicate, None); + // Multiplying/dividing by zero does not preserve NaN-ness: `x * 0` is NaN + // when `x` is ±inf, so it cannot be pushed down. + assert_eq!(convert_to_iceberg_predicate("isnan(qux * 0)"), None); + assert_eq!(convert_to_iceberg_predicate("isnan(qux / 0)"), None); + + // `c / x` is not NaN-preserving (e.g. `0 / 0` is NaN while `0` is not). + assert_eq!(convert_to_iceberg_predicate("isnan(1 / qux)"), None); + + // Expressions referencing more than one column cannot be reduced to a + // single column reference. + assert_eq!(convert_to_iceberg_predicate("isnan(qux + foo)"), None); + + // Unknown scalar functions are not pushed down. + assert_eq!(convert_to_iceberg_predicate("isnan(sqrt(qux))"), None); } } From a10973dad2cac89e468db832881af52e8657a923 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 4 Jun 2026 18:26:13 -0700 Subject: [PATCH 02/13] refactor(datafusion): clarify NaN-preserving arithmetic resolution Replace the terse `(nonzero, literal_allowed_left)` tuple with explicit per-operator handling, and simplify the literal helpers to return booleans (`is_finite_literal` / `is_finite_nonzero_literal`). No behavior change. --- .../src/physical_plan/expr_to_predicate.rs | 85 +++++++++++-------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index ef60c0fae4..49013a13a5 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -284,51 +284,68 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { /// single column with a finite literal while preserving NaN-ness. See /// [`resolve_nan_preserving_reference`] for the soundness argument. fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { - // `nonzero` requires the literal to be non-zero (`x * 0` / `x / 0` are NaN - // for infinite `x`). `literal_allowed_left` indicates whether the literal may - // sit on the left operand: `c + x`, `c - x`, `c * x` all preserve NaN-ness, - // but `c / x` does not (e.g. `0 / 0` is NaN while `0` is not). - let (nonzero, literal_allowed_left) = match binary.op { - Operator::Plus | Operator::Minus => (false, true), - Operator::Multiply => (true, true), - Operator::Divide => (true, false), - _ => return None, - }; + let (left, right) = (&binary.left, &binary.right); + match binary.op { + // `x + c`, `c + x`, `x - c` and `c - x` are NaN iff `x` is NaN, for any + // finite literal `c`. The column may be on either side. + Operator::Plus | Operator::Minus => { + if is_finite_literal(right) { + resolve_nan_preserving_reference(left) + } else if is_finite_literal(left) { + resolve_nan_preserving_reference(right) + } else { + None + } + } - let is_literal = |expr: &Expr| { - if nonzero { - finite_nonzero_literal(expr).is_some() - } else { - finite_literal(expr).is_some() + // `x * c` and `c * x` are NaN iff `x` is NaN, but only when `c` is + // non-zero: `±inf * 0` is NaN even though `±inf` is not. The column may + // be on either side. + Operator::Multiply => { + if is_finite_nonzero_literal(right) { + resolve_nan_preserving_reference(left) + } else if is_finite_nonzero_literal(left) { + resolve_nan_preserving_reference(right) + } else { + None + } + } + + // `x / c` is NaN iff `x` is NaN, for a finite non-zero literal `c`. + // `c / x` is rejected because it is not NaN-preserving (e.g. `0 / 0` is + // NaN while `0` is not), so the column must be the dividend (left side). + Operator::Divide => { + if is_finite_nonzero_literal(right) { + resolve_nan_preserving_reference(left) + } else { + None + } } - }; - if is_literal(&binary.right) { - // - resolve_nan_preserving_reference(&binary.left) - } else if literal_allowed_left && is_literal(&binary.left) { - // - resolve_nan_preserving_reference(&binary.right) - } else { - None + _ => None, } } -/// Returns the literal's numeric value if `expr` is a finite numeric literal. -fn finite_literal(expr: &Expr) -> Option { +/// Returns `true` if `expr` is a finite numeric literal (not infinite or NaN). +fn is_finite_literal(expr: &Expr) -> bool { + matches!(literal_as_f64(expr), Some(v) if v.is_finite()) +} + +/// Returns `true` if `expr` is a finite, non-zero numeric literal. +fn is_finite_nonzero_literal(expr: &Expr) -> bool { + matches!(literal_as_f64(expr), Some(v) if v.is_finite() && v != 0.0) +} + +/// Returns the value of `expr` as an `f64` if it is a numeric literal. Used only +/// to inspect the finiteness and sign of literals (precision loss is irrelevant). +fn literal_as_f64(expr: &Expr) -> Option { match expr { - Expr::Literal(value, _) => scalar_value_as_f64(value).filter(|v| v.is_finite()), + Expr::Literal(value, _) => scalar_value_as_f64(value), _ => None, } } -/// Returns the literal's numeric value if `expr` is a finite, non-zero numeric literal. -fn finite_nonzero_literal(expr: &Expr) -> Option { - finite_literal(expr).filter(|v| *v != 0.0) -} - -/// Extracts a numeric [`ScalarValue`] as an `f64`, used only to inspect -/// finiteness and sign of literals (precision loss is irrelevant here). +/// Extracts a numeric [`ScalarValue`] as an `f64`. fn scalar_value_as_f64(value: &ScalarValue) -> Option { match value { ScalarValue::Int8(Some(v)) => Some(*v as f64), From b289e4fac1da7dc11e8a690f4f44c062a37a7862 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 4 Jun 2026 18:34:59 -0700 Subject: [PATCH 03/13] refactor(datafusion): defer abs scalar-function arg to a follow-up Narrow this PR's scope to references, negation, numeric casts and arithmetic with finite literals. Resolving scalar-function arguments such as `abs(x)` is left for a separate PR. `isnan(abs(x))` is no longer pushed down (covered by the unsupported-args test). --- .../src/physical_plan/expr_to_predicate.rs | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 49013a13a5..6947448cde 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -249,7 +249,6 @@ fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> Trans /// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are sound: /// /// * negation: `-x` is NaN iff `x` is NaN -/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN /// * casts between numeric types preserve NaN /// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c` /// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c` @@ -257,6 +256,9 @@ fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> Trans /// Multiplication/division by zero and `c / x` are intentionally rejected: e.g. /// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN. /// +/// Scalar-function arguments (e.g. `abs(x)`) are intentionally left for a +/// follow-up; only references, negation, casts and arithmetic are resolved here. +/// /// [`IcebergTableProvider::supports_filters_pushdown`]: crate::table::IcebergTableProvider fn resolve_nan_preserving_reference(expr: &Expr) -> Option { match expr { @@ -270,11 +272,6 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { } resolve_nan_preserving_reference(&cast.expr) } - Expr::ScalarFunction(ScalarFunction { func, args }) - if func.name() == "abs" && args.len() == 1 => - { - resolve_nan_preserving_reference(&args[0]) - } Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary), _ => None, } @@ -865,13 +862,6 @@ mod tests { assert_eq!(predicate, !Reference::new("qux").is_nan()); } - #[test] - fn test_predicate_conversion_with_isnan_abs() { - // abs(x) is NaN iff x is NaN - let predicate = convert_to_iceberg_predicate("isnan(abs(qux))").unwrap(); - assert_eq!(predicate, Reference::new("qux").is_nan()); - } - #[test] fn test_predicate_conversion_with_isnan_additive() { // x + c, c + x, x - c, c - x are NaN iff x is NaN (for finite c) @@ -899,7 +889,7 @@ mod tests { #[test] fn test_predicate_conversion_with_isnan_nested_expr() { // Nested NaN-preserving transformations resolve to the inner column - let predicate = convert_to_iceberg_predicate("isnan(-(abs(qux) + 1) * 3)").unwrap(); + let predicate = convert_to_iceberg_predicate("isnan(-(qux + 1) * 3)").unwrap(); assert_eq!(predicate, Reference::new("qux").is_nan()); } @@ -928,6 +918,9 @@ mod tests { // single column reference. assert_eq!(convert_to_iceberg_predicate("isnan(qux + foo)"), None); + // Scalar-function arguments (e.g. abs) are left for a follow-up. + assert_eq!(convert_to_iceberg_predicate("isnan(abs(qux))"), None); + // Unknown scalar functions are not pushed down. assert_eq!(convert_to_iceberg_predicate("isnan(sqrt(qux))"), None); } From b33d08dfb51523763cd25d1448ccc2585f11f5ed Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 4 Jun 2026 18:36:26 -0700 Subject: [PATCH 04/13] docs(datafusion): add TODO for deferred scalar-function arg support --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 6947448cde..5e6db5d789 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -273,6 +273,8 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { resolve_nan_preserving_reference(&cast.expr) } Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary), + // TODO(#2154): resolve NaN-preserving scalar-function arguments such as + // `abs(x)` (and similar, e.g. `signum`). _ => None, } } From 4c922913fe124814bc4357e75414fd959d553bd1 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 5 Jun 2026 11:23:23 -0700 Subject: [PATCH 05/13] feat(datafusion): support abs argument and fold literal helper Re-add `abs(x)` as a NaN-preserving argument to `isnan`, and merge the literal f64 extraction into a single `literal_as_f64` helper. No other behavior change. --- .../src/physical_plan/expr_to_predicate.rs | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 5e6db5d789..cfadfcb555 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -249,6 +249,7 @@ fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> Trans /// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are sound: /// /// * negation: `-x` is NaN iff `x` is NaN +/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN /// * casts between numeric types preserve NaN /// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c` /// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c` @@ -256,9 +257,6 @@ fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> Trans /// Multiplication/division by zero and `c / x` are intentionally rejected: e.g. /// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN. /// -/// Scalar-function arguments (e.g. `abs(x)`) are intentionally left for a -/// follow-up; only references, negation, casts and arithmetic are resolved here. -/// /// [`IcebergTableProvider::supports_filters_pushdown`]: crate::table::IcebergTableProvider fn resolve_nan_preserving_reference(expr: &Expr) -> Option { match expr { @@ -272,9 +270,12 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { } resolve_nan_preserving_reference(&cast.expr) } + Expr::ScalarFunction(ScalarFunction { func, args }) + if func.name() == "abs" && args.len() == 1 => + { + resolve_nan_preserving_reference(&args[0]) + } Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary), - // TODO(#2154): resolve NaN-preserving scalar-function arguments such as - // `abs(x)` (and similar, e.g. `signum`). _ => None, } } @@ -338,14 +339,9 @@ fn is_finite_nonzero_literal(expr: &Expr) -> bool { /// Returns the value of `expr` as an `f64` if it is a numeric literal. Used only /// to inspect the finiteness and sign of literals (precision loss is irrelevant). fn literal_as_f64(expr: &Expr) -> Option { - match expr { - Expr::Literal(value, _) => scalar_value_as_f64(value), - _ => None, - } -} - -/// Extracts a numeric [`ScalarValue`] as an `f64`. -fn scalar_value_as_f64(value: &ScalarValue) -> Option { + let Expr::Literal(value, _) = expr else { + return None; + }; match value { ScalarValue::Int8(Some(v)) => Some(*v as f64), ScalarValue::Int16(Some(v)) => Some(*v as f64), @@ -864,6 +860,13 @@ mod tests { assert_eq!(predicate, !Reference::new("qux").is_nan()); } + #[test] + fn test_predicate_conversion_with_isnan_abs() { + // abs(x) is NaN iff x is NaN + let predicate = convert_to_iceberg_predicate("isnan(abs(qux))").unwrap(); + assert_eq!(predicate, Reference::new("qux").is_nan()); + } + #[test] fn test_predicate_conversion_with_isnan_additive() { // x + c, c + x, x - c, c - x are NaN iff x is NaN (for finite c) @@ -891,7 +894,7 @@ mod tests { #[test] fn test_predicate_conversion_with_isnan_nested_expr() { // Nested NaN-preserving transformations resolve to the inner column - let predicate = convert_to_iceberg_predicate("isnan(-(qux + 1) * 3)").unwrap(); + let predicate = convert_to_iceberg_predicate("isnan(-(abs(qux) + 1) * 3)").unwrap(); assert_eq!(predicate, Reference::new("qux").is_nan()); } @@ -920,9 +923,6 @@ mod tests { // single column reference. assert_eq!(convert_to_iceberg_predicate("isnan(qux + foo)"), None); - // Scalar-function arguments (e.g. abs) are left for a follow-up. - assert_eq!(convert_to_iceberg_predicate("isnan(abs(qux))"), None); - // Unknown scalar functions are not pushed down. assert_eq!(convert_to_iceberg_predicate("isnan(sqrt(qux))"), None); } From b18874f1be6ca265565e2884b1f7d2629fc93363 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 5 Jun 2026 11:27:58 -0700 Subject: [PATCH 06/13] refactor(datafusion): use ScalarValue::cast_to for literal f64 extraction Replace the hand-rolled numeric type match with DataFusion's `ScalarValue::cast_to(Float64)`. Less code and covers all numeric literal types (e.g. decimals) without enumerating them. No behavior change for the supported cases. --- .../src/physical_plan/expr_to_predicate.rs | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index cfadfcb555..f184c9eb58 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -336,23 +336,15 @@ fn is_finite_nonzero_literal(expr: &Expr) -> bool { matches!(literal_as_f64(expr), Some(v) if v.is_finite() && v != 0.0) } -/// Returns the value of `expr` as an `f64` if it is a numeric literal. Used only -/// to inspect the finiteness and sign of literals (precision loss is irrelevant). +/// Returns the value of `expr` as an `f64` if it is a numeric literal, delegating +/// the numeric conversion to DataFusion's [`ScalarValue::cast_to`]. Used only to +/// inspect the finiteness and sign of literals (precision loss is irrelevant). fn literal_as_f64(expr: &Expr) -> Option { let Expr::Literal(value, _) = expr else { return None; }; - match value { - ScalarValue::Int8(Some(v)) => Some(*v as f64), - ScalarValue::Int16(Some(v)) => Some(*v as f64), - ScalarValue::Int32(Some(v)) => Some(*v as f64), - ScalarValue::Int64(Some(v)) => Some(*v as f64), - ScalarValue::UInt8(Some(v)) => Some(*v as f64), - ScalarValue::UInt16(Some(v)) => Some(*v as f64), - ScalarValue::UInt32(Some(v)) => Some(*v as f64), - ScalarValue::UInt64(Some(v)) => Some(*v as f64), - ScalarValue::Float32(Some(v)) => Some(*v as f64), - ScalarValue::Float64(Some(v)) => Some(*v), + match value.cast_to(&DataType::Float64).ok()? { + ScalarValue::Float64(Some(v)) => Some(v), _ => None, } } From 3675bd1bd310a409fdd90cdde36b2706a9eabddb Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 5 Jun 2026 11:29:25 -0700 Subject: [PATCH 07/13] refactor(datafusion): build IsNan via Reference::is_nan builder Use the existing `Reference::is_nan()` builder instead of constructing the `Predicate::Unary(UnaryExpression::new(...))` by hand. --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index f184c9eb58..7cecc1e44a 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -226,10 +226,7 @@ fn to_iceberg_operation(op: Operator) -> OpTransformedResult { fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> TransformedResult { match func_name { "isnan" if args.len() == 1 => match resolve_nan_preserving_reference(&args[0]) { - Some(r) => TransformedResult::Predicate(Predicate::Unary(UnaryExpression::new( - PredicateOperator::IsNan, - r, - ))), + Some(r) => TransformedResult::Predicate(r.is_nan()), None => TransformedResult::NotTransformed, }, _ => TransformedResult::NotTransformed, From 8814873d74c55554323b3bd43c664ad156e5e5d3 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 5 Jun 2026 11:34:42 -0700 Subject: [PATCH 08/13] refactor(datafusion): collapse literal helpers into a single finite_literal Fold the finiteness check into `finite_literal` and inline the non-zero check at the multiply/divide call sites, removing the two thin boolean wrappers. --- .../src/physical_plan/expr_to_predicate.rs | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 7cecc1e44a..e6a89c6d06 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -286,9 +286,9 @@ fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { // `x + c`, `c + x`, `x - c` and `c - x` are NaN iff `x` is NaN, for any // finite literal `c`. The column may be on either side. Operator::Plus | Operator::Minus => { - if is_finite_literal(right) { + if finite_literal(right).is_some() { resolve_nan_preserving_reference(left) - } else if is_finite_literal(left) { + } else if finite_literal(left).is_some() { resolve_nan_preserving_reference(right) } else { None @@ -299,9 +299,9 @@ fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { // non-zero: `±inf * 0` is NaN even though `±inf` is not. The column may // be on either side. Operator::Multiply => { - if is_finite_nonzero_literal(right) { + if matches!(finite_literal(right), Some(c) if c != 0.0) { resolve_nan_preserving_reference(left) - } else if is_finite_nonzero_literal(left) { + } else if matches!(finite_literal(left), Some(c) if c != 0.0) { resolve_nan_preserving_reference(right) } else { None @@ -312,7 +312,7 @@ fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { // `c / x` is rejected because it is not NaN-preserving (e.g. `0 / 0` is // NaN while `0` is not), so the column must be the dividend (left side). Operator::Divide => { - if is_finite_nonzero_literal(right) { + if matches!(finite_literal(right), Some(c) if c != 0.0) { resolve_nan_preserving_reference(left) } else { None @@ -323,25 +323,16 @@ fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { } } -/// Returns `true` if `expr` is a finite numeric literal (not infinite or NaN). -fn is_finite_literal(expr: &Expr) -> bool { - matches!(literal_as_f64(expr), Some(v) if v.is_finite()) -} - -/// Returns `true` if `expr` is a finite, non-zero numeric literal. -fn is_finite_nonzero_literal(expr: &Expr) -> bool { - matches!(literal_as_f64(expr), Some(v) if v.is_finite() && v != 0.0) -} - -/// Returns the value of `expr` as an `f64` if it is a numeric literal, delegating -/// the numeric conversion to DataFusion's [`ScalarValue::cast_to`]. Used only to -/// inspect the finiteness and sign of literals (precision loss is irrelevant). -fn literal_as_f64(expr: &Expr) -> Option { +/// Returns the value of `expr` as an `f64` if it is a finite numeric literal +/// (i.e. not a non-literal, non-numeric, or infinite/NaN value). The numeric +/// conversion is delegated to DataFusion's [`ScalarValue::cast_to`]; the value +/// is only used to inspect finiteness and sign (precision loss is irrelevant). +fn finite_literal(expr: &Expr) -> Option { let Expr::Literal(value, _) = expr else { return None; }; match value.cast_to(&DataType::Float64).ok()? { - ScalarValue::Float64(Some(v)) => Some(v), + ScalarValue::Float64(Some(v)) if v.is_finite() => Some(v), _ => None, } } From a7e81bc6d16b2f703ac53dd51d1f390763507f37 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 11 Jun 2026 17:57:52 -0700 Subject: [PATCH 09/13] docs(datafusion): clarify IEEE-754 rationale in arithmetic comments Address review feedback: explain the multiply/divide rejection using explicit IEEE-754 facts (inf is not NaN but inf * 0 is NaN; 0 is not NaN but 0 / 0 is NaN) instead of the previous terse wording. --- .../src/physical_plan/expr_to_predicate.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index e6a89c6d06..b8d2076097 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -296,8 +296,10 @@ fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { } // `x * c` and `c * x` are NaN iff `x` is NaN, but only when `c` is - // non-zero: `±inf * 0` is NaN even though `±inf` is not. The column may - // be on either side. + // non-zero. Per IEEE-754: + // - inf is not NaN + // - inf * 0 is NaN + // so multiplying by zero is rejected. The column may be on either side. Operator::Multiply => { if matches!(finite_literal(right), Some(c) if c != 0.0) { resolve_nan_preserving_reference(left) @@ -309,8 +311,11 @@ fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { } // `x / c` is NaN iff `x` is NaN, for a finite non-zero literal `c`. - // `c / x` is rejected because it is not NaN-preserving (e.g. `0 / 0` is - // NaN while `0` is not), so the column must be the dividend (left side). + // `c / x` is rejected and the column must be the dividend (left side). + // Per IEEE-754: + // - 0 is not NaN + // - 0 / 0 is NaN + // so `c / x` is not NaN-preserving. Operator::Divide => { if matches!(finite_literal(right), Some(c) if c != 0.0) { resolve_nan_preserving_reference(left) From 517f91b3ad4f40f979b9bba9fd59c45708a9907d Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 12 Jun 2026 14:44:22 -0700 Subject: [PATCH 10/13] docs(datafusion): note why two-column arithmetic is not resolved for isnan pushdown --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index b8d2076097..d6e0b354f1 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -280,6 +280,14 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { /// Resolves the column reference from an arithmetic expression that combines a /// single column with a finite literal while preserving NaN-ness. See /// [`resolve_nan_preserving_reference`] for the soundness argument. +/// +/// Expressions with column references on both sides (e.g. `(x + 1) * (x - 2)`) +/// are not supported. Handling them safely would require both operands to +/// resolve to the *same* column (`x + y` cannot be expressed as a single +/// `col IS NAN`) and the operator combination itself to be NaN-preserving: +/// `(x + 1) * (x - 2)` is NaN iff `x` is NaN, but `(x + 1) - (x - 2)` is NaN +/// for `x = inf` (`inf - inf`) even though `x` is not. Left for a potential +/// follow-up. fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { let (left, right) = (&binary.left, &binary.right); match binary.op { From 674fb441b4e38cb7aca37b632922630432760128 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 12 Jun 2026 15:03:14 -0700 Subject: [PATCH 11/13] docs(datafusion): link two-column arithmetic follow-up to issue 2154 --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index d6e0b354f1..9a0db552ae 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -287,7 +287,7 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { /// `col IS NAN`) and the operator combination itself to be NaN-preserving: /// `(x + 1) * (x - 2)` is NaN iff `x` is NaN, but `(x + 1) - (x - 2)` is NaN /// for `x = inf` (`inf - inf`) even though `x` is not. Left for a potential -/// follow-up. +/// follow-up, tracked in . fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { let (left, right) = (&binary.left, &binary.right); match binary.op { From 366c47267d88fc88f8c184728b2cbbf40396a649 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 12 Jun 2026 15:53:00 -0700 Subject: [PATCH 12/13] docs(datafusion): use TODO(issue-2154) form for two-column arithmetic follow-up --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 9a0db552ae..c84f3cd4ef 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -286,8 +286,10 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { /// resolve to the *same* column (`x + y` cannot be expressed as a single /// `col IS NAN`) and the operator combination itself to be NaN-preserving: /// `(x + 1) * (x - 2)` is NaN iff `x` is NaN, but `(x + 1) - (x - 2)` is NaN -/// for `x = inf` (`inf - inf`) even though `x` is not. Left for a potential -/// follow-up, tracked in . +/// for `x = inf` (`inf - inf`) even though `x` is not. +/// +/// TODO(): support +/// NaN-preserving expressions with column references on both sides. fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { let (left, right) = (&binary.left, &binary.right); match binary.op { From 5743c7a92a5edbe7e8d7f3dd2076ccc612649492 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 12 Jun 2026 15:54:36 -0700 Subject: [PATCH 13/13] docs(datafusion): match repo TODO style for issue-2154 follow-up note --- .../datafusion/src/physical_plan/expr_to_predicate.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index c84f3cd4ef..f0e6639243 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -288,8 +288,8 @@ fn resolve_nan_preserving_reference(expr: &Expr) -> Option { /// `(x + 1) * (x - 2)` is NaN iff `x` is NaN, but `(x + 1) - (x - 2)` is NaN /// for `x = inf` (`inf - inf`) even though `x` is not. /// -/// TODO(): support -/// NaN-preserving expressions with column references on both sides. +/// TODO: support NaN-preserving expressions with column references on both +/// sides, see . fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option { let (left, right) = (&binary.left, &binary.right); match binary.op {