From 9b3f482f93343bfbb7a0bd0190d39840af3ee559 Mon Sep 17 00:00:00 2001 From: HairstonE Date: Thu, 11 Jun 2026 13:04:45 -0400 Subject: [PATCH 1/3] keep one column of the mark so it stays qualified + unit test --- .../optimizer/src/optimize_projections/mod.rs | 69 ++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index b9f22a3f9e52d..9fd8d896002f6 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -780,12 +780,12 @@ fn split_join_requirements( // The mark column is synthetic (produced by the join itself), // so discard it and route only to the left child. let (left_indices, _mark) = indices.split_off(left_len); - (left_indices, RequiredIndices::new()) + (left_indices, RequiredIndices::new().append(&[0])) } JoinType::RightMark => { // Same as LeftMark, but for the right child. let (right_indices, _mark) = indices.split_off(right_len); - (RequiredIndices::new(), right_indices) + (RequiredIndices::new().append(&[0]), right_indices) } // All requirements can be re-routed to left child directly. JoinType::LeftAnti | JoinType::LeftSemi => (indices, RequiredIndices::new()), @@ -2462,6 +2462,71 @@ mod tests { ) } + // Regression test for https://github.com/apache/datafusion/issues/22477 + // `= ANY` / `<> ALL` decorrelate into several stacked LeftMark joins whose + // (pushed-down) filters reference no right column. OptimizeProjections must + // not prune each mark join's right child to zero columns: a zero-column + // child has no table reference, so `mark_field` would emit an unqualified + // `mark` and the stacked marks would collide on the duplicate bare name. + // Here each mark stays qualified (`__correlated_sq_N.mark`) and the plan + // optimizes without error. + #[test] + fn optimize_projections_stacked_mark_joins_keep_qualified_mark() -> Result<()> { + let person = test_table_scan_with_name("person")?; + + let aliased_scan = |table: &str, alias: &str| -> Result { + LogicalPlanBuilder::from(test_table_scan_with_name(table)?) + .project(vec![col(format!("{table}.a"))])? + .alias(alias)? + .build() + }; + + // Three stacked LeftMark joins with trivially-`true` filters (the + // filter-less shape left behind by push_down_filter), feeding a filter + // that references all three marks. + let plan = LogicalPlanBuilder::from(person) + .join_on( + aliased_scan("s1", "__correlated_sq_1")?, + JoinType::LeftMark, + vec![lit(true)], + )? + .join_on( + aliased_scan("s2", "__correlated_sq_2")?, + JoinType::LeftMark, + vec![lit(true)], + )? + .join_on( + aliased_scan("s3", "__correlated_sq_3")?, + JoinType::LeftMark, + vec![lit(true)], + )? + .filter( + col("__correlated_sq_1.mark") + .or(col("__correlated_sq_2.mark")) + .and(not(col("__correlated_sq_3.mark"))), + )? + .project(vec![col("person.a")])? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Projection: person.a + Filter: (__correlated_sq_1.mark OR __correlated_sq_2.mark) AND NOT __correlated_sq_3.mark + LeftMark Join: Filter: Boolean(true) + LeftMark Join: Filter: Boolean(true) + LeftMark Join: Filter: Boolean(true) + TableScan: person projection=[a] + SubqueryAlias: __correlated_sq_1 + TableScan: s1 projection=[a] + SubqueryAlias: __correlated_sq_2 + TableScan: s2 projection=[a] + SubqueryAlias: __correlated_sq_3 + TableScan: s3 projection=[a] + " + ) + } + fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {} fn optimize(plan: LogicalPlan) -> Result { From b4a30efc9b2509af1e9a8519d25e9307eca1afb4 Mon Sep 17 00:00:00 2001 From: HairstonE Date: Thu, 11 Jun 2026 13:05:01 -0400 Subject: [PATCH 2/3] .slt regression coverage --- .../sqllogictest/test_files/subquery.slt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 25f124f217cbf..01cbcafa202e8 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -1598,6 +1598,32 @@ logical_plan 21)----------Projection: column1 AS v 22)------------Values: (Int64(5)), (Int64(NULL)) +# Regression test for https://github.com/apache/datafusion/issues/22477 +# `= ANY` / `<> ALL` over the SAME table decorrelate into several stacked +# LeftMark joins. After push_down_filter the join filters reference no right +# column, so the mark joins are filter-less. optimize_projections must not prune +# each mark join's right child to zero columns (which would make the synthetic +# `mark` unqualified and collide across the stack). Previously failed planning +# with "Schema contains duplicate unqualified field name mark". +statement ok +create table set_cmp_self(id int, age int) as values (1, 20), (2, 30), (3, 40); + +# every row's age is in {20, 30, 40}, so `= ANY` holds for all rows +query I rowsort +select id from set_cmp_self where age = any(select age from set_cmp_self); +---- +1 +2 +3 + +# each row's age equals itself in the set, so `<> ALL` is false for every row +query I +select id from set_cmp_self where age <> all(select age from set_cmp_self); +---- + +statement count 0 +drop table set_cmp_self; + # correlated_recursive_scalar_subquery_with_level_3_exists_subquery_referencing_level1_relation query TT explain select c_custkey from customer From 6dd438d659a6853960323dd4044f3fae5dce9175 Mon Sep 17 00:00:00 2001 From: HairstonE Date: Thu, 11 Jun 2026 13:15:44 -0400 Subject: [PATCH 3/3] cutting comments on unit and .slt tests --- .../optimizer/src/optimize_projections/mod.rs | 13 ++----------- datafusion/sqllogictest/test_files/subquery.slt | 11 ++--------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 9fd8d896002f6..da7dd13598204 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -2462,14 +2462,8 @@ mod tests { ) } - // Regression test for https://github.com/apache/datafusion/issues/22477 - // `= ANY` / `<> ALL` decorrelate into several stacked LeftMark joins whose - // (pushed-down) filters reference no right column. OptimizeProjections must - // not prune each mark join's right child to zero columns: a zero-column - // child has no table reference, so `mark_field` would emit an unqualified - // `mark` and the stacked marks would collide on the duplicate bare name. - // Here each mark stays qualified (`__correlated_sq_N.mark`) and the plan - // optimizes without error. + // Stacked filter-less LeftMark joins (from `= ANY` / `<> ALL`) must keep + // each `mark` qualified so they don't collide. #[test] fn optimize_projections_stacked_mark_joins_keep_qualified_mark() -> Result<()> { let person = test_table_scan_with_name("person")?; @@ -2481,9 +2475,6 @@ mod tests { .build() }; - // Three stacked LeftMark joins with trivially-`true` filters (the - // filter-less shape left behind by push_down_filter), feeding a filter - // that references all three marks. let plan = LogicalPlanBuilder::from(person) .join_on( aliased_scan("s1", "__correlated_sq_1")?, diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 01cbcafa202e8..89e495b3335b4 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -1598,17 +1598,11 @@ logical_plan 21)----------Projection: column1 AS v 22)------------Values: (Int64(5)), (Int64(NULL)) -# Regression test for https://github.com/apache/datafusion/issues/22477 -# `= ANY` / `<> ALL` over the SAME table decorrelate into several stacked -# LeftMark joins. After push_down_filter the join filters reference no right -# column, so the mark joins are filter-less. optimize_projections must not prune -# each mark join's right child to zero columns (which would make the synthetic -# `mark` unqualified and collide across the stack). Previously failed planning -# with "Schema contains duplicate unqualified field name mark". +# same-table `= ANY` / `<> ALL` must plan without +# "duplicate unqualified field name mark". statement ok create table set_cmp_self(id int, age int) as values (1, 20), (2, 30), (3, 40); -# every row's age is in {20, 30, 40}, so `= ANY` holds for all rows query I rowsort select id from set_cmp_self where age = any(select age from set_cmp_self); ---- @@ -1616,7 +1610,6 @@ select id from set_cmp_self where age = any(select age from set_cmp_self); 2 3 -# each row's age equals itself in the set, so `<> ALL` is false for every row query I select id from set_cmp_self where age <> all(select age from set_cmp_self); ----