@@ -611,10 +611,14 @@ fn estimate_inner_join_cardinality(
611611 // The algorithm here is partly based on the non-histogram selectivity estimation
612612 // from Spark's Catalyst optimizer.
613613 let mut join_selectivity = Precision :: Absent ;
614+ let mut has_any_distinct = false ;
614615 for ( left_stat, right_stat) in left_column_statistics
615616 . iter ( )
616617 . zip ( right_column_statistics. iter ( ) )
617618 {
619+ has_any_distinct |= left_stat. distinct_count . get_value ( ) . is_some ( )
620+ || right_stat. distinct_count . get_value ( ) . is_some ( ) ;
621+
618622 let left_max_distinct = max_distinct_count ( & left_num_rows, left_stat) ;
619623 let right_max_distinct = max_distinct_count ( & right_num_rows, right_stat) ;
620624 let max_distinct = left_max_distinct. max ( & right_max_distinct) ;
@@ -631,17 +635,27 @@ fn estimate_inner_join_cardinality(
631635 // of the two inputs and normalizing it by the selectivity factor.
632636 let left_num_rows = left_stats. num_rows . get_value ( ) ?;
633637 let right_num_rows = right_stats. num_rows . get_value ( ) ?;
638+
639+ // When no actual distinct count stats are available, the selectivity
640+ // denominator falls back to max(num_rows_left, num_rows_right), which
641+ // gives cardinality = min(L, R). This severely underestimates FK joins
642+ // (e.g. warehouse(5) ⋈ catalog_sales(1.4M) → 5 instead of 1.4M).
643+ // In this case, use max(L, R) directly as a better heuristic.
644+ if !has_any_distinct {
645+ return Some ( Precision :: Inexact ( * left_num_rows. max ( right_num_rows) ) ) ;
646+ }
647+
634648 match join_selectivity {
635649 Precision :: Exact ( value) if value > 0 => {
636650 Some ( Precision :: Exact ( ( left_num_rows * right_num_rows) / value) )
637651 }
638652 Precision :: Inexact ( value) if value > 0 => {
639653 Some ( Precision :: Inexact ( ( left_num_rows * right_num_rows) / value) )
640654 }
641- // Since we don't have any information about the selectivity (which is derived
642- // from the number of distinct rows information) we can give up here for now .
643- // And let other passes handle this (otherwise we would need to produce an
644- // overestimation using just the cartesian product).
655+ // Selectivity is zero (one side has no non-null values), so the join
656+ // produces no rows.
657+ Precision :: Exact ( 0 ) => Some ( Precision :: Exact ( 0 ) ) ,
658+ Precision :: Inexact ( 0 ) => Some ( Precision :: Inexact ( 0 ) ) ,
645659 _ => None ,
646660 }
647661}
@@ -2159,22 +2173,24 @@ mod tests {
21592173 Some ( Inexact ( 10 ) ) ,
21602174 ) ,
21612175 // range(left) > range(right)
2176+ // Without distinct stats, use max(L, R) = max(10, 10) = 10
21622177 (
21632178 ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
21642179 ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
2165- Some ( Inexact ( 20 ) ) ,
2180+ Some ( Inexact ( 10 ) ) ,
21662181 ) ,
21672182 // range(right) > range(left)
21682183 (
21692184 ( 10 , Inexact ( 8 ) , Inexact ( 10 ) , Absent , Absent ) ,
21702185 ( 10 , Inexact ( 6 ) , Inexact ( 10 ) , Absent , Absent ) ,
2171- Some ( Inexact ( 20 ) ) ,
2186+ Some ( Inexact ( 10 ) ) ,
21722187 ) ,
21732188 // range(left) > len(left), range(right) > len(right)
2189+ // Without distinct stats, min(10, 20) = 10, so (10*20)/10 = 20
21742190 (
21752191 ( 10 , Inexact ( 1 ) , Inexact ( 15 ) , Absent , Absent ) ,
21762192 ( 20 , Inexact ( 1 ) , Inexact ( 40 ) , Absent , Absent ) ,
2177- Some ( Inexact ( 10 ) ) ,
2193+ Some ( Inexact ( 20 ) ) ,
21782194 ) ,
21792195 // Distinct count matches the range
21802196 (
@@ -2201,6 +2217,7 @@ mod tests {
22012217 Some ( Inexact ( 20 ) ) ,
22022218 ) ,
22032219 // min(left) < 0 (range(left) > range(right))
2220+ // Without distinct stats, use max(L, R) = max(10, 10) = 10
22042221 (
22052222 ( 10 , Inexact ( -5 ) , Inexact ( 5 ) , Absent , Absent ) ,
22062223 ( 10 , Inexact ( 1 ) , Inexact ( 5 ) , Absent , Absent ) ,
@@ -2222,10 +2239,11 @@ mod tests {
22222239 Some ( Inexact ( 10 ) ) ,
22232240 ) ,
22242241 // range(left) = 1, range(right) = 1
2242+ // Without distinct stats, use max(L, R) = 10
22252243 (
22262244 ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
22272245 ( 10 , Inexact ( 1 ) , Inexact ( 1 ) , Absent , Absent ) ,
2228- Some ( Inexact ( 100 ) ) ,
2246+ Some ( Inexact ( 10 ) ) ,
22292247 ) ,
22302248 //
22312249 // Edge cases
@@ -2275,17 +2293,18 @@ mod tests {
22752293 ( 10 , Inexact ( 0 ) , Inexact ( 10 ) , Absent , Absent ) ,
22762294 Some ( Inexact ( 0 ) ) ,
22772295 ) ,
2278- // distinct(left) = 0, distinct(right) = 0
2296+ // distinct(left) = 0, distinct(right) = 0: no matching keys possible
22792297 (
22802298 ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
22812299 ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Inexact ( 0 ) , Absent ) ,
2282- None ,
2300+ Some ( Inexact ( 0 ) ) ,
22832301 ) ,
22842302 // Inexact row count < exact null count with absent distinct count
2303+ // Without distinct stats, use max(L, R) = max(0, 10) = 10
22852304 (
22862305 ( 0 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Exact ( 5 ) ) ,
22872306 ( 10 , Inexact ( 1 ) , Inexact ( 10 ) , Absent , Absent ) ,
2288- Some ( Inexact ( 0 ) ) ,
2307+ Some ( Inexact ( 10 ) ) ,
22892308 ) ,
22902309 ] ;
22912310
0 commit comments