diff --git a/crates/paimon/src/predicate_stats.rs b/crates/paimon/src/predicate_stats.rs index cb44c729..248801ae 100644 --- a/crates/paimon/src/predicate_stats.rs +++ b/crates/paimon/src/predicate_stats.rs @@ -59,7 +59,8 @@ pub(crate) fn data_leaf_may_match( PredicateOperator::IsNotNull => { return all_null != Some(true); } - PredicateOperator::In | PredicateOperator::NotIn => { + PredicateOperator::In => {} + PredicateOperator::NotIn => { return true; } PredicateOperator::Eq @@ -95,6 +96,10 @@ pub(crate) fn data_leaf_may_match( }; match op { + PredicateOperator::In => literals.iter().any(|literal| { + !matches!(literal.partial_cmp(&min_value), Some(Ordering::Less)) + && !matches!(literal.partial_cmp(&max_value), Some(Ordering::Greater)) + }), PredicateOperator::Eq => { !matches!(literal.partial_cmp(&min_value), Some(Ordering::Less)) && !matches!(literal.partial_cmp(&max_value), Some(Ordering::Greater)) @@ -112,10 +117,7 @@ pub(crate) fn data_leaf_may_match( Some(Ordering::Less | Ordering::Equal) ), PredicateOperator::GtEq => !matches!(max_value.partial_cmp(literal), Some(Ordering::Less)), - PredicateOperator::IsNull - | PredicateOperator::IsNotNull - | PredicateOperator::In - | PredicateOperator::NotIn => true, + PredicateOperator::IsNull | PredicateOperator::IsNotNull | PredicateOperator::NotIn => true, } } diff --git a/crates/paimon/src/table/table_scan.rs b/crates/paimon/src/table/table_scan.rs index 273a5c91..d7682869 100644 --- a/crates/paimon/src/table/table_scan.rs +++ b/crates/paimon/src/table/table_scan.rs @@ -1185,6 +1185,80 @@ mod tests { )); } + #[test] + fn test_data_file_matches_in_prunes_when_all_literals_out_of_range() { + let fields = int_field(); + let file = test_data_file_meta( + int_stats_row(Some(10)), + int_stats_row(Some(20)), + vec![Some(0)], + 5, + ); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(1), Datum::Int(30)]) + .unwrap(); + + assert!(!data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + + #[test] + fn test_data_file_matches_in_keeps_when_any_literal_in_range() { + let fields = int_field(); + let file = test_data_file_meta( + int_stats_row(Some(10)), + int_stats_row(Some(20)), + vec![Some(0)], + 5, + ); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(1), Datum::Int(15), Datum::Int(30)]) + .unwrap(); + + assert!(data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + + #[test] + fn test_data_file_matches_in_prunes_all_null_file() { + let fields = int_field(); + let file = test_data_file_meta(int_stats_row(None), int_stats_row(None), vec![Some(5)], 5); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(10)]) + .unwrap(); + + assert!(!data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + + #[test] + fn test_data_file_matches_in_with_corrupt_stats_fails_open() { + let fields = int_field(); + let file = test_data_file_meta(Vec::new(), Vec::new(), vec![Some(0)], 5); + let predicate = PredicateBuilder::new(&fields) + .is_in("id", vec![Datum::Int(30)]) + .unwrap(); + + assert!(data_file_matches_predicates( + &file, + &[predicate], + TEST_SCHEMA_ID, + &test_schema_fields(), + )); + } + #[test] fn test_data_file_matches_is_null_prunes_when_null_count_is_zero() { let fields = int_field();