From 1037b86764418f34486ec2f5d128766524bab220 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 11 Jun 2026 15:47:22 +0800 Subject: [PATCH 1/6] GH-50156: [C++][Parquet] Ignore min/max for unknown column order --- cpp/src/parquet/metadata.cc | 134 ++++++++++++++++++++++------- cpp/src/parquet/metadata_test.cc | 98 +++++++++++++++++++++ cpp/src/parquet/page_index.cc | 9 ++ cpp/src/parquet/page_index_test.cc | 52 +++++++++++ cpp/src/parquet/statistics_test.cc | 7 ++ cpp/src/parquet/types.cc | 1 + cpp/src/parquet/types.h | 10 ++- 7 files changed, 279 insertions(+), 32 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 46cd1e442461..718c18706334 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -90,36 +90,111 @@ std::string ParquetVersionToString(ParquetVersion::type ver) { return "UNKNOWN"; } +namespace { + +enum class StatsMinMaxMode { + // Ignore min/max fields because their ordering is unknown or unsupported. + kDiscard, + // Use legacy min/max fields for files without column orders. + kLegacy, + // Use min_value/max_value fields with the column's well-defined order. + kNormal, +}; + +inline StatsMinMaxMode GetStatsMinMaxMode(const ColumnDescriptor& descr) { + switch (descr.column_order().get_order()) { + case ColumnOrder::TYPE_DEFINED_ORDER: + return descr.sort_order() != SortOrder::UNKNOWN ? StatsMinMaxMode::kNormal + : StatsMinMaxMode::kDiscard; + case ColumnOrder::UNDEFINED: + return descr.sort_order() != SortOrder::UNKNOWN ? StatsMinMaxMode::kLegacy + : StatsMinMaxMode::kDiscard; + case ColumnOrder::UNKNOWN: + return StatsMinMaxMode::kDiscard; + } + return StatsMinMaxMode::kDiscard; +} + +} // namespace + +static EncodedStatistics EncodedStatisticsFromThrift(const format::Statistics& statistics, + StatsMinMaxMode min_max) { + EncodedStatistics out; + + switch (min_max) { + case StatsMinMaxMode::kNormal: + if (statistics.__isset.max_value) { + out.set_max(statistics.max_value); + if (statistics.__isset.is_max_value_exact) { + out.is_max_value_exact = statistics.is_max_value_exact; + } + } + if (statistics.__isset.min_value) { + out.set_min(statistics.min_value); + if (statistics.__isset.is_min_value_exact) { + out.is_min_value_exact = statistics.is_min_value_exact; + } + } + break; + case StatsMinMaxMode::kLegacy: + if (statistics.__isset.max) { + out.set_max(statistics.max); + } + if (statistics.__isset.min) { + out.set_min(statistics.min); + } + break; + case StatsMinMaxMode::kDiscard: + break; + } + if (statistics.__isset.null_count) { + out.set_null_count(statistics.null_count); + } + if (statistics.__isset.distinct_count) { + out.set_distinct_count(statistics.distinct_count); + } + + return out; +} + template static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) { - std::optional min_exact = - metadata.statistics.__isset.is_min_value_exact - ? std::optional(metadata.statistics.is_min_value_exact) - : std::nullopt; - std::optional max_exact = - metadata.statistics.__isset.is_max_value_exact - ? std::optional(metadata.statistics.is_max_value_exact) - : std::nullopt; - // If ColumnOrder is defined, return max_value and min_value - if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { - return MakeStatistics( - descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value, - metadata.statistics.__isset.null_count, - metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool); - } - // Default behavior + const auto& statistics = metadata.statistics; + const std::string kEmpty = ""; + const std::string* encoded_min = &kEmpty; + const std::string* encoded_max = &kEmpty; + bool has_min_max = false; + std::optional min_exact = std::nullopt; + std::optional max_exact = std::nullopt; + + switch (GetStatsMinMaxMode(*descr)) { + case StatsMinMaxMode::kNormal: + encoded_min = &statistics.min_value; + encoded_max = &statistics.max_value; + has_min_max = statistics.__isset.max_value && statistics.__isset.min_value; + min_exact = statistics.__isset.is_min_value_exact + ? std::optional(statistics.is_min_value_exact) + : std::nullopt; + max_exact = statistics.__isset.is_max_value_exact + ? std::optional(statistics.is_max_value_exact) + : std::nullopt; + break; + case StatsMinMaxMode::kLegacy: + encoded_min = &statistics.min; + encoded_max = &statistics.max; + has_min_max = statistics.__isset.max && statistics.__isset.min; + break; + case StatsMinMaxMode::kDiscard: + break; + } + return MakeStatistics( - descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max && metadata.statistics.__isset.min, - metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count, - min_exact, max_exact, pool); + descr, *encoded_min, *encoded_max, metadata.num_values - statistics.null_count, + statistics.null_count, statistics.distinct_count, has_min_max, + statistics.__isset.null_count, statistics.__isset.distinct_count, min_exact, + max_exact, pool); } namespace { @@ -337,11 +412,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const std::lock_guard guard(stats_mutex_); if (possible_encoded_stats_ == nullptr) { possible_encoded_stats_ = - std::make_shared(FromThrift(column_metadata_->statistics)); - if (descr_->sort_order() == SortOrder::UNKNOWN) { - // If the column SortOrder is Unknown we can't trust max/min. - possible_encoded_stats_->ClearMinMax(); - } + std::make_shared(EncodedStatisticsFromThrift( + column_metadata_->statistics, GetStatsMinMaxMode(*descr_))); } } return writer_version_->HasCorrectStatistics(type(), *possible_encoded_stats_, @@ -1037,7 +1109,7 @@ class FileMetaData::FileMetaDataImpl { if (column_order.__isset.TYPE_ORDER) { column_orders.push_back(ColumnOrder::type_defined_); } else { - column_orders.push_back(ColumnOrder::undefined_); + column_orders.push_back(ColumnOrder::unknown_); } } } else { diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 572f053179cd..b2a09be1eefb 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -275,6 +275,104 @@ TEST(Metadata, TestBuildAccess) { ASSERT_TRUE(f_accessor_1->Equals(*f_accessor->Subset({2, 0}))); } +namespace { + +std::string EncodeInt32(int32_t value) { + return std::string(reinterpret_cast(&value), sizeof(value)); +} + +constexpr int32_t kLegacyMin = 100, kLegacyMax = 200; + +std::string SerializeMetadata(const format::FileMetaData& thrift_metadata) { + std::string out; + ThriftSerializer{}.SerializeToString(&thrift_metadata, &out); + return out; +} + +std::shared_ptr ParseMetadata(std::string serialized_metadata) { + uint32_t decoded_len = static_cast(serialized_metadata.size()); + return FileMetaData::Make(serialized_metadata.data(), &decoded_len); +} + +format::FileMetaData SingleInt32MetadataWithStats() { + format::FileMetaData metadata; + format::SchemaElement root, leaf; + schema::NodeVector fields = {schema::Int32("int_col", Repetition::REQUIRED)}; + schema::GroupNode::Make("schema", Repetition::REPEATED, fields)->ToParquet(&root); + fields.back()->ToParquet(&leaf); + metadata.schema = {std::move(root), std::move(leaf)}; + + auto& column = metadata.row_groups.emplace_back().columns.emplace_back(); + column.__isset.meta_data = true; + auto& column_metadata = column.meta_data; + column_metadata.__set_type(format::Type::INT32); + column_metadata.__isset.statistics = true; + auto& statistics = column_metadata.statistics; + statistics.__set_min(EncodeInt32(kLegacyMin)); + statistics.__set_max(EncodeInt32(kLegacyMax)); + statistics.__set_min_value(EncodeInt32(kLegacyMin)); + statistics.__set_max_value(EncodeInt32(kLegacyMax)); + metadata.column_orders.emplace_back().__set_TYPE_ORDER(format::TypeDefinedOrder{}); + metadata.__isset.column_orders = true; + return metadata; +} + +std::unique_ptr GetOnlyColumnChunk( + const FileMetaData& metadata, ColumnOrder::type expected_order) { + EXPECT_EQ(expected_order, metadata.schema()->Column(0)->column_order().get_order()); + return metadata.RowGroup(0)->ColumnChunk(0); +} + +void AssertColumnChunkHasNoMinMax(const FileMetaData& metadata, + ColumnOrder::type expected_order) { + auto column = GetOnlyColumnChunk(metadata, expected_order); + ASSERT_NE(nullptr, column->encoded_statistics()); + EXPECT_FALSE(column->encoded_statistics()->has_min); + EXPECT_FALSE(column->encoded_statistics()->has_max); + ASSERT_NE(nullptr, column->statistics()); + EXPECT_FALSE(column->statistics()->HasMinMax()); +} + +void AssertColumnChunkMinMax(const FileMetaData& metadata, + ColumnOrder::type expected_order, int32_t min, int32_t max) { + auto column = GetOnlyColumnChunk(metadata, expected_order); + const std::string encoded_min = EncodeInt32(min); + const std::string encoded_max = EncodeInt32(max); + + ASSERT_NE(nullptr, column->encoded_statistics()); + EXPECT_EQ(encoded_min, column->encoded_statistics()->min()); + EXPECT_EQ(encoded_max, column->encoded_statistics()->max()); + ASSERT_NE(nullptr, column->statistics()); + EXPECT_EQ(encoded_min, column->statistics()->EncodeMin()); + EXPECT_EQ(encoded_max, column->statistics()->EncodeMax()); +} + +} // namespace + +TEST(Metadata, UnknownColumnOrderIgnoresMinMax) { + std::string serialized_metadata = SerializeMetadata(SingleInt32MetadataWithStats()); + const std::string kTypeDefinedOrder("\x1c\x00", 2); + const std::string kUnsupportedOrder("\x2c\x00", 2); + const auto pos = serialized_metadata.find(kTypeDefinedOrder); + ASSERT_NE(std::string::npos, pos); + serialized_metadata.replace(pos, kTypeDefinedOrder.size(), kUnsupportedOrder); + + auto metadata = ParseMetadata(serialized_metadata); + AssertColumnChunkHasNoMinMax(*metadata, ColumnOrder::UNKNOWN); +} + +TEST(Metadata, MissingColumnOrderUsesLegacyMinMax) { + format::FileMetaData thrift_metadata = SingleInt32MetadataWithStats(); + thrift_metadata.column_orders.clear(); + thrift_metadata.__isset.column_orders = false; + auto& statistics = thrift_metadata.row_groups.at(0).columns.at(0).meta_data.statistics; + statistics.__set_min_value(EncodeInt32(kLegacyMin - 100)); + statistics.__set_max_value(EncodeInt32(kLegacyMax + 100)); + + auto metadata = ParseMetadata(SerializeMetadata(thrift_metadata)); + AssertColumnChunkMinMax(*metadata, ColumnOrder::UNDEFINED, kLegacyMin, kLegacyMax); +} + TEST(Metadata, TestV1Version) { // PARQUET-839 parquet::schema::NodeVector fields; diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 7434f2828da2..9556356be84e 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -38,6 +38,12 @@ namespace parquet { namespace { +inline bool CanTrustPageIndexMinMax(const ColumnDescriptor& descr) { + const auto column_order = descr.column_order().get_order(); + return column_order != ColumnOrder::UNKNOWN && column_order != ColumnOrder::UNDEFINED && + descr.sort_order() != SortOrder::UNKNOWN; +} + template void Decode(std::unique_ptr::Decoder>& decoder, const std::string& input, std::vector* output, @@ -973,6 +979,9 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, // Guard against UB when moving column_index throw ParquetException("Invalid ColumnIndex boundary_order"); } + if (!CanTrustPageIndexMinMax(descr)) { + return nullptr; + } switch (descr.physical_type()) { case Type::BOOLEAN: return std::make_unique>(descr, diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index 3a7308c1c6bc..df6ec6c6b098 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -561,6 +561,58 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, } } +template +std::shared_ptr SerializedColumnIndex(const T& min, const T& max) { + auto encode = [](const T& value) { + return std::string(reinterpret_cast(&value), sizeof(value)); + }; + format::ColumnIndex column_index; + column_index.__set_null_pages({false}); + column_index.__set_min_values({encode(min)}); + column_index.__set_max_values({encode(max)}); + column_index.__set_boundary_order(format::BoundaryOrder::UNORDERED); + + auto sink = CreateOutputStream(); + ThriftSerializer{}.Serialize(&column_index, sink.get()); + PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); + return buffer; +} + +void AssertColumnIndexIgnored(const ColumnDescriptor& read_descr, + const std::shared_ptr& buffer) { + auto column_index = + ColumnIndex::Make(read_descr, buffer->data(), static_cast(buffer->size()), + default_reader_properties()); + ASSERT_EQ(nullptr, column_index); +} + +void AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder column_order) { + auto node = schema::Int32("c1"); + auto buffer = SerializedColumnIndex(/*min=*/1, /*max=*/2); + + std::static_pointer_cast(node)->SetColumnOrder(column_order); + ColumnDescriptor read_descr(node, /*max_definition_level=*/1, + /*max_repetition_level=*/0); + AssertColumnIndexIgnored(read_descr, buffer); +} + +TEST(PageIndex, ReadColumnIndexWithUnsupportedColumnOrder) { + AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder::unknown_); + AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder::undefined_); +} + +TEST(PageIndex, ReadColumnIndexWithUnknownSortOrder) { + auto node = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::INT96); + ColumnDescriptor descr(node, /*max_definition_level=*/0, /*max_repetition_level=*/0); + ASSERT_EQ(SortOrder::UNKNOWN, descr.sort_order()); + + Int96 min{{1, 2, 3}}; + Int96 max{{4, 5, 6}}; + auto buffer = SerializedColumnIndex(min, max); + + AssertColumnIndexIgnored(descr, buffer); +} + TEST(PageIndex, WriteInt32ColumnIndex) { auto encode = [=](int32_t value) { return std::string(reinterpret_cast(&value), sizeof(int32_t)); diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a57..0d300b856c2f 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1660,6 +1660,13 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { ASSERT_EQ(1, enc_stats->null_count); ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); ASSERT_FALSE(enc_stats->is_min_value_exact.has_value()); + + // Unknown sort order should not cause min/max to be set + std::shared_ptr stats = column_chunk->statistics(); + ASSERT_NE(nullptr, stats); + ASSERT_FALSE(stats->HasMinMax()); + ASSERT_TRUE(stats->HasNullCount()); + ASSERT_EQ(1, stats->null_count()); } // Test statistics for binary column with UNSIGNED sort order diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index fb4eb92a7544..ff5bd4eafb90 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -444,6 +444,7 @@ SortOrder::type GetSortOrder(const std::shared_ptr& logical_t ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED); ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER); +ColumnOrder ColumnOrder::unknown_ = ColumnOrder(ColumnOrder::UNKNOWN); // Static methods for LogicalType class diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index ad4df5119e75..846a6cfaf9ce 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -599,7 +599,14 @@ bool PageCanUseChecksum(PageType::type pageType); class ColumnOrder { public: - enum type { UNDEFINED, TYPE_DEFINED_ORDER }; + enum type { + // File metadata has no column order, only legacy min/max in stats are defined. + UNDEFINED, + // File metadata uses TypeDefinedOrder from the Parquet format. + TYPE_DEFINED_ORDER, + // Column order value unsupported by this reader. + UNKNOWN + }; explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {} // Default to Type Defined Order ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {} @@ -607,6 +614,7 @@ class ColumnOrder { static ColumnOrder undefined_; static ColumnOrder type_defined_; + static ColumnOrder unknown_; private: ColumnOrder::type column_order_; From 72a4ffb065e24a373e327fde0aad897e37488327 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 11 Jun 2026 16:01:36 +0800 Subject: [PATCH 2/6] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- cpp/src/parquet/metadata_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index b2a09be1eefb..92cedd1b8501 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -350,14 +350,14 @@ void AssertColumnChunkMinMax(const FileMetaData& metadata, } // namespace TEST(Metadata, UnknownColumnOrderIgnoresMinMax) { - std::string serialized_metadata = SerializeMetadata(SingleInt32MetadataWithStats()); - const std::string kTypeDefinedOrder("\x1c\x00", 2); - const std::string kUnsupportedOrder("\x2c\x00", 2); - const auto pos = serialized_metadata.find(kTypeDefinedOrder); - ASSERT_NE(std::string::npos, pos); - serialized_metadata.replace(pos, kTypeDefinedOrder.size(), kUnsupportedOrder); - - auto metadata = ParseMetadata(serialized_metadata); + format::FileMetaData thrift_metadata = SingleInt32MetadataWithStats(); + // Simulate an unsupported ColumnOrder value: unknown union fields are skipped by Thrift, + // leaving an entry with no known field set. + thrift_metadata.column_orders.clear(); + thrift_metadata.column_orders.emplace_back(); + thrift_metadata.__isset.column_orders = true; + + auto metadata = ParseMetadata(SerializeMetadata(thrift_metadata)); AssertColumnChunkHasNoMinMax(*metadata, ColumnOrder::UNKNOWN); } From 3888b2f8a0b8907e56e6a5a709e399ee069a6454 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 11 Jun 2026 16:16:55 +0800 Subject: [PATCH 3/6] Apply suggestion from @wgtmac --- cpp/src/parquet/metadata_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 92cedd1b8501..ac45be1fac38 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -351,8 +351,8 @@ void AssertColumnChunkMinMax(const FileMetaData& metadata, TEST(Metadata, UnknownColumnOrderIgnoresMinMax) { format::FileMetaData thrift_metadata = SingleInt32MetadataWithStats(); - // Simulate an unsupported ColumnOrder value: unknown union fields are skipped by Thrift, - // leaving an entry with no known field set. + // Simulate an unsupported ColumnOrder value: unknown union fields are skipped by + // Thrift, leaving an entry with no known field set. thrift_metadata.column_orders.clear(); thrift_metadata.column_orders.emplace_back(); thrift_metadata.__isset.column_orders = true; From af4a4bc40fba1d5a06734fdcd82883ea795e3f9f Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 11 Jun 2026 16:17:21 +0800 Subject: [PATCH 4/6] Apply suggestion from @wgtmac --- cpp/src/parquet/metadata.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 718c18706334..c29a8f2dbf8e 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -101,7 +101,7 @@ enum class StatsMinMaxMode { kNormal, }; -inline StatsMinMaxMode GetStatsMinMaxMode(const ColumnDescriptor& descr) { +StatsMinMaxMode GetStatsMinMaxMode(const ColumnDescriptor& descr) { switch (descr.column_order().get_order()) { case ColumnOrder::TYPE_DEFINED_ORDER: return descr.sort_order() != SortOrder::UNKNOWN ? StatsMinMaxMode::kNormal From ef0b430ecd2f7fa928fbddeb69bd189819893f6c Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 11 Jun 2026 16:17:37 +0800 Subject: [PATCH 5/6] Apply suggestion from @wgtmac --- cpp/src/parquet/page_index.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index 9556356be84e..a1ec4f75d39c 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -38,7 +38,7 @@ namespace parquet { namespace { -inline bool CanTrustPageIndexMinMax(const ColumnDescriptor& descr) { +bool CanTrustPageIndexMinMax(const ColumnDescriptor& descr) { const auto column_order = descr.column_order().get_order(); return column_order != ColumnOrder::UNKNOWN && column_order != ColumnOrder::UNDEFINED && descr.sort_order() != SortOrder::UNKNOWN; From c73c0a3f1eaa289bfb354af61f5a049889001abd Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 12 Jun 2026 14:04:53 +0800 Subject: [PATCH 6/6] add can_use_stats() and revert nullable column index --- cpp/src/parquet/metadata.cc | 88 ++++++------------------------ cpp/src/parquet/page_index.cc | 9 --- cpp/src/parquet/page_index_test.cc | 52 ------------------ cpp/src/parquet/schema.h | 13 +++++ cpp/src/parquet/schema_test.cc | 26 +++++++++ cpp/src/parquet/thrift_internal.h | 29 +++++++--- 6 files changed, 79 insertions(+), 138 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index c29a8f2dbf8e..2b901b39de58 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -92,75 +92,26 @@ std::string ParquetVersionToString(ParquetVersion::type ver) { namespace { -enum class StatsMinMaxMode { - // Ignore min/max fields because their ordering is unknown or unsupported. - kDiscard, - // Use legacy min/max fields for files without column orders. - kLegacy, - // Use min_value/max_value fields with the column's well-defined order. - kNormal, -}; - -StatsMinMaxMode GetStatsMinMaxMode(const ColumnDescriptor& descr) { +StatisticsMinMaxField GetStatisticsMinMaxField(const ColumnDescriptor& descr) { switch (descr.column_order().get_order()) { case ColumnOrder::TYPE_DEFINED_ORDER: - return descr.sort_order() != SortOrder::UNKNOWN ? StatsMinMaxMode::kNormal - : StatsMinMaxMode::kDiscard; + return descr.sort_order() != SortOrder::UNKNOWN + ? StatisticsMinMaxField::kMinValueMaxValue + : StatisticsMinMaxField::kInvalid; case ColumnOrder::UNDEFINED: - return descr.sort_order() != SortOrder::UNKNOWN ? StatsMinMaxMode::kLegacy - : StatsMinMaxMode::kDiscard; + return descr.sort_order() == SortOrder::SIGNED + ? StatisticsMinMaxField::kLegacyMinMax + : StatisticsMinMaxField::kInvalid; case ColumnOrder::UNKNOWN: - return StatsMinMaxMode::kDiscard; - } - return StatsMinMaxMode::kDiscard; -} - -} // namespace - -static EncodedStatistics EncodedStatisticsFromThrift(const format::Statistics& statistics, - StatsMinMaxMode min_max) { - EncodedStatistics out; - - switch (min_max) { - case StatsMinMaxMode::kNormal: - if (statistics.__isset.max_value) { - out.set_max(statistics.max_value); - if (statistics.__isset.is_max_value_exact) { - out.is_max_value_exact = statistics.is_max_value_exact; - } - } - if (statistics.__isset.min_value) { - out.set_min(statistics.min_value); - if (statistics.__isset.is_min_value_exact) { - out.is_min_value_exact = statistics.is_min_value_exact; - } - } - break; - case StatsMinMaxMode::kLegacy: - if (statistics.__isset.max) { - out.set_max(statistics.max); - } - if (statistics.__isset.min) { - out.set_min(statistics.min); - } - break; - case StatsMinMaxMode::kDiscard: - break; + return StatisticsMinMaxField::kInvalid; } - if (statistics.__isset.null_count) { - out.set_null_count(statistics.null_count); - } - if (statistics.__isset.distinct_count) { - out.set_distinct_count(statistics.distinct_count); - } - - return out; + return StatisticsMinMaxField::kInvalid; } template -static std::shared_ptr MakeTypedColumnStats( - const format::ColumnMetaData& metadata, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) { +std::shared_ptr MakeTypedColumnStats(const format::ColumnMetaData& metadata, + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) { const auto& statistics = metadata.statistics; const std::string kEmpty = ""; const std::string* encoded_min = &kEmpty; @@ -169,8 +120,8 @@ static std::shared_ptr MakeTypedColumnStats( std::optional min_exact = std::nullopt; std::optional max_exact = std::nullopt; - switch (GetStatsMinMaxMode(*descr)) { - case StatsMinMaxMode::kNormal: + switch (GetStatisticsMinMaxField(*descr)) { + case StatisticsMinMaxField::kMinValueMaxValue: encoded_min = &statistics.min_value; encoded_max = &statistics.max_value; has_min_max = statistics.__isset.max_value && statistics.__isset.min_value; @@ -181,12 +132,12 @@ static std::shared_ptr MakeTypedColumnStats( ? std::optional(statistics.is_max_value_exact) : std::nullopt; break; - case StatsMinMaxMode::kLegacy: + case StatisticsMinMaxField::kLegacyMinMax: encoded_min = &statistics.min; encoded_max = &statistics.max; has_min_max = statistics.__isset.max && statistics.__isset.min; break; - case StatsMinMaxMode::kDiscard: + case StatisticsMinMaxField::kInvalid: break; } @@ -197,8 +148,6 @@ static std::shared_ptr MakeTypedColumnStats( max_exact, pool); } -namespace { - std::shared_ptr MakeColumnGeometryStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { if (metadata.__isset.geospatial_statistics) { @@ -411,9 +360,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { { const std::lock_guard guard(stats_mutex_); if (possible_encoded_stats_ == nullptr) { - possible_encoded_stats_ = - std::make_shared(EncodedStatisticsFromThrift( - column_metadata_->statistics, GetStatsMinMaxMode(*descr_))); + possible_encoded_stats_ = std::make_shared( + FromThrift(column_metadata_->statistics, GetStatisticsMinMaxField(*descr_))); } } return writer_version_->HasCorrectStatistics(type(), *possible_encoded_stats_, diff --git a/cpp/src/parquet/page_index.cc b/cpp/src/parquet/page_index.cc index a1ec4f75d39c..7434f2828da2 100644 --- a/cpp/src/parquet/page_index.cc +++ b/cpp/src/parquet/page_index.cc @@ -38,12 +38,6 @@ namespace parquet { namespace { -bool CanTrustPageIndexMinMax(const ColumnDescriptor& descr) { - const auto column_order = descr.column_order().get_order(); - return column_order != ColumnOrder::UNKNOWN && column_order != ColumnOrder::UNDEFINED && - descr.sort_order() != SortOrder::UNKNOWN; -} - template void Decode(std::unique_ptr::Decoder>& decoder, const std::string& input, std::vector* output, @@ -979,9 +973,6 @@ std::unique_ptr ColumnIndex::Make(const ColumnDescriptor& descr, // Guard against UB when moving column_index throw ParquetException("Invalid ColumnIndex boundary_order"); } - if (!CanTrustPageIndexMinMax(descr)) { - return nullptr; - } switch (descr.physical_type()) { case Type::BOOLEAN: return std::make_unique>(descr, diff --git a/cpp/src/parquet/page_index_test.cc b/cpp/src/parquet/page_index_test.cc index df6ec6c6b098..3a7308c1c6bc 100644 --- a/cpp/src/parquet/page_index_test.cc +++ b/cpp/src/parquet/page_index_test.cc @@ -561,58 +561,6 @@ void TestWriteTypedColumnIndex(schema::NodePtr node, } } -template -std::shared_ptr SerializedColumnIndex(const T& min, const T& max) { - auto encode = [](const T& value) { - return std::string(reinterpret_cast(&value), sizeof(value)); - }; - format::ColumnIndex column_index; - column_index.__set_null_pages({false}); - column_index.__set_min_values({encode(min)}); - column_index.__set_max_values({encode(max)}); - column_index.__set_boundary_order(format::BoundaryOrder::UNORDERED); - - auto sink = CreateOutputStream(); - ThriftSerializer{}.Serialize(&column_index, sink.get()); - PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish()); - return buffer; -} - -void AssertColumnIndexIgnored(const ColumnDescriptor& read_descr, - const std::shared_ptr& buffer) { - auto column_index = - ColumnIndex::Make(read_descr, buffer->data(), static_cast(buffer->size()), - default_reader_properties()); - ASSERT_EQ(nullptr, column_index); -} - -void AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder column_order) { - auto node = schema::Int32("c1"); - auto buffer = SerializedColumnIndex(/*min=*/1, /*max=*/2); - - std::static_pointer_cast(node)->SetColumnOrder(column_order); - ColumnDescriptor read_descr(node, /*max_definition_level=*/1, - /*max_repetition_level=*/0); - AssertColumnIndexIgnored(read_descr, buffer); -} - -TEST(PageIndex, ReadColumnIndexWithUnsupportedColumnOrder) { - AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder::unknown_); - AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder::undefined_); -} - -TEST(PageIndex, ReadColumnIndexWithUnknownSortOrder) { - auto node = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::INT96); - ColumnDescriptor descr(node, /*max_definition_level=*/0, /*max_repetition_level=*/0); - ASSERT_EQ(SortOrder::UNKNOWN, descr.sort_order()); - - Int96 min{{1, 2, 3}}; - Int96 max{{4, 5, 6}}; - auto buffer = SerializedColumnIndex(min, max); - - AssertColumnIndexIgnored(descr, buffer); -} - TEST(PageIndex, WriteInt32ColumnIndex) { auto encode = [=](int32_t value) { return std::string(reinterpret_cast(&value), sizeof(int32_t)); diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 1addc73bd367..52c6b5544e32 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -382,6 +382,19 @@ class PARQUET_EXPORT ColumnDescriptor { return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt); } + // Whether ColumnOrder-governed min/max values have a supported ordering. + bool can_use_stats() const { + switch (column_order().get_order()) { + case ColumnOrder::TYPE_DEFINED_ORDER: + return sort_order() != SortOrder::UNKNOWN; + case ColumnOrder::UNDEFINED: + return sort_order() == SortOrder::SIGNED; + case ColumnOrder::UNKNOWN: + return false; + } + return false; + } + const std::string& name() const { return primitive_node_->name(); } const std::shared_ptr path() const; diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index 2950a7df70f8..425f34ff61db 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -660,6 +660,32 @@ TEST(TestColumnDescriptor, TestAttrs) { ASSERT_EQ(expected_descr, descr2.ToString()); } +TEST(TestColumnDescriptor, CanUseStats) { + NodePtr node = Int32("name"); + ColumnDescriptor descr(node, 0, 0); + // Type-defined column order is usable when the type has a known sort order. + EXPECT_TRUE(descr.can_use_stats()); + + auto primitive_node = std::static_pointer_cast(node); + primitive_node->SetColumnOrder(ColumnOrder::undefined_); + // Missing column order falls back to legacy min/max, which are signed. + EXPECT_TRUE(ColumnDescriptor(node, 0, 0).can_use_stats()); + + primitive_node->SetColumnOrder(ColumnOrder::unknown_); + // Unsupported column order means min/max ordering is unknown to this reader. + EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats()); + + node = PrimitiveNode::Make("name", Repetition::REQUIRED, Type::BYTE_ARRAY); + primitive_node = std::static_pointer_cast(node); + primitive_node->SetColumnOrder(ColumnOrder::undefined_); + // Legacy min/max are signed, so they cannot represent unsigned byte ordering. + EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats()); + + node = PrimitiveNode::Make("name", Repetition::REQUIRED, Type::INT96); + // INT96 has no defined sort order in the Parquet type-defined ordering. + EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats()); +} + class TestSchemaDescriptor : public ::testing::Test { public: void setUp() {} diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 5d14ae4e289c..e6d69308edb7 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -252,12 +252,21 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { aesGcmCtrV1.supply_aad_prefix}; } -static inline EncodedStatistics FromThrift(const format::Statistics& stats) { +// Selects which thrift Statistics min/max fields should populate EncodedStatistics. +enum class StatisticsMinMaxField { + // Do not populate min/max, because the ordering is undefined or unsupported. + kInvalid, + // Populate min/max from the min_value/max_value fields. + kMinValueMaxValue, + // Populate min/max from the legacy min/max fields. + kLegacyMinMax, +}; + +static inline EncodedStatistics FromThrift(const format::Statistics& stats, + StatisticsMinMaxField min_max) { EncodedStatistics out; - // Use the new V2 min-max statistics over the former one if it is filled - if (stats.__isset.max_value || stats.__isset.min_value) { - // TODO: check if the column_order is TYPE_DEFINED_ORDER. + if (min_max == StatisticsMinMaxField::kMinValueMaxValue) { if (stats.__isset.max_value) { out.set_max(stats.max_value); if (stats.__isset.is_max_value_exact) { @@ -270,9 +279,7 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) { out.is_min_value_exact = stats.is_min_value_exact; } } - } else if (stats.__isset.max || stats.__isset.min) { - // TODO: check created_by to see if it is corrupted for some types. - // TODO: check if the sort_order is SIGNED. + } else if (min_max == StatisticsMinMaxField::kLegacyMinMax) { if (stats.__isset.max) { out.set_max(stats.max); } @@ -290,6 +297,14 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) { return out; } +static inline EncodedStatistics FromThrift(const format::Statistics& stats) { + // Use the new V2 min-max statistics over the former one if it is filled. + if (stats.__isset.max_value || stats.__isset.min_value) { + return FromThrift(stats, StatisticsMinMaxField::kMinValueMaxValue); + } + return FromThrift(stats, StatisticsMinMaxField::kLegacyMinMax); +} + static inline geospatial::EncodedGeoStatistics FromThrift( const format::GeospatialStatistics& geo_stats) { geospatial::EncodedGeoStatistics out;