diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 46cd1e442461..2b901b39de58 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -90,40 +90,64 @@ std::string ParquetVersionToString(ParquetVersion::type ver) { return "UNKNOWN"; } +namespace { + +StatisticsMinMaxField GetStatisticsMinMaxField(const ColumnDescriptor& descr) { + switch (descr.column_order().get_order()) { + case ColumnOrder::TYPE_DEFINED_ORDER: + return descr.sort_order() != SortOrder::UNKNOWN + ? StatisticsMinMaxField::kMinValueMaxValue + : StatisticsMinMaxField::kInvalid; + case ColumnOrder::UNDEFINED: + return descr.sort_order() == SortOrder::SIGNED + ? StatisticsMinMaxField::kLegacyMinMax + : StatisticsMinMaxField::kInvalid; + case ColumnOrder::UNKNOWN: + return StatisticsMinMaxField::kInvalid; + } + return StatisticsMinMaxField::kInvalid; +} + template -static std::shared_ptr MakeTypedColumnStats( - const format::ColumnMetaData& metadata, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) { - std::optional min_exact = - metadata.statistics.__isset.is_min_value_exact - ? std::optional(metadata.statistics.is_min_value_exact) - : std::nullopt; - std::optional max_exact = - metadata.statistics.__isset.is_max_value_exact - ? std::optional(metadata.statistics.is_max_value_exact) - : std::nullopt; - // If ColumnOrder is defined, return max_value and min_value - if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { - return MakeStatistics( - descr, metadata.statistics.min_value, metadata.statistics.max_value, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max_value && metadata.statistics.__isset.min_value, - metadata.statistics.__isset.null_count, - metadata.statistics.__isset.distinct_count, min_exact, max_exact, pool); - } - // Default behavior +std::shared_ptr MakeTypedColumnStats(const format::ColumnMetaData& metadata, + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) { + const auto& statistics = metadata.statistics; + const std::string kEmpty = ""; + const std::string* encoded_min = &kEmpty; + const std::string* encoded_max = &kEmpty; + bool has_min_max = false; + std::optional min_exact = std::nullopt; + std::optional max_exact = std::nullopt; + + switch (GetStatisticsMinMaxField(*descr)) { + case StatisticsMinMaxField::kMinValueMaxValue: + encoded_min = &statistics.min_value; + encoded_max = &statistics.max_value; + has_min_max = statistics.__isset.max_value && statistics.__isset.min_value; + min_exact = statistics.__isset.is_min_value_exact + ? std::optional(statistics.is_min_value_exact) + : std::nullopt; + max_exact = statistics.__isset.is_max_value_exact + ? std::optional(statistics.is_max_value_exact) + : std::nullopt; + break; + case StatisticsMinMaxField::kLegacyMinMax: + encoded_min = &statistics.min; + encoded_max = &statistics.max; + has_min_max = statistics.__isset.max && statistics.__isset.min; + break; + case StatisticsMinMaxField::kInvalid: + break; + } + return MakeStatistics( - descr, metadata.statistics.min, metadata.statistics.max, - metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count, - metadata.statistics.__isset.max && metadata.statistics.__isset.min, - metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count, - min_exact, max_exact, pool); + descr, *encoded_min, *encoded_max, metadata.num_values - statistics.null_count, + statistics.null_count, statistics.distinct_count, has_min_max, + statistics.__isset.null_count, statistics.__isset.distinct_count, min_exact, + max_exact, pool); } -namespace { - std::shared_ptr MakeColumnGeometryStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { if (metadata.__isset.geospatial_statistics) { @@ -336,12 +360,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { { const std::lock_guard guard(stats_mutex_); if (possible_encoded_stats_ == nullptr) { - possible_encoded_stats_ = - std::make_shared(FromThrift(column_metadata_->statistics)); - if (descr_->sort_order() == SortOrder::UNKNOWN) { - // If the column SortOrder is Unknown we can't trust max/min. - possible_encoded_stats_->ClearMinMax(); - } + possible_encoded_stats_ = std::make_shared( + FromThrift(column_metadata_->statistics, GetStatisticsMinMaxField(*descr_))); } } return writer_version_->HasCorrectStatistics(type(), *possible_encoded_stats_, @@ -1037,7 +1057,7 @@ class FileMetaData::FileMetaDataImpl { if (column_order.__isset.TYPE_ORDER) { column_orders.push_back(ColumnOrder::type_defined_); } else { - column_orders.push_back(ColumnOrder::undefined_); + column_orders.push_back(ColumnOrder::unknown_); } } } else { diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 572f053179cd..ac45be1fac38 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -275,6 +275,104 @@ TEST(Metadata, TestBuildAccess) { ASSERT_TRUE(f_accessor_1->Equals(*f_accessor->Subset({2, 0}))); } +namespace { + +std::string EncodeInt32(int32_t value) { + return std::string(reinterpret_cast(&value), sizeof(value)); +} + +constexpr int32_t kLegacyMin = 100, kLegacyMax = 200; + +std::string SerializeMetadata(const format::FileMetaData& thrift_metadata) { + std::string out; + ThriftSerializer{}.SerializeToString(&thrift_metadata, &out); + return out; +} + +std::shared_ptr ParseMetadata(std::string serialized_metadata) { + uint32_t decoded_len = static_cast(serialized_metadata.size()); + return FileMetaData::Make(serialized_metadata.data(), &decoded_len); +} + +format::FileMetaData SingleInt32MetadataWithStats() { + format::FileMetaData metadata; + format::SchemaElement root, leaf; + schema::NodeVector fields = {schema::Int32("int_col", Repetition::REQUIRED)}; + schema::GroupNode::Make("schema", Repetition::REPEATED, fields)->ToParquet(&root); + fields.back()->ToParquet(&leaf); + metadata.schema = {std::move(root), std::move(leaf)}; + + auto& column = metadata.row_groups.emplace_back().columns.emplace_back(); + column.__isset.meta_data = true; + auto& column_metadata = column.meta_data; + column_metadata.__set_type(format::Type::INT32); + column_metadata.__isset.statistics = true; + auto& statistics = column_metadata.statistics; + statistics.__set_min(EncodeInt32(kLegacyMin)); + statistics.__set_max(EncodeInt32(kLegacyMax)); + statistics.__set_min_value(EncodeInt32(kLegacyMin)); + statistics.__set_max_value(EncodeInt32(kLegacyMax)); + metadata.column_orders.emplace_back().__set_TYPE_ORDER(format::TypeDefinedOrder{}); + metadata.__isset.column_orders = true; + return metadata; +} + +std::unique_ptr GetOnlyColumnChunk( + const FileMetaData& metadata, ColumnOrder::type expected_order) { + EXPECT_EQ(expected_order, metadata.schema()->Column(0)->column_order().get_order()); + return metadata.RowGroup(0)->ColumnChunk(0); +} + +void AssertColumnChunkHasNoMinMax(const FileMetaData& metadata, + ColumnOrder::type expected_order) { + auto column = GetOnlyColumnChunk(metadata, expected_order); + ASSERT_NE(nullptr, column->encoded_statistics()); + EXPECT_FALSE(column->encoded_statistics()->has_min); + EXPECT_FALSE(column->encoded_statistics()->has_max); + ASSERT_NE(nullptr, column->statistics()); + EXPECT_FALSE(column->statistics()->HasMinMax()); +} + +void AssertColumnChunkMinMax(const FileMetaData& metadata, + ColumnOrder::type expected_order, int32_t min, int32_t max) { + auto column = GetOnlyColumnChunk(metadata, expected_order); + const std::string encoded_min = EncodeInt32(min); + const std::string encoded_max = EncodeInt32(max); + + ASSERT_NE(nullptr, column->encoded_statistics()); + EXPECT_EQ(encoded_min, column->encoded_statistics()->min()); + EXPECT_EQ(encoded_max, column->encoded_statistics()->max()); + ASSERT_NE(nullptr, column->statistics()); + EXPECT_EQ(encoded_min, column->statistics()->EncodeMin()); + EXPECT_EQ(encoded_max, column->statistics()->EncodeMax()); +} + +} // namespace + +TEST(Metadata, UnknownColumnOrderIgnoresMinMax) { + format::FileMetaData thrift_metadata = SingleInt32MetadataWithStats(); + // Simulate an unsupported ColumnOrder value: unknown union fields are skipped by + // Thrift, leaving an entry with no known field set. + thrift_metadata.column_orders.clear(); + thrift_metadata.column_orders.emplace_back(); + thrift_metadata.__isset.column_orders = true; + + auto metadata = ParseMetadata(SerializeMetadata(thrift_metadata)); + AssertColumnChunkHasNoMinMax(*metadata, ColumnOrder::UNKNOWN); +} + +TEST(Metadata, MissingColumnOrderUsesLegacyMinMax) { + format::FileMetaData thrift_metadata = SingleInt32MetadataWithStats(); + thrift_metadata.column_orders.clear(); + thrift_metadata.__isset.column_orders = false; + auto& statistics = thrift_metadata.row_groups.at(0).columns.at(0).meta_data.statistics; + statistics.__set_min_value(EncodeInt32(kLegacyMin - 100)); + statistics.__set_max_value(EncodeInt32(kLegacyMax + 100)); + + auto metadata = ParseMetadata(SerializeMetadata(thrift_metadata)); + AssertColumnChunkMinMax(*metadata, ColumnOrder::UNDEFINED, kLegacyMin, kLegacyMax); +} + TEST(Metadata, TestV1Version) { // PARQUET-839 parquet::schema::NodeVector fields; diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 1addc73bd367..52c6b5544e32 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -382,6 +382,19 @@ class PARQUET_EXPORT ColumnDescriptor { return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt); } + // Whether ColumnOrder-governed min/max values have a supported ordering. + bool can_use_stats() const { + switch (column_order().get_order()) { + case ColumnOrder::TYPE_DEFINED_ORDER: + return sort_order() != SortOrder::UNKNOWN; + case ColumnOrder::UNDEFINED: + return sort_order() == SortOrder::SIGNED; + case ColumnOrder::UNKNOWN: + return false; + } + return false; + } + const std::string& name() const { return primitive_node_->name(); } const std::shared_ptr path() const; diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc index 2950a7df70f8..425f34ff61db 100644 --- a/cpp/src/parquet/schema_test.cc +++ b/cpp/src/parquet/schema_test.cc @@ -660,6 +660,32 @@ TEST(TestColumnDescriptor, TestAttrs) { ASSERT_EQ(expected_descr, descr2.ToString()); } +TEST(TestColumnDescriptor, CanUseStats) { + NodePtr node = Int32("name"); + ColumnDescriptor descr(node, 0, 0); + // Type-defined column order is usable when the type has a known sort order. + EXPECT_TRUE(descr.can_use_stats()); + + auto primitive_node = std::static_pointer_cast(node); + primitive_node->SetColumnOrder(ColumnOrder::undefined_); + // Missing column order falls back to legacy min/max, which are signed. + EXPECT_TRUE(ColumnDescriptor(node, 0, 0).can_use_stats()); + + primitive_node->SetColumnOrder(ColumnOrder::unknown_); + // Unsupported column order means min/max ordering is unknown to this reader. + EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats()); + + node = PrimitiveNode::Make("name", Repetition::REQUIRED, Type::BYTE_ARRAY); + primitive_node = std::static_pointer_cast(node); + primitive_node->SetColumnOrder(ColumnOrder::undefined_); + // Legacy min/max are signed, so they cannot represent unsigned byte ordering. + EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats()); + + node = PrimitiveNode::Make("name", Repetition::REQUIRED, Type::INT96); + // INT96 has no defined sort order in the Parquet type-defined ordering. + EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats()); +} + class TestSchemaDescriptor : public ::testing::Test { public: void setUp() {} diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a57..0d300b856c2f 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1660,6 +1660,13 @@ TEST(TestStatisticsSortOrder, UNKNOWN) { ASSERT_EQ(1, enc_stats->null_count); ASSERT_FALSE(enc_stats->is_max_value_exact.has_value()); ASSERT_FALSE(enc_stats->is_min_value_exact.has_value()); + + // Unknown sort order should not cause min/max to be set + std::shared_ptr stats = column_chunk->statistics(); + ASSERT_NE(nullptr, stats); + ASSERT_FALSE(stats->HasMinMax()); + ASSERT_TRUE(stats->HasNullCount()); + ASSERT_EQ(1, stats->null_count()); } // Test statistics for binary column with UNSIGNED sort order diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h index 5d14ae4e289c..e6d69308edb7 100644 --- a/cpp/src/parquet/thrift_internal.h +++ b/cpp/src/parquet/thrift_internal.h @@ -252,12 +252,21 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) { aesGcmCtrV1.supply_aad_prefix}; } -static inline EncodedStatistics FromThrift(const format::Statistics& stats) { +// Selects which thrift Statistics min/max fields should populate EncodedStatistics. +enum class StatisticsMinMaxField { + // Do not populate min/max, because the ordering is undefined or unsupported. + kInvalid, + // Populate min/max from the min_value/max_value fields. + kMinValueMaxValue, + // Populate min/max from the legacy min/max fields. + kLegacyMinMax, +}; + +static inline EncodedStatistics FromThrift(const format::Statistics& stats, + StatisticsMinMaxField min_max) { EncodedStatistics out; - // Use the new V2 min-max statistics over the former one if it is filled - if (stats.__isset.max_value || stats.__isset.min_value) { - // TODO: check if the column_order is TYPE_DEFINED_ORDER. + if (min_max == StatisticsMinMaxField::kMinValueMaxValue) { if (stats.__isset.max_value) { out.set_max(stats.max_value); if (stats.__isset.is_max_value_exact) { @@ -270,9 +279,7 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) { out.is_min_value_exact = stats.is_min_value_exact; } } - } else if (stats.__isset.max || stats.__isset.min) { - // TODO: check created_by to see if it is corrupted for some types. - // TODO: check if the sort_order is SIGNED. + } else if (min_max == StatisticsMinMaxField::kLegacyMinMax) { if (stats.__isset.max) { out.set_max(stats.max); } @@ -290,6 +297,14 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) { return out; } +static inline EncodedStatistics FromThrift(const format::Statistics& stats) { + // Use the new V2 min-max statistics over the former one if it is filled. + if (stats.__isset.max_value || stats.__isset.min_value) { + return FromThrift(stats, StatisticsMinMaxField::kMinValueMaxValue); + } + return FromThrift(stats, StatisticsMinMaxField::kLegacyMinMax); +} + static inline geospatial::EncodedGeoStatistics FromThrift( const format::GeospatialStatistics& geo_stats) { geospatial::EncodedGeoStatistics out; diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index fb4eb92a7544..ff5bd4eafb90 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -444,6 +444,7 @@ SortOrder::type GetSortOrder(const std::shared_ptr& logical_t ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED); ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER); +ColumnOrder ColumnOrder::unknown_ = ColumnOrder(ColumnOrder::UNKNOWN); // Static methods for LogicalType class diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index ad4df5119e75..846a6cfaf9ce 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -599,7 +599,14 @@ bool PageCanUseChecksum(PageType::type pageType); class ColumnOrder { public: - enum type { UNDEFINED, TYPE_DEFINED_ORDER }; + enum type { + // File metadata has no column order, only legacy min/max in stats are defined. + UNDEFINED, + // File metadata uses TypeDefinedOrder from the Parquet format. + TYPE_DEFINED_ORDER, + // Column order value unsupported by this reader. + UNKNOWN + }; explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {} // Default to Type Defined Order ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {} @@ -607,6 +614,7 @@ class ColumnOrder { static ColumnOrder undefined_; static ColumnOrder type_defined_; + static ColumnOrder unknown_; private: ColumnOrder::type column_order_;