From 98d87479368bd30c073fb179302350938f4bab63 Mon Sep 17 00:00:00 2001
From: Divjot Arora
Date: Tue, 9 Jun 2026 19:03:42 +0000
Subject: [PATCH 1/3] Introduce chronological ordering for INT96 timestamps
---
README.md | 2 +-
src/main/thrift/parquet.thrift | 36 ++++++++++++++++++++++++++--------
2 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 1ce553e8..064084a2 100644
--- a/README.md
+++ b/README.md
@@ -160,7 +160,7 @@ Column Index, and Data Page). These statistics are according to a sort order,
which is defined for each column in the file footer. Parquet supports common
sort orders for logical and primitive types and also special orders for types
with potentially ambiguous semantics (e.g., NaN ordering for floating point
-types). The details are documented in the
+types, INT96 timestamps). The details are documented in the
[Thrift definition](src/main/thrift/parquet.thrift) in the `ColumnOrder` union.
## Nested Encoding
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index fe259d61..ab91b104 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -1061,6 +1061,9 @@ struct TypeDefinedOrder {}
/** Empty struct to signal IEEE 754 total order for floating point types */
struct IEEE754TotalOrder {}
+/** Empty struct to signal chronological ordering of physical type INT96 */
+struct Int96TimestampOrder {}
+
/**
* Union to specify the order used for the min_value and max_value fields for a
* column. This union takes the role of an enhanced enum that allows rich
@@ -1071,6 +1074,8 @@ struct IEEE754TotalOrder {}
* physical type (if there is no logical type).
* * IEEE754TotalOrder - the floating point column uses IEEE 754 total order.
*
+ * * Int96TimestampOrder - the INT96 column uses chronological timestamp order.
+ *
* If the reader does not support the value of this union, min and max stats
* for this column should be ignored.
*/
@@ -1108,20 +1113,16 @@ union ColumnOrder {
* BOOLEAN - false, true
* INT32 - signed comparison
* INT64 - signed comparison
- * INT96 (only used for legacy timestamps) - undefined(+)
+ * INT96 (only used for legacy timestamps) - undefined or signed comparison of the represented value(+)
* FLOAT - signed comparison of the represented value (*)
* DOUBLE - signed comparison of the represented value (*)
* BYTE_ARRAY - unsigned byte-wise comparison
* FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
*
* (+) While the INT96 type has been deprecated, at the time of writing it is
- * still used in many legacy systems. If a Parquet implementation chooses
- * to write statistics for INT96 columns, it is recommended to order them
- * according to the legacy rules:
- * - compare the last 4 bytes (days) as a little-endian 32-bit signed integer
- * - if equal last 4 bytes, compare the first 8 bytes as a little-endian
- * 64-bit signed integer (nanos)
- * See https://github.com/apache/parquet-format/issues/502 for more details
+ * still used in many legacy systems. It is recommended that writers use
+ * INT96_TIMESTAMP_ORDER for this type. If TYPE_ORDER is used for an INT96
+ * column, readers should ignore statistics for that column.
*
* (*) Because TYPE_ORDER is ambiguous for floating point types due to
* underspecified handling of NaN and -0/+0, it is recommended that writers
@@ -1195,6 +1196,18 @@ union ColumnOrder {
* or max_values indicates that all non-null values are NaN.
*/
2: IEEE754TotalOrder IEEE_754_TOTAL_ORDER;
+
+ /*
+ * The INT96 timestamp type is ordered chronologically. Only columns of
+ * physical type INT96 may use this ordering.
+ *
+ * When writing statistics for columns with INT96_TIMESTAMP_ORDER, two values
+ * must be compared as follows:
+ * - Compare the last 4 bytes (days) as a little-endian 32-bit signed integer
+ * - If equal last 4 bytes, compare the first 8 bytes (nanos) as a
+ * little-endinan 64-bit signed integer.
+ */
+ 3: Int96TimestampOrder INT96_TIMESTAMP_ORDER;
}
struct PageLocation {
@@ -1278,6 +1291,13 @@ struct ColumnIndex {
* - If the order of this column is IEEE754_TOTAL_ORDER, then min_values[i]
* and max_values[i] of that page must be set to the smallest and largest
* NaN values as defined by IEEE 754 total order.
+ *
+ * For columns of physical type INT96, the writer must do the following:
+ * - If the order of this column is not INT96_TIMESTAMP_ORDER, then a column
+ * index must not be written for this column chunk.
+ * - If the order of this column is INT96_TIMESTAMP_ORDER, the min_values[i]
+ * and max_values[i] of that page must be set to the smallest and largest
+ * values as defined by the INT96 chronological timestamp ordering.
*/
2: required list min_values
3: required list max_values
From e248175cdd74bf280690a0c2e5f85ed00532475e Mon Sep 17 00:00:00 2001
From: Divjot Arora
Date: Wed, 10 Jun 2026 09:39:02 +0000
Subject: [PATCH 2/3] clarify int96 stats are optional
---
src/main/thrift/parquet.thrift | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index ab91b104..2ebe4f41 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -1120,9 +1120,11 @@ union ColumnOrder {
* FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
*
* (+) While the INT96 type has been deprecated, at the time of writing it is
- * still used in many legacy systems. It is recommended that writers use
- * INT96_TIMESTAMP_ORDER for this type. If TYPE_ORDER is used for an INT96
- * column, readers should ignore statistics for that column.
+ * still used in many legacy systems. It is optional for writers to emit
+ * statistics for INT96 columns. Writers that emit stats for such columns
+ * should use the INT96_TIMESTAMP_ORDER for this type. If TYPE_ORDER is
+ * used for an INT96 column, readers should ignore statistics for that
+ * column.
*
* (*) Because TYPE_ORDER is ambiguous for floating point types due to
* underspecified handling of NaN and -0/+0, it is recommended that writers
From d86d8c7606b4964eb23f91a95830d8a69cae3877 Mon Sep 17 00:00:00 2001
From: Divjot Arora
Date: Thu, 11 Jun 2026 07:44:14 +0000
Subject: [PATCH 3/3] fix typo
---
src/main/thrift/parquet.thrift | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
index 2ebe4f41..1b787536 100644
--- a/src/main/thrift/parquet.thrift
+++ b/src/main/thrift/parquet.thrift
@@ -1207,7 +1207,7 @@ union ColumnOrder {
* must be compared as follows:
* - Compare the last 4 bytes (days) as a little-endian 32-bit signed integer
* - If equal last 4 bytes, compare the first 8 bytes (nanos) as a
- * little-endinan 64-bit signed integer.
+ * little-endian 64-bit signed integer.
*/
3: Int96TimestampOrder INT96_TIMESTAMP_ORDER;
}