From 5d4a666bfc8b6162ee0723cd4f0806cf07f22c0e Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Wed, 4 Feb 2026 14:33:04 +0500 Subject: [PATCH 1/5] parquet-1.15.2 --- server/build.gradle | 3 ++- server/gradle.properties | 2 +- server/pxf-hdfs/build.gradle | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/server/build.gradle b/server/build.gradle index 499a0b72..a1b6275c 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -164,7 +164,7 @@ configure(javaProjects) { } // Parquet dependencies - dependency("org.apache.parquet:parquet-format:2.7.0") + dependency("org.apache.parquet:parquet-format:2.11.0") dependencySet(group:"org.apache.parquet", version:"${parquetVersion}") { entry("parquet-column") entry("parquet-common") @@ -173,6 +173,7 @@ configure(javaProjects) { entry("parquet-hadoop") entry("parquet-jackson") entry("parquet-pig") + entry("parquet-format-structures") } // Thrift dependencies diff --git a/server/gradle.properties b/server/gradle.properties index eb6191df..960ef7c9 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.11.1 +parquetVersion=1.15.2 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true diff --git a/server/pxf-hdfs/build.gradle b/server/pxf-hdfs/build.gradle index 673e528e..9705fb6f 100644 --- a/server/pxf-hdfs/build.gradle +++ b/server/pxf-hdfs/build.gradle @@ -38,6 +38,7 @@ dependencies { implementation("org.apache.hadoop:hadoop-hdfs") { transitive = false } implementation("org.apache.hadoop:hadoop-hdfs-client") { transitive = false } implementation("org.apache.parquet:parquet-format") { transitive = false } + implementation("org.apache.parquet:parquet-format-structures") { transitive = false } implementation("org.apache.parquet:parquet-column") { transitive = false } implementation("org.apache.parquet:parquet-common") { transitive = false } implementation("org.apache.parquet:parquet-encoding") { transitive = false } From 7d17b9d208633472b5ca5906e5a2a81b30de52cd Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Wed, 4 Feb 2026 16:06:13 +0500 Subject: [PATCH 2/5] parquet-1.14.4 --- server/gradle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/gradle.properties b/server/gradle.properties index 960ef7c9..0696a23e 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.15.2 +parquetVersion=1.14.4 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true From e72f3de5da108cdcf143784ba30ab0d1e5440539 Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Wed, 4 Feb 2026 20:19:50 +0500 Subject: [PATCH 3/5] 1.13.1 --- server/gradle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/gradle.properties b/server/gradle.properties index 0696a23e..2a9a452b 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.14.4 +parquetVersion=1.13.1 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true From 9edd2e860a7c036cc6f475d6a2ea4ebeb4426237 Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Thu, 5 Feb 2026 14:33:36 +0500 Subject: [PATCH 4/5] 1.12.3 =/ --- server/gradle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/gradle.properties b/server/gradle.properties index 2a9a452b..42da880a 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.13.1 +parquetVersion=1.12.3 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true From c50e2ce6f82d515a0970d36e58b8e370e3b94d5f Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Thu, 5 Feb 2026 23:10:15 +0500 Subject: [PATCH 5/5] Add tests for Parquet compression --- .../features/parquet/ParquetWriteTest.java | 15 +++++++++++++++ docs/content/hdfs_parquet.html.md.erb | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java index 358d2233..18218eac 100644 --- a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java +++ b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java @@ -195,6 +195,21 @@ public void parquetWritePrimitivesGZipClassName() throws Exception { runWritePrimitivesScenario("pxf_parquet_write_primitives_gzip_classname", "pxf_parquet_read_primitives_gzip_classname", "parquet_write_primitives_gzip_classname", new String[]{"COMPRESSION_CODEC=org.apache.hadoop.io.compress.GzipCodec"}); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesSnappy() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_snappy", "pxf_parquet_read_primitives_snappy", "parquet_write_primitives_snappy", new String[]{"COMPRESSION_CODEC=snappy"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesUncompressed() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_uncompressed", "pxf_parquet_read_primitives_uncompressed", "parquet_write_primitives_uncompressed", new String[]{"COMPRESSION_CODEC=uncompressed"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesZStd() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd", "pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new String[]{"COMPRESSION_CODEC=zstd"}); + } + // Numeric precision not defined, test writing data precision in [1, 38]. All the data should be written correctly. @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void parquetWriteUndefinedPrecisionNumeric() throws Exception { diff --git a/docs/content/hdfs_parquet.html.md.erb b/docs/content/hdfs_parquet.html.md.erb index 26ee4817..9ad05b78 100644 --- a/docs/content/hdfs_parquet.html.md.erb +++ b/docs/content/hdfs_parquet.html.md.erb @@ -23,7 +23,7 @@ under the License. Use the PXF HDFS connector to read and write Parquet-format data. This section describes how to read and write HDFS files that are stored in Parquet format, including how to create, query, and insert into external tables that reference files in the HDFS data store. -PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `lzo`. +PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `zstd`. PXF currently supports reading and writing primitive Parquet data types only. @@ -182,7 +182,7 @@ The PXF `hdfs:parquet` profile supports encoding- and compression-related write | Write Option | Value Description | |-------|-------------------------------------| -| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `lzo`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | +| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | | ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes. | | PAGE_SIZE | A row group consists of column chunks that are divided up into pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size is `1 * 1024 * 1024` bytes. | | ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable dictionary encoding. The default value is `true`; dictionary encoding is enabled when PXF writes Parquet files. |