diff --git a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java index 358d2233..18218eac 100644 --- a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java +++ b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java @@ -195,6 +195,21 @@ public void parquetWritePrimitivesGZipClassName() throws Exception { runWritePrimitivesScenario("pxf_parquet_write_primitives_gzip_classname", "pxf_parquet_read_primitives_gzip_classname", "parquet_write_primitives_gzip_classname", new String[]{"COMPRESSION_CODEC=org.apache.hadoop.io.compress.GzipCodec"}); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesSnappy() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_snappy", "pxf_parquet_read_primitives_snappy", "parquet_write_primitives_snappy", new String[]{"COMPRESSION_CODEC=snappy"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesUncompressed() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_uncompressed", "pxf_parquet_read_primitives_uncompressed", "parquet_write_primitives_uncompressed", new String[]{"COMPRESSION_CODEC=uncompressed"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesZStd() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd", "pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new String[]{"COMPRESSION_CODEC=zstd"}); + } + // Numeric precision not defined, test writing data precision in [1, 38]. All the data should be written correctly. @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void parquetWriteUndefinedPrecisionNumeric() throws Exception { diff --git a/docs/content/hdfs_parquet.html.md.erb b/docs/content/hdfs_parquet.html.md.erb index 26ee4817..9ad05b78 100644 --- a/docs/content/hdfs_parquet.html.md.erb +++ b/docs/content/hdfs_parquet.html.md.erb @@ -23,7 +23,7 @@ under the License. Use the PXF HDFS connector to read and write Parquet-format data. This section describes how to read and write HDFS files that are stored in Parquet format, including how to create, query, and insert into external tables that reference files in the HDFS data store. -PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `lzo`. +PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `zstd`. PXF currently supports reading and writing primitive Parquet data types only. @@ -182,7 +182,7 @@ The PXF `hdfs:parquet` profile supports encoding- and compression-related write | Write Option | Value Description | |-------|-------------------------------------| -| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `lzo`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | +| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | | ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes. | | PAGE_SIZE | A row group consists of column chunks that are divided up into pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size is `1 * 1024 * 1024` bytes. | | ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable dictionary encoding. The default value is `true`; dictionary encoding is enabled when PXF writes Parquet files. | diff --git a/server/build.gradle b/server/build.gradle index 499a0b72..a1b6275c 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -164,7 +164,7 @@ configure(javaProjects) { } // Parquet dependencies - dependency("org.apache.parquet:parquet-format:2.7.0") + dependency("org.apache.parquet:parquet-format:2.11.0") dependencySet(group:"org.apache.parquet", version:"${parquetVersion}") { entry("parquet-column") entry("parquet-common") @@ -173,6 +173,7 @@ configure(javaProjects) { entry("parquet-hadoop") entry("parquet-jackson") entry("parquet-pig") + entry("parquet-format-structures") } // Thrift dependencies diff --git a/server/gradle.properties b/server/gradle.properties index eb6191df..42da880a 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.11.1 +parquetVersion=1.12.3 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true diff --git a/server/pxf-hdfs/build.gradle b/server/pxf-hdfs/build.gradle index 673e528e..9705fb6f 100644 --- a/server/pxf-hdfs/build.gradle +++ b/server/pxf-hdfs/build.gradle @@ -38,6 +38,7 @@ dependencies { implementation("org.apache.hadoop:hadoop-hdfs") { transitive = false } implementation("org.apache.hadoop:hadoop-hdfs-client") { transitive = false } implementation("org.apache.parquet:parquet-format") { transitive = false } + implementation("org.apache.parquet:parquet-format-structures") { transitive = false } implementation("org.apache.parquet:parquet-column") { transitive = false } implementation("org.apache.parquet:parquet-common") { transitive = false } implementation("org.apache.parquet:parquet-encoding") { transitive = false }