Skip to content

Commit 96c80e3

Browse files
author
infvg
committed
Added iceberg write configs
1 parent 8b04f51 commit 96c80e3

4 files changed

Lines changed: 136 additions & 70 deletions

File tree

backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -751,4 +751,40 @@ object VeloxConfig extends ConfigRegistry {
751751
.doc("Maps table field names to file field names using names, not indices for Parquet files.")
752752
.booleanConf
753753
.createWithDefault(true)
754+
755+
val PARQUET_COMPRESSION_CODEC =
756+
buildConf("spark.gluten.sql.columnar.backend.velox.parquet.compression-codec")
757+
.doc("Compression codec to use for Parquet write operations.")
758+
.stringConf
759+
.createWithDefault("UNCOMPRESSED")
760+
761+
val PARQUET_COMPRESSION_LEVEL =
762+
buildConf("spark.gluten.sql.columnar.backend.velox.parquet.compression-level")
763+
.doc("Compression level for Parquet write operations.")
764+
.intConf
765+
.createOptional
766+
767+
val VELOX_TARGET_FILE_SIZE =
768+
buildConf("spark.gluten.sql.columnar.backend.velox.target-file-size-byte")
769+
.doc("Target file size in bytes for write operations.")
770+
.bytesConf(ByteUnit.BYTE)
771+
.createWithDefaultString("0")
772+
773+
val ICEBERG_WRITE_PARQUET_ROW_GROUP_SIZE_BYTES =
774+
buildConf("spark.gluten.sql.columnar.backend.velox.iceberg.write.parquet.row-group-size-bytes")
775+
.doc("Row group size in bytes for Iceberg Parquet write operations.")
776+
.bytesConf(ByteUnit.BYTE)
777+
.createWithDefaultString("128MB")
778+
779+
val ICEBERG_WRITE_PARQUET_PAGE_SIZE_BYTES =
780+
buildConf("spark.gluten.sql.columnar.backend.velox.iceberg.write.parquet.page-size-bytes")
781+
.doc("Page size in bytes for Iceberg Parquet write operations.")
782+
.bytesConf(ByteUnit.BYTE)
783+
.createWithDefaultString("1MB")
784+
785+
val ICEBERG_WRITE_PARQUET_PAGE_ROW_LIMIT =
786+
buildConf("spark.gluten.sql.columnar.backend.velox.iceberg.write.parquet.page-row-limit")
787+
.doc("Maximum number of rows per page for Iceberg Parquet write operations.")
788+
.intConf
789+
.createWithDefault(20000)
754790
}

cpp/velox/config/VeloxConfig.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,24 @@ const std::string kMemoryPoolCapacityTransferAcrossTasks =
166166
const std::string kOrcUseColumnNames = "spark.gluten.sql.columnar.backend.velox.orcUseColumnNames";
167167
const std::string kParquetUseColumnNames = "spark.gluten.sql.columnar.backend.velox.parquetUseColumnNames";
168168

169-
// write fies
169+
// write files
170170
const std::string kMaxPartitions = "spark.gluten.sql.columnar.backend.velox.maxPartitionsPerWritersSession";
171171

172+
// Iceberg write configs
173+
const std::string kWriteTargetFileSizeBytes =
174+
"spark.gluten.sql.columnar.backend.velox.target-file-size-bytes";
175+
const std::string kWriteParquetCompressionCodec =
176+
"spark.gluten.sql.columnar.backend.velox.parquet.compression-codec";
177+
const std::string kWriteParquetCompressionLevel =
178+
"spark.gluten.sql.columnar.backend.velox.parquet.compression-level";
179+
180+
const std::string kWriteParquetRowGroupSizeBytes =
181+
"spark.gluten.sql.columnar.backend.velox.parquet.row-group-size-bytes";
182+
const std::string kWriteParquetPageSizeBytes =
183+
"spark.gluten.sql.columnar.backend.velox.parquet.page-size-bytes";
184+
const std::string kWriteParquetPageRowLimit =
185+
"spark.gluten.sql.columnar.backend.velox.parquet.page-row-limit";
186+
172187
const std::string kGlogVerboseLevel = "spark.gluten.sql.columnar.backend.velox.glogVerboseLevel";
173188
const uint32_t kGlogVerboseLevelDefault = 0;
174189
const uint32_t kGlogVerboseLevelMaximum = 99;

cpp/velox/utils/ConfigExtractor.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,15 @@ std::shared_ptr<facebook::velox::config::ConfigBase> createHiveConnectorSessionC
240240
configs[facebook::velox::connector::hive::HiveConfig::kOrcUseColumnNamesSession] =
241241
conf->get<bool>(kOrcUseColumnNames, true) ? "true" : "false";
242242

243+
configs[facebook::velox::connector::hive::HiveConfig::kMaxTargetFileSize] =
244+
conf->get<std::string>(kWriteTargetFileSizeBytes, "0");
245+
configs[facebook::velox::parquet::WriterOptions::kParquetSessionWritePageSize] =
246+
conf->get<std::string>(kWriteParquetCompressionCodec, "UNCOMPRESSED");
247+
// configs[facebook::velox::parquet::WriterOptions::kParquetHiveConnectorCompressionLevel] =
248+
// conf->get<std::string>(kWriteParquetCompressionLevel, "0");
249+
// configs[facebook::velox::parquet::DefaultFlushPolicy::kDefaultBytesInRowGroup] =
250+
// conf->get<std::string>(kWriteParquetRowGroupSizeBytes, "UNCOMPRESSED");
251+
243252
overwriteVeloxConf(conf.get(), configs, kDynamicBackendConfPrefix);
244253
return std::make_shared<facebook::velox::config::ConfigBase>(std::move(configs));
245254
}

0 commit comments

Comments
 (0)