-
Notifications
You must be signed in to change notification settings - Fork 3
#649 Add ability to select how key is serialized in Kafka Avro source. #652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ package za.co.absa.pramen.extras.source | |
| import com.typesafe.config.Config | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.functions.{col, struct} | ||
| import org.apache.spark.sql.types.StringType | ||
| import org.slf4j.LoggerFactory | ||
| import za.co.absa.abris.avro.functions.from_avro | ||
| import za.co.absa.abris.config.AbrisConfig | ||
|
|
@@ -48,9 +49,15 @@ import java.time.LocalDate | |
| * | ||
| * # [Optional] Set name for the struct field that contains Kafka record metadata | ||
| * custom.kafka.column = "kafka" | ||
| * | ||
| * # [Optional] Set name for the Kafka key column | ||
| * key.column.name = "kafka_key" | ||
| * | ||
| * # The Kafka key serializer when 'key.naming.strategy' is NOT defined. Can be "none", "binary", "string". | ||
| * # When 'key.naming.strategy' IS defined in 'schema.registry', Avro deserialization is used automatically. | ||
| * # Default is "binary". | ||
| * #key.column.serializer = "none" | ||
| * | ||
| * kafka { | ||
| * bootstrap.servers = "mybroker1:9092,mybroker2:9092" | ||
| * | ||
|
|
@@ -66,6 +73,7 @@ import java.time.LocalDate | |
| * | ||
| * # Can be one of: topic.name, record.name, topic.record.name | ||
| * value.naming.strategy = "topic.name" | ||
| * #key.naming.strategy = "topic.name" | ||
| * | ||
| * # If you want to force the specific schema id. Otherwise, the latest schema id will be used. | ||
| * # key.schema.id = | ||
|
|
@@ -114,6 +122,9 @@ class KafkaAvroSource(sourceConfig: Config, | |
|
|
||
| private val kafkaColumnName = ConfigUtils.getOptionString(sourceConfig, CUSTOM_KAFKA_COLUMN_KEY).getOrElse("kafka") | ||
| private val keyColumnName = ConfigUtils.getOptionString(sourceConfig, KEY_COLUMN_KEY).getOrElse("kafka_key") | ||
| private val keyColumnSerializer = ConfigUtils.getOptionString(sourceConfig, KEY_COLUMN_SERIALIZER_KEY).getOrElse("binary").toLowerCase.trim | ||
| private val tempKafkaColumnName = "tmp_pramen_kafka" | ||
| private val tempKafkaKeyColumnName = "tmp_pramen_kafka_key" | ||
|
|
||
| override def hasInfoDateColumn(query: Query): Boolean = false | ||
|
|
||
|
|
@@ -206,32 +217,46 @@ class KafkaAvroSource(sourceConfig: Config, | |
| col("timestampType").as("timestamp_type") | ||
| )) | ||
|
|
||
| val hasKey = kafkaAvroConfig.keyNamingStrategy.isDefined || keyColumnSerializer != "none" | ||
|
|
||
| val df2 = kafkaAvroConfig.keyNamingStrategy match { | ||
| case Some(keyNamingStrategy) => | ||
| val abrisKeyConfig = keyNamingStrategy | ||
| .applyNamingStrategyToAbrisConfig(abrisValueBase, topic, isKey = true) | ||
| .usingSchemaRegistry(schemaRegistryClientConfig) | ||
| df1.withColumn("tmp_pramen_kafka_key", from_avro(col("key"), abrisKeyConfig)) | ||
| df1.withColumn(tempKafkaKeyColumnName, from_avro(col("key"), abrisKeyConfig)) | ||
| case None => | ||
| df1.withColumn("tmp_pramen_kafka_key", col("key")) | ||
| keyColumnSerializer match { | ||
| case "none" => df1 | ||
| case "binary" => df1.withColumn(tempKafkaKeyColumnName, col("key")) | ||
| case "string" => df1.withColumn(tempKafkaKeyColumnName, col("key").cast(StringType)) | ||
| case "avro" => throw new IllegalArgumentException("For the 'avro' serializer of Kafka topic key, 'schema.registry.key.naming.strategy' needs to be set.") | ||
| case x => throw new IllegalArgumentException(s"Unknown Kafka key serializer '$x'. Can be one of: none, binary, string, avro.") | ||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| val payloadFields = df2.select("data.*").schema.fieldNames.toSet | ||
| if (payloadFields.contains(kafkaColumnName)) { | ||
| log.warn(s"Payload field '$kafkaColumnName' conflicts with Kafka metadata struct name and will be replaced.") | ||
| log.warn(s"Payload field '$kafkaColumnName' conflicts with reserved Kafka metadata struct name and will be replaced.") | ||
| } | ||
| if (payloadFields.contains(keyColumnName)) { | ||
| log.warn(s"Payload field '$keyColumnName' conflicts with Kafka key column name and will be replaced.") | ||
| log.warn(s"Payload field '$keyColumnName' conflicts with reserved Kafka key column name and will be removed.") | ||
| } | ||
|
|
||
| // Put data fields to the root level of the schema, and if data struct already has kafka_key and kafka fields, | ||
| // drop them | ||
| val dfFinal = df2 | ||
| .select("tmp_pramen_kafka_key", "data.*", "tmp_pramen_kafka") | ||
| .drop(kafkaColumnName) | ||
| .drop(keyColumnName) | ||
| .withColumnRenamed("tmp_pramen_kafka", kafkaColumnName) | ||
| .withColumnRenamed("tmp_pramen_kafka_key", keyColumnName) | ||
| val dfFinal = if (hasKey) { | ||
| df2.select(tempKafkaKeyColumnName, "data.*", tempKafkaColumnName) | ||
| .drop(kafkaColumnName) | ||
| .drop(keyColumnName) | ||
| .withColumnRenamed(tempKafkaColumnName, kafkaColumnName) | ||
| .withColumnRenamed(tempKafkaKeyColumnName, keyColumnName) | ||
| } else { | ||
| df2.select("data.*", tempKafkaColumnName) | ||
| .drop(kafkaColumnName) | ||
| .drop(keyColumnName) | ||
| .withColumnRenamed(tempKafkaColumnName, kafkaColumnName) | ||
| } | ||
|
|
||
| SourceResult(dfFinal) | ||
| } | ||
|
|
@@ -243,6 +268,7 @@ object KafkaAvroSource extends ExternalChannelFactoryV2[KafkaAvroSource] { | |
| val TOPIC_NAME_KEY = "topic.name" | ||
| val CUSTOM_KAFKA_COLUMN_KEY = "custom.kafka.column" | ||
| val KEY_COLUMN_KEY = "key.column.name" | ||
| val KEY_COLUMN_SERIALIZER_KEY = "key.column.serializer" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chainPublic constant looks good; ensure config default is discoverable. Confirm 🏁 Script executed: #!/bin/bash
# Verify default presence and examples for key.column.serializer
fd -a reference.conf
rg -nC2 'key\.column\.serializer' -g 'pramen/**/reference.conf'Length of output: 196 I need to inspect the reference.conf files directly to verify if the configuration default is present. Length of output: 1132 Let me verify more thoroughly by checking the full contents and looking for any Kafka configuration: Length of output: 1120 Let me verify the file path and view the KafkaAvroSource code directly: #!/bin/bash
# Find the correct KafkaAvroSource file
echo "=== Finding KafkaAvroSource.scala ==="
find /home/jailuser/git/pramen -name "KafkaAvroSource.scala" -type f
echo -e "\n=== View context around line 271 ==="
fd -e scala "KafkaAvroSource" /home/jailuser/git/pramen --exec cat {} \; | sed -n '265,280p'🏁 Script executed: #!/bin/bash
# Find KafkaAvroSource.scala and view context around line 271
find /home/jailuser/git/pramen -name "KafkaAvroSource.scala" -type f
echo -e "\n=== View lines 265-280 of KafkaAvroSource.scala ==="
find /home/jailuser/git/pramen -name "KafkaAvroSource.scala" -type f -exec sed -n '265,280p' {} +
echo -e "\n=== View surrounding context for KEY_COLUMN_SERIALIZER_KEY ==="
find /home/jailuser/git/pramen -name "KafkaAvroSource.scala" -type f -exec grep -nC5 "KEY_COLUMN_SERIALIZER" {} +Length of output: 2062 Let me check if there are any Kafka configuration examples elsewhere in the codebase: Length of output: 56619 Add Kafka configuration defaults to Verification confirms 🤖 Prompt for AI Agents |
||
|
|
||
| val KAFKA_TOKENS_TO_REDACT = Set("password", "jaas.config", "auth.user.info") | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.