diff --git a/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SelfManagedOffsetLifecycleIntegrationTest.java b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SelfManagedOffsetLifecycleIntegrationTest.java new file mode 100644 index 0000000000000..065ad95632e04 --- /dev/null +++ b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SelfManagedOffsetLifecycleIntegrationTest.java @@ -0,0 +1,519 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kafka.streams.integration; + +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.serialization.LongDeserializer; +import org.apache.kafka.common.serialization.Serdes; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.kafka.common.serialization.StringSerializer; +import org.apache.kafka.streams.KafkaStreams; +import org.apache.kafka.streams.KeyValue; +import org.apache.kafka.streams.StoreQueryParameters; +import org.apache.kafka.streams.StreamsBuilder; +import org.apache.kafka.streams.StreamsConfig; +import org.apache.kafka.streams.integration.utils.EmbeddedKafkaCluster; +import org.apache.kafka.streams.integration.utils.IntegrationTestUtils; +import org.apache.kafka.streams.kstream.KStream; +import org.apache.kafka.streams.kstream.Materialized; +import org.apache.kafka.streams.processor.StateRestoreListener; +import org.apache.kafka.streams.state.QueryableStoreTypes; +import org.apache.kafka.streams.state.ReadOnlyKeyValueStore; +import org.apache.kafka.streams.state.internals.RocksDBStoreTestingUtils; +import org.apache.kafka.test.TestUtils; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.TestInfo; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.atomic.AtomicLong; + +import static org.apache.kafka.streams.utils.TestUtils.safeUniqueTestName; +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Integration tests for KIP-1035 column family offset normal lifecycle. + * + *

Validates that offsets stored in RocksDB column families are correctly persisted on + * clean shutdown and read back on restart. After each clean stop, tests directly inspect + * the CF to assert status=closed and offsets populated. On restart, a + * {@code StateRestoreListener} verifies no changelog restoration occurs, and cumulative + * count assertions confirm state continuity. All tests are parameterized for ALOS and EOS. + */ +@Tag("integration") +@Timeout(600) +public class SelfManagedOffsetLifecycleIntegrationTest { + + private static final int NUM_BROKERS = 1; + private static final int NUM_PARTITIONS = 3; + private static final String INPUT_TOPIC = "input-topic"; + private static final String OUTPUT_TOPIC = "output-topic"; + private static final String STORE_NAME = "counts-store"; + private static final long COMMIT_INTERVAL_MS = 100L; + private static final Duration STREAMS_CLOSE_TIMEOUT = Duration.ofSeconds(5); + + private static final EmbeddedKafkaCluster CLUSTER = new EmbeddedKafkaCluster(NUM_BROKERS); + + private int consumerGroupCounter = 0; + + private Properties streamsConfig; + private KafkaStreams streams; + private File stateDir; + + @BeforeAll + public static void startCluster() throws IOException, InterruptedException { + CLUSTER.start(); + } + + @AfterAll + public static void stopCluster() { + CLUSTER.stop(); + } + + @BeforeEach + public void setUp(final TestInfo testInfo) throws InterruptedException { + CLUSTER.deleteAllTopics(); + CLUSTER.createTopic(INPUT_TOPIC, NUM_PARTITIONS, 1); + CLUSTER.createTopic(OUTPUT_TOPIC, NUM_PARTITIONS, 1); + + final String safeTestName = safeUniqueTestName(testInfo); + + streamsConfig = new Properties(); + streamsConfig.put(StreamsConfig.APPLICATION_ID_CONFIG, "app-" + safeTestName); + streamsConfig.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers()); + streamsConfig.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.StringSerde.class); + streamsConfig.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.StringSerde.class); + stateDir = TestUtils.tempDirectory(); + streamsConfig.put(StreamsConfig.STATE_DIR_CONFIG, stateDir.getPath()); + streamsConfig.put(StreamsConfig.STATESTORE_CACHE_MAX_BYTES_CONFIG, 0); + streamsConfig.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, COMMIT_INTERVAL_MS); + streamsConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + streamsConfig.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 5000); + streamsConfig.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, 1000); + } + + @AfterEach + public void tearDown() { + if (streams != null) { + closeStreams(streams); + streams.cleanUp(); + } + } + + private StreamsBuilder buildCountTopology() { + final StreamsBuilder builder = new StreamsBuilder(); + final KStream stream = builder.stream(INPUT_TOPIC); + stream + .groupByKey() + .count(Materialized.as(STORE_NAME)) + .toStream() + .to(OUTPUT_TOPIC); + return builder; + } + + private void closeStreams(final KafkaStreams kafkaStreams) { + kafkaStreams.close(STREAMS_CLOSE_TIMEOUT); + } + + private KafkaStreams startStreams() throws Exception { + return startStreams(false); + } + + private KafkaStreams startStreams(final boolean cleanUp) throws Exception { + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + if (cleanUp) { + streams.cleanUp(); + } + streams.start(); + waitForRunning(streams); + return streams; + } + + private KafkaStreams startStreamsWithRestoreListener(final StateRestoreListener listener) throws Exception { + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + streams.setGlobalStateRestoreListener(listener); + streams.start(); + waitForRunning(streams); + return streams; + } + + private void waitForRunning(final KafkaStreams kafkaStreams) throws Exception { + TestUtils.waitForCondition( + () -> kafkaStreams.state().equals(KafkaStreams.State.RUNNING), + Duration.ofSeconds(60).toMillis(), + () -> "Expected RUNNING state but was " + kafkaStreams.state() + ); + } + + private Properties producerConfig() { + final Properties props = new Properties(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers()); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + return props; + } + + private Properties readCommittedConsumerConfig() { + final Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers()); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "verify-consumer-" + consumerGroupCounter++); + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + props.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, LongDeserializer.class.getName()); + return props; + } + + private void produceRecords(final List> records) { + IntegrationTestUtils.produceKeyValuesSynchronously( + INPUT_TOPIC, + records, + producerConfig(), + CLUSTER.time + ); + } + + private List> waitForOutput(final int expectedCount) throws Exception { + return IntegrationTestUtils.waitUntilMinKeyValueRecordsReceived( + readCommittedConsumerConfig(), + OUTPUT_TOPIC, + expectedCount + ); + } + + /** + * Extracts the latest count for each key from the output records. + * Since the count store emits updates, the last value for each key is the current count. + */ + private Map latestCountsFromOutput(final List> output) { + final Map latest = new HashMap<>(); + for (final KeyValue record : output) { + latest.put(record.key, record.value); + } + return latest; + } + + /** + * Queries the state store via interactive queries and returns all key-value pairs. + */ + private Map queryStore(final KafkaStreams kafkaStreams) throws Exception { + final ReadOnlyKeyValueStore store = kafkaStreams.store( + StoreQueryParameters.fromNameAndType(STORE_NAME, QueryableStoreTypes.keyValueStore()) + ); + final Map result = new HashMap<>(); + try (var iter = store.all()) { + while (iter.hasNext()) { + final KeyValue kv = iter.next(); + result.put(kv.key, kv.value); + } + } + return result; + } + + // ----------------------------------------------------------- + // Column family inspection helpers + // ----------------------------------------------------------- + + private List findAllStoreDirs(final String storeName) { + final String appId = streamsConfig.getProperty(StreamsConfig.APPLICATION_ID_CONFIG); + return RocksDBStoreTestingUtils.findAllStoreDirs(stateDir, appId, storeName); + } + + /** + * Asserts that all store directories have the expected status (0L = closed, 1L = open). + */ + private void assertStoreStatus(final long expectedStatus) throws Exception { + for (final File storeDir : findAllStoreDirs(STORE_NAME)) { + final Long status = RocksDBStoreTestingUtils.readStoreStatus(storeDir); + assertEquals(expectedStatus, status, + "Store status in " + storeDir + " should be " + (expectedStatus == 0L ? "closed" : "open")); + } + } + + /** + * Asserts that all store directories have non-empty offsets in the CF. + */ + private void assertOffsetsPopulated() throws Exception { + for (final File storeDir : findAllStoreDirs(STORE_NAME)) { + final Map offsets = RocksDBStoreTestingUtils.readOffsets(storeDir); + if (!offsets.isEmpty()) { + return; // At least one store dir has offsets — with partitioning, not all may + } + } + throw new AssertionError("Expected at least one store directory to have populated offsets"); + } + + /** + * Start, produce, stop cleanly, restart (no cleanUp), produce more. + * Counts should be cumulative, proving offsets in the CF were persisted and read back. + */ + @ParameterizedTest + @ValueSource(strings = {StreamsConfig.AT_LEAST_ONCE, StreamsConfig.EXACTLY_ONCE_V2}) + public void shouldPreserveStateAcrossCleanRestart(final String processingGuarantee) throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, processingGuarantee); + + final List> batch1 = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + + startStreams(true); + produceRecords(batch1); + waitForOutput(batch1.size()); + + // Clean stop — verify CF state, then restart without cleanUp + closeStreams(streams); + streams = null; + + assertStoreStatus(0L); + assertOffsetsPopulated(); + + // Restart without cleanUp so local state is preserved + startStreams(false); + + final List> batch2 = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("B", "v2") + ); + produceRecords(batch2); + + final List> allOutput = waitForOutput(batch1.size() + batch2.size()); + final Map counts = latestCountsFromOutput(allOutput); + + // A: 3 total (v1, v2, v3), B: 2 total (v1, v2) + assertEquals(3L, counts.get("A"), "A count should be cumulative across restart"); + assertEquals(2L, counts.get("B"), "B count should be cumulative across restart"); + } + + /** + * Multiple restart cycles: start, produce batch1, stop, restart, produce batch2, + * stop, restart, produce batch3. Final counts should equal totals across all 3 batches. + */ + @ParameterizedTest + @ValueSource(strings = {StreamsConfig.AT_LEAST_ONCE, StreamsConfig.EXACTLY_ONCE_V2}) + public void shouldPreserveStateAcrossMultipleRestartCycles(final String processingGuarantee) throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, processingGuarantee); + + // Cycle 1 + startStreams(true); + final List> batch1 = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1") + ); + produceRecords(batch1); + waitForOutput(batch1.size()); + closeStreams(streams); + streams = null; + + // Cycle 2 + startStreams(false); + final List> batch2 = Arrays.asList( + new KeyValue<>("A", "v2"), + new KeyValue<>("C", "v1") + ); + produceRecords(batch2); + waitForOutput(batch1.size() + batch2.size()); + closeStreams(streams); + streams = null; + + // Cycle 3 + startStreams(false); + final List> batch3 = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("B", "v2"), + new KeyValue<>("C", "v2") + ); + produceRecords(batch3); + + final int totalRecords = batch1.size() + batch2.size() + batch3.size(); + final List> allOutput = waitForOutput(totalRecords); + final Map counts = latestCountsFromOutput(allOutput); + + // A: 3 (v1, v2, v3), B: 2 (v1, v2), C: 2 (v1, v2) + assertEquals(3L, counts.get("A"), "A count across 3 cycles"); + assertEquals(2L, counts.get("B"), "B count across 3 cycles"); + assertEquals(2L, counts.get("C"), "C count across 3 cycles"); + } + + /** + * After a clean shutdown, restarting should not require full changelog restoration. + * Uses a TrackingRestoreListener to verify no records are restored on restart. + */ + @ParameterizedTest + @ValueSource(strings = {StreamsConfig.AT_LEAST_ONCE, StreamsConfig.EXACTLY_ONCE_V2}) + public void shouldNotRestoreFromChangelogOnCleanRestart(final String processingGuarantee) throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, processingGuarantee); + + final List> batch1 = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + + startStreams(true); + produceRecords(batch1); + waitForOutput(batch1.size()); + + closeStreams(streams); + streams = null; + + // Verify CF state after clean shutdown: status=closed, offsets populated + assertStoreStatus(0L); + assertOffsetsPopulated(); + + // Restart with a restore listener — should see 0 records restored + final TrackingRestoreListener restoreListener = new TrackingRestoreListener(); + startStreamsWithRestoreListener(restoreListener); + + final List> batch2 = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("C", "v1") + ); + produceRecords(batch2); + + final List> allOutput = waitForOutput(batch1.size() + batch2.size()); + final Map counts = latestCountsFromOutput(allOutput); + + assertEquals(3L, counts.get("A"), "A count should be cumulative"); + assertEquals(1L, counts.get("B"), "B count should be preserved"); + assertEquals(1L, counts.get("C"), "C count should reflect new record"); + assertEquals(0L, restoreListener.totalRestored.get(), + "No records should be restored from changelog after clean shutdown"); + } + + /** + * Edge case: start, reach RUNNING, stop cleanly without producing any records, + * then restart and produce. The store was initialized but never committed. + */ + @ParameterizedTest + @ValueSource(strings = {StreamsConfig.AT_LEAST_ONCE, StreamsConfig.EXACTLY_ONCE_V2}) + public void shouldHandleCleanRestartWithNoDataProcessed(final String processingGuarantee) throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, processingGuarantee); + + // Start and immediately stop — no records produced + startStreams(true); + closeStreams(streams); + streams = null; + + // After clean shutdown with no data, status should still be closed + assertStoreStatus(0L); + + // Restart — should not treat empty CF as corruption + startStreams(false); + + final List> records = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1") + ); + produceRecords(records); + + final List> output = waitForOutput(records.size()); + final Map counts = latestCountsFromOutput(output); + + assertEquals(1L, counts.get("A")); + assertEquals(1L, counts.get("B")); + } + + /** + * Validates that the in-store state (via interactive queries) is preserved across + * a clean restart, not just the output topic. + */ + @ParameterizedTest + @ValueSource(strings = {StreamsConfig.AT_LEAST_ONCE, StreamsConfig.EXACTLY_ONCE_V2}) + public void shouldVerifyStoreStateViaInteractiveQueriesAcrossRestart(final String processingGuarantee) throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, processingGuarantee); + + final List> records = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + + startStreams(true); + produceRecords(records); + waitForOutput(records.size()); + + // Query store before restart + final Map countsBefore = queryStore(streams); + assertEquals(2L, countsBefore.get("A")); + assertEquals(1L, countsBefore.get("B")); + + closeStreams(streams); + streams = null; + + // Verify CF persisted correctly + assertStoreStatus(0L); + assertOffsetsPopulated(); + + // Restart and query again — counts should match + startStreams(false); + final Map countsAfter = queryStore(streams); + + assertEquals(countsBefore, countsAfter, + "Store state via IQ should be identical after clean restart"); + } + + /** + * A StateRestoreListener that tracks the total number of records restored. + */ + static class TrackingRestoreListener implements StateRestoreListener { + private final AtomicLong totalRestored = new AtomicLong(0); + + @Override + public void onRestoreStart(final TopicPartition topicPartition, + final String storeName, + final long startingOffset, + final long endingOffset) { + // no-op + } + + @Override + public void onBatchRestored(final TopicPartition topicPartition, + final String storeName, + final long batchEndOffset, + final long numRestored) { + totalRestored.addAndGet(numRestored); + } + + @Override + public void onRestoreEnd(final TopicPartition topicPartition, + final String storeName, + final long totalRestored) { + // no-op + } + } +} diff --git a/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SelfManagedOffsetRecoveryIntegrationTest.java b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SelfManagedOffsetRecoveryIntegrationTest.java new file mode 100644 index 0000000000000..d537c35e3ae3d --- /dev/null +++ b/streams/integration-tests/src/test/java/org/apache/kafka/streams/integration/SelfManagedOffsetRecoveryIntegrationTest.java @@ -0,0 +1,696 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.streams.integration; + +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.common.serialization.LongDeserializer; +import org.apache.kafka.common.serialization.Serdes; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.kafka.common.serialization.StringSerializer; +import org.apache.kafka.streams.KafkaStreams; +import org.apache.kafka.streams.KeyValue; +import org.apache.kafka.streams.StreamsBuilder; +import org.apache.kafka.streams.StreamsConfig; +import org.apache.kafka.streams.integration.utils.EmbeddedKafkaCluster; +import org.apache.kafka.streams.integration.utils.IntegrationTestUtils; +import org.apache.kafka.streams.kstream.KStream; +import org.apache.kafka.streams.kstream.Materialized; +import org.apache.kafka.streams.kstream.Produced; +import org.apache.kafka.streams.state.internals.RocksDBStoreTestingUtils; +import org.apache.kafka.test.TestUtils; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.junit.jupiter.api.Timeout; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import static org.apache.kafka.streams.utils.TestUtils.safeUniqueTestName; + +/** + * Integration tests for KIP-1035 column family offset recovery. + * + * KIP-1035 moved offset storage from external .checkpoint files into RocksDB column families. + * These tests verify that Kafka Streams can recover from unclean shutdowns and corrupted + * column family state, which is critical for exactly-once semantics (EOS) correctness. + */ +@Tag("integration") +@Timeout(600) +public class SelfManagedOffsetRecoveryIntegrationTest { + + private static final int NUM_BROKERS = 1; + private static final int NUM_PARTITIONS = 3; + private static final String INPUT_TOPIC = "input-topic"; + private static final String OUTPUT_TOPIC = "output-topic"; + private static final String OUTPUT_TOPIC_2 = "output-topic-2"; + private static final String STORE_NAME = "counts-store"; + private static final String STORE_NAME_2 = "counts-store-2"; + private static final long COMMIT_INTERVAL_MS = 100L; + private static final Duration STREAMS_CLOSE_TIMEOUT = Duration.ofSeconds(5); + + private static final EmbeddedKafkaCluster CLUSTER = new EmbeddedKafkaCluster(NUM_BROKERS); + + private int consumerGroupCounter = 0; + + private Properties streamsConfig; + private KafkaStreams streams; + private File stateDir; + + @BeforeAll + public static void startCluster() throws IOException, InterruptedException { + CLUSTER.start(); + } + + @AfterAll + public static void stopCluster() { + CLUSTER.stop(); + } + + @BeforeEach + public void setUp(final TestInfo testInfo) throws InterruptedException { + CLUSTER.deleteAllTopics(); + CLUSTER.createTopic(INPUT_TOPIC, NUM_PARTITIONS, 1); + CLUSTER.createTopic(OUTPUT_TOPIC, NUM_PARTITIONS, 1); + CLUSTER.createTopic(OUTPUT_TOPIC_2, NUM_PARTITIONS, 1); + + stateDir = TestUtils.tempDirectory(); + final String safeTestName = safeUniqueTestName(testInfo); + + streamsConfig = new Properties(); + streamsConfig.put(StreamsConfig.APPLICATION_ID_CONFIG, "app-" + safeTestName); + streamsConfig.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers()); + streamsConfig.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.StringSerde.class); + streamsConfig.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.StringSerde.class); + streamsConfig.put(StreamsConfig.STATE_DIR_CONFIG, stateDir.getPath()); + streamsConfig.put(StreamsConfig.STATESTORE_CACHE_MAX_BYTES_CONFIG, 0); + streamsConfig.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, COMMIT_INTERVAL_MS); + streamsConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + streamsConfig.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, 5000); + streamsConfig.put(ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG, 1000); + } + + @AfterEach + public void tearDown() { + if (streams != null) { + closeStreams(streams); + streams.cleanUp(); + } + } + + private StreamsBuilder buildCountTopology() { + final StreamsBuilder builder = new StreamsBuilder(); + final KStream stream = builder.stream(INPUT_TOPIC); + stream + .groupByKey() + .count(Materialized.as(STORE_NAME)) + .toStream() + .to(OUTPUT_TOPIC); + return builder; + } + + /** + * Builds a topology with two separate state stores: + * store 1: groupByKey -> count (counts per key) + * store 2: groupBy(value) -> count (counts per value) + */ + private StreamsBuilder buildDualStoreTopology() { + final StreamsBuilder builder = new StreamsBuilder(); + final KStream stream = builder.stream(INPUT_TOPIC); + + // Store 1: count by key + stream + .groupByKey() + .count(Materialized.as(STORE_NAME)) + .toStream() + .to(OUTPUT_TOPIC, Produced.with(Serdes.String(), Serdes.Long())); + + // Store 2: count by value + stream + .groupBy((key, value) -> value) + .count(Materialized.as(STORE_NAME_2)) + .toStream() + .to(OUTPUT_TOPIC_2, Produced.with(Serdes.String(), Serdes.Long())); + + return builder; + } + + /** + * Corrupts store status to open for ALL task directories that contain the given store. + */ + private void setAllStoreStatusesToOpen(final String storeName) throws Exception { + final String appId = streamsConfig.getProperty(StreamsConfig.APPLICATION_ID_CONFIG); + for (final File storeDir : RocksDBStoreTestingUtils.findAllStoreDirs(stateDir, appId, storeName)) { + RocksDBStoreTestingUtils.setStoreStatusToOpen(storeDir); + } + } + + /** + * Deletes offset entries from the offsets column family for ALL task directories. + */ + private void deleteAllOffsets(final String storeName) throws Exception { + final String appId = streamsConfig.getProperty(StreamsConfig.APPLICATION_ID_CONFIG); + for (final File storeDir : RocksDBStoreTestingUtils.findAllStoreDirs(stateDir, appId, storeName)) { + RocksDBStoreTestingUtils.deleteOffsets(storeDir); + } + } + + + private void closeStreams(final KafkaStreams kafkaStreams) { + kafkaStreams.close(STREAMS_CLOSE_TIMEOUT); + } + + private KafkaStreams startStreams() throws Exception { + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + streams.cleanUp(); + streams.start(); + waitForRunning(streams); + return streams; + } + + private void waitForRunning(final KafkaStreams kafkaStreams) throws Exception { + TestUtils.waitForCondition( + () -> kafkaStreams.state().equals(KafkaStreams.State.RUNNING), + Duration.ofSeconds(60).toMillis(), + () -> "Expected RUNNING state but was " + kafkaStreams.state() + ); + } + + private Properties producerConfig() { + final Properties props = new Properties(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers()); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + return props; + } + + private Properties readCommittedConsumerConfig() { + final Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers()); + props.put(ConsumerConfig.GROUP_ID_CONFIG, "verify-consumer-" + consumerGroupCounter++); + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + props.put(ConsumerConfig.ISOLATION_LEVEL_CONFIG, "read_committed"); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, LongDeserializer.class.getName()); + return props; + } + + private void produceRecords(final List> records) { + IntegrationTestUtils.produceKeyValuesSynchronously( + INPUT_TOPIC, + records, + producerConfig(), + CLUSTER.time + ); + } + + private List> waitForOutput(final int expectedCount) throws Exception { + return IntegrationTestUtils.waitUntilMinKeyValueRecordsReceived( + readCommittedConsumerConfig(), + OUTPUT_TOPIC, + expectedCount + ); + } + + /** + * ALOS baseline: after an unclean shutdown (status=open), the store should recover + * because ALOS opens with ignoreInvalidState=true. + */ + @Test + public void shouldRecoverFromUncleanShutdownWithAlos() throws Exception { + // No EOS — default is at-least-once + + // Phase 1: start, produce, verify output + final List> initialRecords = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + + startStreams(); + produceRecords(initialRecords); + waitForOutput(initialRecords.size()); + + // Phase 2: clean shutdown, then corrupt store status + closeStreams(streams); + streams = null; + + setAllStoreStatusesToOpen(STORE_NAME); + + // Phase 3: restart — should recover despite status=open + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + streams.start(); + waitForRunning(streams); + + // Phase 4: produce more records, verify processing continues + final List> additionalRecords = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("C", "v1") + ); + produceRecords(additionalRecords); + + // We expect output from both initial and additional records. + // After recovery, state may be rebuilt from changelog, so we just verify + // that processing continues and we get at least the additional records' output. + waitForOutput(initialRecords.size() + additionalRecords.size()); + } + + /** + * Primary regression test for KIP-1035: after an unclean shutdown with EOS enabled, + * the store status key is left as 1L (open). AbstractColumnFamilyAccessor.open() throws + * ProcessorStateException("Invalid state during store open") which should be caught and + * trigger task corruption recovery (wipe + restore from changelog). + * + * Without the fix, the ProcessorStateException propagates fatally and the application + * fails to start. + */ + @Test + public void shouldRecoverFromUncleanShutdownWithEos() throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2); + + // Phase 1: start with EOS, produce records, verify committed output + final List> initialRecords = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + + startStreams(); + produceRecords(initialRecords); + waitForOutput(initialRecords.size()); + + // Phase 2: clean shutdown, then corrupt store status to simulate unclean shutdown + closeStreams(streams); + streams = null; + + setAllStoreStatusesToOpen(STORE_NAME); + + // Phase 3: restart with EOS — should detect corruption, wipe, and restore from changelog + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + streams.start(); + waitForRunning(streams); + + // Phase 4: produce more records and verify processing continues correctly + final List> additionalRecords = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("C", "v1") + ); + produceRecords(additionalRecords); + + // After recovery from corruption, state is rebuilt from changelog. + // New consumer group reads all committed output from the beginning. + waitForOutput(initialRecords.size() + additionalRecords.size()); + } + + /** + * Tests the TaskCorruptedException path: offsets are deleted from the column family + * but the store status is clean (closed). Under EOS, missing offsets should trigger + * task corruption detection, causing a wipe and restore from changelog. + */ + @Test + public void shouldRecoverFromMissingOffsetsInColumnFamilyWithEos() throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2); + + // Phase 1: start with EOS, produce records, verify committed output + final List> initialRecords = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + + startStreams(); + produceRecords(initialRecords); + waitForOutput(initialRecords.size()); + + // Phase 2: clean shutdown, then delete offset entries (keep status=closed) + closeStreams(streams); + streams = null; + + deleteAllOffsets(STORE_NAME); + + // Phase 3: restart — should detect missing offsets, mark task corrupted, wipe and restore + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + streams.start(); + waitForRunning(streams); + + // Phase 4: produce more records, verify data is re-bootstrapped from changelog + final List> additionalRecords = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("C", "v1") + ); + produceRecords(additionalRecords); + + waitForOutput(initialRecords.size() + additionalRecords.size()); + } + + /** + * Combined worst case: status=open (unclean shutdown) AND no committed offsets. + * Under EOS, this should still trigger corruption recovery. + * + * Without the fix, the ProcessorStateException from status=open propagates fatally + * before the missing offsets are even checked. + */ + @Test + public void shouldRecoverFromUncleanShutdownAndMissingOffsetsWithEos() throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2); + + // Phase 1: start with EOS, produce records, verify committed output + final List> initialRecords = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + + startStreams(); + produceRecords(initialRecords); + waitForOutput(initialRecords.size()); + + // Phase 2: clean shutdown, then corrupt BOTH status and offsets + closeStreams(streams); + streams = null; + + setAllStoreStatusesToOpen(STORE_NAME); + deleteAllOffsets(STORE_NAME); + + // Phase 3: restart — should recover from both corruptions + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + streams.start(); + waitForRunning(streams); + + // Phase 4: produce more records, verify data is re-bootstrapped correctly + final List> additionalRecords = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("C", "v1") + ); + produceRecords(additionalRecords); + + waitForOutput(initialRecords.size() + additionalRecords.size()); + } + + /** + * End-to-end EOS correctness after recovery: verifies that no duplicate output records + * are visible via READ_COMMITTED and that final aggregation values are correct. + * + * Without the fix, the application crashes on restart due to ProcessorStateException. + */ + @Test + public void shouldMaintainEosGuaranteesAcrossUncleanShutdownAndRecovery() throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2); + + // Phase 1: produce records with known keys, wait for committed output + final List> batch1 = Arrays.asList( + new KeyValue<>("X", "a"), + new KeyValue<>("Y", "b"), + new KeyValue<>("X", "c") + ); + + startStreams(); + produceRecords(batch1); + waitForOutput(batch1.size()); + + // Phase 2: clean shutdown, corrupt store status + closeStreams(streams); + streams = null; + + setAllStoreStatusesToOpen(STORE_NAME); + + // Phase 3: restart, produce more records with same keys + final StreamsBuilder builder = buildCountTopology(); + streams = new KafkaStreams(builder.build(), streamsConfig); + streams.start(); + waitForRunning(streams); + + final List> batch2 = Arrays.asList( + new KeyValue<>("X", "d"), + new KeyValue<>("Y", "e") + ); + produceRecords(batch2); + + // Phase 4: collect all committed output and verify correctness + final List> allOutput = waitForOutput(batch1.size() + batch2.size()); + + // Find the latest count for each key — these should reflect correct aggregation + // without double-counting. X had 3 records total (a, c, d) -> count=3, Y had 2 (b, e) -> count=2 + long latestX = 0; + long latestY = 0; + for (final KeyValue record : allOutput) { + if ("X".equals(record.key)) { + latestX = Math.max(latestX, record.value); + } else if ("Y".equals(record.key)) { + latestY = Math.max(latestY, record.value); + } + } + + // X: 3 records total -> count should be exactly 3 + // Y: 2 records total -> count should be exactly 2 + // If there were duplicates from recovery, counts would be higher + org.junit.jupiter.api.Assertions.assertEquals(3L, latestX, "X count should be 3 (no double-counting after recovery)"); + org.junit.jupiter.api.Assertions.assertEquals(2L, latestY, "Y count should be 2 (no double-counting after recovery)"); + } + + /** + * Tests that partial store corruption is handled correctly: only one of two stores + * is corrupted, and the application should still recover. + * + * Without the fix, corrupting even one store causes the application to crash. + */ + @Test + public void shouldRecoverMultipleStoresFromUncleanShutdown() throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2); + + // Phase 1: start with dual-store topology, produce records + final List> initialRecords = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v2"), + new KeyValue<>("A", "v1") + ); + + final StreamsBuilder builder1 = buildDualStoreTopology(); + streams = new KafkaStreams(builder1.build(), streamsConfig); + streams.cleanUp(); + streams.start(); + waitForRunning(streams); + + produceRecords(initialRecords); + // Wait for output from the first store + waitForOutput(initialRecords.size()); + + // Phase 2: clean shutdown, corrupt ONLY store 1 (leave store 2 clean) + closeStreams(streams); + streams = null; + + setAllStoreStatusesToOpen(STORE_NAME); + // STORE_NAME_2 is left with clean status + + // Phase 3: restart — should recover the corrupted store, keep the clean one + final StreamsBuilder builder2 = buildDualStoreTopology(); + streams = new KafkaStreams(builder2.build(), streamsConfig); + streams.start(); + waitForRunning(streams); + + // Phase 4: produce more records, verify both stores produce correct output + final List> additionalRecords = Arrays.asList( + new KeyValue<>("C", "v3"), + new KeyValue<>("A", "v1") + ); + produceRecords(additionalRecords); + + waitForOutput(initialRecords.size() + additionalRecords.size()); + } + + /** + * Tests standby task recovery with corrupted column family state. + * After corrupting instance 1's store, it should recover from the standby/changelog + * and eventually take over as active when instance 2 is shut down. + * + * Without the fix, instance 1 fails to restart due to ProcessorStateException. + */ + @Test + public void shouldRecoverStandbyTaskFromUncleanShutdownWithEos() throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2); + streamsConfig.put(StreamsConfig.NUM_STANDBY_REPLICAS_CONFIG, 1); + streamsConfig.put(StreamsConfig.NUM_STREAM_THREADS_CONFIG, 1); + + // Use separate state dirs for each instance + final File stateDir1 = TestUtils.tempDirectory(); + final File stateDir2 = TestUtils.tempDirectory(); + + // Phase 1: start two instances + final Properties config1 = new Properties(); + config1.putAll(streamsConfig); + config1.put(StreamsConfig.STATE_DIR_CONFIG, stateDir1.getPath()); + + final Properties config2 = new Properties(); + config2.putAll(streamsConfig); + config2.put(StreamsConfig.STATE_DIR_CONFIG, stateDir2.getPath()); + + final StreamsBuilder builder1 = buildCountTopology(); + final StreamsBuilder builder2 = buildCountTopology(); + + final KafkaStreams streams1 = new KafkaStreams(builder1.build(), config1); + final KafkaStreams streams2 = new KafkaStreams(builder2.build(), config2); + streams1.cleanUp(); + streams2.cleanUp(); + streams1.start(); + streams2.start(); + + waitForRunning(streams1); + waitForRunning(streams2); + + // Phase 2: produce data, wait for processing + final List> initialRecords = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + produceRecords(initialRecords); + waitForOutput(initialRecords.size()); + + // Phase 3: shut down instance 1, corrupt its store status + closeStreams(streams1); + + // Corrupt all store dirs under instance 1's state directory + final String appId = streamsConfig.getProperty(StreamsConfig.APPLICATION_ID_CONFIG); + for (final File storeDir : RocksDBStoreTestingUtils.findAllStoreDirs(stateDir1, appId, STORE_NAME)) { + RocksDBStoreTestingUtils.setStoreStatusToOpen(storeDir); + } + + // Phase 4: restart instance 1 — should recover from standby or changelog + final StreamsBuilder builder1Restart = buildCountTopology(); + final KafkaStreams streams1Restart = new KafkaStreams(builder1Restart.build(), config1); + streams1Restart.start(); + waitForRunning(streams1Restart); + + // Phase 5: shut down instance 2, verify instance 1 takes over + closeStreams(streams2); + + // Produce more records and verify instance 1 processes them as active + final List> additionalRecords = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("C", "v1") + ); + produceRecords(additionalRecords); + + waitForOutput(initialRecords.size() + additionalRecords.size()); + + // Clean up — set streams to instance 1 so tearDown handles it + streams = streams1Restart; + } + + /** + * Regression test for KAFKA-19712 (PR #21884): after completely deleting local state + * and restarting, standby tasks should not get TaskCorruptedException during rebalance. + * + * The bug: KIP-1035 removed the OFFSET_UNKNOWN sentinel, so stores closed with null + * offsets when offsets were never initialized. On the next rebalance, initializeStoreOffsets() + * found null committed offset + non-empty state dir under EOS, and threw TaskCorruptedException. + * + * The fix: re-introduced OFFSET_UNKNOWN (-4L) as a sentinel in commit(), and translates + * it back to null in initializeStoreOffsets(). + */ + @Test + public void shouldNotThrowTaskCorruptedOnStandbyAfterStateWipe() throws Exception { + streamsConfig.put(StreamsConfig.PROCESSING_GUARANTEE_CONFIG, StreamsConfig.EXACTLY_ONCE_V2); + streamsConfig.put(StreamsConfig.NUM_STANDBY_REPLICAS_CONFIG, 1); + streamsConfig.put(StreamsConfig.NUM_STREAM_THREADS_CONFIG, 1); + + final File stateDir1 = TestUtils.tempDirectory(); + final File stateDir2 = TestUtils.tempDirectory(); + + final Properties config1 = new Properties(); + config1.putAll(streamsConfig); + config1.put(StreamsConfig.STATE_DIR_CONFIG, stateDir1.getPath()); + + final Properties config2 = new Properties(); + config2.putAll(streamsConfig); + config2.put(StreamsConfig.STATE_DIR_CONFIG, stateDir2.getPath()); + + // Phase 1: start two instances, produce data, let both process and replicate + final StreamsBuilder builder1 = buildCountTopology(); + final StreamsBuilder builder2 = buildCountTopology(); + + final KafkaStreams streams1 = new KafkaStreams(builder1.build(), config1); + final KafkaStreams streams2 = new KafkaStreams(builder2.build(), config2); + streams1.cleanUp(); + streams2.cleanUp(); + streams1.start(); + streams2.start(); + + waitForRunning(streams1); + waitForRunning(streams2); + + final List> initialRecords = Arrays.asList( + new KeyValue<>("A", "v1"), + new KeyValue<>("B", "v1"), + new KeyValue<>("A", "v2") + ); + produceRecords(initialRecords); + waitForOutput(initialRecords.size()); + + // Phase 2: shut down instance 1, wipe its entire state, then restart. + // This simulates the LittleHorse scenario: complete state deletion followed by + // changelog restoration. The standby tasks on this instance will have stores + // that were never initialized with offsets. + closeStreams(streams1); + streams1.cleanUp(); + + final StreamsBuilder builder1Restart = buildCountTopology(); + final KafkaStreams streams1Restart = new KafkaStreams(builder1Restart.build(), config1); + streams1Restart.start(); + waitForRunning(streams1Restart); + + // Phase 3: trigger a rebalance by shutting down instance 2 and restarting it. + // Before the fix, the standby tasks on instance 1 would throw + // TaskCorruptedException during the rebalance when re-initializing store offsets. + closeStreams(streams2); + + final StreamsBuilder builder2Restart = buildCountTopology(); + final KafkaStreams streams2Restart = new KafkaStreams(builder2Restart.build(), config2); + streams2Restart.start(); + + // Both instances should reach RUNNING without TaskCorruptedException + waitForRunning(streams1Restart); + waitForRunning(streams2Restart); + + // Phase 4: verify processing still works after rebalance + final List> additionalRecords = Arrays.asList( + new KeyValue<>("A", "v3"), + new KeyValue<>("C", "v1") + ); + produceRecords(additionalRecords); + waitForOutput(initialRecords.size() + additionalRecords.size()); + + // Clean up + closeStreams(streams2Restart); + streams = streams1Restart; + } +} diff --git a/streams/src/test/java/org/apache/kafka/streams/state/internals/RocksDBStoreTestingUtils.java b/streams/src/test/java/org/apache/kafka/streams/state/internals/RocksDBStoreTestingUtils.java new file mode 100644 index 0000000000000..3bcdc99e3402c --- /dev/null +++ b/streams/src/test/java/org/apache/kafka/streams/state/internals/RocksDBStoreTestingUtils.java @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.kafka.streams.state.internals; + +import org.apache.kafka.common.serialization.Serdes; +import org.apache.kafka.common.serialization.StringSerializer; + +import org.rocksdb.ColumnFamilyDescriptor; +import org.rocksdb.ColumnFamilyHandle; +import org.rocksdb.ColumnFamilyOptions; +import org.rocksdb.DBOptions; +import org.rocksdb.Options; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksDBException; + +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * Test utility for working with RocksDB including changing column family state to simulate + * store corruption scenarios (e.g., unclean shutdown). + */ +public final class RocksDBStoreTestingUtils { + + private static final StringSerializer STRING_SERIALIZER = new StringSerializer(); + private static final byte[] OFFSETS_COLUMN_FAMILY_NAME = "offsets".getBytes(StandardCharsets.UTF_8); + private static final byte[] STATUS_KEY = STRING_SERIALIZER.serialize(null, "status"); + private static final byte[] OPEN_STATE = Serdes.Long().serializer().serialize(null, 1L); + private static final byte[] POSITION_KEY = STRING_SERIALIZER.serialize(null, "position"); + + private RocksDBStoreTestingUtils() { + } + + /** + * Overwrites the store status key to 1L (open), simulating an unclean shutdown. + * + * @param dbDir the RocksDB store directory + */ + public static void setStoreStatusToOpen(final File dbDir) throws RocksDBException { + try (final DBOptions dbOptions = new DBOptions(); + final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) { + + final List cfDescriptors = listCfDescriptors(dbDir, cfOptions); + final List cfHandles = new ArrayList<>(cfDescriptors.size()); + try (final RocksDB db = RocksDB.open(dbOptions, dbDir.getAbsolutePath(), cfDescriptors, cfHandles)) { + final ColumnFamilyHandle offsetsCf = findOffsetsCf(cfHandles, cfDescriptors); + db.put(offsetsCf, STATUS_KEY, OPEN_STATE); + } finally { + cfHandles.forEach(ColumnFamilyHandle::close); + } + } + } + + /** + * Deletes all offset entries from the offsets column family, keeping only the status key. + * + * @param dbDir the RocksDB store directory + */ + public static void deleteOffsets(final File dbDir) throws RocksDBException { + try (final DBOptions dbOptions = new DBOptions(); + final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) { + + final List cfDescriptors = listCfDescriptors(dbDir, cfOptions); + final List cfHandles = new ArrayList<>(cfDescriptors.size()); + try (final RocksDB db = RocksDB.open(dbOptions, dbDir.getAbsolutePath(), cfDescriptors, cfHandles)) { + final ColumnFamilyHandle offsetsCf = findOffsetsCf(cfHandles, cfDescriptors); + + try (final org.rocksdb.RocksIterator iter = db.newIterator(offsetsCf)) { + iter.seekToFirst(); + while (iter.isValid()) { + final byte[] key = iter.key(); + if (!Arrays.equals(key, STATUS_KEY)) { + db.delete(offsetsCf, key); + } + iter.next(); + } + } + } finally { + cfHandles.forEach(ColumnFamilyHandle::close); + } + } + } + + /** + * Reads the store status from the offsets column family. + * + * @param dbDir the RocksDB store directory + * @return the status value (0L = closed, 1L = open), or null if no status key exists + */ + public static Long readStoreStatus(final File dbDir) throws RocksDBException { + try (final DBOptions dbOptions = new DBOptions(); + final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) { + + final List cfDescriptors = listCfDescriptors(dbDir, cfOptions); + final List cfHandles = new ArrayList<>(cfDescriptors.size()); + try (final RocksDB db = RocksDB.open(dbOptions, dbDir.getAbsolutePath(), cfDescriptors, cfHandles)) { + final ColumnFamilyHandle offsetsCf = findOffsetsCf(cfHandles, cfDescriptors); + final byte[] valueBytes = db.get(offsetsCf, STATUS_KEY); + if (valueBytes != null) { + return Serdes.Long().deserializer().deserialize(null, valueBytes); + } + return null; + } finally { + cfHandles.forEach(ColumnFamilyHandle::close); + } + } + } + + /** + * Reads all offset entries from the offsets column family, excluding the status and position keys. + * Keys are TopicPartition.toString() values, values are committed offsets. + * + * @param dbDir the RocksDB store directory + * @return a map of partition string to committed offset + */ + public static Map readOffsets(final File dbDir) throws RocksDBException { + try (final DBOptions dbOptions = new DBOptions(); + final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) { + + final List cfDescriptors = listCfDescriptors(dbDir, cfOptions); + final List cfHandles = new ArrayList<>(cfDescriptors.size()); + try (final RocksDB db = RocksDB.open(dbOptions, dbDir.getAbsolutePath(), cfDescriptors, cfHandles)) { + final ColumnFamilyHandle offsetsCf = findOffsetsCf(cfHandles, cfDescriptors); + final Map offsets = new HashMap<>(); + + try (final org.rocksdb.RocksIterator iter = db.newIterator(offsetsCf)) { + iter.seekToFirst(); + while (iter.isValid()) { + final byte[] key = iter.key(); + if (!Arrays.equals(key, STATUS_KEY) && !Arrays.equals(key, POSITION_KEY)) { + final String partition = new String(key, StandardCharsets.UTF_8); + final Long offset = Serdes.Long().deserializer().deserialize(null, iter.value()); + offsets.put(partition, offset); + } + iter.next(); + } + } + return offsets; + } finally { + cfHandles.forEach(ColumnFamilyHandle::close); + } + } + } + + /** + * Finds all RocksDB store directories for the given store name across all task directories. + * + * @param stateDir the root state directory + * @param appId the application ID + * @param storeName the store name + * @return list of store directories + */ + public static List findAllStoreDirs(final File stateDir, final String appId, final String storeName) { + final File appDir = new File(stateDir, appId); + final File[] taskDirs = appDir.listFiles(file -> + file.isDirectory() && !file.getName().startsWith(".")); + + if (taskDirs == null || taskDirs.length == 0) { + throw new IllegalStateException("No task directories found under " + appDir); + } + + final List storeDirs = new ArrayList<>(); + for (final File taskDir : taskDirs) { + final File storeDir2 = Paths.get(taskDir.getAbsolutePath(), "rocksdb", storeName).toFile(); + if (storeDir2.exists()) { + storeDirs.add(storeDir2); + } + } + + if (storeDirs.isEmpty()) { + throw new IllegalStateException("No store directories for '" + storeName + "' found under " + appDir); + } + return storeDirs; + } + + private static List listCfDescriptors(final File dbDir, + final ColumnFamilyOptions cfOptions) throws RocksDBException { + return RocksDB.listColumnFamilies(new Options(), dbDir.getAbsolutePath()) + .stream() + .map(name -> new ColumnFamilyDescriptor(name, cfOptions)) + .collect(Collectors.toList()); + } + + private static ColumnFamilyHandle findOffsetsCf(final List handles, + final List descriptors) { + for (int i = 0; i < descriptors.size(); i++) { + if (Arrays.equals(descriptors.get(i).getName(), OFFSETS_COLUMN_FAMILY_NAME)) { + return handles.get(i); + } + } + throw new IllegalStateException("Offsets column family not found in RocksDB store"); + } +}