diff --git a/example/session/pom.xml b/example/session/pom.xml index e707c5b25d1ce..331fbf0c46df8 100644 --- a/example/session/pom.xml +++ b/example/session/pom.xml @@ -40,4 +40,17 @@ ${project.version} + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 11 + 11 + + + + diff --git a/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionWalFileAnalyzer.java b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionWalFileAnalyzer.java new file mode 100644 index 0000000000000..420c1672cca46 --- /dev/null +++ b/example/session/src/main/java/org/apache/iotdb/ConsensusSubscriptionWalFileAnalyzer.java @@ -0,0 +1,530 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.tsfile.file.metadata.enums.CompressionType; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.util.Locale; + +/** + * Inspect a single WAL file and print size breakdowns for its major sections. + * + *

Example: + * + *

+ *   java ... org.apache.iotdb.ConsensusSubscriptionWalFileAnalyzer D:\path\to\_12-25000-1.wal
+ * 
+ */ +public class ConsensusSubscriptionWalFileAnalyzer { + + private static final String V1_MAGIC = "WAL"; + private static final String V2_MAGIC = "V2-WAL"; + private static final String V3_MAGIC = "V3-WAL"; + + private static final int SEGMENT_HEADER_BASE_BYTES = Byte.BYTES + Integer.BYTES; + private static final int COMPRESSED_SEGMENT_EXTRA_HEADER_BYTES = Integer.BYTES; + private static final int WAL_FILE_INFO_END_MARKER_BYTES = Byte.BYTES; + private static final int METADATA_SIZE_FIELD_BYTES = Integer.BYTES; + private static final int V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT = + Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES; + + public static void main(final String[] args) throws Exception { + if (args.length == 0 || "--help".equals(args[0]) || "-h".equals(args[0])) { + printUsage(); + return; + } + + final File walFile = new File(args[0]); + if (!walFile.isFile()) { + throw new IllegalArgumentException("WAL file does not exist: " + walFile.getAbsolutePath()); + } + + final WalFileAnalysis analysis = analyze(walFile); + printAnalysis(analysis); + } + + private static void printUsage() { + System.out.println("Usage:"); + System.out.println( + " java ... org.apache.iotdb.ConsensusSubscriptionWalFileAnalyzer "); + } + + private static WalFileAnalysis analyze(final File walFile) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(walFile, "r"); + FileChannel channel = raf.getChannel()) { + final long totalBytes = channel.size(); + final String version = detectVersion(channel, totalBytes); + final int headMagicBytes = getHeadMagicBytes(version); + final int tailMagicBytes = getTailMagicBytes(version); + + final WalFileAnalysis analysis = new WalFileAnalysis(walFile, version, totalBytes); + analysis.headMagicBytes = Math.min(totalBytes, headMagicBytes); + + if (totalBytes <= headMagicBytes) { + analysis.note = "header-only WAL file (magic only, no body/footer)"; + return analysis; + } + + if (!hasTrailingMagic(channel, totalBytes, version)) { + analysis.note = "missing trailing magic/footer, file may be open or broken"; + return analysis; + } + + analysis.tailMagicBytes = tailMagicBytes; + analysis.metadataSizeFieldBytes = METADATA_SIZE_FIELD_BYTES; + + final long metadataSizeFieldPos = totalBytes - tailMagicBytes - METADATA_SIZE_FIELD_BYTES; + if (metadataSizeFieldPos < headMagicBytes) { + analysis.note = "invalid metadata size position"; + return analysis; + } + + final int metadataBytes = readInt(channel, metadataSizeFieldPos); + analysis.metadataBytes = metadataBytes; + analysis.footerStartOffset = metadataSizeFieldPos - metadataBytes; + if (analysis.footerStartOffset < headMagicBytes) { + analysis.note = "invalid footer start offset"; + return analysis; + } + + final long markerOffset = analysis.footerStartOffset - WAL_FILE_INFO_END_MARKER_BYTES; + if (markerOffset < headMagicBytes) { + analysis.note = "invalid end-marker offset"; + return analysis; + } + + analysis.endMarkerBytes = WAL_FILE_INFO_END_MARKER_BYTES; + analysis.segmentStartOffset = headMagicBytes; + analysis.segmentEndOffsetExclusive = markerOffset; + analysis.segmentRegionBytes = Math.max(0L, markerOffset - headMagicBytes); + + scanSegments(channel, analysis); + parseFooter(channel, analysis); + return analysis; + } + } + + private static void scanSegments(final FileChannel channel, final WalFileAnalysis analysis) + throws IOException { + long offset = analysis.segmentStartOffset; + while (offset < analysis.segmentEndOffsetExclusive) { + if (analysis.segmentEndOffsetExclusive - offset < SEGMENT_HEADER_BASE_BYTES) { + analysis.segmentParseWarning = + "remaining bytes are smaller than a segment header at offset " + offset; + return; + } + + final ByteBuffer headerBuffer = ByteBuffer.allocate(SEGMENT_HEADER_BASE_BYTES); + readFully(channel, headerBuffer, offset); + headerBuffer.flip(); + + final CompressionType compressionType = CompressionType.deserialize(headerBuffer.get()); + final int dataInDiskBytes = headerBuffer.getInt(); + int headerBytes = SEGMENT_HEADER_BASE_BYTES; + if (compressionType != CompressionType.UNCOMPRESSED) { + headerBytes += COMPRESSED_SEGMENT_EXTRA_HEADER_BYTES; + } + + final long nextOffset = offset + headerBytes + dataInDiskBytes; + if (nextOffset > analysis.segmentEndOffsetExclusive) { + analysis.segmentParseWarning = + String.format( + Locale.ROOT, + "segment at offset %d exceeds body boundary (%d > %d)", + offset, + nextOffset, + analysis.segmentEndOffsetExclusive); + return; + } + + analysis.segmentCount++; + analysis.segmentHeaderBytes += headerBytes; + analysis.segmentPayloadBytes += dataInDiskBytes; + if (compressionType != CompressionType.UNCOMPRESSED) { + analysis.compressedSegmentCount++; + } + offset = nextOffset; + } + + if (offset != analysis.segmentEndOffsetExclusive) { + analysis.segmentParseWarning = + String.format( + Locale.ROOT, + "segment parser stopped at %d but expected %d", + offset, + analysis.segmentEndOffsetExclusive); + } + } + + private static void parseFooter(final FileChannel channel, final WalFileAnalysis analysis) + throws IOException { + if (analysis.metadataBytes <= 0) { + return; + } + + final ByteBuffer metadataBuffer = ByteBuffer.allocate(analysis.metadataBytes); + readFully(channel, metadataBuffer, analysis.footerStartOffset); + metadataBuffer.flip(); + + if (metadataBuffer.remaining() < Long.BYTES + Integer.BYTES) { + analysis.footerWarning = "metadata buffer is too small"; + return; + } + + metadataBuffer.getLong(); + analysis.firstSearchIndexBytes = Long.BYTES; + final int entryCount = metadataBuffer.getInt(); + analysis.entryCount = entryCount; + analysis.entryCountBytes = Integer.BYTES; + + analysis.bufferSizeArrayBytes = (long) entryCount * Integer.BYTES; + for (int i = 0; i < entryCount; i++) { + metadataBuffer.getInt(); + } + + final boolean serializedEmptyV3WithoutMemTableCount = + V3_MAGIC.equals(analysis.version) + && entryCount == 0 + && metadataBuffer.remaining() == V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT; + + if (metadataBuffer.hasRemaining() && !serializedEmptyV3WithoutMemTableCount) { + analysis.memTableCountFieldBytes = Integer.BYTES; + analysis.memTableCount = metadataBuffer.getInt(); + analysis.memTableIdsBytes = (long) analysis.memTableCount * Long.BYTES; + for (int i = 0; i < analysis.memTableCount; i++) { + metadataBuffer.getLong(); + } + } + + if (V3_MAGIC.equals(analysis.version) && metadataBuffer.hasRemaining()) { + if (metadataBuffer.remaining() < Long.BYTES * 2) { + analysis.footerWarning = "V3 metadata is truncated before min/max timestamp range"; + return; + } + + analysis.minMaxDataTsBytes = Long.BYTES * 2L; + metadataBuffer.getLong(); + metadataBuffer.getLong(); + + final long requiredWriterMetadataBytes = + (long) entryCount * Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES; + if (metadataBuffer.remaining() < requiredWriterMetadataBytes) { + analysis.footerWarning = "V3 metadata is truncated before writer progress arrays"; + return; + } + + analysis.physicalTimesBytes = (long) entryCount * Long.BYTES; + analysis.localSeqsBytes = (long) entryCount * Long.BYTES; + for (int i = 0; i < entryCount; i++) { + metadataBuffer.getLong(); + } + for (int i = 0; i < entryCount; i++) { + metadataBuffer.getLong(); + } + + analysis.defaultWriterIdentityBytes = Short.BYTES * 2L; + metadataBuffer.getShort(); + metadataBuffer.getShort(); + + analysis.overrideCountFieldBytes = Integer.BYTES; + analysis.overrideCount = metadataBuffer.getInt(); + + analysis.overrideIndexesBytes = (long) analysis.overrideCount * Integer.BYTES; + analysis.overrideNodeIdsBytes = (long) analysis.overrideCount * Short.BYTES; + analysis.overrideWriterEpochsBytes = (long) analysis.overrideCount * Short.BYTES; + + for (int i = 0; i < analysis.overrideCount; i++) { + metadataBuffer.getInt(); + } + for (int i = 0; i < analysis.overrideCount; i++) { + metadataBuffer.getShort(); + } + for (int i = 0; i < analysis.overrideCount; i++) { + metadataBuffer.getShort(); + } + } + + analysis.unknownMetadataBytes = metadataBuffer.remaining(); + } + + private static String detectVersion(final FileChannel channel, final long totalBytes) + throws IOException { + if (totalBytes >= V3_MAGIC.length() + && readString(channel, 0, V3_MAGIC.length()).equals(V3_MAGIC)) { + return V3_MAGIC; + } + if (totalBytes >= V2_MAGIC.length() + && readString(channel, 0, V2_MAGIC.length()).equals(V2_MAGIC)) { + return V2_MAGIC; + } + if (totalBytes >= V1_MAGIC.length() + && readString(channel, totalBytes - V1_MAGIC.length(), V1_MAGIC.length()) + .equals(V1_MAGIC)) { + return V1_MAGIC; + } + return "UNKNOWN"; + } + + private static int getHeadMagicBytes(final String version) { + if (V3_MAGIC.equals(version)) { + return V3_MAGIC.length(); + } + if (V2_MAGIC.equals(version)) { + return V2_MAGIC.length(); + } + return 0; + } + + private static int getTailMagicBytes(final String version) { + if (V3_MAGIC.equals(version)) { + return V3_MAGIC.length(); + } + if (V2_MAGIC.equals(version)) { + return V2_MAGIC.length(); + } + if (V1_MAGIC.equals(version)) { + return V1_MAGIC.length(); + } + return 0; + } + + private static boolean hasTrailingMagic( + final FileChannel channel, final long totalBytes, final String version) throws IOException { + final int tailMagicBytes = getTailMagicBytes(version); + if (tailMagicBytes <= 0 || totalBytes < tailMagicBytes) { + return false; + } + return readString(channel, totalBytes - tailMagicBytes, tailMagicBytes).equals(version); + } + + private static String readString(final FileChannel channel, final long offset, final int length) + throws IOException { + final ByteBuffer buffer = ByteBuffer.allocate(length); + readFully(channel, buffer, offset); + buffer.flip(); + return StandardCharsets.UTF_8.decode(buffer).toString(); + } + + private static int readInt(final FileChannel channel, final long offset) throws IOException { + final ByteBuffer buffer = ByteBuffer.allocate(Integer.BYTES); + readFully(channel, buffer, offset); + buffer.flip(); + return buffer.getInt(); + } + + private static void readFully( + final FileChannel channel, final ByteBuffer buffer, final long offset) throws IOException { + long position = offset; + while (buffer.hasRemaining()) { + final int bytesRead = channel.read(buffer, position); + if (bytesRead < 0) { + throw new IOException("Unexpected EOF while reading at offset " + position); + } + position += bytesRead; + } + } + + private static void printAnalysis(final WalFileAnalysis analysis) { + System.out.println("=== WAL File Layout Analysis ==="); + System.out.println("file: " + analysis.file.getAbsolutePath()); + System.out.println("version: " + analysis.version); + System.out.println("total: " + formatBytes(analysis.totalBytes)); + if (analysis.note != null) { + System.out.println("note: " + analysis.note); + } + System.out.println(); + + printSection("head magic", analysis.headMagicBytes, analysis.totalBytes); + printSection("segment headers", analysis.segmentHeaderBytes, analysis.totalBytes); + printSection("segment payload", analysis.segmentPayloadBytes, analysis.totalBytes); + printSection("wal end marker", analysis.endMarkerBytes, analysis.totalBytes); + printSection("footer metadata", analysis.metadataBytes, analysis.totalBytes); + printSection("metadata size field", analysis.metadataSizeFieldBytes, analysis.totalBytes); + printSection("tail magic", analysis.tailMagicBytes, analysis.totalBytes); + final long accountedBytes = + analysis.headMagicBytes + + analysis.segmentHeaderBytes + + analysis.segmentPayloadBytes + + analysis.endMarkerBytes + + analysis.metadataBytes + + analysis.metadataSizeFieldBytes + + analysis.tailMagicBytes; + if (analysis.totalBytes >= accountedBytes) { + printSection("unaccounted", analysis.totalBytes - accountedBytes, analysis.totalBytes); + } + + System.out.println(); + System.out.println( + String.format( + Locale.ROOT, + "segments: total=%d, compressed=%d", + analysis.segmentCount, + analysis.compressedSegmentCount)); + if (analysis.segmentParseWarning != null) { + System.out.println("segment warning: " + analysis.segmentParseWarning); + } + + if (analysis.metadataBytes <= 0) { + return; + } + + System.out.println(); + System.out.println("=== Footer Breakdown ==="); + printSection("v2-compatible base", analysis.getV2BaseMetadataBytes(), analysis.totalBytes); + if (V3_MAGIC.equals(analysis.version)) { + printSection("v3 extension total", analysis.getV3ExtensionBytes(), analysis.totalBytes); + System.out.println( + String.format( + Locale.ROOT, + "v3 extension share of footer: %s", + formatPercent(analysis.getV3ExtensionBytes(), analysis.metadataBytes))); + printSection(" min/max data ts", analysis.minMaxDataTsBytes, analysis.totalBytes); + printSection(" physicalTimes[]", analysis.physicalTimesBytes, analysis.totalBytes); + printSection(" localSeqs[]", analysis.localSeqsBytes, analysis.totalBytes); + printSection( + " default writer identity + override count", + analysis.defaultWriterIdentityBytes + analysis.overrideCountFieldBytes, + analysis.totalBytes); + printSection(" overrideIndexes[]", analysis.overrideIndexesBytes, analysis.totalBytes); + printSection(" overrideNodeIds[]", analysis.overrideNodeIdsBytes, analysis.totalBytes); + printSection( + " overrideWriterEpochs[]", analysis.overrideWriterEpochsBytes, analysis.totalBytes); + } + if (analysis.unknownMetadataBytes > 0) { + printSection("unknown metadata tail", analysis.unknownMetadataBytes, analysis.totalBytes); + } + System.out.println( + String.format( + Locale.ROOT, + "entries=%d, memTables=%d, overrides=%d", + analysis.entryCount, + analysis.memTableCount, + analysis.overrideCount)); + if (analysis.footerWarning != null) { + System.out.println("footer warning: " + analysis.footerWarning); + } + } + + private static void printSection(final String name, final long bytes, final long totalBytes) { + System.out.println( + String.format( + Locale.ROOT, + "%-42s %12s %8s", + name + ":", + formatBytes(bytes), + formatPercent(bytes, totalBytes))); + } + + private static String formatBytes(final long bytes) { + final long absBytes = Math.abs(bytes); + if (absBytes < 1024L) { + return bytes + " B"; + } + if (absBytes < 1024L * 1024L) { + return String.format(Locale.ROOT, "%.2f KiB", bytes / 1024.0d); + } + if (absBytes < 1024L * 1024L * 1024L) { + return String.format(Locale.ROOT, "%.2f MiB", bytes / 1024.0d / 1024.0d); + } + return String.format(Locale.ROOT, "%.2f GiB", bytes / 1024.0d / 1024.0d / 1024.0d); + } + + private static String formatPercent(final long bytes, final long totalBytes) { + if (totalBytes <= 0) { + return "N/A"; + } + return String.format(Locale.ROOT, "%.2f%%", bytes * 100.0d / totalBytes); + } + + private static final class WalFileAnalysis { + private final File file; + private final String version; + private final long totalBytes; + + private long headMagicBytes; + private long segmentHeaderBytes; + private long segmentPayloadBytes; + private long endMarkerBytes; + private int metadataBytes; + private long metadataSizeFieldBytes; + private long tailMagicBytes; + + private long footerStartOffset; + private long segmentStartOffset; + private long segmentEndOffsetExclusive; + private long segmentRegionBytes; + + private int segmentCount; + private int compressedSegmentCount; + + private int entryCount; + private int memTableCount; + private int overrideCount; + private long firstSearchIndexBytes; + private long entryCountBytes; + private long bufferSizeArrayBytes; + private long memTableCountFieldBytes; + private long memTableIdsBytes; + private long minMaxDataTsBytes; + private long physicalTimesBytes; + private long localSeqsBytes; + private long defaultWriterIdentityBytes; + private long overrideCountFieldBytes; + private long overrideIndexesBytes; + private long overrideNodeIdsBytes; + private long overrideWriterEpochsBytes; + private long unknownMetadataBytes; + + private String note; + private String segmentParseWarning; + private String footerWarning; + + private WalFileAnalysis(final File file, final String version, final long totalBytes) { + this.file = file; + this.version = version; + this.totalBytes = totalBytes; + } + + private long getV2BaseMetadataBytes() { + return firstSearchIndexBytes + + entryCountBytes + + bufferSizeArrayBytes + + memTableCountFieldBytes + + memTableIdsBytes; + } + + private long getV3ExtensionBytes() { + return minMaxDataTsBytes + + physicalTimesBytes + + localSeqsBytes + + defaultWriterIdentityBytes + + overrideCountFieldBytes + + overrideIndexesBytes + + overrideNodeIdsBytes + + overrideWriterEpochsBytes; + } + } +} diff --git a/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionPerfTest.java b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionPerfTest.java new file mode 100644 index 0000000000000..a6fa862da011a --- /dev/null +++ b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionPerfTest.java @@ -0,0 +1,1526 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.session.subscription.ISubscriptionTreeSession; +import org.apache.iotdb.session.subscription.SubscriptionTreeSessionBuilder; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumerBuilder; +import org.apache.iotdb.session.subscription.payload.PollResult; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.utils.BitMap; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; + +import java.time.Instant; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.Properties; +import java.util.Random; +import java.util.TreeMap; +import java.util.concurrent.locks.LockSupport; + +/** + * Manual performance test for consensus subscription. + * + *

Typical usage: + * + *

+ *   java ... org.apache.iotdb.ConsensusSubscriptionPerfTest
+ *   java ... org.apache.iotdb.ConsensusSubscriptionPerfTest --topic=topic_perf --group=cg_perf
+ *   java ... org.apache.iotdb.ConsensusSubscriptionPerfTest --path=root.db_bench.**
+ *   java ... org.apache.iotdb.ConsensusSubscriptionPerfTest --orderMode=per-writer
+ *   java ... org.apache.iotdb.ConsensusSubscriptionPerfTest --topic=topic_perf --createTopicOnly=true
+ * 
+ * + *

This tool is designed to be started before a benchmark writer (for example iot-benchmark). It + * creates a live topic by default and continuously prints subscription throughput statistics. + */ +public class ConsensusSubscriptionPerfTest { + + private static final DateTimeFormatter TIME_FORMATTER = + DateTimeFormatter.ofPattern("HH:mm:ss").withZone(ZoneId.systemDefault()); + private static final long RANDOM_SEEK_CHECKPOINT_INTERVAL_ROWS = 100_000L; + + public static void main(final String[] args) throws Exception { + final PerfConfig config = PerfConfig.parse(args); + + if (config.help) { + printUsage(); + return; + } + + System.out.println("=== Consensus Subscription Performance Test ==="); + System.out.println(config); + + if (config.autoCreateTopic) { + createTopicIfNeeded(config); + } + + if (config.createTopicOnly) { + System.out.println( + String.format( + Locale.ROOT, + "[%s] Topic is ready. Exiting due to createTopicOnly=true", + nowString())); + return; + } + + final PerfStats stats = new PerfStats(config.enableEquivalentRowTracking()); + final RandomSeekController randomSeekController = new RandomSeekController(config.randomSeek); + final ScheduledSeekController scheduledSeekController = + new ScheduledSeekController(config.seekCaptureRows > 0 && config.seekTriggerNanos > 0); + final ConsumerRestartController consumerRestartController = + new ConsumerRestartController(config.consumerStopNanos > 0); + final ConsumerPauseController consumerPauseController = + new ConsumerPauseController(config.consumerPauseEveryRows); + long startNanoTime; + long lastReportNanoTime; + final Snapshot[] lastSnapshot = new Snapshot[1]; + final ProcessingRateLimiter processingRateLimiter = + new ProcessingRateLimiter(config.targetPointsPerSec); + SubscriptionTreePullConsumer consumer = null; + PollResult lastPollResult = emptyPollResult(stats); + + try { + consumer = openAndSubscribeConsumer(config); + + System.out.println( + String.format( + Locale.ROOT, "[%s] Subscribed. Waiting for benchmark writes...", nowString())); + + if (config.waitBeforePollNanos > 0) { + System.out.println( + String.format( + Locale.ROOT, + "[%s] Delaying poll start for %.3f second(s)...", + nowString(), + config.waitBeforePollSec)); + LockSupport.parkNanos(config.waitBeforePollNanos); + } + + System.out.println(String.format(Locale.ROOT, "[%s] Starting poll loop.", nowString())); + + startNanoTime = System.nanoTime(); + lastReportNanoTime = startNanoTime; + lastSnapshot[0] = Snapshot.capture(stats); + + while (config.durationSec <= 0 + || nanosToSeconds(System.nanoTime() - startNanoTime) < config.durationSec) { + final long loopNowNanoTime = System.nanoTime(); + final long elapsedNanoTime = loopNowNanoTime - startNanoTime; + + if (shouldStopConsumer(config, consumerRestartController, elapsedNanoTime) + && Objects.nonNull(consumer)) { + consumerRestartController.stopPerformed = true; + consumerRestartController.stoppedNanoTime = System.nanoTime(); + System.out.println( + String.format( + Locale.ROOT, + "[%s] Consumer polling paused at elapsedSec=%.3f; polling will resume at %.3f second(s).", + nowString(), + elapsedNanoTime / 1_000_000_000.0d, + config.consumerResumeSec)); + } + + if (shouldPauseConsumerByRows(config, consumerPauseController, stats.totalRows) + && Objects.nonNull(consumer)) { + consumerPauseController.pausePerformedCount++; + consumerPauseController.paused = true; + consumerPauseController.stoppedNanoTime = System.nanoTime(); + consumerPauseController.nextPauseRows = stats.totalRows + config.consumerPauseEveryRows; + System.out.println( + String.format( + Locale.ROOT, + "[%s] Consumer paused after rows=%d; polling will resume in %.3f second(s).", + nowString(), + stats.totalRows, + config.consumerPauseDurationSec)); + } + + if (shouldResumeConsumer(config, consumerRestartController, elapsedNanoTime) + && Objects.nonNull(consumer)) { + final long resumedNanoTime = System.nanoTime(); + processingRateLimiter.pauseForDowntime( + resumedNanoTime - consumerRestartController.stoppedNanoTime); + consumerRestartController.resumePerformed = true; + System.out.println( + String.format( + Locale.ROOT, + "[%s] Consumer polling resumed at elapsedSec=%.3f after downtimeSec=%.3f.", + nowString(), + (resumedNanoTime - startNanoTime) / 1_000_000_000.0d, + (resumedNanoTime - consumerRestartController.stoppedNanoTime) + / 1_000_000_000.0d)); + } + + if (shouldResumeConsumerByRows(config, consumerPauseController) + && Objects.nonNull(consumer)) { + final long resumedNanoTime = System.nanoTime(); + processingRateLimiter.pauseForDowntime( + resumedNanoTime - consumerPauseController.stoppedNanoTime); + consumerPauseController.paused = false; + System.out.println( + String.format( + Locale.ROOT, + "[%s] Consumer resumed after row-based pause at rows=%d, downtimeSec=%.3f.", + nowString(), + stats.totalRows, + (resumedNanoTime - consumerPauseController.stoppedNanoTime) / 1_000_000_000.0d)); + } + + final boolean pollingPaused = + consumerRestartController.enabled + && consumerRestartController.stopPerformed + && !consumerRestartController.resumePerformed + || consumerPauseController.enabled && consumerPauseController.paused; + + final PollResult pollResult; + if (Objects.nonNull(consumer) && !pollingPaused) { + pollResult = consumer.pollWithInfo(config.pollTimeoutMs); + handlePollResult( + pollResult, + stats, + config.processDelayNanos, + processingRateLimiter, + config.ingestWallTimeSensor); + captureScheduledSeekCheckpoint(consumer, config, stats, scheduledSeekController); + captureRandomSeekCheckpoint(consumer, config, stats, randomSeekController); + maybePerformScheduledSeek( + consumer, config, stats, scheduledSeekController, System.nanoTime() - startNanoTime); + maybePerformRandomSeek(consumer, config, stats, randomSeekController); + } else { + LockSupport.parkNanos(Math.min(100_000_000L, config.pollTimeoutMs * 1_000_000L)); + pollResult = emptyPollResult(stats); + } + lastPollResult = pollResult; + + final long nowNanoTime = System.nanoTime(); + if (nowNanoTime - lastReportNanoTime >= config.reportIntervalSec * 1_000_000_000L) { + printReport( + "interval", + lastSnapshot[0], + Snapshot.capture(stats), + nowNanoTime - lastReportNanoTime, + pollResult); + lastSnapshot[0] = Snapshot.capture(stats); + lastReportNanoTime = nowNanoTime; + } + } + + printReport( + "final", + Snapshot.zero(), + Snapshot.capture(stats), + System.nanoTime() - startNanoTime, + lastPollResult); + } finally { + if (Objects.nonNull(consumer)) { + consumer.close(); + } + } + } + + private static void createTopicIfNeeded(final PerfConfig config) throws Exception { + try (final ISubscriptionTreeSession session = + new SubscriptionTreeSessionBuilder() + .host(config.host) + .port(config.port) + .username(config.username) + .password(config.password) + .build()) { + session.open(); + + final Properties topicConfig = new Properties(); + topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + topicConfig.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + topicConfig.put(TopicConstant.PATH_KEY, config.path); + topicConfig.put(TopicConstant.ORDER_MODE_KEY, config.orderMode); + session.createTopicIfNotExists(config.topic, topicConfig); + } + } + + private static SubscriptionTreePullConsumer createConsumer(final PerfConfig config) { + return (SubscriptionTreePullConsumer) + new SubscriptionTreePullConsumerBuilder() + .host(config.host) + .port(config.port) + .username(config.username) + .password(config.password) + .consumerId(config.consumer) + .consumerGroupId(config.group) + .autoCommit(config.autoCommit) + .autoCommitIntervalMs(config.autoCommitIntervalMs) + .maxPollParallelism(1) + .build(); + } + + private static SubscriptionTreePullConsumer openAndSubscribeConsumer(final PerfConfig config) + throws Exception { + final SubscriptionTreePullConsumer consumer = createConsumer(config); + consumer.open(); + consumer.subscribe(config.topic); + return consumer; + } + + private static PollResult emptyPollResult(final PerfStats stats) { + return new PollResult(Collections.emptyList(), 0, stats.lastWatermark); + } + + private static boolean shouldStopConsumer( + final PerfConfig config, + final ConsumerRestartController controller, + final long elapsedNanoTime) { + return controller.enabled + && !controller.stopPerformed + && elapsedNanoTime >= config.consumerStopNanos; + } + + private static boolean shouldResumeConsumer( + final PerfConfig config, + final ConsumerRestartController controller, + final long elapsedNanoTime) { + return controller.enabled + && controller.stopPerformed + && !controller.resumePerformed + && elapsedNanoTime >= config.consumerResumeNanos; + } + + private static boolean shouldPauseConsumerByRows( + final PerfConfig config, final ConsumerPauseController controller, final long totalRows) { + return controller.enabled + && !controller.paused + && totalRows > 0 + && totalRows >= controller.nextPauseRows + && config.consumerPauseEveryRows > 0; + } + + private static boolean shouldResumeConsumerByRows( + final PerfConfig config, final ConsumerPauseController controller) { + return controller.enabled + && controller.paused + && controller.stoppedNanoTime > 0 + && System.nanoTime() - controller.stoppedNanoTime >= config.consumerPauseDurationNanos; + } + + private static void handlePollResult( + final PollResult pollResult, + final PerfStats stats, + final long processDelayNanos, + final ProcessingRateLimiter processingRateLimiter, + final String ingestWallTimeSensor) { + stats.totalPollCalls++; + stats.lastBufferedCount = pollResult.getBufferedCount(); + if (pollResult.getWatermark() >= 0) { + stats.lastWatermark = pollResult.getWatermark(); + } + + final List messages = pollResult.getMessages(); + if (messages.isEmpty()) { + stats.emptyPollCalls++; + return; + } + + for (final SubscriptionMessage message : messages) { + stats.totalMessages++; + + if (message.getMessageType() == SubscriptionMessageType.WATERMARK.getType()) { + stats.totalWatermarkMessages++; + if (message.getWatermarkTimestamp() >= 0) { + stats.lastWatermark = Math.max(stats.lastWatermark, message.getWatermarkTimestamp()); + } + continue; + } + + if (message.getMessageType() == SubscriptionMessageType.TS_FILE.getType()) { + stats.totalTsFileMessages++; + maybeApplyProcessingDelay(processDelayNanos, processingRateLimiter, 0); + continue; + } + + if (message.getMessageType() == SubscriptionMessageType.RECORD_HANDLER.getType()) { + final Iterator tabletIterator = message.getRecordTabletIterator(); + while (tabletIterator.hasNext()) { + final Tablet tablet = tabletIterator.next(); + stats.totalTablets++; + final int rowSize = tablet.getRowSize(); + stats.totalRows += rowSize; + stats.totalApproxBytes += tablet.ramBytesUsed(); + updateOrderingStats(stats, tablet, rowSize); + updateLatencyStats(stats, tablet, rowSize, ingestWallTimeSensor); + maybeApplyProcessingDelay( + processDelayNanos, processingRateLimiter, estimateTabletPoints(tablet, rowSize)); + } + } + } + } + + private static long estimateTabletPoints(final Tablet tablet, final int rowSize) { + if (rowSize <= 0) { + return 0L; + } + return (long) rowSize * tablet.getSchemas().size(); + } + + private static void updateOrderingStats( + final PerfStats stats, final Tablet tablet, final int rowSize) { + if (rowSize <= 0) { + return; + } + + final String deviceId = Objects.toString(tablet.getDeviceId(), ""); + long lastSeenTimestamp = stats.lastSeenTimestampByDevice.getOrDefault(deviceId, Long.MIN_VALUE); + + for (int rowIndex = 0; rowIndex < rowSize; rowIndex++) { + final long currentTimestamp = tablet.getTimestamp(rowIndex); + if (stats.equivalentRowTracker == null + || stats.equivalentRowTracker.record(deviceId, currentTimestamp)) { + stats.totalEquivalentRows++; + } + if (lastSeenTimestamp != Long.MIN_VALUE && currentTimestamp < lastSeenTimestamp) { + stats.totalOutOfOrderRows++; + final long regression = lastSeenTimestamp - currentTimestamp; + if (regression > stats.maxTimestampRegression) { + stats.maxTimestampRegression = regression; + } + } + if (currentTimestamp > lastSeenTimestamp) { + lastSeenTimestamp = currentTimestamp; + } + } + + stats.lastSeenTimestampByDevice.put(deviceId, lastSeenTimestamp); + } + + private static void updateLatencyStats( + final PerfStats stats, + final Tablet tablet, + final int rowSize, + final String ingestWallTimeSensor) { + if (rowSize <= 0 || ingestWallTimeSensor == null || ingestWallTimeSensor.isEmpty()) { + return; + } + + final int sensorIndex = findMeasurementIndex(tablet, ingestWallTimeSensor); + if (sensorIndex < 0) { + return; + } + + final List schemas = tablet.getSchemas(); + if (sensorIndex >= schemas.size() + || schemas.get(sensorIndex).getType() != TSDataType.INT64 + || sensorIndex >= tablet.getValues().length + || !(tablet.getValues()[sensorIndex] instanceof long[])) { + return; + } + + final long[] ingestWallTimes = (long[]) tablet.getValues()[sensorIndex]; + final BitMap[] bitMaps = tablet.getBitMaps(); + final BitMap bitMap = + bitMaps != null && sensorIndex < bitMaps.length ? bitMaps[sensorIndex] : null; + final long nowMs = System.currentTimeMillis(); + + for (int rowIndex = 0; rowIndex < rowSize; rowIndex++) { + if (bitMap != null && bitMap.isMarked(rowIndex)) { + continue; + } + + final long ingestWallTimeMs = ingestWallTimes[rowIndex]; + final long latencyMs = Math.max(0L, nowMs - ingestWallTimeMs); + stats.recordLatency(latencyMs); + } + } + + private static int findMeasurementIndex(final Tablet tablet, final String measurementName) { + final List schemas = tablet.getSchemas(); + for (int i = 0, size = schemas.size(); i < size; i++) { + if (measurementName.equals(schemas.get(i).getMeasurementName())) { + return i; + } + } + return -1; + } + + private static void maybeApplyProcessingDelay( + final long processDelayNanos, + final ProcessingRateLimiter processingRateLimiter, + final long processedPoints) { + if (processingRateLimiter.isEnabled()) { + processingRateLimiter.acquire(processedPoints); + return; + } + if (processDelayNanos > 0) { + LockSupport.parkNanos(processDelayNanos); + } + } + + private static void captureRandomSeekCheckpoint( + final SubscriptionTreePullConsumer consumer, + final PerfConfig config, + final PerfStats stats, + final RandomSeekController controller) + throws Exception { + if (!controller.enabled + || stats.totalRows <= 0 + || (controller.lastCapturedRows >= 0 + && stats.totalRows - controller.lastCapturedRows + < RANDOM_SEEK_CHECKPOINT_INTERVAL_ROWS)) { + return; + } + + TopicProgress progress = consumer.committedPositions(config.topic); + String source = "committed"; + if (isEmptyTopicProgress(progress)) { + progress = consumer.positions(config.topic); + source = "current"; + } + if (isEmptyTopicProgress(progress)) { + return; + } + + final TopicProgress safeProgress = new TopicProgress(progress.getRegionProgress()); + if (Objects.equals(controller.lastCapturedProgress, safeProgress)) { + controller.lastCapturedRows = stats.totalRows; + return; + } + + controller.checkpoints.add( + new SeekCheckpoint(stats.totalRows, stats.totalEquivalentRows, source, safeProgress)); + controller.lastCapturedRows = stats.totalRows; + controller.lastCapturedProgress = safeProgress; + stats.totalRandomSeekCheckpoints = controller.checkpoints.size(); + } + + private static void captureScheduledSeekCheckpoint( + final SubscriptionTreePullConsumer consumer, + final PerfConfig config, + final PerfStats stats, + final ScheduledSeekController controller) + throws Exception { + if (!controller.enabled || Objects.nonNull(controller.checkpoint)) { + return; + } + if (stats.totalRows < config.seekCaptureRows) { + return; + } + + final TopicProgress currentProgress = consumer.positions(config.topic); + if (isEmptyTopicProgress(currentProgress)) { + return; + } + final TopicProgress committedProgress = consumer.committedPositions(config.topic); + + controller.checkpoint = + new SeekCheckpoint( + stats.totalRows, + stats.totalEquivalentRows, + "current", + new TopicProgress(currentProgress.getRegionProgress())); + + System.out.println( + String.format( + Locale.ROOT, + "[%s] Scheduled seek checkpoint captured: checkpointRows=%d, checkpointEquivalentRows=%d, progressSource=current, triggerSec=%.3f", + nowString(), + controller.checkpoint.rawRows, + controller.checkpoint.equivalentRows, + config.seekTriggerSec)); + } + + private static void maybePerformRandomSeek( + final SubscriptionTreePullConsumer consumer, + final PerfConfig config, + final PerfStats stats, + final RandomSeekController controller) + throws Exception { + if (!controller.enabled + || controller.performed + || stats.totalRows < config.randomSeekMinRows + || controller.checkpoints.size() < 2) { + return; + } + + final int candidateCount = controller.checkpoints.size() - 1; + final SeekCheckpoint targetCheckpoint = + controller.checkpoints.get(controller.random.nextInt(candidateCount)); + + consumer.seekAfter(config.topic, targetCheckpoint.topicProgress); + + controller.performed = true; + stats.totalRandomSeeks++; + stats.lastRandomSeekSourceRows = targetCheckpoint.rawRows; + stats.lastRandomSeekEquivalentRows = targetCheckpoint.equivalentRows; + stats.lastRandomSeekObservedRows = stats.totalRows; + + System.out.println( + String.format( + Locale.ROOT, + "[%s] Random seekAfter triggered: checkpointRows=%d, checkpointEquivalentRows=%d, progressSource=%s, checkpointCount=%d", + nowString(), + targetCheckpoint.rawRows, + targetCheckpoint.equivalentRows, + targetCheckpoint.source, + controller.checkpoints.size())); + } + + private static void maybePerformScheduledSeek( + final SubscriptionTreePullConsumer consumer, + final PerfConfig config, + final PerfStats stats, + final ScheduledSeekController controller, + final long elapsedNanoTime) + throws Exception { + if (!controller.enabled + || controller.performed + || Objects.isNull(controller.checkpoint) + || elapsedNanoTime < config.seekTriggerNanos) { + return; + } + + consumer.seekAfter(config.topic, controller.checkpoint.topicProgress); + + controller.performed = true; + stats.totalRandomSeeks++; + + System.out.println( + String.format( + Locale.ROOT, + "[%s] Scheduled seekAfter triggered: checkpointRows=%d, checkpointEquivalentRows=%d, progressSource=%s, triggerSec=%.3f", + nowString(), + controller.checkpoint.rawRows, + controller.checkpoint.equivalentRows, + controller.checkpoint.source, + config.seekTriggerSec)); + } + + private static boolean isEmptyTopicProgress(final TopicProgress topicProgress) { + return Objects.isNull(topicProgress) || topicProgress.getRegionProgress().isEmpty(); + } + + private static void printReport( + final String label, + final Snapshot previous, + final Snapshot current, + final long elapsedNanoTime, + final PollResult pollResult) { + final double seconds = Math.max(1e-9, elapsedNanoTime / 1_000_000_000.0d); + + final long intervalMessages = current.totalMessages - previous.totalMessages; + final long intervalTablets = current.totalTablets - previous.totalTablets; + final long intervalRows = current.totalRows - previous.totalRows; + final long intervalEquivalentRows = current.totalEquivalentRows - previous.totalEquivalentRows; + final long intervalBytes = current.totalApproxBytes - previous.totalApproxBytes; + final long intervalWatermarks = + current.totalWatermarkMessages - previous.totalWatermarkMessages; + final long intervalOutOfOrderRows = current.totalOutOfOrderRows - previous.totalOutOfOrderRows; + final double intervalOutOfOrderRatio = + intervalRows <= 0 ? 0d : intervalOutOfOrderRows * 100.0d / intervalRows; + final double totalOutOfOrderRatio = + current.totalRows <= 0 ? 0d : current.totalOutOfOrderRows * 100.0d / current.totalRows; + final LatencySummary intervalLatency = LatencySummary.delta(previous, current); + final LatencySummary totalLatency = LatencySummary.total(current); + + System.out.println( + String.format( + Locale.ROOT, + "[%s] %-8s msgs=%d (%.1f/s), tablets=%d (%.1f/s), rows=%d (%.1f/s), eqRows=%d (%.1f/s), bytes=%s (%s/s), " + + "watermarks=%d, oooRows=%d (%.4f%%), totalOoo=%.4f%%, maxTsBack=%d, " + + "latRows=%d, latAvgMs=%s, latP95Ms=%s, latP99Ms=%s, latMaxMs=%s, totalLatAvgMs=%s, totalLatP95Ms=%s, totalLatP99Ms=%s, totalLatMaxMs=%s, " + + "totalRows=%d, equivalentRows=%d, replayRows=%d, seeks=%d, totalBytes=%s, polls=%d, emptyPolls=%d, buffered=%d, watermark=%s", + nowString(), + label, + intervalMessages, + intervalMessages / seconds, + intervalTablets, + intervalTablets / seconds, + intervalRows, + intervalRows / seconds, + intervalEquivalentRows, + intervalEquivalentRows / seconds, + formatBytes(intervalBytes), + formatBytes((long) (intervalBytes / seconds)), + intervalWatermarks, + intervalOutOfOrderRows, + intervalOutOfOrderRatio, + totalOutOfOrderRatio, + current.maxTimestampRegression, + intervalLatency.sampleCount, + intervalLatency.formatAverageMs(), + intervalLatency.p95MsLabel, + intervalLatency.p99MsLabel, + intervalLatency.maxMsLabel, + totalLatency.formatAverageMs(), + totalLatency.p95MsLabel, + totalLatency.p99MsLabel, + totalLatency.maxMsLabel, + current.totalRows, + current.totalEquivalentRows, + current.totalRows - current.totalEquivalentRows, + current.totalRandomSeeks, + formatBytes(current.totalApproxBytes), + current.totalPollCalls, + current.emptyPollCalls, + pollResult.getBufferedCount(), + formatWatermark(current.lastWatermark))); + } + + private static String formatWatermark(final long watermark) { + return watermark >= 0 ? Long.toString(watermark) : "N/A"; + } + + private static String formatBytes(final long bytes) { + final long absBytes = Math.abs(bytes); + if (absBytes < 1024) { + return bytes + " B"; + } + if (absBytes < 1024L * 1024) { + return String.format(Locale.ROOT, "%.2f KiB", bytes / 1024.0d); + } + if (absBytes < 1024L * 1024 * 1024) { + return String.format(Locale.ROOT, "%.2f MiB", bytes / 1024.0d / 1024.0d); + } + return String.format(Locale.ROOT, "%.2f GiB", bytes / 1024.0d / 1024.0d / 1024.0d); + } + + private static String nowString() { + return TIME_FORMATTER.format(Instant.now()); + } + + private static long nanosToSeconds(final long nanos) { + return nanos / 1_000_000_000L; + } + + private static void printUsage() { + System.out.println("Usage:"); + System.out.println( + " java ... org.apache.iotdb.ConsensusSubscriptionPerfTest [--key=value ...]"); + System.out.println(); + System.out.println("Available keys:"); + System.out.println(" host=127.0.0.1"); + System.out.println(" port=6667"); + System.out.println(" username=root"); + System.out.println(" password=root"); + System.out.println(" topic=topic_perf_"); + System.out.println(" group=cg_perf_"); + System.out.println(" consumer=consumer_perf_"); + System.out.println(" path=root.**"); + System.out.println(" orderMode=leader-only"); + System.out.println(" autoCreateTopic=true"); + System.out.println(" createTopicOnly=false"); + System.out.println(" autoCommit=true"); + System.out.println(" autoCommitIntervalMs=1000"); + System.out.println(" pollTimeoutMs=1000"); + System.out.println(" waitBeforePollSec=0"); + System.out.println(" reportIntervalSec=5"); + System.out.println(" durationSec=0 (0 means run until manually stopped)"); + System.out.println(" processDelayMs=0 (delay per non-watermark message, decimal allowed)"); + System.out.println(" targetPointsPerSec=0 (0 disables point-rate limiting)"); + System.out.println(" ingestWallTimeSensor=ingest_wall_time_ms"); + System.out.println(" randomSeek=false"); + System.out.println(" randomSeekMinRows=1000000"); + System.out.println(" seekCaptureRows=0 (0 disables scheduled checkpoint capture)"); + System.out.println(" seekTriggerSec=0 (0 disables scheduled seek)"); + System.out.println( + " consumerStopSec=0 (0 disables consumer polling pause/resume simulation)"); + System.out.println(" consumerResumeSec=0 (must be > consumerStopSec when enabled)"); + System.out.println(" consumerPauseEveryRows=0 (0 disables row-based recurring pauses)"); + System.out.println( + " consumerPauseDurationSec=0 (must be > 0 when consumerPauseEveryRows is enabled)"); + } + + private static final class PerfConfig { + private final boolean help; + private final String host; + private final int port; + private final String username; + private final String password; + private final String topic; + private final String group; + private final String consumer; + private final String path; + private final String orderMode; + private final String ingestWallTimeSensor; + private final boolean autoCreateTopic; + private final boolean createTopicOnly; + private final boolean autoCommit; + private final long autoCommitIntervalMs; + private final long pollTimeoutMs; + private final double waitBeforePollSec; + private final long waitBeforePollNanos; + private final long reportIntervalSec; + private final long durationSec; + private final double processDelayMs; + private final long processDelayNanos; + private final double targetPointsPerSec; + private final boolean randomSeek; + private final long randomSeekMinRows; + private final long seekCaptureRows; + private final double seekTriggerSec; + private final long seekTriggerNanos; + private final double consumerStopSec; + private final long consumerStopNanos; + private final double consumerResumeSec; + private final long consumerResumeNanos; + private final long consumerPauseEveryRows; + private final double consumerPauseDurationSec; + private final long consumerPauseDurationNanos; + + private PerfConfig( + final boolean help, + final String host, + final int port, + final String username, + final String password, + final String topic, + final String group, + final String consumer, + final String path, + final String orderMode, + final String ingestWallTimeSensor, + final boolean autoCreateTopic, + final boolean createTopicOnly, + final boolean autoCommit, + final long autoCommitIntervalMs, + final long pollTimeoutMs, + final double waitBeforePollSec, + final long waitBeforePollNanos, + final long reportIntervalSec, + final long durationSec, + final double processDelayMs, + final long processDelayNanos, + final double targetPointsPerSec, + final boolean randomSeek, + final long randomSeekMinRows, + final long seekCaptureRows, + final double seekTriggerSec, + final long seekTriggerNanos, + final double consumerStopSec, + final long consumerStopNanos, + final double consumerResumeSec, + final long consumerResumeNanos, + final long consumerPauseEveryRows, + final double consumerPauseDurationSec, + final long consumerPauseDurationNanos) { + this.help = help; + this.host = host; + this.port = port; + this.username = username; + this.password = password; + this.topic = topic; + this.group = group; + this.consumer = consumer; + this.path = path; + this.orderMode = orderMode; + this.ingestWallTimeSensor = ingestWallTimeSensor; + this.autoCreateTopic = autoCreateTopic; + this.createTopicOnly = createTopicOnly; + this.autoCommit = autoCommit; + this.autoCommitIntervalMs = autoCommitIntervalMs; + this.pollTimeoutMs = pollTimeoutMs; + this.waitBeforePollSec = waitBeforePollSec; + this.waitBeforePollNanos = waitBeforePollNanos; + this.reportIntervalSec = reportIntervalSec; + this.durationSec = durationSec; + this.processDelayMs = processDelayMs; + this.processDelayNanos = processDelayNanos; + this.targetPointsPerSec = targetPointsPerSec; + this.randomSeek = randomSeek; + this.randomSeekMinRows = randomSeekMinRows; + this.seekCaptureRows = seekCaptureRows; + this.seekTriggerSec = seekTriggerSec; + this.seekTriggerNanos = seekTriggerNanos; + this.consumerStopSec = consumerStopSec; + this.consumerStopNanos = consumerStopNanos; + this.consumerResumeSec = consumerResumeSec; + this.consumerResumeNanos = consumerResumeNanos; + this.consumerPauseEveryRows = consumerPauseEveryRows; + this.consumerPauseDurationSec = consumerPauseDurationSec; + this.consumerPauseDurationNanos = consumerPauseDurationNanos; + } + + private static PerfConfig parse(final String[] args) { + final long suffix = System.currentTimeMillis(); + String host = "127.0.0.1"; + int port = 6667; + String username = "root"; + String password = "root"; + String topic = "topic_perf_" + suffix; + String group = "cg_perf_" + suffix; + String consumer = "consumer_perf_" + suffix; + String path = "root.**"; + String orderMode = TopicConstant.ORDER_MODE_DEFAULT_VALUE; + orderMode = TopicConstant.ORDER_MODE_PER_WRITER_VALUE; + String ingestWallTimeSensor = "ingest_wall_time_ms"; + boolean autoCreateTopic = true; + boolean createTopicOnly = false; + boolean autoCommit = true; + long autoCommitIntervalMs = 1000L; + long pollTimeoutMs = 1000L; + double waitBeforePollSec = 0d; + long reportIntervalSec = 1L; + long durationSec = 0L; + double processDelayMs = 0d; + double targetPointsPerSec = 10_000_000d; + boolean randomSeek = false; + long randomSeekMinRows = 2_000_000L; + long seekCaptureRows = 0L; + double seekTriggerSec = 0d; + double consumerStopSec = 0d; + double consumerResumeSec = 0d; + long consumerPauseEveryRows = 0L; + double consumerPauseDurationSec = 0d; + boolean help = false; + + for (final String arg : args) { + if ("--help".equals(arg) || "-h".equals(arg)) { + help = true; + continue; + } + + final String normalized = arg.startsWith("--") ? arg.substring(2) : arg; + final int separator = normalized.indexOf('='); + if (separator <= 0) { + throw new IllegalArgumentException( + "Invalid argument: " + arg + ". Expected format --key=value"); + } + + final String key = normalized.substring(0, separator); + final String value = normalized.substring(separator + 1); + + switch (key) { + case "host": + host = value; + break; + case "port": + port = Integer.parseInt(value); + break; + case "username": + username = value; + break; + case "password": + password = value; + break; + case "topic": + topic = value; + break; + case "group": + group = value; + break; + case "consumer": + consumer = value; + break; + case "path": + path = value; + break; + case "orderMode": + case "order-mode": + orderMode = TopicConfig.normalizeOrderMode(value); + break; + case "ingestWallTimeSensor": + case "ingest-wall-time-sensor": + ingestWallTimeSensor = value; + break; + case "autoCreateTopic": + autoCreateTopic = Boolean.parseBoolean(value); + break; + case "createTopicOnly": + createTopicOnly = Boolean.parseBoolean(value); + break; + case "autoCommit": + autoCommit = Boolean.parseBoolean(value); + break; + case "autoCommitIntervalMs": + autoCommitIntervalMs = Long.parseLong(value); + break; + case "pollTimeoutMs": + pollTimeoutMs = Long.parseLong(value); + break; + case "waitBeforePollSec": + waitBeforePollSec = Double.parseDouble(value); + break; + case "reportIntervalSec": + reportIntervalSec = Long.parseLong(value); + break; + case "durationSec": + durationSec = Long.parseLong(value); + break; + case "processDelayMs": + processDelayMs = Double.parseDouble(value); + break; + case "targetPointsPerSec": + case "target-points-per-sec": + targetPointsPerSec = Double.parseDouble(value); + break; + case "randomSeek": + randomSeek = Boolean.parseBoolean(value); + break; + case "randomSeekMinRows": + randomSeekMinRows = Long.parseLong(value); + break; + case "seekCaptureRows": + seekCaptureRows = Long.parseLong(value); + break; + case "seekTriggerSec": + seekTriggerSec = Double.parseDouble(value); + break; + case "consumerStopSec": + case "consumer-stop-sec": + consumerStopSec = Double.parseDouble(value); + break; + case "consumerResumeSec": + case "consumer-resume-sec": + consumerResumeSec = Double.parseDouble(value); + break; + case "consumerPauseEveryRows": + case "consumer-pause-every-rows": + consumerPauseEveryRows = Long.parseLong(value); + break; + case "consumerPauseDurationSec": + case "consumer-pause-duration-sec": + consumerPauseDurationSec = Double.parseDouble(value); + break; + default: + throw new IllegalArgumentException("Unknown argument key: " + key); + } + } + + if (!TopicConfig.isValidOrderMode(orderMode)) { + throw new IllegalArgumentException("Unsupported orderMode: " + orderMode); + } + if (processDelayMs < 0) { + throw new IllegalArgumentException("processDelayMs must be >= 0"); + } + if (targetPointsPerSec < 0) { + throw new IllegalArgumentException("targetPointsPerSec must be >= 0"); + } + if (waitBeforePollSec < 0) { + throw new IllegalArgumentException("waitBeforePollSec must be >= 0"); + } + if (randomSeekMinRows < 0) { + throw new IllegalArgumentException("randomSeekMinRows must be >= 0"); + } + if (seekCaptureRows < 0) { + throw new IllegalArgumentException("seekCaptureRows must be >= 0"); + } + if (seekTriggerSec < 0) { + throw new IllegalArgumentException("seekTriggerSec must be >= 0"); + } + if (consumerStopSec < 0) { + throw new IllegalArgumentException("consumerStopSec must be >= 0"); + } + if (consumerResumeSec < 0) { + throw new IllegalArgumentException("consumerResumeSec must be >= 0"); + } + if (consumerPauseEveryRows < 0) { + throw new IllegalArgumentException("consumerPauseEveryRows must be >= 0"); + } + if (consumerPauseDurationSec < 0) { + throw new IllegalArgumentException("consumerPauseDurationSec must be >= 0"); + } + if ((seekCaptureRows > 0) != (seekTriggerSec > 0)) { + throw new IllegalArgumentException( + "seekCaptureRows and seekTriggerSec must both be set to positive values to enable scheduled seek"); + } + if ((consumerStopSec > 0) != (consumerResumeSec > 0)) { + throw new IllegalArgumentException( + "consumerStopSec and consumerResumeSec must both be set to positive values to enable consumer polling pause/resume simulation"); + } + if (consumerResumeSec > 0 && consumerResumeSec <= consumerStopSec) { + throw new IllegalArgumentException( + "consumerResumeSec must be greater than consumerStopSec"); + } + if ((consumerPauseEveryRows > 0) != (consumerPauseDurationSec > 0)) { + throw new IllegalArgumentException( + "consumerPauseEveryRows and consumerPauseDurationSec must both be set to positive values to enable row-based recurring pauses"); + } + if (consumerPauseEveryRows > 0 && consumerStopSec > 0) { + throw new IllegalArgumentException( + "consumerPauseEveryRows/consumerPauseDurationSec cannot be combined with consumerStopSec/consumerResumeSec"); + } + + final long waitBeforePollNanos = Math.round(waitBeforePollSec * 1_000_000_000.0d); + final long processDelayNanos = Math.round(processDelayMs * 1_000_000.0d); + final long seekTriggerNanos = Math.round(seekTriggerSec * 1_000_000_000.0d); + final long consumerStopNanos = Math.round(consumerStopSec * 1_000_000_000.0d); + final long consumerResumeNanos = Math.round(consumerResumeSec * 1_000_000_000.0d); + final long consumerPauseDurationNanos = + Math.round(consumerPauseDurationSec * 1_000_000_000.0d); + + return new PerfConfig( + help, + host, + port, + username, + password, + topic, + group, + consumer, + path, + orderMode, + ingestWallTimeSensor, + autoCreateTopic, + createTopicOnly, + autoCommit, + autoCommitIntervalMs, + pollTimeoutMs, + waitBeforePollSec, + waitBeforePollNanos, + reportIntervalSec, + durationSec, + processDelayMs, + processDelayNanos, + targetPointsPerSec, + randomSeek, + randomSeekMinRows, + seekCaptureRows, + seekTriggerSec, + seekTriggerNanos, + consumerStopSec, + consumerStopNanos, + consumerResumeSec, + consumerResumeNanos, + consumerPauseEveryRows, + consumerPauseDurationSec, + consumerPauseDurationNanos); + } + + @Override + public String toString() { + return String.format( + Locale.ROOT, + "Config{host=%s, port=%d, username=%s, topic=%s, group=%s, consumer=%s, path=%s, " + + "orderMode=%s, ingestWallTimeSensor=%s, autoCreateTopic=%s, createTopicOnly=%s, autoCommit=%s, autoCommitIntervalMs=%d, pollTimeoutMs=%d, " + + "waitBeforePollSec=%.3f, " + + "reportIntervalSec=%d, durationSec=%d, processDelayMs=%.3f, targetPointsPerSec=%.3f, randomSeek=%s, randomSeekMinRows=%d, seekCaptureRows=%d, seekTriggerSec=%.3f, consumerStopSec=%.3f, consumerResumeSec=%.3f, consumerPauseEveryRows=%d, consumerPauseDurationSec=%.3f}", + host, + port, + username, + topic, + group, + consumer, + path, + orderMode, + ingestWallTimeSensor, + autoCreateTopic, + createTopicOnly, + autoCommit, + autoCommitIntervalMs, + pollTimeoutMs, + waitBeforePollSec, + reportIntervalSec, + durationSec, + processDelayMs, + targetPointsPerSec, + randomSeek, + randomSeekMinRows, + seekCaptureRows, + seekTriggerSec, + consumerStopSec, + consumerResumeSec, + consumerPauseEveryRows, + consumerPauseDurationSec); + } + + private boolean enableEquivalentRowTracking() { + return randomSeek || (seekCaptureRows > 0 && seekTriggerSec > 0); + } + } + + private static final class ProcessingRateLimiter { + private final double targetPointsPerSec; + private long throttlingStartNanoTime = -1L; + private long totalProcessedPoints = 0L; + + private ProcessingRateLimiter(final double targetPointsPerSec) { + this.targetPointsPerSec = targetPointsPerSec; + } + + private boolean isEnabled() { + return targetPointsPerSec > 0d; + } + + private void acquire(final long processedPoints) { + if (!isEnabled() || processedPoints <= 0) { + return; + } + + final long nowNanoTime = System.nanoTime(); + if (throttlingStartNanoTime < 0) { + throttlingStartNanoTime = nowNanoTime; + } + + totalProcessedPoints += processedPoints; + final long targetElapsedNanos = + (long) Math.ceil((totalProcessedPoints * 1_000_000_000.0d) / targetPointsPerSec); + final long actualElapsedNanos = nowNanoTime - throttlingStartNanoTime; + final long remainingNanos = targetElapsedNanos - actualElapsedNanos; + if (remainingNanos > 0) { + LockSupport.parkNanos(remainingNanos); + } + } + + private void pauseForDowntime(final long pausedNanos) { + if (!isEnabled() || throttlingStartNanoTime < 0 || pausedNanos <= 0) { + return; + } + throttlingStartNanoTime += pausedNanos; + } + } + + private static final class PerfStats { + private long totalPollCalls; + private long emptyPollCalls; + private long totalMessages; + private long totalWatermarkMessages; + private long totalTsFileMessages; + private long totalTablets; + private long totalRows; + private long totalEquivalentRows; + private long totalApproxBytes; + private long totalOutOfOrderRows; + private long maxTimestampRegression; + private long totalLatencySamples; + private long totalLatencySumMs; + private final long[] latencyHistogramBuckets = new long[LatencyHistogram.BUCKET_COUNT]; + private int lastBufferedCount; + private long lastWatermark = -1L; + private final Map lastSeenTimestampByDevice = new HashMap<>(); + private final EquivalentRowTracker equivalentRowTracker; + private long totalRandomSeeks; + private long totalRandomSeekCheckpoints; + private long lastRandomSeekSourceRows = -1L; + private long lastRandomSeekEquivalentRows = -1L; + private long lastRandomSeekObservedRows = -1L; + + private PerfStats(final boolean enableEquivalentRowTracking) { + this.equivalentRowTracker = enableEquivalentRowTracking ? new EquivalentRowTracker() : null; + } + + private void recordLatency(final long latencyMs) { + totalLatencySamples++; + totalLatencySumMs += latencyMs; + latencyHistogramBuckets[LatencyHistogram.bucketIndex(latencyMs)]++; + } + } + + private static final class Snapshot { + private final long totalPollCalls; + private final long emptyPollCalls; + private final long totalMessages; + private final long totalWatermarkMessages; + private final long totalTablets; + private final long totalRows; + private final long totalEquivalentRows; + private final long totalApproxBytes; + private final long totalOutOfOrderRows; + private final long maxTimestampRegression; + private final long totalLatencySamples; + private final long totalLatencySumMs; + private final long[] latencyHistogramBuckets; + private final long lastWatermark; + private final long totalRandomSeeks; + + private Snapshot( + final long totalPollCalls, + final long emptyPollCalls, + final long totalMessages, + final long totalWatermarkMessages, + final long totalTablets, + final long totalRows, + final long totalEquivalentRows, + final long totalApproxBytes, + final long totalOutOfOrderRows, + final long maxTimestampRegression, + final long totalLatencySamples, + final long totalLatencySumMs, + final long[] latencyHistogramBuckets, + final long lastWatermark, + final long totalRandomSeeks) { + this.totalPollCalls = totalPollCalls; + this.emptyPollCalls = emptyPollCalls; + this.totalMessages = totalMessages; + this.totalWatermarkMessages = totalWatermarkMessages; + this.totalTablets = totalTablets; + this.totalRows = totalRows; + this.totalEquivalentRows = totalEquivalentRows; + this.totalApproxBytes = totalApproxBytes; + this.totalOutOfOrderRows = totalOutOfOrderRows; + this.maxTimestampRegression = maxTimestampRegression; + this.totalLatencySamples = totalLatencySamples; + this.totalLatencySumMs = totalLatencySumMs; + this.latencyHistogramBuckets = latencyHistogramBuckets; + this.lastWatermark = lastWatermark; + this.totalRandomSeeks = totalRandomSeeks; + } + + private static Snapshot capture(final PerfStats stats) { + Objects.requireNonNull(stats, "stats"); + return new Snapshot( + stats.totalPollCalls, + stats.emptyPollCalls, + stats.totalMessages, + stats.totalWatermarkMessages, + stats.totalTablets, + stats.totalRows, + stats.totalEquivalentRows, + stats.totalApproxBytes, + stats.totalOutOfOrderRows, + stats.maxTimestampRegression, + stats.totalLatencySamples, + stats.totalLatencySumMs, + Arrays.copyOf(stats.latencyHistogramBuckets, stats.latencyHistogramBuckets.length), + stats.lastWatermark, + stats.totalRandomSeeks); + } + + private static Snapshot zero() { + return new Snapshot( + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, new long[LatencyHistogram.BUCKET_COUNT], -1L, 0); + } + } + + private static final class RandomSeekController { + private final boolean enabled; + private final Random random = new Random(); + private final List checkpoints = new ArrayList<>(); + private boolean performed; + private long lastCapturedRows = Long.MIN_VALUE; + private TopicProgress lastCapturedProgress; + + private RandomSeekController(final boolean enabled) { + this.enabled = enabled; + } + } + + private static final class ScheduledSeekController { + private final boolean enabled; + private boolean performed; + private SeekCheckpoint checkpoint; + + private ScheduledSeekController(final boolean enabled) { + this.enabled = enabled; + } + } + + private static final class ConsumerRestartController { + private final boolean enabled; + private boolean stopPerformed; + private boolean resumePerformed; + private long stoppedNanoTime = -1L; + + private ConsumerRestartController(final boolean enabled) { + this.enabled = enabled; + } + } + + private static final class ConsumerPauseController { + private final boolean enabled; + private long nextPauseRows; + private boolean paused; + private long stoppedNanoTime = -1L; + private long pausePerformedCount; + + private ConsumerPauseController(final long pauseEveryRows) { + this.enabled = pauseEveryRows > 0; + this.nextPauseRows = pauseEveryRows; + } + } + + private static final class SeekCheckpoint { + private final long rawRows; + private final long equivalentRows; + private final String source; + private final TopicProgress topicProgress; + + private SeekCheckpoint( + final long rawRows, + final long equivalentRows, + final String source, + final TopicProgress topicProgress) { + this.rawRows = rawRows; + this.equivalentRows = equivalentRows; + this.source = source; + this.topicProgress = topicProgress; + } + } + + private static final class EquivalentRowTracker { + private final Map> intervalsByDevice = new HashMap<>(); + + private boolean record(final String deviceId, final long timestamp) { + final NavigableMap intervals = + intervalsByDevice.computeIfAbsent(deviceId, ignored -> new TreeMap<>()); + final Map.Entry floor = intervals.floorEntry(timestamp); + if (Objects.nonNull(floor) && floor.getValue() >= timestamp) { + return false; + } + + long start = timestamp; + long end = timestamp; + + if (Objects.nonNull(floor) && floor.getValue() + 1 == timestamp) { + start = floor.getKey(); + intervals.remove(floor.getKey()); + } + + final Map.Entry ceiling = intervals.ceilingEntry(timestamp); + if (Objects.nonNull(ceiling) && ceiling.getKey() - 1 == timestamp) { + end = ceiling.getValue(); + intervals.remove(ceiling.getKey()); + } + + intervals.put(start, end); + return true; + } + } + + private static final class LatencyHistogram { + private static final int MAX_TRACKED_LATENCY_MS = 60_000; + private static final int BUCKET_COUNT = MAX_TRACKED_LATENCY_MS + 2; + + private static int bucketIndex(final long latencyMs) { + if (latencyMs <= 0) { + return 0; + } + if (latencyMs > MAX_TRACKED_LATENCY_MS) { + return MAX_TRACKED_LATENCY_MS + 1; + } + return (int) latencyMs; + } + + private static String bucketLabel(final int bucketIndex) { + if (bucketIndex > MAX_TRACKED_LATENCY_MS) { + return ">" + MAX_TRACKED_LATENCY_MS; + } + return Integer.toString(bucketIndex); + } + } + + private static final class LatencySummary { + private final long sampleCount; + private final long sumMs; + private final String p95MsLabel; + private final String p99MsLabel; + private final String maxMsLabel; + + private LatencySummary( + final long sampleCount, + final long sumMs, + final String p95MsLabel, + final String p99MsLabel, + final String maxMsLabel) { + this.sampleCount = sampleCount; + this.sumMs = sumMs; + this.p95MsLabel = p95MsLabel; + this.p99MsLabel = p99MsLabel; + this.maxMsLabel = maxMsLabel; + } + + private static LatencySummary delta(final Snapshot previous, final Snapshot current) { + final long sampleCount = current.totalLatencySamples - previous.totalLatencySamples; + final long sumMs = current.totalLatencySumMs - previous.totalLatencySumMs; + if (sampleCount <= 0) { + return empty(); + } + return summarize( + sampleCount, sumMs, current.latencyHistogramBuckets, previous.latencyHistogramBuckets); + } + + private static LatencySummary total(final Snapshot current) { + if (current.totalLatencySamples <= 0) { + return empty(); + } + return summarize( + current.totalLatencySamples, + current.totalLatencySumMs, + current.latencyHistogramBuckets, + null); + } + + private static LatencySummary summarize( + final long sampleCount, + final long sumMs, + final long[] currentBuckets, + final long[] previousBuckets) { + final long p95Threshold = Math.max(1L, (long) Math.ceil(sampleCount * 0.95d)); + final long p99Threshold = Math.max(1L, (long) Math.ceil(sampleCount * 0.99d)); + long cumulative = 0L; + String p95 = "N/A"; + String p99 = "N/A"; + String max = "N/A"; + + for (int bucketIndex = 0; bucketIndex < currentBuckets.length; bucketIndex++) { + final long bucketCount = + currentBuckets[bucketIndex] + - (previousBuckets == null ? 0L : previousBuckets[bucketIndex]); + if (bucketCount <= 0) { + continue; + } + + cumulative += bucketCount; + if ("N/A".equals(p95) && cumulative >= p95Threshold) { + p95 = LatencyHistogram.bucketLabel(bucketIndex); + } + if ("N/A".equals(p99) && cumulative >= p99Threshold) { + p99 = LatencyHistogram.bucketLabel(bucketIndex); + } + } + + for (int bucketIndex = currentBuckets.length - 1; bucketIndex >= 0; bucketIndex--) { + final long bucketCount = + currentBuckets[bucketIndex] + - (previousBuckets == null ? 0L : previousBuckets[bucketIndex]); + if (bucketCount > 0) { + max = LatencyHistogram.bucketLabel(bucketIndex); + break; + } + } + + return new LatencySummary(sampleCount, sumMs, p95, p99, max); + } + + private static LatencySummary empty() { + return new LatencySummary(0L, 0L, "N/A", "N/A", "N/A"); + } + + private String formatAverageMs() { + if (sampleCount <= 0) { + return "N/A"; + } + return String.format(Locale.ROOT, "%.2f", sumMs / (double) sampleCount); + } + } +} diff --git a/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java new file mode 100644 index 0000000000000..a9cb090ccf8e0 --- /dev/null +++ b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionTableTest.java @@ -0,0 +1,2216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ITableSession; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; +import org.apache.iotdb.session.TableSessionBuilder; +import org.apache.iotdb.session.subscription.ISubscriptionTableSession; +import org.apache.iotdb.session.subscription.SubscriptionTableSessionBuilder; +import org.apache.iotdb.session.subscription.consumer.ISubscriptionTablePullConsumer; +import org.apache.iotdb.session.subscription.consumer.base.ColumnAlignProcessor; +import org.apache.iotdb.session.subscription.consumer.base.WatermarkProcessor; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumer; +import org.apache.iotdb.session.subscription.consumer.table.SubscriptionTablePullConsumerBuilder; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionRecordHandler.SubscriptionResultSet; + +import org.apache.tsfile.enums.ColumnCategory; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +/** TODO: Move these manual tests into ITs */ +public class ConsensusSubscriptionTableTest { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static int testCounter = 0; + private static int passed = 0; + private static int failed = 0; + private static final List failedTests = new ArrayList<>(); + + public static void main(String[] args) throws Exception { + System.out.println("=== Consensus-Based Subscription Table Model Test Suite ===\n"); + + String targetTest = args.length > 0 ? args[0] : null; + + if (targetTest == null || "testBasicFlow".equals(targetTest)) { + runTest("testBasicFlow", ConsensusSubscriptionTableTest::testBasicFlow); + } + if (targetTest == null || "testDataTypes".equals(targetTest)) { + runTest("testDataTypes", ConsensusSubscriptionTableTest::testDataTypes); + } + if (targetTest == null || "testFilteringAndTopicSelection".equals(targetTest)) { + runTest( + "testFilteringAndTopicSelection", + ConsensusSubscriptionTableTest::testFilteringAndTopicSelection); + } + if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { + runTest( + "testSubscribeBeforeRegion", ConsensusSubscriptionTableTest::testSubscribeBeforeRegion); + } + if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) { + runTest("testMultiEntityIsolation", ConsensusSubscriptionTableTest::testMultiEntityIsolation); + } + if (targetTest == null || "testWalCatchUpAndGapRecovery".equals(targetTest)) { + runTest( + "testWalCatchUpAndGapRecovery", + ConsensusSubscriptionTableTest::testWalCatchUpAndGapRecovery); + } + if (targetTest == null || "testSeekAndPositionSemantics".equals(targetTest)) { + runTest( + "testSeekAndPositionSemantics", + ConsensusSubscriptionTableTest::testSeekAndPositionSemantics); + } + if (targetTest == null || "testConsumerRestartRecovery".equals(targetTest)) { + runTest( + "testConsumerRestartRecovery", + ConsensusSubscriptionTableTest::testConsumerRestartRecovery); + } + if (targetTest == null || "testAckNackAndPoisonSemantics".equals(targetTest)) { + runTest( + "testAckNackAndPoisonSemantics", + ConsensusSubscriptionTableTest::testAckNackAndPoisonSemantics); + } + if (targetTest == null || "testProcessorWatermarkAndMetadata".equals(targetTest)) { + runTest( + "testProcessorWatermarkAndMetadata", + ConsensusSubscriptionTableTest::testProcessorWatermarkAndMetadata); + } + + // Summary + System.out.println("\n=== Test Suite Summary ==="); + System.out.println("Passed: " + passed); + System.out.println("Failed: " + failed); + if (!failedTests.isEmpty()) { + System.out.println("Failed tests: " + failedTests); + } + System.out.println("=== Done ==="); + } + + // ============================ + // Test Infrastructure + // ============================ + + @FunctionalInterface + interface TestMethod { + void run() throws Exception; + } + + private static void runTest(String name, TestMethod test) { + System.out.println("\n" + "================================================================="); + System.out.println("Running: " + name); + System.out.println("================================================================="); + try { + test.run(); + passed++; + System.out.println(">>> PASSED: " + name); + } catch (AssertionError e) { + failed++; + failedTests.add(name); + System.out.println(">>> FAILED: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } catch (Exception e) { + failed++; + failedTests.add(name); + System.out.println(">>> ERROR: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } + } + + private static String nextDatabase() { + testCounter++; + return "csub_tbl_" + testCounter; + } + + private static String nextTopic() { + return "topic_tbl_" + testCounter; + } + + private static String nextConsumerGroup() { + return "cg_tbl_" + testCounter; + } + + private static String nextConsumerId() { + return "consumer_tbl_" + testCounter; + } + + private static ITableSession openTableSession() throws Exception { + return new TableSessionBuilder() + .nodeUrls(Collections.singletonList(HOST + ":" + PORT)) + .username(USER) + .password(PASSWORD) + .build(); + } + + private static void createDatabaseAndTable( + ITableSession session, String database, String tableName, String tableSchema) + throws Exception { + session.executeNonQueryStatement("CREATE DATABASE IF NOT EXISTS " + database); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement(String.format("CREATE TABLE %s (%s)", tableName, tableSchema)); + } + + private static void deleteDatabase(String database) { + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("DROP DATABASE IF EXISTS " + database); + } catch (Exception e) { + // ignore + } + } + + private static void dropTopicTable(String topicName) { + try (ISubscriptionTableSession subSession = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + subSession.dropTopicIfExists(topicName); + } catch (Exception e) { + // ignore + } + } + + private static void createTopicTable(String topicName, String dbKey, String tableKey) + throws Exception { + try (ISubscriptionTableSession subSession = + new SubscriptionTableSessionBuilder() + .host(HOST) + .port(PORT) + .username(USER) + .password(PASSWORD) + .build()) { + try { + subSession.dropTopicIfExists(topicName); + } catch (Exception e) { + // ignore + } + + Properties topicConfig = new Properties(); + topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + topicConfig.put( + TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_SESSION_DATA_SETS_HANDLER_VALUE); + topicConfig.put(TopicConstant.DATABASE_KEY, dbKey); + topicConfig.put(TopicConstant.TABLE_KEY, tableKey); + subSession.createTopic(topicName, topicConfig); + System.out.println( + " Created topic: " + topicName + " (database=" + dbKey + ", table=" + tableKey + ")"); + } + } + + private static ISubscriptionTablePullConsumer createConsumer( + String consumerId, String consumerGroupId) throws Exception { + ISubscriptionTablePullConsumer consumer = + new SubscriptionTablePullConsumerBuilder() + .host(HOST) + .port(PORT) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .build(); + consumer.open(); + return consumer; + } + + // ============================ + // Polling & Verification + // ============================ + + /** + * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive + * empty rounds to verify no extra data arrives. + */ + private static PollResult pollUntilComplete( + ISubscriptionTablePullConsumer consumer, int expectedRows, int maxPollAttempts) { + return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true); + } + + /** + * Poll until we accumulate the expected number of rows, then verify no extra data arrives. + * + *

After reaching expectedRows, continues polling until 5 consecutive empty polls confirm + * quiescence. Any extra rows polled are included in the count (will break assertEquals). + * + * @param commitMessages if false, messages are NOT committed + */ + private static PollResult pollUntilComplete( + ISubscriptionTablePullConsumer consumer, + int expectedRows, + int maxPollAttempts, + long pollTimeoutMs, + boolean commitMessages) { + PollResult result = new PollResult(); + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + + if (messages.isEmpty()) { + consecutiveEmpty++; + // Normal completion: reached expected rows and verified quiescence + if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) { + System.out.println( + " Verified: " + + consecutiveEmpty + + " consecutive empty polls after " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Stuck: have data but cannot reach expected count + if (consecutiveEmpty >= 5 && result.totalRows > 0) { + System.out.println( + " Stuck: " + + consecutiveEmpty + + " consecutive empty polls at " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Never received anything + if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) { + System.out.println(" No data received after " + consecutiveEmpty + " polls"); + break; + } + try { + Thread.sleep(1000); + } catch (InterruptedException ignored) { + } + continue; + } + + consecutiveEmpty = 0; + + for (SubscriptionMessage message : messages) { + for (SubscriptionResultSet dataSet : getResultSets(message)) { + String tableName = dataSet.getTableName(); + String databaseName = dataSet.getDatabaseName(); + List columnNames = dataSet.getColumnNames(); + + while (dataSet.hasNext()) { + try { + org.apache.tsfile.read.common.RowRecord record = dataSet.nextRecord(); + result.totalRows++; + if (tableName != null) { + result.rowsPerTable.merge(tableName, 1, Integer::sum); + } + if (databaseName != null) { + result.rowsPerDatabase.merge(databaseName, 1, Integer::sum); + } + for (int i = 0; i < columnNames.size(); i++) { + result.seenColumns.add(columnNames.get(i)); + } + if (result.totalRows <= 5) { + System.out.println( + " Row: time=" + + record.getTimestamp() + + ", values=" + + record.getFields() + + ", table=" + + tableName + + ", database=" + + databaseName); + } + } catch (java.io.IOException e) { + throw new RuntimeException("Failed to iterate subscription result set", e); + } + } + } + if (commitMessages) { + consumer.commitSync(message); + } + } + + System.out.println( + " Poll attempt " + + attempt + + ": totalRows=" + + result.totalRows + + " / expected=" + + expectedRows); + + // Stop immediately if we exceeded the expected row count + if (expectedRows > 0 && result.totalRows > expectedRows) { + System.out.println( + " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows); + break; + } + } + + return result; + } + + // ============================ + // Cleanup + // ============================ + + /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */ + private static void cleanup( + ISubscriptionTablePullConsumer consumer, String topicName, String database) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName); + deleteDatabase(database); + } + + /** Clean up with multiple databases. */ + private static void cleanup( + ISubscriptionTablePullConsumer consumer, String topicName, String... databases) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName); + for (String db : databases) { + deleteDatabase(db); + } + } + + // ============================ + // Result & Assertions + // ============================ + + static class PollResult { + int totalRows = 0; + Map rowsPerTable = new HashMap<>(); + Map rowsPerDatabase = new HashMap<>(); + Set seenColumns = new HashSet<>(); + + @Override + public String toString() { + return "PollResult{totalRows=" + + totalRows + + ", rowsPerTable=" + + rowsPerTable + + ", rowsPerDatabase=" + + rowsPerDatabase + + ", seenColumns=" + + seenColumns + + "}"; + } + } + + private static void assertEquals(String msg, int expected, int actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertEquals(String msg, long expected, long actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertEquals(String msg, String expected, String actual) { + if (expected == null ? actual != null : !expected.equals(actual)) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertTrue(String msg, boolean condition) { + if (!condition) { + throw new AssertionError(msg); + } + } + + private static void assertAtLeast(String msg, int min, int actual) { + if (actual < min) { + throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual); + } + } + + private static int countWriterFrontiers(TopicProgress topicProgress) { + int writerCount = 0; + if (topicProgress == null || topicProgress.getRegionProgress() == null) { + return 0; + } + for (Map.Entry entry : topicProgress.getRegionProgress().entrySet()) { + if (entry.getValue() != null && entry.getValue().getWriterPositions() != null) { + writerCount += entry.getValue().getWriterPositions().size(); + } + } + return writerCount; + } + + private static int countRows(SubscriptionMessage message) { + int rows = 0; + for (SubscriptionResultSet dataSet : getResultSets(message)) { + while (dataSet.hasNext()) { + try { + dataSet.next(); + rows++; + } catch (java.io.IOException e) { + throw new RuntimeException("Failed to count rows from subscription result set", e); + } + } + } + return rows; + } + + // ====================================================================== + // High-signal 10-test suite wrappers + // ====================================================================== + + private static void testFilteringAndTopicSelection() throws Exception { + testPathFiltering(); + testPollWithInfoTopicFilter(); + } + + private static void testWalCatchUpAndGapRecovery() throws Exception { + testBurstWriteGapRecovery(); + } + + private static void testSeekAndPositionSemantics() throws Exception { + testSeek(); + } + + private static void testAckNackAndPoisonSemantics() throws Exception { + testCommitAfterUnsubscribe(); + testPoisonMessageDrop(); + } + + private static void testProcessorWatermarkAndMetadata() throws Exception { + testProcessorFramework(); + testWriterProgressFields(); + } + + // ====================================================================== + // Topic filter subcase for table model + // ====================================================================== + private static void testPollWithInfoTopicFilter() throws Exception { + String database = nextDatabase(); + String topicName1 = "topic_tbl_filter_" + testCounter + "_a"; + String topicName2 = "topic_tbl_filter_" + testCounter + "_b"; + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + } + Thread.sleep(2000); + + createTopicTable(topicName1, database, "t1"); + createTopicTable(topicName2, database, "t2"); + Thread.sleep(1000); + + consumer = (SubscriptionTablePullConsumer) createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName1, topicName2); + Thread.sleep(3000); + + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d2', %d, %d)", i * 20, i)); + } + } + Thread.sleep(3000); + + int t1Rows = 0; + Set topic1Only = new HashSet<>(Arrays.asList(topicName1)); + for (int attempt = 0; attempt < 40; attempt++) { + org.apache.iotdb.session.subscription.payload.PollResult pollResult = + consumer.pollWithInfo(topic1Only, 2000); + List msgs = pollResult.getMessages(); + if (msgs.isEmpty()) { + if (t1Rows > 0) { + break; + } + Thread.sleep(1000); + continue; + } + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + t1Rows++; + assertEquals("Topic1-only poll should stay on t1", "t1", ds.getTableName()); + } + } + consumer.commitSync(msg); + } + } + assertEquals("Topic1 should deliver exactly 30 rows from t1", 30, t1Rows); + + int t2Rows = 0; + Set topic2Only = new HashSet<>(Arrays.asList(topicName2)); + for (int attempt = 0; attempt < 40; attempt++) { + org.apache.iotdb.session.subscription.payload.PollResult pollResult = + consumer.pollWithInfo(topic2Only, 2000); + List msgs = pollResult.getMessages(); + if (msgs.isEmpty()) { + if (t2Rows > 0) { + break; + } + Thread.sleep(1000); + continue; + } + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + t2Rows++; + assertEquals("Topic2-only poll should stay on t2", "t2", ds.getTableName()); + } + } + consumer.commitSync(msg); + } + } + assertEquals("Topic2 should deliver exactly 40 rows from t2", 40, t2Rows); + } finally { + if (consumer != null) { + try { + consumer.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopicTable(topicName1); + dropTopicTable(topicName2); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 8: Consumer Restart Recovery + // ====================================================================== + private static void testConsumerRestartRecovery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId1 = nextConsumerId(); + String consumerId2 = consumerId1 + "_restart"; + SubscriptionTablePullConsumer consumer1 = null; + SubscriptionTablePullConsumer consumer2 = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, "t1"); + Thread.sleep(1000); + + consumer1 = (SubscriptionTablePullConsumer) createConsumer(consumerId1, consumerGroupId); + consumer1.subscribe(topicName); + Thread.sleep(3000); + + final int totalRows = 257; + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= totalRows; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + session.executeNonQueryStatement("FLUSH"); + } + Thread.sleep(3000); + + int committedRows = 0; + for (int attempt = 1; attempt <= 30; attempt++) { + List messages = consumer1.poll(Duration.ofMillis(2000)); + if (messages.isEmpty()) { + Thread.sleep(1000); + continue; + } + SubscriptionMessage firstMessage = messages.get(0); + committedRows = countRows(firstMessage); + consumer1.commitSync(firstMessage); + break; + } + + assertAtLeast("First consumer should commit some rows before restart", 1, committedRows); + TopicProgress checkpoint = consumer1.committedPositions(topicName); + assertTrue( + "Committed checkpoint should not be empty", + checkpoint.getRegionProgress() != null && !checkpoint.getRegionProgress().isEmpty()); + int remainingRows = totalRows - committedRows; + assertAtLeast("Restart scenario should leave rows after the first commit", 1, remainingRows); + + consumer1.close(); + consumer1 = null; + + consumer2 = (SubscriptionTablePullConsumer) createConsumer(consumerId2, consumerGroupId); + consumer2.subscribe(topicName); + Thread.sleep(3000); + consumer2.seekAfter(topicName, checkpoint); + Thread.sleep(1000); + + PollResult resumed = pollUntilComplete(consumer2, remainingRows, 120); + assertEquals( + "Restarted consumer should resume from the committed checkpoint without replay", + remainingRows, + resumed.totalRows); + } finally { + cleanup(consumer1, topicName, database); + cleanup(consumer2, topicName, database); + } + } + + // ====================================================================== + // Test 1: Basic Flow (merged: BasicDataDelivery + MultiTables + Flush) + // ====================================================================== + /** + * Verifies: + * + *

+ */ + private static void testBasicFlow() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + // Step 1: Write initial data to create DataRegion (should NOT be received) + System.out.println(" Step 1: Writing initial data (should NOT be received)"); + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("CREATE TABLE t3 (tag1 STRING TAG, s1 INT64 FIELD)"); + for (int i = 0; i < 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 2: Create topic and subscribe + System.out.println(" Step 2: Creating topic and subscribing"); + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 3: Write to 3 tables (30 rows each = 90 total), then flush + System.out.println(" Step 3: Writing 30 rows x 3 tables AFTER subscribe, then flush"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 100; i < 130; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t3 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); + } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 4: Poll and verify + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 90, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 90 rows (30 per table)", 90, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + System.out.println(" Rows per table: " + result.rowsPerTable); + for (String tbl : new String[] {"t1", "t2", "t3"}) { + Integer tblRows = result.rowsPerTable.get(tbl); + assertAtLeast("Expected rows from " + tbl, 1, tblRows != null ? tblRows : 0); + } + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 2: Data Types (merged: MultipleDataTypes + MultiColumnTypes + CrossPartition) + // ====================================================================== + /** + * Verifies: + * + * + */ + private static void testDataTypes() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + final long GAP = 604_800_001L; // slightly over 1 week + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable( + session, + database, + "t1", + "tag1 STRING TAG, s_int32 INT32 FIELD, s_int64 INT64 FIELD, " + + "s_float FLOAT FIELD, s_double DOUBLE FIELD, s_bool BOOLEAN FIELD, " + + "s_text TEXT FIELD"); + session.executeNonQueryStatement("USE " + database); + // Init row to force DataRegion creation + session.executeNonQueryStatement( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 0, 0, 0.0, 0.0, false, 'init', 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + int totalExpected = 0; + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + + // --- Part A: 6 data types x 20 rows, separate INSERTs --- + System.out.println(" Part A: 6 data types x 20 rows (separate INSERTs)"); + for (int i = 1; i <= 20; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s_int32, time) VALUES ('d1', %d, %d)", i, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int64, time) VALUES ('d1', %d, %d)", + (long) i * 100000L, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_float, time) VALUES ('d1', %f, %d)", i * 1.1f, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_double, time) VALUES ('d1', %f, %d)", i * 2.2, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_bool, time) VALUES ('d1', %s, %d)", + i % 2 == 0 ? "true" : "false", i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_text, time) VALUES ('d1', 'text_%d', %d)", i, i)); + } + totalExpected += 120; // 6 types x 20 rows + + // --- Part B: All-column rows (50 rows) --- + System.out.println(" Part B: 50 all-column rows"); + for (int i = 21; i <= 70; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time)" + + " VALUES ('d1', %d, %d, %f, %f, %s, 'text_%d', %d)", + i, (long) i * 100000L, i * 1.1f, i * 2.2, i % 2 == 0 ? "true" : "false", i, i)); + } + totalExpected += 50; + + // --- Part C: Cross-partition writes --- + System.out.println(" Part C: Cross-partition (SQL single, multi, Tablet)"); + long baseTs = 1_000_000_000L; + + // SQL single-row x2 + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 1, 100, 1.1, 1.11, true, 'xp_single_1', %d)", + baseTs)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 2, 200, 2.2, 2.22, false, 'xp_single_2', %d)", + baseTs + GAP)); + totalExpected += 2; + + // SQL multi-row x3 + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s_int32, s_int64, s_float, s_double, s_bool, s_text, time) " + + "VALUES ('d1', 3, 300, 3.3, 3.33, true, 'xp_multi_1', %d), " + + "('d1', 4, 400, 4.4, 4.44, false, 'xp_multi_2', %d), " + + "('d1', 5, 500, 5.5, 5.55, true, 'xp_multi_3', %d)", + baseTs + GAP * 2, baseTs + GAP * 3, baseTs + GAP * 4)); + totalExpected += 3; + + // Tablet x4 + List schemaList = new ArrayList<>(); + schemaList.add(new MeasurementSchema("tag1", TSDataType.STRING)); + schemaList.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemaList.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemaList.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemaList.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemaList.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemaList.add(new MeasurementSchema("s_text", TSDataType.STRING)); + + List categories = + java.util.Arrays.asList( + ColumnCategory.TAG, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD); + + Tablet tablet = + new Tablet( + "t1", + IMeasurementSchema.getMeasurementNameList(schemaList), + IMeasurementSchema.getDataTypeList(schemaList), + categories, + 10); + for (int i = 0; i < 4; i++) { + int row = tablet.getRowSize(); + long ts = baseTs + GAP * (5 + i); + tablet.addTimestamp(row, ts); + tablet.addValue("tag1", row, "d1"); + tablet.addValue("s_int32", row, 6 + i); + tablet.addValue("s_int64", row, (long) (600 + i * 100)); + tablet.addValue("s_float", row, (6 + i) * 1.1f); + tablet.addValue("s_double", row, (6 + i) * 2.22); + tablet.addValue("s_bool", row, i % 2 == 0); + tablet.addValue("s_text", row, "xp_tablet_" + (i + 1)); + } + session.insert(tablet); + totalExpected += 4; + } + + System.out.println(" Total expected rows: " + totalExpected); + Thread.sleep(2000); + + PollResult result = pollUntilComplete(consumer, totalExpected, 200); + System.out.println(" Result: " + result); + + assertAtLeast( + "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows); + assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 3: Path Filtering (merged: TableLevel + DatabaseLevel) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Table-level: topic on table=t1 does NOT deliver t2 data + *
  • Database-level: topic on db1 does NOT deliver db2 data + *
+ */ + private static void testPathFiltering() throws Exception { + String database1 = nextDatabase(); + String database2 = database1 + "_other"; + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + // db1 with t1 and t2 + createDatabaseAndTable(session, database1, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database1); + session.executeNonQueryStatement("CREATE TABLE t2 (tag1 STRING TAG, s1 INT64 FIELD)"); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', 0, 0)"); + // db2 with t1 + createDatabaseAndTable(session, database2, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database2); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic: only db1, only table t1 + createTopicTable(topicName, database1, "t1"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to db1.t1, db1.t2, db2.t1 (topic filter: db1.t1 only)"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database1); + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + session.executeNonQueryStatement( + String.format("INSERT INTO t2 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 20, i)); + } + session.executeNonQueryStatement("USE " + database2); + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 30, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting only db1.t1 data = 50 rows)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 50 rows from db1.t1 only", 50, result.totalRows); + if (!result.rowsPerTable.isEmpty()) { + Integer t2Rows = result.rowsPerTable.get("t2"); + assertTrue("Expected NO rows from t2, but got " + t2Rows, t2Rows == null || t2Rows == 0); + System.out.println(" Table filtering verified: t1 only"); + } + if (!result.rowsPerDatabase.isEmpty()) { + Integer db2Rows = result.rowsPerDatabase.get(database2); + assertTrue("Expected NO rows from " + database2, db2Rows == null || db2Rows == 0); + System.out.println(" Database filtering verified: " + database1 + " only"); + } + } finally { + cleanup(consumer, topicName, database1, database2); + } + } + + // ====================================================================== + // Test 4: Subscribe Before Region Creation (kept as-is) + // ====================================================================== + /** + * Subscribe BEFORE the database/region exists, then create database and write. Tests the + * IoTConsensus.onNewPeerCreated auto-binding path with table model. + */ + private static void testSubscribeBeforeRegion() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + System.out.println(" Step 1: Creating topic BEFORE database exists"); + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + System.out.println(" Step 2: Subscribing (no DataRegion exists yet)"); + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Step 3: Creating database, table and writing data (100 rows)"); + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < 100; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(5000); + + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + if (result.totalRows >= 100) { + System.out.println(" Auto-binding works! All " + result.totalRows + " rows received."); + } else if (result.totalRows > 0) { + System.out.println( + " Partial: " + result.totalRows + "/100 rows. First writes may precede binding."); + } else { + System.out.println(" No data received. Check logs for auto-binding messages."); + } + assertAtLeast( + "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // testRedelivery removed — will be re-added with proper timeout-based nack testing + + // ====================================================================== + // Test 6: Multi-Entity Isolation + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Two consumer groups on the same topic each receive the full data stream independently + *
+ */ + private static void testMultiEntityIsolation() throws Exception { + String database = nextDatabase(); + String topicName = "topic_tbl_multi_" + testCounter; + String consumerGroupId1 = "cg_tbl_multi_" + testCounter + "_a"; + String consumerId1 = "consumer_tbl_multi_" + testCounter + "_a"; + String consumerGroupId2 = "cg_tbl_multi_" + testCounter + "_b"; + String consumerId2 = "consumer_tbl_multi_" + testCounter + "_b"; + ISubscriptionTablePullConsumer consumer1 = null; + ISubscriptionTablePullConsumer consumer2 = null; + + try { + // Setup: database with a single table to isolate the multi-group semantics. + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, "t1"); + Thread.sleep(1000); + + consumer1 = createConsumer(consumerId1, consumerGroupId1); + consumer1.subscribe(topicName); + consumer2 = createConsumer(consumerId2, consumerGroupId2); + consumer2.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing 70 rows to t1"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 70; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + System.out.println(" Multi-group isolation"); + System.out.println(" Polling from group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 70, 80); + System.out.println(" Group 1 result: " + result1); + + System.out.println(" Polling from group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 70, 80); + System.out.println(" Group 2 result: " + result2); + + assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows); + assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows); + assertEquals("Expected 70 rows from t1", 70, result1.rowsPerTable.getOrDefault("t1", 0)); + assertEquals("Expected 70 rows from t1", 70, result2.rowsPerTable.getOrDefault("t1", 0)); + System.out.println( + " Multi-group isolation verified: group1=" + + result1.totalRows + + ", group2=" + + result2.totalRows); + } finally { + if (consumer1 != null) { + try { + consumer1.unsubscribe(topicName); + } catch (Exception e) { + /* ignore */ + } + try { + consumer1.close(); + } catch (Exception e) { + /* ignore */ + } + } + if (consumer2 != null) { + try { + consumer2.unsubscribe(topicName); + } catch (Exception e) { + /* ignore */ + } + try { + consumer2.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopicTable(topicName); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 7: Burst Write Gap Recovery (NEW — tests C2 fix) + // ====================================================================== + /** + * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The + * pending queue overflow triggers gaps, which should be recovered from WAL. + * + *

Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one + * {@code pendingEntries.offer()}. A single {@code session.insert(tablet)} with N rows in one time + * partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To actually + * overflow, we need 4096+ individual write() calls arriving faster than the prefetch + * thread can drain. We achieve this with multiple concurrent writer threads, each performing + * individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate. + * + *

Note: Gap occurrence is inherently timing-dependent (race between writers and the + * prefetch drain loop). This test maximizes the probability by using concurrent threads, but + * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling + * from WAL" messages to confirm the gap path was exercised. + * + *

Fix verified: C2 — gap entries are not skipped when WAL fill times out; they are deferred to + * the next prefetch iteration. + */ + private static void testBurstWriteGapRecovery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Use multiple concurrent writer threads with individual SQL INSERTs. + // Each INSERT → 1 IoTConsensusServerImpl.write() → 1 pendingEntries.offer(). + // With N threads writing concurrently, aggregate rate should exceed drain rate + // and overflow the 4096-capacity queue, creating gaps. + final int writerThreads = 4; + final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096 + final int totalRows = writerThreads * rowsPerThread; + final AtomicInteger errorCount = new AtomicInteger(0); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(writerThreads); + + System.out.println( + " Burst writing " + + totalRows + + " rows via " + + writerThreads + + " concurrent threads (" + + rowsPerThread + + " individual SQL INSERTs each)"); + System.out.println( + " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)"); + + ExecutorService executor = Executors.newFixedThreadPool(writerThreads); + for (int t = 0; t < writerThreads; t++) { + final int threadId = t; + final int startTs = threadId * rowsPerThread + 1; + executor.submit( + () -> { + try { + startLatch.await(); // all threads start at the same time + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < rowsPerThread; i++) { + int ts = startTs + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", + (long) ts * 10, ts)); + } + } + } catch (Exception e) { + System.out.println(" Writer thread " + threadId + " error: " + e.getMessage()); + errorCount.incrementAndGet(); + } finally { + doneLatch.countDown(); + } + }); + } + + // Fire all threads simultaneously + startLatch.countDown(); + doneLatch.await(); + executor.shutdown(); + + if (errorCount.get() > 0) { + System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors"); + } + + // Do NOT add artificial delay — let the consumer compete with ongoing WAL writes + System.out.println( + " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)..."); + System.out.println( + " (Check server logs for 'gap detected' to confirm gap recovery was triggered)"); + PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true); + System.out.println(" Result: " + result); + + assertEquals( + "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)", + totalRows, + result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 8: Commit After Unsubscribe (NEW — tests H7 fix) + // ====================================================================== + /** + * Tests that commit still works correctly after the consumer has unsubscribed (queue has been + * torn down). The commit routing should use metadata-based topic config check instead of runtime + * queue state. + * + *

Fix verified: H7 — commit routes via isConsensusBasedTopic() instead of hasQueue(). + */ + private static void testCommitAfterUnsubscribe() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + ISubscriptionTablePullConsumer consumer = null; + + try { + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', 0, 0)"); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopicTable(topicName, database, ".*"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write data + System.out.println(" Writing 50 rows"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + // Poll WITHOUT commit + System.out.println(" Polling WITHOUT commit..."); + List uncommittedMessages = new ArrayList<>(); + int polledRows = 0; + for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + if (polledRows > 0) break; + Thread.sleep(500); + continue; + } + for (SubscriptionMessage msg : msgs) { + uncommittedMessages.add(msg); + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + polledRows++; + } + } + } + } + System.out.println( + " Polled " + + polledRows + + " rows, holding " + + uncommittedMessages.size() + + " uncommitted messages"); + assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows); + + // Unsubscribe (tears down the consensus queue) + System.out.println(" Unsubscribing (queue teardown)..."); + consumer.unsubscribe(topicName); + Thread.sleep(2000); + + // Now commit the previously polled messages — should NOT throw + System.out.println( + " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe..."); + boolean commitSucceeded = true; + for (SubscriptionMessage msg : uncommittedMessages) { + try { + consumer.commitSync(msg); + } catch (Exception e) { + System.out.println(" Commit threw exception: " + e.getMessage()); + commitSucceeded = false; + } + } + + System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded); + assertTrue("Commit after unsubscribe should succeed without exception", commitSucceeded); + System.out.println(" (Key: no exception crash, routing handled gracefully)"); + } finally { + if (consumer != null) { + try { + consumer.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopicTable(topicName); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 8: Seek (seekToBeginning, seekToEnd, seek by timestamp) + // ====================================================================== + /** + * Verifies all three seek operations in a single flow: + * + *

    + *
  • seekToBeginning — re-delivers previously committed data from earliest available position + *
  • seekToEnd — skips all existing data, only new writes are received + *
  • seek(timestamp) — positions at the approximate WAL entry matching the given timestamp + *
+ */ + private static void testSeek() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTablePullConsumer consumer = null; + + try { + // Step 0: Create DataRegion + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + } + Thread.sleep(2000); + + // Step 1: Create topic + consumer + subscribe + System.out.println(" Step 1: Create topic and subscribe"); + createTopicTable(topicName, database, "t1"); + Thread.sleep(1000); + + consumer = (SubscriptionTablePullConsumer) createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 2: Write 1000 rows with timestamps 1000..1999 and poll+commit all + System.out.println(" Step 2: Write 1000 rows (timestamps 1000..1999) and poll+commit"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 0; i < 1000; i++) { + long ts = 1000 + i; + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", ts * 10, ts)); + } + } + Thread.sleep(2000); + + PollResult firstPoll = pollUntilComplete(consumer, 1000, 120); + System.out.println(" First poll: " + firstPoll.totalRows + " rows"); + assertAtLeast("First poll should get rows", 1, firstPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 3: seekToBeginning — should re-deliver data from the start + // ------------------------------------------------------------------ + System.out.println(" Step 3: seekToBeginning → expect re-delivery"); + consumer.seekToBeginning(topicName); + Thread.sleep(2000); + + // No initial INSERT in table test (Step 0 only creates DB+table), so expectedRows=1000 + PollResult beginningPoll = pollUntilComplete(consumer, 1000, 120); + System.out.println(" After seekToBeginning: " + beginningPoll); + assertAtLeast( + "seekToBeginning should re-deliver rows (WAL retention permitting)", + 1, + beginningPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 4: seekToEnd — should receive nothing until new writes + // ------------------------------------------------------------------ + System.out.println(" Step 4: seekToEnd → expect no old data"); + consumer.seekToEnd(topicName); + Thread.sleep(2000); + + PollResult endPoll = new PollResult(); + int consecutiveEmpty = 0; + for (int attempt = 0; attempt < 15; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(1000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + endPoll.totalRows++; + } + } + consumer.commitSync(msg); + } + } + System.out.println(" After seekToEnd (no new writes): " + endPoll.totalRows + " rows"); + // May occasionally be 1 due to prefetch thread race; tolerate small values + assertTrue("seekToEnd should yield at most 1 row (race tolerance)", endPoll.totalRows <= 1); + + // Write 200 new rows — they should be received + System.out.println(" Writing 200 new rows after seekToEnd"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 2000; i < 2200; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + PollResult afterEndPoll = pollUntilComplete(consumer, 200, 120); + System.out.println(" After seekToEnd + new writes: " + afterEndPoll); + assertEquals( + "Should receive exactly 200 new rows after seekToEnd", 200, afterEndPoll.totalRows); + + // ------------------------------------------------------------------ + // Step 5: seek(timestamp) — seek to timestamp 1500 + // ------------------------------------------------------------------ + System.out.println(" Step 5: seek(1500) → expect rows from near ts=1500"); + consumer.seekToBeginning(topicName); + Thread.sleep(2000); + + PollResult midpointPoll = new PollResult(); + TopicProgress midpointProgress = null; + consecutiveEmpty = 0; + for (int attempt = 0; attempt < 20 && midpointProgress == null; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(1000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + midpointPoll.totalRows++; + } + } + consumer.commitSync(msg); + if (midpointPoll.totalRows >= 500) { + midpointProgress = consumer.committedPositions(topicName); + break; + } + } + } + assertTrue("Should capture a midpoint TopicProgress", midpointProgress != null); + + consumer.seek(topicName, midpointProgress); + Thread.sleep(2000); + + // Sparse mapping (interval=100) positions near ts=1500. + // Expect: ~500 rows from ts≥1500 in original data (1500..1999) + // + 200 rows from new writes (2000..2199) = ~700 minimum + PollResult afterSeek = pollUntilComplete(consumer, 1200, 120); + final int minimumTailRows = Math.max(1, 1200 - midpointPoll.totalRows); + System.out.println( + " After seek(topicProgress): " + + afterSeek.totalRows + + " rows from midpoint progress " + + midpointPoll.totalRows); + assertAtLeast( + "seek(topicProgress) should deliver the remaining tail rows", + minimumTailRows, + afterSeek.totalRows); + + // ------------------------------------------------------------------ + // Step 6: seek(future timestamp) — expect 0 rows + // ------------------------------------------------------------------ + System.out.println(" Step 6: seek(99999) → expect no data"); + TopicProgress tailProgress = consumer.committedPositions(topicName); + assertTrue("Tail TopicProgress should be available after replay", tailProgress != null); + consumer.seekAfter(topicName, tailProgress); + Thread.sleep(2000); + + PollResult futurePoll = new PollResult(); + consecutiveEmpty = 0; + for (int attempt = 0; attempt < 10; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(1000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + futurePoll.totalRows++; + } + } + consumer.commitSync(msg); + } + } + System.out.println( + " After seekAfter(tail topicProgress): " + futurePoll.totalRows + " rows"); + // seek(99999) should behave like seekToEnd — 0 rows normally, + // but may yield up to 1 row due to prefetch thread race (same as seekToEnd) + assertTrue( + "seekAfter(tail topicProgress) should yield at most 1 row (race tolerance)", + futurePoll.totalRows <= 1); + + // ------------------------------------------------------------------ + // Step 7: seek(topicProgress) — seek by per-region writer progress + // ------------------------------------------------------------------ + System.out.println( + " Step 7: seekToBeginning first, then poll to collect per-region positions"); + consumer.seekToBeginning(topicName); + Thread.sleep(2000); + + List positionSnapshots = new ArrayList<>(); + List rowsPerMsg = new ArrayList<>(); + int totalRowsCollected = 0; + consecutiveEmpty = 0; + + for (int attempt = 0; attempt < 60; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5 && totalRowsCollected > 0) break; + Thread.sleep(500); + continue; + } + consecutiveEmpty = 0; + for (SubscriptionMessage msg : msgs) { + int msgRows = 0; + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + msgRows++; + } + } + consumer.commitSync(msg); + rowsPerMsg.add(msgRows); + totalRowsCollected += msgRows; + positionSnapshots.add(consumer.committedPositions(topicName)); + } + } + System.out.println( + " Collected " + + totalRowsCollected + + " rows in " + + positionSnapshots.size() + + " messages"); + + if (positionSnapshots.size() >= 2) { + int midIdx = positionSnapshots.size() / 2; + TopicProgress seekPositions = positionSnapshots.get(midIdx); + int writerFrontierCount = countWriterFrontiers(seekPositions); + assertTrue( + "committed TopicProgress should contain at least one writer frontier", + writerFrontierCount > 0); + System.out.println( + " seekAfter(topicProgress.regionCount=" + + seekPositions.getRegionProgress().size() + + ", writerFrontierCount=" + + writerFrontierCount + + ") [msg " + + midIdx + + "/" + + positionSnapshots.size() + + "]"); + + int expectedFromMid = 0; + for (int i = midIdx; i < rowsPerMsg.size(); i++) { + expectedFromMid += rowsPerMsg.get(i); + } + + consumer.seekAfter(topicName, seekPositions); + Thread.sleep(2000); + + PollResult afterSeekEpoch = pollUntilComplete(consumer, expectedFromMid, 60); + System.out.println( + " After seekAfter(topicProgress): " + + afterSeekEpoch.totalRows + + " rows (expected ~" + + expectedFromMid + + ")"); + assertAtLeast( + "seekAfter(topicProgress) should deliver at least half the tail data", + expectedFromMid / 2, + afterSeekEpoch.totalRows); + } else { + System.out.println( + " SKIP seekAfter(topicProgress) sub-test: only " + + positionSnapshots.size() + + " messages"); + } + + System.out.println(" testSeek passed all sub-tests!"); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 9: Processor Framework (ColumnAlignProcessor + WatermarkProcessor + PollResult) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • ColumnAlignProcessor forward-fills null columns per table + *
  • pollWithInfo() returns PollResult with correct metadata + *
  • WatermarkProcessor buffers and emits based on watermark + *
  • Processor chaining works correctly + *
  • Idempotent double-commit does not throw + *
+ */ + private static void testProcessorFramework() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + String tableName = "proc_test"; + SubscriptionTablePullConsumer consumer = null; + SubscriptionTablePullConsumer consumer2 = null; + + try { + // Step 1: Create table with 3 measurement columns + System.out.println(" Step 1: Creating table with 3 measurement columns"); + try (ITableSession session = openTableSession()) { + createDatabaseAndTable( + session, + database, + tableName, + "device_id STRING TAG, s1 INT32 FIELD, s2 INT32 FIELD, s3 INT32 FIELD"); + } + + // Step 2: Create topic and subscribe + System.out.println(" Step 2: Creating topic and subscribing"); + createTopicTable(topicName, database, tableName); + Thread.sleep(1000); + + // Build consumer with ColumnAlignProcessor — use concrete type for addProcessor access + consumer = + (SubscriptionTablePullConsumer) + new SubscriptionTablePullConsumerBuilder() + .host(HOST) + .port(PORT) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .build(); + consumer.addProcessor(new ColumnAlignProcessor()); + consumer.open(); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 3: Write a Tablet with 2 rows — row 2 has s2/s3 null (marked in BitMap). + // Using insertTablet ensures both rows share the same Tablet with all 3 columns, + // so ColumnAlignProcessor can forward-fill the nulls. + System.out.println(" Step 3: Writing partial-column data via insertTablet"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + List schemas = + Arrays.asList( + new MeasurementSchema("device_id", TSDataType.STRING), + new MeasurementSchema("s1", TSDataType.INT32), + new MeasurementSchema("s2", TSDataType.INT32), + new MeasurementSchema("s3", TSDataType.INT32)); + List categories = + Arrays.asList( + ColumnCategory.TAG, + ColumnCategory.FIELD, + ColumnCategory.FIELD, + ColumnCategory.FIELD); + Tablet tablet = + new Tablet( + tableName, + IMeasurementSchema.getMeasurementNameList(schemas), + IMeasurementSchema.getDataTypeList(schemas), + categories, + 2); + + // Row 0 (time=100): all columns present + tablet.addTimestamp(0, 100); + tablet.addValue("device_id", 0, "dev1"); + tablet.addValue("s1", 0, 10); + tablet.addValue("s2", 0, 20); + tablet.addValue("s3", 0, 30); + + // Row 1 (time=200): only s1 — s2/s3 remain null (BitMap marked by addTimestamp) + tablet.addTimestamp(1, 200); + tablet.addValue("device_id", 1, "dev1"); + tablet.addValue("s1", 1, 11); + + session.insert(tablet); + session.executeNonQueryStatement("FLUSH"); + } + Thread.sleep(2000); + + // Step 4: Poll with pollWithInfo and verify ColumnAlign + PollResult + System.out.println(" Step 4: Polling with pollWithInfo"); + int totalRows = 0; + boolean foundForwardFill = false; + org.apache.iotdb.session.subscription.payload.PollResult lastPollResult = null; + List allMessages = new ArrayList<>(); + + for (int attempt = 0; attempt < 30; attempt++) { + org.apache.iotdb.session.subscription.payload.PollResult pollResult = + consumer.pollWithInfo(Duration.ofMillis(1000)); + lastPollResult = pollResult; + + assertTrue("PollResult should not be null", pollResult != null); + // With only ColumnAlignProcessor (non-buffering), bufferedCount should be 0 + assertEquals("ColumnAlignProcessor should not buffer", 0, pollResult.getBufferedCount()); + + List msgs = pollResult.getMessages(); + if (msgs.isEmpty()) { + if (totalRows >= 2) break; + Thread.sleep(1000); + continue; + } + + allMessages.addAll(msgs); + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + List columnNames = ds.getColumnNames(); + while (ds.hasNext()) { + org.apache.tsfile.read.common.RowRecord row = ds.nextRecord(); + totalRows++; + List fields = row.getFields(); + System.out.println( + " Row: time=" + + row.getTimestamp() + + ", columns=" + + columnNames + + ", fields=" + + fields); + // Check forward-fill: at timestamp 200, s2 and s3 should be filled + if (row.getTimestamp() == 200) { + // Table results include "time" in columnNames but not in fields. + int s2ColumnIdx = columnNames.indexOf("s2"); + int s3ColumnIdx = columnNames.indexOf("s3"); + int fieldOffset = + !columnNames.isEmpty() && "time".equalsIgnoreCase(columnNames.get(0)) ? 1 : 0; + int s2FieldIdx = s2ColumnIdx - fieldOffset; + int s3FieldIdx = s3ColumnIdx - fieldOffset; + if (s2FieldIdx >= 0 + && s3FieldIdx >= 0 + && s2FieldIdx < fields.size() + && s3FieldIdx < fields.size() + && fields.get(s2FieldIdx) != null + && fields.get(s2FieldIdx).getDataType() != null + && fields.get(s3FieldIdx) != null + && fields.get(s3FieldIdx).getDataType() != null) { + foundForwardFill = true; + System.out.println(" >>> Forward-fill confirmed at timestamp 200"); + } + } + } + } + } + } + + assertEquals("Expected 2 rows total", 2, totalRows); + assertTrue( + "ColumnAlignProcessor should forward-fill nulls at timestamp 200", foundForwardFill); + System.out.println(" ColumnAlignProcessor: PASSED"); + + // Step 5: Idempotent double-commit + System.out.println(" Step 5: Testing idempotent double-commit"); + if (!allMessages.isEmpty()) { + SubscriptionMessage firstMsg = allMessages.get(0); + consumer.commitSync(firstMsg); + // Second commit of same message should not throw + consumer.commitSync(firstMsg); + System.out.println(" Double-commit succeeded (idempotent)"); + } + + // Step 6: Test with WatermarkProcessor chained + System.out.println(" Step 6: Verifying WatermarkProcessor buffering"); + // Close current consumer and create a new one with WatermarkProcessor + consumer.unsubscribe(topicName); + consumer.close(); + + String consumerId2 = consumerId + "_wm"; + consumer2 = + (SubscriptionTablePullConsumer) + new SubscriptionTablePullConsumerBuilder() + .host(HOST) + .port(PORT) + .consumerId(consumerId2) + .consumerGroupId(consumerGroupId + "_wm") + .autoCommit(false) + .build(); + // Chain: ColumnAlign → Watermark(5s out-of-order, 10s timeout) + consumer2.addProcessor(new ColumnAlignProcessor()); + consumer2.addProcessor(new WatermarkProcessor(5000, 10000)); + consumer2.open(); + consumer2.subscribe(topicName); + Thread.sleep(3000); + + // Write data that should be buffered by watermark + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s(time, device_id, s1, s2, s3) VALUES (1000, 'dev1', 100, 200, 300)", + tableName)); + session.executeNonQueryStatement("FLUSH"); + } + Thread.sleep(2000); + + // First poll — data may be buffered by WatermarkProcessor + org.apache.iotdb.session.subscription.payload.PollResult wmResult = + consumer2.pollWithInfo(Duration.ofMillis(2000)); + System.out.println( + " WatermarkProcessor poll: messages=" + + wmResult.getMessages().size() + + ", buffered=" + + wmResult.getBufferedCount()); + // The watermark processor may buffer or emit depending on timing; + // we just verify the API works and returns valid metadata + assertTrue("PollResult bufferedCount should be >= 0", wmResult.getBufferedCount() >= 0); + + // consumer already closed above in Step 6 setup + consumer = null; + + System.out.println(" testProcessorFramework passed all sub-tests!"); + } finally { + cleanup(consumer, topicName, database); + cleanup(consumer2, topicName, database); + } + } + + // ====================================================================== + // Test 10: Poison Message Drop — messages nacked beyond threshold + // are force-acked (dropped) and don't block new data. + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • A message nacked more than POISON_MESSAGE_NACK_THRESHOLD (10) times is dropped + *
  • After drop, new data can still be received + *
  • The consumer is not permanently blocked by a single unprocessable message + *
+ */ + private static void testPoisonMessageDrop() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTablePullConsumer consumer = null; + + try { + // Step 0: Create DataRegion + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + } + Thread.sleep(2000); + + // Step 1: Create topic and subscribe + System.out.println(" Step 1: Creating topic and subscribing"); + createTopicTable(topicName, database, "t1"); + Thread.sleep(1000); + + consumer = (SubscriptionTablePullConsumer) createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 2: Write initial data that will become the "poison" message + System.out.println(" Step 2: Writing 10 rows (the initial batch)"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 10; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + // Step 3: Poll without commit — repeatedly. Each poll-then-timeout cycle + // causes the server to nack the in-flight event and re-enqueue it. + System.out.println( + " Step 3: Polling without commit for 15 rounds (threshold=10, need >10 nacks)"); + int totalPoisonPolled = 0; + for (int round = 1; round <= 15; round++) { + List msgs = consumer.poll(Duration.ofMillis(3000)); + int roundRows = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + roundRows++; + totalPoisonPolled++; + } + } + // Deliberately NOT committing — this is the "nack" behavior + } + System.out.println( + " Round " + round + ": received " + roundRows + " rows (NOT committing)"); + if (msgs.isEmpty() && round > 11) { + System.out.println(" No messages — poison message may have been force-acked"); + break; + } + Thread.sleep(1000); + } + System.out.println(" Total rows polled across all rounds: " + totalPoisonPolled); + + // Step 4: Write NEW data and verify it can be received (consumer not blocked) + System.out.println(" Step 4: Writing 50 NEW rows and polling WITH commit"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1000; i < 1050; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + PollResult newResult = pollUntilComplete(consumer, 50, 60); + System.out.println(" New data poll result: " + newResult); + + assertAtLeast( + "Consumer must not be permanently blocked by poison message — new data should arrive", + 1, + newResult.totalRows); + System.out.println( + " testPoisonMessageDrop passed: consumer received " + + newResult.totalRows + + " new rows after poison message handling"); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 11: Serialization V2 Fields — regionId, epoch, dataNodeId + // are properly populated in polled messages' SubscriptionCommitContext. + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • SubscriptionCommitContext.getWriterId() is non-null for consensus messages + *
  • SubscriptionCommitContext.getWriterProgress() is non-null for consensus messages + *
  • SubscriptionCommitContext.getWriterId().getRegionId() stays aligned with the region + *
  • These writer-progress fields survive the serialize/deserialize round-trip through RPC + *
+ */ + private static void testWriterProgressFields() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTablePullConsumer consumer = null; + + try { + // Step 0: Create DataRegion + try (ITableSession session = openTableSession()) { + createDatabaseAndTable(session, database, "t1", "tag1 STRING TAG, s1 INT64 FIELD"); + } + Thread.sleep(2000); + + // Step 1: Create topic and subscribe + System.out.println(" Step 1: Creating topic and subscribing"); + createTopicTable(topicName, database, "t1"); + Thread.sleep(1000); + + consumer = (SubscriptionTablePullConsumer) createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 2: Write data + System.out.println(" Step 2: Writing 20 rows"); + try (ITableSession session = openTableSession()) { + session.executeNonQueryStatement("USE " + database); + for (int i = 1; i <= 20; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO t1 (tag1, s1, time) VALUES ('d1', %d, %d)", i * 10, i)); + } + } + Thread.sleep(2000); + + // Step 3: Poll and check writer-progress fields in SubscriptionCommitContext + System.out.println(" Step 3: Polling and verifying writer-progress fields in CommitContext"); + int totalRows = 0; + int messagesChecked = 0; + boolean foundWriterProgress = false; + + for (int attempt = 0; attempt < 30; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + if (totalRows > 0) break; + Thread.sleep(1000); + continue; + } + + for (SubscriptionMessage msg : msgs) { + SubscriptionCommitContext ctx = msg.getCommitContext(); + messagesChecked++; + + // Check writer-progress fields and their compatibility projections + String regionId = ctx.getRegionId(); + int dataNodeId = ctx.getDataNodeId(); + WriterId writerId = ctx.getWriterId(); + WriterProgress writerProgress = ctx.getWriterProgress(); + long physicalTime = + writerProgress != null ? writerProgress.getPhysicalTime() : Long.MIN_VALUE; + + System.out.println( + " Message " + + messagesChecked + + ": regionId=" + + regionId + + ", physicalTime=" + + physicalTime + + ", writerId=" + + writerId + + ", writerProgress=" + + writerProgress + + ", dataNodeId=" + + dataNodeId + + ", topicName=" + + ctx.getTopicName() + + ", consumerGroupId=" + + ctx.getConsumerGroupId()); + + assertTrue( + "regionId should be non-null for consensus message", + regionId != null && !regionId.isEmpty()); + assertTrue("writerId should be non-null for consensus message", writerId != null); + assertTrue( + "writerProgress should be non-null for consensus message", writerProgress != null); + assertEquals("regionId should match writerId.regionId", writerId.getRegionId(), regionId); + assertEquals( + "physicalTime should mirror writerProgress.physicalTime", + writerProgress.getPhysicalTime(), + physicalTime); + foundWriterProgress = true; + + assertTrue("physicalTime should be >= 0, got " + physicalTime, physicalTime >= 0); + + assertTrue("dataNodeId should be > 0, got " + dataNodeId, dataNodeId > 0); + + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + totalRows++; + } + } + consumer.commitSync(msg); + } + } + + System.out.println( + " Checked " + + messagesChecked + + " messages, " + + totalRows + + " rows. foundWriterProgress=" + + foundWriterProgress); + assertAtLeast("Should have received data rows", 1, totalRows); + assertTrue( + "Should have found writer-progress metadata in at least one message", + foundWriterProgress); + System.out.println(" testWriterProgressFields passed!"); + } finally { + cleanup(consumer, topicName, database); + } + } + + private static List getResultSets(final SubscriptionMessage message) { + return message.getResultSets().stream() + .map(resultSet -> (SubscriptionResultSet) resultSet) + .collect(Collectors.toList()); + } +} diff --git a/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java new file mode 100644 index 0000000000000..d2c4044ca908a --- /dev/null +++ b/example/subscription/src/main/java/org/apache/iotdb/ConsensusSubscriptionTest.java @@ -0,0 +1,2018 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb; + +import org.apache.iotdb.isession.ISession; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.session.Session; +import org.apache.iotdb.session.subscription.SubscriptionTreeSession; +import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePullConsumer; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionRecordHandler.SubscriptionResultSet; + +import org.apache.tsfile.common.conf.TSFileConfig; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +/** TODO: move these manual tests into ITs */ +public class ConsensusSubscriptionTest { + + private static final String HOST = "127.0.0.1"; + private static final int PORT = 6667; + private static final String USER = "root"; + private static final String PASSWORD = "root"; + + private static int testCounter = 0; + private static int passed = 0; + private static int failed = 0; + private static final List failedTests = new ArrayList<>(); + + public static void main(String[] args) throws Exception { + System.out.println("=== Consensus-Based Subscription Test Suite ===\n"); + + String targetTest = args.length > 0 ? args[0] : null; + + if (targetTest == null || "testBasicFlow".equals(targetTest)) { + runTest("testBasicFlow", ConsensusSubscriptionTest::testBasicFlow); + } + if (targetTest == null || "testDataTypes".equals(targetTest)) { + runTest("testDataTypes", ConsensusSubscriptionTest::testDataTypes); + } + if (targetTest == null || "testFilteringAndTopicSelection".equals(targetTest)) { + runTest( + "testFilteringAndTopicSelection", + ConsensusSubscriptionTest::testFilteringAndTopicSelection); + } + if (targetTest == null || "testSubscribeBeforeRegion".equals(targetTest)) { + runTest("testSubscribeBeforeRegion", ConsensusSubscriptionTest::testSubscribeBeforeRegion); + } + if (targetTest == null || "testMultiEntityIsolation".equals(targetTest)) { + runTest("testMultiEntityIsolation", ConsensusSubscriptionTest::testMultiEntityIsolation); + } + if (targetTest == null || "testWalCatchUpAndGapRecovery".equals(targetTest)) { + runTest( + "testWalCatchUpAndGapRecovery", ConsensusSubscriptionTest::testWalCatchUpAndGapRecovery); + } + if (targetTest == null || "testSeekAndPositionSemantics".equals(targetTest)) { + runTest( + "testSeekAndPositionSemantics", ConsensusSubscriptionTest::testSeekAndPositionSemantics); + } + if (targetTest == null || "testConsumerRestartRecovery".equals(targetTest)) { + runTest( + "testConsumerRestartRecovery", ConsensusSubscriptionTest::testConsumerRestartRecovery); + } + if (targetTest == null || "testAckNackAndPoisonSemantics".equals(targetTest)) { + runTest( + "testAckNackAndPoisonSemantics", + ConsensusSubscriptionTest::testAckNackAndPoisonSemantics); + } + + // Summary + System.out.println("\n=== Test Suite Summary ==="); + System.out.println("Passed: " + passed); + System.out.println("Failed: " + failed); + if (!failedTests.isEmpty()) { + System.out.println("Failed tests: " + failedTests); + } + System.out.println("=== Done ==="); + } + + // ============================ + // Test Infrastructure + // ============================ + + @FunctionalInterface + interface TestMethod { + void run() throws Exception; + } + + private static void runTest(String name, TestMethod test) { + System.out.println("\n" + "================================================================="); + System.out.println("Running: " + name); + System.out.println("================================================================="); + try { + test.run(); + passed++; + System.out.println(">>> PASSED: " + name); + } catch (AssertionError e) { + failed++; + failedTests.add(name); + System.out.println(">>> FAILED: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } catch (Exception e) { + failed++; + failedTests.add(name); + System.out.println(">>> ERROR: " + name + " - " + e.getMessage()); + e.printStackTrace(System.out); + } + } + + private static String nextDatabase() { + testCounter++; + return "root.csub_test_" + testCounter; + } + + private static String nextTopic() { + return "topic_csub_" + testCounter; + } + + private static String nextConsumerGroup() { + return "cg_csub_" + testCounter; + } + + private static String nextConsumerId() { + return "consumer_csub_" + testCounter; + } + + private static ISession openSession() throws Exception { + ISession session = + new Session.Builder().host(HOST).port(PORT).username(USER).password(PASSWORD).build(); + session.open(); + return session; + } + + private static void createDatabase(ISession session, String database) throws Exception { + try { + session.executeNonQueryStatement("CREATE DATABASE " + database); + } catch (Exception e) { + // ignore if already exists + } + } + + private static void deleteDatabase(String database) { + try (ISession session = openSession()) { + session.executeNonQueryStatement("DELETE DATABASE " + database); + } catch (Exception e) { + // ignore + } + } + + private static void dropTopic(String topicName) { + try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) { + subSession.open(); + subSession.dropTopic(topicName); + } catch (Exception e) { + // ignore + } + } + + private static void createTopic(String topicName, String path) throws Exception { + try (SubscriptionTreeSession subSession = new SubscriptionTreeSession(HOST, PORT)) { + subSession.open(); + try { + subSession.dropTopic(topicName); + } catch (Exception e) { + // ignore + } + + Properties topicConfig = new Properties(); + topicConfig.put(TopicConstant.MODE_KEY, TopicConstant.MODE_LIVE_VALUE); + topicConfig.put(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_RECORD_HANDLER_VALUE); + topicConfig.put(TopicConstant.PATH_KEY, path); + topicConfig.put(TopicConstant.ORDER_MODE_KEY, TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + subSession.createTopic(topicName, topicConfig); + System.out.println(" Created topic: " + topicName + " (path=" + path + ")"); + } + } + + private static SubscriptionTreePullConsumer createConsumer( + String consumerId, String consumerGroupId) throws Exception { + SubscriptionTreePullConsumer consumer = + new SubscriptionTreePullConsumer.Builder() + .host(HOST) + .port(PORT) + .consumerId(consumerId) + .consumerGroupId(consumerGroupId) + .autoCommit(false) + .buildPullConsumer(); + consumer.open(); + return consumer; + } + + // ============================ + // Polling & Verification + // ============================ + + /** + * Poll and commit messages. After reaching expectedRows, continues polling for 5 consecutive + * empty rounds to verify no extra data arrives. + */ + private static PollResult pollUntilComplete( + SubscriptionTreePullConsumer consumer, int expectedRows, int maxPollAttempts) { + return pollUntilComplete(consumer, expectedRows, maxPollAttempts, 1000, true); + } + + private static PollResult pollUntilComplete( + SubscriptionTreePullConsumer consumer, + int expectedRows, + int maxPollAttempts, + long pollTimeoutMs, + boolean commitMessages) { + PollResult result = new PollResult(); + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + + if (messages.isEmpty()) { + consecutiveEmpty++; + // Normal completion: reached expected rows and verified quiescence + if (consecutiveEmpty >= 3 && result.totalRows >= expectedRows) { + System.out.println( + " Verified: " + + consecutiveEmpty + + " consecutive empty polls after " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Stuck: have data but cannot reach expected count + if (consecutiveEmpty >= 5 && result.totalRows > 0) { + System.out.println( + " Stuck: " + + consecutiveEmpty + + " consecutive empty polls at " + + result.totalRows + + " rows (expected " + + expectedRows + + ")"); + break; + } + // Never received anything + if (consecutiveEmpty >= 10 && result.totalRows == 0 && expectedRows > 0) { + System.out.println(" No data received after " + consecutiveEmpty + " polls"); + break; + } + try { + Thread.sleep(1000); + } catch (InterruptedException ignored) { + } + continue; + } + + consecutiveEmpty = 0; + + for (SubscriptionMessage message : messages) { + for (SubscriptionResultSet dataSet : getResultSets(message)) { + String device = null; + List columnNames = dataSet.getColumnNames(); + if (columnNames.size() > 1) { + String fullPath = columnNames.get(1); + int lastDot = fullPath.lastIndexOf('.'); + device = lastDot > 0 ? fullPath.substring(0, lastDot) : fullPath; + } + + while (dataSet.hasNext()) { + try { + org.apache.tsfile.read.common.RowRecord record = dataSet.nextRecord(); + result.totalRows++; + if (device != null) { + result.rowsPerDevice.merge(device, 1, Integer::sum); + } + for (int i = 1; i < columnNames.size(); i++) { + result.seenColumns.add(columnNames.get(i)); + } + if (result.totalRows <= 5) { + System.out.println( + " Row: time=" + + record.getTimestamp() + + ", values=" + + record.getFields() + + ", device=" + + device); + } + } catch (java.io.IOException e) { + throw new RuntimeException("Failed to iterate subscription result set", e); + } + } + } + if (commitMessages) { + consumer.commitSync(message); + } + } + + System.out.println( + " Poll attempt " + + attempt + + ": totalRows=" + + result.totalRows + + " / expected=" + + expectedRows); + + // Stop immediately if we exceeded the expected row count + if (expectedRows > 0 && result.totalRows > expectedRows) { + System.out.println( + " EXCEEDED: totalRows=" + result.totalRows + " > expectedRows=" + expectedRows); + break; + } + } + + return result; + } + + // ============================ + // Cleanup + // ============================ + + /** Clean up all test artifacts: unsubscribe, close consumer, drop topic, delete database. */ + private static void cleanup( + SubscriptionTreePullConsumer consumer, String topicName, String database) { + if (consumer != null) { + try { + consumer.unsubscribe(topicName); + } catch (Exception e) { + // ignore + } + try { + consumer.close(); + } catch (Exception e) { + // ignore + } + } + dropTopic(topicName); + deleteDatabase(database); + } + + // ============================ + // Result & Assertions + // ============================ + + static class PollResult { + int totalRows = 0; + Map rowsPerDevice = new HashMap<>(); + Set seenColumns = new HashSet<>(); + + @Override + public String toString() { + return "PollResult{totalRows=" + + totalRows + + ", rowsPerDevice=" + + rowsPerDevice + + ", seenColumns=" + + seenColumns + + "}"; + } + } + + private static void assertEquals(String msg, int expected, int actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertEquals(String msg, long expected, long actual) { + if (expected != actual) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertEquals(String msg, String expected, String actual) { + if (expected == null ? actual != null : !expected.equals(actual)) { + throw new AssertionError(msg + ": expected=" + expected + ", actual=" + actual); + } + } + + private static void assertTrue(String msg, boolean condition) { + if (!condition) { + throw new AssertionError(msg); + } + } + + private static void assertAtLeast(String msg, int min, int actual) { + if (actual < min) { + throw new AssertionError(msg + ": expected at least " + min + ", actual=" + actual); + } + } + + private static void assertAtMost(String msg, int max, int actual) { + if (actual > max) { + throw new AssertionError(msg + ": expected at most " + max + ", actual=" + actual); + } + } + + private static int countWriterFrontiers(TopicProgress topicProgress) { + int writerCount = 0; + if (topicProgress == null || topicProgress.getRegionProgress() == null) { + return 0; + } + for (Map.Entry entry : topicProgress.getRegionProgress().entrySet()) { + if (entry.getValue() != null && entry.getValue().getWriterPositions() != null) { + writerCount += entry.getValue().getWriterPositions().size(); + } + } + return writerCount; + } + + private static int countRows(SubscriptionMessage message) { + int rows = 0; + for (SubscriptionResultSet dataSet : getResultSets(message)) { + while (dataSet.hasNext()) { + try { + dataSet.next(); + rows++; + } catch (java.io.IOException e) { + throw new RuntimeException("Failed to count rows from subscription result set", e); + } + } + } + return rows; + } + + private static final class CommittedSnapshot { + private final TopicProgress progress; + private final int rowsInMessage; + private final int cumulativeRows; + + private CommittedSnapshot(TopicProgress progress, int rowsInMessage, int cumulativeRows) { + this.progress = progress; + this.rowsInMessage = rowsInMessage; + this.cumulativeRows = cumulativeRows; + } + } + + private static final class PolledMessageBatch { + private final List messages; + private final int totalRows; + + private PolledMessageBatch(List messages, int totalRows) { + this.messages = messages; + this.totalRows = totalRows; + } + } + + private static void pause(long millis) { + try { + Thread.sleep(millis); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for subscription test state", e); + } + } + + private static void bootstrapSeekTopic(String database, String topicName) throws Exception { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + pause(2000); + + createTopic(topicName, database + ".**"); + pause(1000); + } + + private static SubscriptionTreePullConsumer createSubscribedConsumer( + String topicName, String consumerId, String consumerGroupId) throws Exception { + SubscriptionTreePullConsumer consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + pause(3000); + return consumer; + } + + private static void writeSequentialRowsAndFlush( + String database, int startTimestampInclusive, int rowCount) throws Exception { + try (ISession session = openSession()) { + for (int i = 0; i < rowCount; i++) { + long ts = startTimestampInclusive + i; + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, ts, ts * 10)); + } + session.executeNonQueryStatement("flush"); + } + pause(2000); + } + + private static CommittedSnapshot pollUntilCommittedRows( + SubscriptionTreePullConsumer consumer, + String topicName, + int minimumRows, + int maxPollAttempts, + long pollTimeoutMs) { + int cumulativeRows = 0; + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + if (messages.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 5 && cumulativeRows > 0) { + break; + } + pause(1000); + continue; + } + + consecutiveEmpty = 0; + for (SubscriptionMessage message : messages) { + int rowsInMessage = countRows(message); + consumer.commitSync(message); + cumulativeRows += rowsInMessage; + TopicProgress checkpoint = consumer.committedPositions(topicName); + System.out.println( + " Captured committed checkpoint after " + + cumulativeRows + + " rows (last message=" + + rowsInMessage + + ")"); + if (cumulativeRows >= minimumRows) { + return new CommittedSnapshot(checkpoint, rowsInMessage, cumulativeRows); + } + } + } + + throw new AssertionError( + "Unable to capture committed checkpoint after " + + minimumRows + + " rows; stopped at " + + cumulativeRows); + } + + private static List pollAndCaptureCommittedSnapshots( + SubscriptionTreePullConsumer consumer, + String topicName, + int maxPollAttempts, + long pollTimeoutMs) { + List snapshots = new ArrayList<>(); + int cumulativeRows = 0; + int consecutiveEmpty = 0; + + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + if (messages.isEmpty()) { + consecutiveEmpty++; + if (consecutiveEmpty >= 3 && cumulativeRows > 0) { + break; + } + if (consecutiveEmpty >= 8 && cumulativeRows == 0) { + break; + } + pause(1000); + continue; + } + + consecutiveEmpty = 0; + for (SubscriptionMessage message : messages) { + int rowsInMessage = countRows(message); + consumer.commitSync(message); + cumulativeRows += rowsInMessage; + snapshots.add( + new CommittedSnapshot( + consumer.committedPositions(topicName), rowsInMessage, cumulativeRows)); + } + } + + System.out.println( + " Drained " + + cumulativeRows + + " rows across " + + snapshots.size() + + " committed messages"); + return snapshots; + } + + private static PolledMessageBatch pollFirstNonEmptyBatchWithoutCommit( + SubscriptionTreePullConsumer consumer, int maxPollAttempts, long pollTimeoutMs) { + for (int attempt = 1; attempt <= maxPollAttempts; attempt++) { + List messages = consumer.poll(Duration.ofMillis(pollTimeoutMs)); + if (messages.isEmpty()) { + pause(1000); + continue; + } + + int totalRows = 0; + for (SubscriptionMessage message : messages) { + totalRows += countRows(message); + } + System.out.println( + " Polled stale batch without commit: " + + messages.size() + + " messages, " + + totalRows + + " rows"); + return new PolledMessageBatch(new ArrayList<>(messages), totalRows); + } + + return new PolledMessageBatch(new ArrayList<>(), 0); + } + + private static int totalRows(List snapshots) { + return snapshots.isEmpty() ? 0 : snapshots.get(snapshots.size() - 1).cumulativeRows; + } + + // ====================================================================== + // High-signal 10-test suite wrappers + // ====================================================================== + + private static void testFilteringAndTopicSelection() throws Exception { + testPathFiltering(); + testPollWithInfoTopicFilter(); + } + + private static void testWalCatchUpAndGapRecovery() throws Exception { + testBurstWriteGapRecovery(); + } + + private static void testSeekAndPositionSemantics() throws Exception { + testSeekNavigationSemantics(); + testSeekAfterCheckpointSemantics(); + testSeekAfterWithStaleAckFencing(); + } + + private static void testAckNackAndPoisonSemantics() throws Exception { + testCommitAfterUnsubscribe(); + testPoisonMessageDrop(); + } + + // ====================================================================== + // Test 8: Consumer Restart Recovery + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • A committed per-region checkpoint captured by consumer1 can be reused after restart + *
  • A restarted consumer with the same group can seek to that checkpoint and continue + *
  • The tail after restart is replayed exactly once + *
+ */ + private static void testConsumerRestartRecovery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId1 = nextConsumerId(); + String consumerId2 = consumerId1 + "_restart"; + SubscriptionTreePullConsumer consumer1 = null; + SubscriptionTreePullConsumer consumer2 = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer1 = createConsumer(consumerId1, consumerGroupId); + consumer1.subscribe(topicName); + Thread.sleep(3000); + + final int totalRows = 257; + System.out.println(" Writing " + totalRows + " rows before restart"); + try (ISession session = openSession()) { + for (int i = 1; i <= totalRows; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + session.executeNonQueryStatement("flush"); + } + Thread.sleep(3000); + + SubscriptionMessage committedMessage = null; + int committedRows = 0; + for (int attempt = 1; attempt <= 30; attempt++) { + List messages = consumer1.poll(Duration.ofMillis(2000)); + if (messages.isEmpty()) { + Thread.sleep(1000); + continue; + } + committedMessage = messages.get(0); + committedRows = countRows(committedMessage); + consumer1.commitSync(committedMessage); + break; + } + + assertAtLeast("First consumer should commit some rows before restart", 1, committedRows); + TopicProgress checkpoint = consumer1.committedPositions(topicName); + assertTrue( + "Committed checkpoint should not be empty", + checkpoint.getRegionProgress() != null && !checkpoint.getRegionProgress().isEmpty()); + int remainingRows = totalRows - committedRows; + assertAtLeast("Restart scenario should leave rows after the first commit", 1, remainingRows); + System.out.println( + " Captured checkpoint after committing " + + committedRows + + " rows: " + + checkpoint + + ", remainingRows=" + + remainingRows); + + consumer1.close(); + consumer1 = null; + + consumer2 = createConsumer(consumerId2, consumerGroupId); + consumer2.subscribe(topicName); + Thread.sleep(3000); + consumer2.seekAfter(topicName, checkpoint); + Thread.sleep(1000); + + PollResult resumed = pollUntilComplete(consumer2, remainingRows, 120); + System.out.println(" Restart recovery result: " + resumed); + assertEquals( + "Restarted consumer should resume from the committed checkpoint without replay", + remainingRows, + resumed.totalRows); + } finally { + cleanup(consumer1, topicName, database); + cleanup(consumer2, topicName, database); + } + } + + // ====================================================================== + // Test 1: Basic Flow (merged: BasicDataDelivery + MultiDevices + Flush) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Data written BEFORE subscribe is NOT received + *
  • Multiple devices (d1, d2, d3) written AFTER subscribe are all received + *
  • Flush does not cause data loss (WAL pinning keeps entries available) + *
  • Exact row count matches expectation + *
+ */ + private static void testBasicFlow() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Step 1: Write initial data to create DataRegion (should NOT be received) + System.out.println(" Step 1: Writing initial data (should NOT be received)"); + try (ISession session = openSession()) { + createDatabase(session, database); + for (int i = 0; i < 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + // Also write to d2, d3 for multi-device readiness + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 2: Create topic and subscribe + System.out.println(" Step 2: Creating topic and subscribing"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 3: Write to 3 devices (30 rows each = 90 total), then flush + System.out.println(" Step 3: Writing 30 rows x 3 devices AFTER subscribe, then flush"); + try (ISession session = openSession()) { + for (int i = 100; i < 130; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d3(time, s1) VALUES (%d, %d)", database, i, i * 30)); + } + System.out.println(" Flushing..."); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 4: Poll and verify + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 90, 100); + System.out.println(" Result: " + result); + + assertEquals("Expected exactly 90 rows (30 per device)", 90, result.totalRows); + if (!result.rowsPerDevice.isEmpty()) { + System.out.println(" Rows per device: " + result.rowsPerDevice); + for (String dev : new String[] {"d1", "d2", "d3"}) { + Integer devRows = result.rowsPerDevice.get(database + "." + dev); + assertAtLeast("Expected rows from " + dev, 1, devRows != null ? devRows : 0); + } + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 2: Data Types (merged: MultipleDataTypes + Aligned + CrossPartition) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Non-aligned: 6 data types (INT32, INT64, FLOAT, DOUBLE, BOOLEAN, TEXT) + *
  • Aligned: 6 data types, cross-partition timestamps (>1 week apart) + *
  • 6 write methods: SQL single/multi-row, insertAlignedRecord/Records/Tablet/Tablets + *
+ */ + private static void testDataTypes() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + final long GAP = 604_800_001L; // slightly over 1 week + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + // Create aligned timeseries + session.executeNonQueryStatement( + String.format( + "CREATE ALIGNED TIMESERIES %s.d_aligned" + + "(s_int32 INT32, s_int64 INT64, s_float FLOAT," + + " s_double DOUBLE, s_bool BOOLEAN, s_text TEXT)", + database)); + // Init rows to force DataRegion creation + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s_int32) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (0, 0, 0, 0.0, 0.0, false, 'init')", + database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + int totalExpected = 0; + final String device = database + ".d_aligned"; + List measurements = + Arrays.asList("s_int32", "s_int64", "s_float", "s_double", "s_bool", "s_text"); + List types = + Arrays.asList( + TSDataType.INT32, + TSDataType.INT64, + TSDataType.FLOAT, + TSDataType.DOUBLE, + TSDataType.BOOLEAN, + TSDataType.TEXT); + List schemas = new ArrayList<>(); + schemas.add(new MeasurementSchema("s_int32", TSDataType.INT32)); + schemas.add(new MeasurementSchema("s_int64", TSDataType.INT64)); + schemas.add(new MeasurementSchema("s_float", TSDataType.FLOAT)); + schemas.add(new MeasurementSchema("s_double", TSDataType.DOUBLE)); + schemas.add(new MeasurementSchema("s_bool", TSDataType.BOOLEAN)); + schemas.add(new MeasurementSchema("s_text", TSDataType.TEXT)); + + try (ISession session = openSession()) { + // --- Part A: Non-aligned, 6 types x 20 rows --- + System.out.println(" Part A: Non-aligned 6 data types x 20 rows"); + for (int i = 1; i <= 20; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s_int32) VALUES (%d, %d)", database, i, i)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_int64) VALUES (%d, %d)", + database, i, (long) i * 100000L)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_float) VALUES (%d, %f)", database, i, i * 1.1f)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_double) VALUES (%d, %f)", database, i, i * 2.2)); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_bool) VALUES (%d, %s)", + database, i, i % 2 == 0 ? "true" : "false")); + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s_text) VALUES (%d, 'text_%d')", database, i, i)); + } + totalExpected += 120; // 6 types x 20 rows + + // --- Part B: Aligned cross-partition, 6 write methods --- + System.out.println(" Part B: Aligned cross-partition, 6 write methods"); + + // Method 1: SQL single row + long t1 = 1; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 1, 100, 1.1, 1.11, true, 'sql_single')", + database, t1)); + totalExpected += 1; + + // Method 2: SQL multi-row (cross-partition) + long t2a = 1 + GAP; + long t2b = 1 + 2 * GAP; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d_aligned(time, s_int32, s_int64, s_float," + + " s_double, s_bool, s_text)" + + " VALUES (%d, 2, 200, 2.2, 2.22, false, 'sql_multi_a')," + + " (%d, 3, 300, 3.3, 3.33, true, 'sql_multi_b')", + database, t2a, t2b)); + totalExpected += 2; + + // Method 3: insertAlignedRecord + long t3 = 1 + 3 * GAP; + session.insertAlignedRecord( + device, + t3, + measurements, + types, + Arrays.asList(4, 400L, 4.4f, 4.44, true, "record_single")); + totalExpected += 1; + + // Method 4: insertAlignedRecordsOfOneDevice (cross-partition) + long t4a = 1 + 4 * GAP; + long t4b = 1 + 5 * GAP; + session.insertAlignedRecordsOfOneDevice( + device, + Arrays.asList(t4a, t4b), + Arrays.asList(measurements, measurements), + Arrays.asList(types, types), + Arrays.asList( + Arrays.asList(5, 500L, 5.5f, 5.55, false, "records_a"), + Arrays.asList(6, 600L, 6.6f, 6.66, true, "records_b"))); + totalExpected += 2; + + // Method 5: insertAlignedTablet (cross-partition) + long t5a = 1 + 6 * GAP; + long t5b = 1 + 7 * GAP; + Tablet tablet5 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet5, 0, t5a, 7, 700L, 7.7f, 7.77, false, "tablet_a"); + addAlignedTabletRow(tablet5, 1, t5b, 8, 800L, 8.8f, 8.88, true, "tablet_b"); + session.insertAlignedTablet(tablet5); + totalExpected += 2; + + // Method 6: insertAlignedTablets (cross-partition) + long t6a = 1 + 8 * GAP; + long t6b = 1 + 9 * GAP; + Tablet tablet6 = new Tablet(device, schemas, 2); + addAlignedTabletRow(tablet6, 0, t6a, 9, 900L, 9.9f, 9.99, false, "tablets_a"); + addAlignedTabletRow(tablet6, 1, t6b, 10, 1000L, 10.1f, 10.10, true, "tablets_b"); + Map tabletMap = new HashMap<>(); + tabletMap.put(device, tablet6); + session.insertAlignedTablets(tabletMap); + totalExpected += 2; + } + + System.out.println(" Total expected rows: " + totalExpected); + Thread.sleep(2000); + + PollResult result = pollUntilComplete(consumer, totalExpected, 150); + System.out.println(" Result: " + result); + + assertAtLeast( + "Expected at least " + totalExpected + " rows", totalExpected, result.totalRows); + assertAtLeast("Expected multiple column types in result", 2, result.seenColumns.size()); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 3: Path Filtering (merged: DeviceLevel + TimeseriesLevel) + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Device-level: topic on d1.** does NOT deliver d2 data + *
  • Timeseries-level: topic on d1.s1 鈥?lenient check for s2 filtering + *
+ */ + private static void testPathFiltering() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1, s2) VALUES (0, 0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Topic filters d1.s1 only (timeseries-level) + String filterPath = database + ".d1.s1"; + createTopic(topicName, filterPath); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing to d1 (s1 + s2) and d2 (s1)"); + try (ISession session = openSession()) { + for (int i = 100; i < 150; i++) { + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1, s2) VALUES (%d, %d, %d)", + database, i, i * 10, i * 20)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 30)); + } + } + Thread.sleep(2000); + + System.out.println(" Polling (expecting d1 data only, ideally s1 only)..."); + PollResult result = pollUntilComplete(consumer, 50, 60); + System.out.println(" Result: " + result); + + // Device-level: d2 must NOT appear + if (!result.rowsPerDevice.isEmpty()) { + Integer d2Rows = result.rowsPerDevice.get(database + ".d2"); + assertTrue("Expected NO rows from d2, but got " + d2Rows, d2Rows == null || d2Rows == 0); + Integer d1Rows = result.rowsPerDevice.get(database + ".d1"); + assertAtLeast("Expected d1 rows", 1, d1Rows != null ? d1Rows : 0); + System.out.println(" Device filtering verified: d1=" + d1Rows + ", d2=" + d2Rows); + } + + // Timeseries-level: lenient check + boolean hasS2 = result.seenColumns.stream().anyMatch(c -> c.contains(".s2")); + if (hasS2) { + System.out.println( + " INFO: Both s1 and s2 received 鈥?converter uses device-level filtering only."); + assertAtLeast("Should have received d1 rows", 50, result.totalRows); + } else { + System.out.println(" Timeseries-level filtering verified: only s1 data received"); + assertEquals("Expected exactly 50 rows from d1.s1 only", 50, result.totalRows); + } + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 4: Subscribe Before Region Creation (kept as-is) + // ====================================================================== + /** + * Subscribe BEFORE the database/region exists, then create database and write. Tests the + * IoTConsensus.onNewPeerCreated auto-binding path. + */ + private static void testSubscribeBeforeRegion() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + System.out.println(" Step 1: Creating topic BEFORE database exists"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + System.out.println(" Step 2: Subscribing (no DataRegion exists yet)"); + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Step 3: Creating database and writing data (100 rows)"); + try (ISession session = openSession()) { + createDatabase(session, database); + for (int i = 0; i < 100; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(5000); + + System.out.println(" Step 4: Polling..."); + PollResult result = pollUntilComplete(consumer, 100, 100); + System.out.println(" Result: " + result); + + if (result.totalRows >= 100) { + System.out.println(" Auto-binding works! All " + result.totalRows + " rows received."); + } else if (result.totalRows > 0) { + System.out.println( + " Partial: " + result.totalRows + "/100 rows. First writes may precede binding."); + } else { + System.out.println(" No data received. Check logs for auto-binding messages."); + } + assertAtLeast( + "Expected some rows from subscribe-before-region (auto-binding)", 1, result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 6: Multi-Entity Isolation + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • Two consumer groups on the same topic each receive the full data stream independently + *
+ */ + private static void testMultiEntityIsolation() throws Exception { + String database = nextDatabase(); + String topicName = "topic_multi_" + testCounter; + String consumerGroupId1 = "cg_multi_" + testCounter + "_a"; + String consumerId1 = "consumer_multi_" + testCounter + "_a"; + String consumerGroupId2 = "cg_multi_" + testCounter + "_b"; + String consumerId2 = "consumer_multi_" + testCounter + "_b"; + SubscriptionTreePullConsumer consumer1 = null; + SubscriptionTreePullConsumer consumer2 = null; + + try { + // Setup: database with a single device path to isolate multi-group semantics. + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".d1.**"); + Thread.sleep(1000); + + consumer1 = createConsumer(consumerId1, consumerGroupId1); + consumer1.subscribe(topicName); + consumer2 = createConsumer(consumerId2, consumerGroupId2); + consumer2.subscribe(topicName); + Thread.sleep(3000); + + System.out.println(" Writing 70 rows to d1"); + try (ISession session = openSession()) { + for (int i = 1; i <= 70; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(2000); + + System.out.println(" Multi-group isolation"); + System.out.println(" Polling from group 1..."); + PollResult result1 = pollUntilComplete(consumer1, 70, 80); + System.out.println(" Group 1 result: " + result1); + + System.out.println(" Polling from group 2..."); + PollResult result2 = pollUntilComplete(consumer2, 70, 80); + System.out.println(" Group 2 result: " + result2); + + assertEquals("Group 1 should receive all 70 rows", 70, result1.totalRows); + assertEquals("Group 2 should receive all 70 rows", 70, result2.totalRows); + assertEquals( + "Expected 70 rows from d1", 70, result1.rowsPerDevice.getOrDefault(database + ".d1", 0)); + assertEquals( + "Expected 70 rows from d1", 70, result2.rowsPerDevice.getOrDefault(database + ".d1", 0)); + System.out.println( + " Multi-group isolation verified: group1=" + + result1.totalRows + + ", group2=" + + result2.totalRows); + } finally { + if (consumer1 != null) { + try { + consumer1.unsubscribe(topicName); + } catch (Exception e) { + /* ignore */ + } + try { + consumer1.close(); + } catch (Exception e) { + /* ignore */ + } + } + if (consumer2 != null) { + try { + consumer2.unsubscribe(topicName); + } catch (Exception e) { + /* ignore */ + } + try { + consumer2.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopic(topicName); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 7: Burst Write Gap Recovery (NEW 鈥?tests C2 fix) + // ====================================================================== + /** + * Tests that burst writing beyond the pending queue capacity (4096) does not cause data loss. The + * pending queue overflow triggers gaps, which should be recovered from WAL. + * + *

Mechanism: Each {@code IoTConsensusServerImpl.write()} call produces exactly one + * {@code pendingEntries.offer()}. A single {@code session.insertTablet(tablet)} with N rows in + * one time partition = 1 write() call = 1 offer, so Tablet batches rarely overflow the queue. To + * actually overflow, we need 4096+ individual write() calls arriving faster than the + * prefetch thread can drain. We achieve this with multiple concurrent writer threads, each + * performing individual SQL INSERTs, to maximize the aggregate write rate vs. drain rate. + * + *

Note: Gap occurrence is inherently timing-dependent (race between writers and the + * prefetch drain loop). This test maximizes the probability by using concurrent threads, but + * cannot guarantee gap occurrence on every run. Check server logs for "gap detected" / "Filling + * from WAL" messages to confirm the gap path was exercised. + * + *

Fix verified: C2 鈥?gap entries are not skipped when WAL fill times out; they are deferred to + * the next prefetch iteration. + */ + private static void testBurstWriteGapRecovery() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Use multiple concurrent writer threads with individual SQL INSERTs. + // Each INSERT 鈫?1 IoTConsensusServerImpl.write() 鈫?1 pendingEntries.offer(). + // With N threads writing concurrently, aggregate rate should exceed drain rate + // and overflow the 4096-capacity queue, creating gaps. + final int writerThreads = 4; + final int rowsPerThread = 1500; // 4 * 1500 = 6000 total write() calls > 4096 + final int totalRows = writerThreads * rowsPerThread; + final AtomicInteger errorCount = new AtomicInteger(0); + final CountDownLatch startLatch = new CountDownLatch(1); + final CountDownLatch doneLatch = new CountDownLatch(writerThreads); + + System.out.println( + " Burst writing " + + totalRows + + " rows via " + + writerThreads + + " concurrent threads (" + + rowsPerThread + + " individual SQL INSERTs each)"); + System.out.println( + " (Each INSERT = 1 WAL entry = 1 pendingEntries.offer(); " + "queue capacity = 4096)"); + + ExecutorService executor = Executors.newFixedThreadPool(writerThreads); + for (int t = 0; t < writerThreads; t++) { + final int threadId = t; + final int startTs = threadId * rowsPerThread + 1; + executor.submit( + () -> { + try { + startLatch.await(); // all threads start at the same time + try (ISession session = openSession()) { + for (int i = 0; i < rowsPerThread; i++) { + int ts = startTs + i; + session.executeNonQueryStatement( + String.format( + "INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", + database, ts, (long) ts * 10)); + } + } + } catch (Exception e) { + System.out.println(" Writer thread " + threadId + " error: " + e.getMessage()); + errorCount.incrementAndGet(); + } finally { + doneLatch.countDown(); + } + }); + } + + // Fire all threads simultaneously + startLatch.countDown(); + doneLatch.await(); + executor.shutdown(); + + if (errorCount.get() > 0) { + System.out.println(" WARNING: " + errorCount.get() + " writer threads encountered errors"); + } + + // Do NOT add artificial delay 鈥?let the consumer compete with ongoing WAL writes + System.out.println( + " Polling (expecting " + totalRows + " rows, may need WAL gap recovery)..."); + System.out.println( + " (Check server logs for 'gap detected' to confirm gap recovery was triggered)"); + PollResult result = pollUntilComplete(consumer, totalRows, 6000, 2000, true); + System.out.println(" Result: " + result); + + assertEquals( + "Expected exactly " + totalRows + " rows (no data loss despite pending queue overflow)", + totalRows, + result.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 8: Commit After Unsubscribe (NEW 鈥?tests H7 fix) + // ====================================================================== + /** + * Tests that commit still works correctly after the consumer has unsubscribed (queue has been + * torn down). The commit routing should use metadata-based topic config check instead of runtime + * queue state. + * + *

Fix verified: H7 鈥?commit routes via isConsensusBasedTopic() instead of hasQueue(). + */ + private static void testCommitAfterUnsubscribe() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Write data + System.out.println(" Writing 50 rows"); + try (ISession session = openSession()) { + for (int i = 1; i <= 50; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(2000); + + // Poll WITHOUT commit + System.out.println(" Polling WITHOUT commit..."); + List uncommittedMessages = new ArrayList<>(); + int polledRows = 0; + for (int attempt = 0; attempt < 60 && polledRows < 50; attempt++) { + List msgs = consumer.poll(Duration.ofMillis(2000)); + if (msgs.isEmpty()) { + if (polledRows > 0) break; + Thread.sleep(500); + continue; + } + for (SubscriptionMessage msg : msgs) { + uncommittedMessages.add(msg); + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + polledRows++; + } + } + } + } + System.out.println( + " Polled " + + polledRows + + " rows, holding " + + uncommittedMessages.size() + + " uncommitted messages"); + assertAtLeast("Should have polled some rows before unsubscribe", 1, polledRows); + + // Unsubscribe (tears down the consensus queue) + System.out.println(" Unsubscribing (queue teardown)..."); + consumer.unsubscribe(topicName); + Thread.sleep(2000); + + // Now commit the previously polled messages 鈥?should NOT throw + System.out.println( + " Committing " + uncommittedMessages.size() + " messages AFTER unsubscribe..."); + boolean commitSucceeded = true; + for (SubscriptionMessage msg : uncommittedMessages) { + try { + consumer.commitSync(msg); + } catch (Exception e) { + System.out.println(" Commit threw exception: " + e.getMessage()); + commitSucceeded = false; + } + } + + // The commit may silently succeed or fail gracefully 鈥?the key is no crash + System.out.println(" Commit after unsubscribe completed. Success=" + commitSucceeded); + assertTrue("Commit after unsubscribe should succeed without exception", commitSucceeded); + System.out.println(" (Key: no exception crash, routing handled gracefully)"); + } finally { + if (consumer != null) { + try { + consumer.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopic(topicName); + deleteDatabase(database); + } + } + + /** + * Verifies: + * + *

    + *
  • seekToBeginning replays historical rows from the beginning of the topic + *
  • seekToEnd suppresses old rows and only delivers future writes + *
  • seek(topicProgress) resumes from a committed checkpoint without replaying earlier rows + *
+ */ + private static void testSeekNavigationSemantics() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + final int initialRows = 1800; + final int rowsAfterSeekToEnd = 240; + + try { + bootstrapSeekTopic(database, topicName); + consumer = createSubscribedConsumer(topicName, consumerId, consumerGroupId); + + System.out.println(" Step 1: Write initial live rows and drain them"); + writeSequentialRowsAndFlush(database, 1000, initialRows); + PollResult firstPoll = pollUntilComplete(consumer, initialRows, 120); + System.out.println(" First poll: " + firstPoll.totalRows + " rows"); + assertEquals( + "Initial live poll should deliver exactly the rows written after subscribe", + initialRows, + firstPoll.totalRows); + + System.out.println(" Step 2: seekToBeginning -> expect full replay"); + consumer.seekToBeginning(topicName); + pause(2000); + + PollResult beginningPoll = pollUntilComplete(consumer, initialRows, 120); + System.out.println(" After seekToBeginning: " + beginningPoll.totalRows + " rows"); + assertAtLeast( + "seekToBeginning should replay the rows written after subscribe", + initialRows, + beginningPoll.totalRows); + assertAtMost( + "seekToBeginning should replay at most one extra bootstrap row", + initialRows + 1, + beginningPoll.totalRows); + + System.out.println(" Step 3: seekToEnd -> expect no old data"); + consumer.seekToEnd(topicName); + pause(2000); + + PollResult endPoll = pollUntilComplete(consumer, 0, 15, 1000, true); + System.out.println(" After seekToEnd with no new writes: " + endPoll.totalRows + " rows"); + assertAtMost("seekToEnd should yield at most 1 race row", 1, endPoll.totalRows); + + System.out.println(" Step 4: Write new rows after seekToEnd"); + writeSequentialRowsAndFlush(database, 4000, rowsAfterSeekToEnd); + PollResult afterEndPoll = pollUntilComplete(consumer, rowsAfterSeekToEnd, 120); + System.out.println(" After seekToEnd + new writes: " + afterEndPoll.totalRows + " rows"); + assertEquals( + "seekToEnd should only deliver rows written after the seek", + rowsAfterSeekToEnd, + afterEndPoll.totalRows); + + System.out.println(" Step 5: seek(committed checkpoint) -> expect remaining tail only"); + consumer.seekToBeginning(topicName); + pause(2000); + + CommittedSnapshot midpointCheckpoint = + pollUntilCommittedRows(consumer, topicName, initialRows / 2, 60, 1000); + List remainingTail = + pollAndCaptureCommittedSnapshots(consumer, topicName, 60, 1000); + int expectedRemainingRows = totalRows(remainingTail); + System.out.println( + " Midpoint checkpoint after " + + midpointCheckpoint.cumulativeRows + + " rows, expected remaining tail=" + + expectedRemainingRows); + assertAtLeast( + "seek(topicProgress) scenario should leave rows after the checkpoint", + 1, + expectedRemainingRows); + + consumer.seekAfter(topicName, midpointCheckpoint.progress); + pause(2000); + + PollResult afterSeek = pollUntilComplete(consumer, expectedRemainingRows, 120); + System.out.println(" After seek(topicProgress): " + afterSeek.totalRows + " rows"); + assertEquals( + "seek(topicProgress) should resume from the committed checkpoint", + expectedRemainingRows, + afterSeek.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + /** + * Verifies: + * + *
    + *
  • seekAfter(topicProgress) replays only rows strictly after the checkpoint + *
  • Repeating seekAfter with the same checkpoint is stable + *
  • seekAfter(tail) suppresses history but still allows future rows through + *
+ */ + private static void testSeekAfterCheckpointSemantics() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + final int totalRows = 2000; + final int futureRows = 160; + + try { + bootstrapSeekTopic(database, topicName); + consumer = createSubscribedConsumer(topicName, consumerId, consumerGroupId); + + System.out.println(" Step 1: Write rows and capture a committed checkpoint"); + writeSequentialRowsAndFlush(database, 1000, totalRows); + CommittedSnapshot checkpoint = + pollUntilCommittedRows(consumer, topicName, totalRows / 3, 60, 1000); + List drainedTail = + pollAndCaptureCommittedSnapshots(consumer, topicName, 80, 1000); + int expectedTailRows = totalRows(drainedTail); + System.out.println( + " Checkpoint after " + + checkpoint.cumulativeRows + + " rows, tail after checkpoint=" + + expectedTailRows); + assertAtLeast( + "seekAfter(topicProgress) scenario should leave rows after the checkpoint", + 1, + expectedTailRows); + + int writerFrontierCount = countWriterFrontiers(checkpoint.progress); + assertAtLeast("Committed checkpoint should contain writer frontiers", 1, writerFrontierCount); + + System.out.println(" Step 2: seekAfter(midpoint checkpoint) -> expect exact tail replay"); + consumer.seekAfter(topicName, checkpoint.progress); + pause(2000); + + PollResult firstReplay = pollUntilComplete(consumer, expectedTailRows, 120); + System.out.println(" After first seekAfter(checkpoint): " + firstReplay.totalRows + " rows"); + assertEquals( + "seekAfter(topicProgress) should replay exactly the tail after the checkpoint", + expectedTailRows, + firstReplay.totalRows); + + System.out.println(" Step 3: repeat seekAfter(checkpoint) -> expect same exact replay"); + consumer.seekAfter(topicName, checkpoint.progress); + pause(2000); + + PollResult repeatedReplay = pollUntilComplete(consumer, expectedTailRows, 120); + System.out.println( + " After repeated seekAfter(checkpoint): " + repeatedReplay.totalRows + " rows"); + assertEquals( + "Repeating seekAfter(topicProgress) should be stable", + expectedTailRows, + repeatedReplay.totalRows); + + System.out.println(" Step 4: seekAfter(tail) -> expect no historical rows"); + TopicProgress tailProgress = consumer.committedPositions(topicName); + assertTrue("Tail checkpoint should be non-null", tailProgress != null); + consumer.seekAfter(topicName, tailProgress); + pause(2000); + + PollResult noHistory = pollUntilComplete(consumer, 0, 15, 1000, true); + System.out.println(" After seekAfter(tail): " + noHistory.totalRows + " rows"); + assertAtMost("seekAfter(tail) should yield at most 1 race row", 1, noHistory.totalRows); + + System.out.println(" Step 5: Write new rows after seekAfter(tail)"); + writeSequentialRowsAndFlush(database, 5000, futureRows); + PollResult futureOnly = pollUntilComplete(consumer, futureRows, 120); + System.out.println(" After seekAfter(tail) + new writes: " + futureOnly.totalRows + " rows"); + assertEquals( + "seekAfter(tail) should only deliver rows written after the seek", + futureRows, + futureOnly.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + /** + * Verifies: + * + *
    + *
  • seekAfter fences off stale in-flight commit contexts from before the seek + *
  • Committing old polled messages after the seek does not affect the new replay frontier + *
  • The full tail after the checkpoint is replayed exactly once + *
+ */ + private static void testSeekAfterWithStaleAckFencing() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + final int totalRows = 2400; + + try { + bootstrapSeekTopic(database, topicName); + consumer = createSubscribedConsumer(topicName, consumerId, consumerGroupId); + + System.out.println(" Step 1: Write rows and commit part of them"); + writeSequentialRowsAndFlush(database, 1000, totalRows); + CommittedSnapshot committedCheckpoint = + pollUntilCommittedRows(consumer, topicName, totalRows / 3, 60, 1000); + assertTrue( + "Committed checkpoint should not be null", + committedCheckpoint.progress != null + && committedCheckpoint.progress.getRegionProgress() != null + && !committedCheckpoint.progress.getRegionProgress().isEmpty()); + + System.out.println(" Step 2: Poll a stale batch without committing it"); + PolledMessageBatch staleBatch = pollFirstNonEmptyBatchWithoutCommit(consumer, 30, 1000); + assertAtLeast( + "Stale-ack scenario should poll at least one row after the checkpoint", + 1, + staleBatch.totalRows); + + int expectedTailRows = totalRows - committedCheckpoint.cumulativeRows; + System.out.println( + " Committed checkpoint after " + + committedCheckpoint.cumulativeRows + + " rows, stale batch=" + + staleBatch.totalRows + + ", expected replay tail=" + + expectedTailRows); + assertAtLeast( + "Stale-ack replay should include the stale batch rows", + staleBatch.totalRows, + expectedTailRows); + + System.out.println(" Step 3: seekAfter(checkpoint), then commit stale messages"); + consumer.seekAfter(topicName, committedCheckpoint.progress); + pause(2000); + + for (SubscriptionMessage staleMessage : staleBatch.messages) { + consumer.commitSync(staleMessage); + } + + PollResult replayAfterSeek = pollUntilComplete(consumer, expectedTailRows, 120); + System.out.println( + " After seekAfter(checkpoint) with stale commits: " + + replayAfterSeek.totalRows + + " rows"); + assertEquals( + "Stale commits from the old generation must not reduce the replayed tail", + expectedTailRows, + replayAfterSeek.totalRows); + } finally { + cleanup(consumer, topicName, database); + } + } + + // ====================================================================== + // Test 11: pollWithInfo(topicNames, timeoutMs) 鈥?topic-level filtering + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • pollWithInfo(Set, long) only returns data matching the specified topics + *
  • Data from other subscribed topics is not returned in the filtered poll + *
  • After filtered poll, remaining data can still be retrieved via unfiltered poll + *
+ */ + private static void testPollWithInfoTopicFilter() throws Exception { + String database = nextDatabase(); + String topicName1 = "topic_pwf_" + testCounter + "_a"; + String topicName2 = "topic_pwf_" + testCounter + "_b"; + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Step 0: Create database with d1, d2 + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 1: Create two topics with distinct path filters + System.out.println(" Step 1: Creating two topics (d1 / d2)"); + createTopic(topicName1, database + ".d1.**"); + createTopic(topicName2, database + ".d2.**"); + Thread.sleep(1000); + + // Step 2: Subscribe to both topics + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName1, topicName2); + Thread.sleep(3000); + + // Step 3: Write 30 rows to d1, 40 rows to d2 + System.out.println(" Step 3: Writing 30 rows to d1, 40 rows to d2"); + try (ISession session = openSession()) { + for (int i = 1; i <= 40; i++) { + if (i <= 30) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d2(time, s1) VALUES (%d, %d)", database, i, i * 20)); + } + } + Thread.sleep(3000); + + // Step 4: pollWithInfo for topicName1 only + System.out.println(" Step 4: pollWithInfo for topic1 (d1) only"); + Set topic1Only = new HashSet<>(Arrays.asList(topicName1)); + int d1Rows = 0; + for (int attempt = 0; attempt < 40; attempt++) { + org.apache.iotdb.session.subscription.payload.PollResult pollResult = + consumer.pollWithInfo(topic1Only, 2000); + List msgs = pollResult.getMessages(); + if (msgs.isEmpty()) { + if (d1Rows > 0) break; + Thread.sleep(1000); + continue; + } + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + List cols = ds.getColumnNames(); + while (ds.hasNext()) { + ds.next(); + d1Rows++; + // Verify no d2 columns appear + for (String col : cols) { + assertTrue("Topic1 poll should not contain d2 data: " + col, !col.contains(".d2.")); + } + } + } + consumer.commitSync(msg); + } + } + System.out.println(" Topic1-only poll received: " + d1Rows + " rows"); + assertEquals("Topic1 should deliver exactly 30 rows from d1", 30, d1Rows); + + // Step 5: pollWithInfo for topicName2 only 鈥?should get d2 data + System.out.println(" Step 5: pollWithInfo for topic2 (d2) only"); + Set topic2Only = new HashSet<>(Arrays.asList(topicName2)); + int d2Rows = 0; + for (int attempt = 0; attempt < 40; attempt++) { + org.apache.iotdb.session.subscription.payload.PollResult pollResult = + consumer.pollWithInfo(topic2Only, 2000); + List msgs = pollResult.getMessages(); + if (msgs.isEmpty()) { + if (d2Rows > 0) break; + Thread.sleep(1000); + continue; + } + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + List cols = ds.getColumnNames(); + while (ds.hasNext()) { + ds.next(); + d2Rows++; + // Verify no d1 columns appear + for (String col : cols) { + assertTrue("Topic2 poll should not contain d1 data: " + col, !col.contains(".d1.")); + } + } + } + consumer.commitSync(msg); + } + } + System.out.println(" Topic2-only poll received: " + d2Rows + " rows"); + assertEquals("Topic2 should deliver exactly 40 rows from d2", 40, d2Rows); + + System.out.println(" testPollWithInfoTopicFilter passed!"); + } finally { + if (consumer != null) { + try { + consumer.unsubscribe(topicName1, topicName2); + } catch (Exception e) { + /* ignore */ + } + try { + consumer.close(); + } catch (Exception e) { + /* ignore */ + } + } + dropTopic(topicName1); + dropTopic(topicName2); + deleteDatabase(database); + } + } + + // ====================================================================== + // Test 12: Poison Message Drop 鈥?messages nacked beyond threshold + // are force-acked (dropped) and don't block new data. + // ====================================================================== + /** + * Verifies: + * + *
    + *
  • A message that is nacked (poll timeout without commit) more than + * POISON_MESSAGE_NACK_THRESHOLD (10) times is eventually dropped + *
  • After the poison message is dropped, new data can still be received + *
  • The consumer is not permanently blocked by a single unprocessable message + *
+ * + *

Note: "Nack" in this context means the server re-enqueues an in-flight event that was + * polled but never committed by the consumer. Each re-enqueue increments the event's nack + * counter. After 10 nacks, the event is marked as poisoned and force-acked (dropped) at the next + * re-enqueue attempt. + */ + private static void testPoisonMessageDrop() throws Exception { + String database = nextDatabase(); + String topicName = nextTopic(); + String consumerGroupId = nextConsumerGroup(); + String consumerId = nextConsumerId(); + SubscriptionTreePullConsumer consumer = null; + + try { + // Step 0: Create DataRegion + try (ISession session = openSession()) { + createDatabase(session, database); + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (0, 0)", database)); + session.executeNonQueryStatement("flush"); + } + Thread.sleep(2000); + + // Step 1: Create topic and subscribe + System.out.println(" Step 1: Creating topic and subscribing"); + createTopic(topicName, database + ".**"); + Thread.sleep(1000); + + consumer = createConsumer(consumerId, consumerGroupId); + consumer.subscribe(topicName); + Thread.sleep(3000); + + // Step 2: Write initial data that will become the "poison" message + System.out.println(" Step 2: Writing 10 rows (the initial batch)"); + try (ISession session = openSession()) { + for (int i = 1; i <= 10; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(2000); + + // Step 3: Poll without commit 鈥?repeatedly. Each poll-then-timeout cycle + // causes the server to nack the in-flight event and re-enqueue it. + // After POISON_MESSAGE_NACK_THRESHOLD (10) nacks, the message should be dropped. + System.out.println( + " Step 3: Polling without commit for 15 rounds (threshold=10, need >10 nacks)"); + int totalPoisonPolled = 0; + for (int round = 1; round <= 15; round++) { + List msgs = consumer.poll(Duration.ofMillis(3000)); + int roundRows = 0; + for (SubscriptionMessage msg : msgs) { + for (SubscriptionResultSet ds : getResultSets(msg)) { + while (ds.hasNext()) { + ds.next(); + roundRows++; + totalPoisonPolled++; + } + } + // Deliberately NOT committing 鈥?this is the "nack" behavior + } + System.out.println( + " Round " + round + ": received " + roundRows + " rows (NOT committing)"); + if (msgs.isEmpty() && round > 11) { + // After threshold exceeded, the message may have been dropped + System.out.println(" No messages 鈥?poison message may have been force-acked"); + break; + } + Thread.sleep(1000); + } + System.out.println(" Total rows polled across all rounds: " + totalPoisonPolled); + + // Step 4: Write NEW data and verify it can be received (consumer not blocked) + System.out.println(" Step 4: Writing 50 NEW rows and polling WITH commit"); + try (ISession session = openSession()) { + for (int i = 1000; i < 1050; i++) { + session.executeNonQueryStatement( + String.format("INSERT INTO %s.d1(time, s1) VALUES (%d, %d)", database, i, i * 10)); + } + } + Thread.sleep(2000); + + PollResult newResult = pollUntilComplete(consumer, 50, 60); + System.out.println(" New data poll result: " + newResult); + + // The key assertion: new data must be receivable + // The exact count may be slightly more than 50 if the old poison data leaked through + // in an earlier round, but the queue must not be permanently blocked. + assertAtLeast( + "Consumer must not be permanently blocked by poison message 鈥?new data should arrive", + 1, + newResult.totalRows); + System.out.println( + " testPoisonMessageDrop passed: consumer received " + + newResult.totalRows + + " new rows after poison message handling"); + } finally { + cleanup(consumer, topicName, database); + } + } + + private static List getResultSets(final SubscriptionMessage message) { + return message.getResultSets().stream() + .map(resultSet -> (SubscriptionResultSet) resultSet) + .collect(Collectors.toList()); + } + + /** Helper: populate one row of an aligned Tablet with all 6 data types. */ + private static void addAlignedTabletRow( + Tablet tablet, + int rowIndex, + long timestamp, + int intVal, + long longVal, + float floatVal, + double doubleVal, + boolean boolVal, + String textVal) { + tablet.addTimestamp(rowIndex, timestamp); + tablet.addValue("s_int32", rowIndex, intVal); + tablet.addValue("s_int64", rowIndex, longVal); + tablet.addValue("s_float", rowIndex, floatVal); + tablet.addValue("s_double", rowIndex, doubleVal); + tablet.addValue("s_bool", rowIndex, boolVal); + tablet.addValue("s_text", rowIndex, new Binary(textVal, TSFileConfig.STRING_CHARSET)); + } +} diff --git a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java index 53c23626b1335..761bae4bd98ef 100644 --- a/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java +++ b/iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java @@ -313,6 +313,7 @@ public enum TSStatusCode { SHOW_SUBSCRIPTION_ERROR(1910), SUBSCRIPTION_PIPE_TIMEOUT_ERROR(1911), SUBSCRIPTION_NOT_ENABLED_ERROR(1912), + SUBSCRIPTION_SEEK_ERROR(1913), // Topic CREATE_TOPIC_ERROR(2000), diff --git a/iotdb-client/subscription/pom.xml b/iotdb-client/subscription/pom.xml index c41ef1e3bde89..bd1e71232b307 100644 --- a/iotdb-client/subscription/pom.xml +++ b/iotdb-client/subscription/pom.xml @@ -77,5 +77,10 @@ org.apache.thrift libthrift + + junit + junit + test + diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java index ea588f1276325..b6c318d6e7628 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConfig.java @@ -35,6 +35,16 @@ public class TopicConfig extends PipeParameters { + private static final Set ORDER_MODE_VALUE_SET; + + static { + final Set orderModes = new HashSet<>(3); + orderModes.add(TopicConstant.ORDER_MODE_LEADER_ONLY_VALUE); + orderModes.add(TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE); + orderModes.add(TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + ORDER_MODE_VALUE_SET = Collections.unmodifiableSet(orderModes); + } + public TopicConfig() { super(Collections.emptyMap()); } @@ -97,6 +107,22 @@ public boolean isTableTopic() { attributes.getOrDefault(SQL_DIALECT_KEY, SQL_DIALECT_TREE_VALUE)); } + public String getOrderMode() { + return normalizeOrderMode( + attributes.getOrDefault( + TopicConstant.ORDER_MODE_KEY, TopicConstant.ORDER_MODE_DEFAULT_VALUE)); + } + + public static boolean isValidOrderMode(final String orderMode) { + return ORDER_MODE_VALUE_SET.contains(normalizeOrderMode(orderMode)); + } + + public static String normalizeOrderMode(final String orderMode) { + return orderMode == null + ? TopicConstant.ORDER_MODE_DEFAULT_VALUE + : orderMode.trim().toLowerCase(); + } + /////////////////////////////// extractor attributes mapping /////////////////////////////// public Map getAttributeWithSqlDialect() { diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java index bb84358648e59..09a63c939bc86 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/config/TopicConstant.java @@ -40,6 +40,12 @@ public class TopicConstant { public static final String MODE_SNAPSHOT_VALUE = "snapshot"; public static final String MODE_DEFAULT_VALUE = MODE_LIVE_VALUE; + public static final String ORDER_MODE_KEY = "order-mode"; + public static final String ORDER_MODE_LEADER_ONLY_VALUE = "leader-only"; + public static final String ORDER_MODE_MULTI_WRITER_VALUE = "multi-writer"; + public static final String ORDER_MODE_PER_WRITER_VALUE = "per-writer"; + public static final String ORDER_MODE_DEFAULT_VALUE = ORDER_MODE_LEADER_ONLY_VALUE; + public static final String FORMAT_KEY = "format"; public static final String FORMAT_RECORD_HANDLER_VALUE = "SubscriptionRecordHandler"; public static final String FORMAT_TS_FILE_VALUE = "SubscriptionTsFileHandler"; diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/RegionProgress.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/RegionProgress.java new file mode 100644 index 0000000000000..134f59dfe5dae --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/RegionProgress.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; + +public class RegionProgress { + + private final Map writerPositions; + + public RegionProgress(final Map writerPositions) { + this.writerPositions = + writerPositions == null + ? Collections.emptyMap() + : Collections.unmodifiableMap(new LinkedHashMap<>(writerPositions)); + } + + public Map getWriterPositions() { + return writerPositions; + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(writerPositions.size(), stream); + for (final Map.Entry entry : writerPositions.entrySet()) { + entry.getKey().serialize(stream); + entry.getValue().serialize(stream); + } + } + + public static RegionProgress deserialize(final ByteBuffer buffer) { + final int size = ReadWriteIOUtils.readInt(buffer); + final Map writerPositions = new LinkedHashMap<>(size); + for (int i = 0; i < size; i++) { + writerPositions.put(WriterId.deserialize(buffer), WriterProgress.deserialize(buffer)); + } + return new RegionProgress(writerPositions); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof RegionProgress)) { + return false; + } + final RegionProgress that = (RegionProgress) obj; + return Objects.equals(writerPositions, that.writerPositions); + } + + @Override + public int hashCode() { + return Objects.hash(writerPositions); + } + + @Override + public String toString() { + return "RegionProgress{" + "writerPositions=" + writerPositions + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java index e2bf809d32c20..3121843e62c92 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContext.java @@ -30,6 +30,13 @@ public class SubscriptionCommitContext implements Comparable { + /** + * Version 1: original 5 fields (dataNodeId, rebootTimes, topicName, consumerGroupId, commitId). + * Version 2: added regionId + physicalTime (serialized in the legacy epoch slot). Version 3: + * added writerId + writerProgress. + */ + private static final byte SERIALIZATION_VERSION = 3; + private final int dataNodeId; private final int rebootTimes; @@ -40,6 +47,16 @@ public class SubscriptionCommitContext implements Comparable + Objects.nonNull(context.getWriterId()) ? context.getWriterId().getNodeId() : -1) + .thenComparingLong( + context -> + Objects.nonNull(context.getWriterId()) + ? context.getWriterId().getWriterEpoch() + : -1L) + .thenComparingLong(SubscriptionCommitContext::getPhysicalTime) + .thenComparingLong(SubscriptionCommitContext::getLocalSeq) .compare(this, that); } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java index 3337887b185f5..d8c800f247b2d 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequest.java @@ -27,10 +27,13 @@ import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; public class SubscriptionPollRequest { - private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionPollResponse.class); + private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionPollRequest.class); private final transient short requestType; @@ -41,15 +44,31 @@ public class SubscriptionPollRequest { /** The maximum size, in bytes, for the response payload. */ private final transient long maxBytes; + /** + * Per-topic writer-based progress used by the new consensus subscription model. This preserves + * topic boundaries while allowing the consumer to provide a recovery hint on reconnect. + */ + private final transient Map progressByTopic; + public SubscriptionPollRequest( final short requestType, final SubscriptionPollPayload payload, final long timeoutMs, final long maxBytes) { + this(requestType, payload, timeoutMs, maxBytes, Collections.emptyMap()); + } + + public SubscriptionPollRequest( + final short requestType, + final SubscriptionPollPayload payload, + final long timeoutMs, + final long maxBytes, + final Map progressByTopic) { this.requestType = requestType; this.payload = payload; this.timeoutMs = timeoutMs; this.maxBytes = maxBytes; + this.progressByTopic = progressByTopic != null ? progressByTopic : Collections.emptyMap(); } public short getRequestType() { @@ -68,6 +87,10 @@ public long getMaxBytes() { return maxBytes; } + public Map getProgressByTopic() { + return progressByTopic; + } + //////////////////////////// serialization //////////////////////////// public static ByteBuffer serialize(final SubscriptionPollRequest request) throws IOException { @@ -83,6 +106,11 @@ private void serialize(final DataOutputStream stream) throws IOException { payload.serialize(stream); ReadWriteIOUtils.write(timeoutMs, stream); ReadWriteIOUtils.write(maxBytes, stream); + ReadWriteIOUtils.write(progressByTopic.size(), stream); + for (final Map.Entry entry : progressByTopic.entrySet()) { + ReadWriteIOUtils.write(entry.getKey(), stream); + entry.getValue().serialize(stream); + } } public static SubscriptionPollRequest deserialize(final ByteBuffer buffer) { @@ -109,7 +137,20 @@ public static SubscriptionPollRequest deserialize(final ByteBuffer buffer) { final long timeoutMs = ReadWriteIOUtils.readLong(buffer); final long maxBytes = ReadWriteIOUtils.readLong(buffer); - return new SubscriptionPollRequest(requestType, payload, timeoutMs, maxBytes); + + Map progressByTopic = Collections.emptyMap(); + if (buffer.hasRemaining()) { + final int mapSize = ReadWriteIOUtils.readInt(buffer); + if (mapSize > 0) { + progressByTopic = new HashMap<>(mapSize); + for (int i = 0; i < mapSize; i++) { + progressByTopic.put( + ReadWriteIOUtils.readString(buffer), TopicProgress.deserialize(buffer)); + } + } + } + + return new SubscriptionPollRequest(requestType, payload, timeoutMs, maxBytes, progressByTopic); } /////////////////////////////// object /////////////////////////////// @@ -117,13 +158,15 @@ public static SubscriptionPollRequest deserialize(final ByteBuffer buffer) { @Override public String toString() { return "SubscriptionPollRequest{requestType=" - + SubscriptionPollRequestType.valueOf(requestType).toString() + + SubscriptionPollRequestType.valueOf(requestType) + ", payload=" + payload + ", timeoutMs=" + timeoutMs + ", maxBytes=" + maxBytes + + ", progressByTopic.size=" + + progressByTopic.size() + "}"; } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java index 06baa30acee9f..df1bb91a9f3e9 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponse.java @@ -100,6 +100,9 @@ public static SubscriptionPollResponse deserialize(final ByteBuffer buffer) { case TERMINATION: payload = new TerminationPayload().deserialize(buffer); break; + case WATERMARK: + payload = new WatermarkPayload().deserialize(buffer); + break; default: LOGGER.warn("unexpected response type: {}, payload will be null", responseType); break; @@ -121,9 +124,10 @@ public String toString() { protected Map coreReportMessage() { final Map result = new HashMap<>(); - result.put("responseType", SubscriptionPollResponseType.valueOf(responseType).toString()); - result.put("payload", payload.toString()); - result.put("commitContext", commitContext.toString()); + final SubscriptionPollResponseType type = SubscriptionPollResponseType.valueOf(responseType); + result.put("responseType", type != null ? type.toString() : "UNKNOWN(" + responseType + ")"); + result.put("payload", payload != null ? payload.toString() : "null"); + result.put("commitContext", commitContext != null ? commitContext.toString() : "null"); return result; } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java index b27791b36c538..4ca6cb09dd67c 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollResponseType.java @@ -33,6 +33,13 @@ public enum SubscriptionPollResponseType { FILE_SEAL((short) 4), TERMINATION((short) 5), + + /** + * Periodic timestamp-progress signal from the server-side {@code ConsensusPrefetchingQueue}. + * Carries the maximum data timestamp observed so far for a region, enabling client-side watermark + * computation even when a region is idle (no new data). + */ + WATERMARK((short) 7), ; private final short type; diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/TopicProgress.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/TopicProgress.java new file mode 100644 index 0000000000000..35dfd2e0ca33d --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/TopicProgress.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.PublicBAOS; +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; + +public class TopicProgress { + + private final Map regionProgress; + + public TopicProgress(final Map regionProgress) { + this.regionProgress = + regionProgress == null + ? Collections.emptyMap() + : Collections.unmodifiableMap(new LinkedHashMap<>(regionProgress)); + } + + public Map getRegionProgress() { + return regionProgress; + } + + public static ByteBuffer serialize(final TopicProgress progress) throws IOException { + try (final PublicBAOS byteArrayOutputStream = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + progress.serialize(outputStream); + return ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(regionProgress.size(), stream); + for (final Map.Entry entry : regionProgress.entrySet()) { + ReadWriteIOUtils.write(entry.getKey(), stream); + entry.getValue().serialize(stream); + } + } + + public static TopicProgress deserialize(final ByteBuffer buffer) { + final int size = ReadWriteIOUtils.readInt(buffer); + final Map regionProgress = new LinkedHashMap<>(size); + for (int i = 0; i < size; i++) { + regionProgress.put(ReadWriteIOUtils.readString(buffer), RegionProgress.deserialize(buffer)); + } + return new TopicProgress(regionProgress); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof TopicProgress)) { + return false; + } + final TopicProgress that = (TopicProgress) obj; + return Objects.equals(regionProgress, that.regionProgress); + } + + @Override + public int hashCode() { + return Objects.hash(regionProgress); + } + + @Override + public String toString() { + return "TopicProgress{" + "regionProgress=" + regionProgress + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WatermarkPayload.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WatermarkPayload.java new file mode 100644 index 0000000000000..32dab88967497 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WatermarkPayload.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Payload for {@link SubscriptionPollResponseType#WATERMARK}. + * + *

Periodically injected by the server-side {@code ConsensusPrefetchingQueue} to report timestamp + * progress for a region. Carries the maximum data timestamp observed so far, enabling client-side + * {@code WatermarkProcessor} to advance its watermark even when a region is idle (no new data). + * + *

The {@code dataNodeId} identifies which DataNode emitted this watermark, allowing the client + * to track per-node progress across leader transitions. + */ +public class WatermarkPayload implements SubscriptionPollPayload { + + /** Maximum data timestamp observed across all InsertNodes in this region's queue. */ + private transient long watermarkTimestamp; + + /** The DataNode ID that emitted this watermark. */ + private transient int dataNodeId; + + public WatermarkPayload() {} + + public WatermarkPayload(final long watermarkTimestamp, final int dataNodeId) { + this.watermarkTimestamp = watermarkTimestamp; + this.dataNodeId = dataNodeId; + } + + public long getWatermarkTimestamp() { + return watermarkTimestamp; + } + + public int getDataNodeId() { + return dataNodeId; + } + + @Override + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(watermarkTimestamp, stream); + ReadWriteIOUtils.write(dataNodeId, stream); + } + + @Override + public SubscriptionPollPayload deserialize(final ByteBuffer buffer) { + watermarkTimestamp = ReadWriteIOUtils.readLong(buffer); + dataNodeId = ReadWriteIOUtils.readInt(buffer); + return this; + } + + @Override + public String toString() { + return "WatermarkPayload{watermarkTimestamp=" + + watermarkTimestamp + + ", dataNodeId=" + + dataNodeId + + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterId.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterId.java new file mode 100644 index 0000000000000..ce21e07fe008d --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterId.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +public class WriterId { + + private final String regionId; + private final int nodeId; + private final long writerEpoch; + + public WriterId(final String regionId, final int nodeId, final long writerEpoch) { + this.regionId = regionId; + this.nodeId = nodeId; + this.writerEpoch = writerEpoch; + } + + public String getRegionId() { + return regionId; + } + + public int getNodeId() { + return nodeId; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(regionId, stream); + ReadWriteIOUtils.write(nodeId, stream); + ReadWriteIOUtils.write(writerEpoch, stream); + } + + public static WriterId deserialize(final ByteBuffer buffer) { + return new WriterId( + ReadWriteIOUtils.readString(buffer), + ReadWriteIOUtils.readInt(buffer), + ReadWriteIOUtils.readLong(buffer)); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterId)) { + return false; + } + final WriterId that = (WriterId) obj; + return nodeId == that.nodeId + && writerEpoch == that.writerEpoch + && Objects.equals(regionId, that.regionId); + } + + @Override + public int hashCode() { + return Objects.hash(regionId, nodeId, writerEpoch); + } + + @Override + public String toString() { + return "WriterId{" + + "regionId='" + + regionId + + '\'' + + ", nodeId=" + + nodeId + + ", writerEpoch=" + + writerEpoch + + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterProgress.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterProgress.java new file mode 100644 index 0000000000000..f38ea770e8ff6 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/poll/WriterProgress.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +public class WriterProgress { + + private final long physicalTime; + private final long localSeq; + + public WriterProgress(final long physicalTime, final long localSeq) { + this.physicalTime = physicalTime; + this.localSeq = localSeq; + } + + public long getPhysicalTime() { + return physicalTime; + } + + public long getLocalSeq() { + return localSeq; + } + + public void serialize(final DataOutputStream stream) throws IOException { + ReadWriteIOUtils.write(physicalTime, stream); + ReadWriteIOUtils.write(localSeq, stream); + } + + public static WriterProgress deserialize(final ByteBuffer buffer) { + return new WriterProgress(ReadWriteIOUtils.readLong(buffer), ReadWriteIOUtils.readLong(buffer)); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterProgress)) { + return false; + } + final WriterProgress that = (WriterProgress) obj; + return physicalTime == that.physicalTime && localSeq == that.localSeq; + } + + @Override + public int hashCode() { + return Objects.hash(physicalTime, localSeq); + } + + @Override + public String toString() { + return "WriterProgress{" + "physicalTime=" + physicalTime + ", localSeq=" + localSeq + '}'; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java index d649aa567ade4..9fcc1d86b0c75 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeRequestType.java @@ -31,6 +31,7 @@ public enum PipeSubscribeRequestType { CLOSE((short) 4), SUBSCRIBE((short) 5), UNSUBSCRIBE((short) 6), + SEEK((short) 7), ; private final short type; diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java new file mode 100644 index 0000000000000..e2a78227a6dc5 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReq.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.request; + +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeReq; + +import org.apache.tsfile.utils.PublicBAOS; +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.Objects; + +public class PipeSubscribeSeekReq extends TPipeSubscribeReq { + + /** Seek type constants. */ + public static final short SEEK_TO_BEGINNING = 1; + + public static final short SEEK_TO_END = 2; + public static final short SEEK_TO_TIMESTAMP = 3; + public static final short SEEK_TO_TOPIC_PROGRESS = 6; + public static final short SEEK_AFTER_TOPIC_PROGRESS = 7; + + private transient String topicName; + private transient short seekType; + private transient long timestamp; // only meaningful when seekType == SEEK_TO_TIMESTAMP + private transient TopicProgress topicProgress = new TopicProgress(Collections.emptyMap()); + + public String getTopicName() { + return topicName; + } + + public short getSeekType() { + return seekType; + } + + public long getTimestamp() { + return timestamp; + } + + public TopicProgress getTopicProgress() { + return topicProgress; + } + + /////////////////////////////// Thrift /////////////////////////////// + + /** + * Serialize the incoming parameters into {@code PipeSubscribeSeekReq}, called by the subscription + * client. + */ + public static PipeSubscribeSeekReq toTPipeSubscribeReq( + final String topicName, final short seekType, final long timestamp) throws IOException { + return toTPipeSubscribeReq(topicName, seekType, timestamp, null); + } + + public static PipeSubscribeSeekReq toTPipeSubscribeReq( + final String topicName, final TopicProgress topicProgress) throws IOException { + return toTPipeSubscribeReq(topicName, SEEK_TO_TOPIC_PROGRESS, 0, topicProgress); + } + + public static PipeSubscribeSeekReq toTPipeSubscribeSeekAfterReq( + final String topicName, final TopicProgress topicProgress) throws IOException { + return toTPipeSubscribeReq(topicName, SEEK_AFTER_TOPIC_PROGRESS, 0, topicProgress); + } + + public static PipeSubscribeSeekReq toTPipeSubscribeReq( + final String topicName, + final short seekType, + final long timestamp, + final TopicProgress topicProgress) + throws IOException { + final PipeSubscribeSeekReq req = new PipeSubscribeSeekReq(); + + req.topicName = topicName; + req.seekType = seekType; + req.timestamp = timestamp; + req.topicProgress = + Objects.nonNull(topicProgress) ? topicProgress : new TopicProgress(Collections.emptyMap()); + + req.version = PipeSubscribeRequestVersion.VERSION_1.getVersion(); + req.type = PipeSubscribeRequestType.SEEK.getType(); + try (final PublicBAOS byteArrayOutputStream = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + ReadWriteIOUtils.write(topicName, outputStream); + ReadWriteIOUtils.write(seekType, outputStream); + if (seekType == SEEK_TO_TIMESTAMP) { + ReadWriteIOUtils.write(timestamp, outputStream); + } else if (seekType == SEEK_TO_TOPIC_PROGRESS || seekType == SEEK_AFTER_TOPIC_PROGRESS) { + req.topicProgress.serialize(outputStream); + } + req.body = ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + + return req; + } + + /** + * Deserialize {@code TPipeSubscribeReq} to obtain parameters, called by the subscription server. + */ + public static PipeSubscribeSeekReq fromTPipeSubscribeReq(final TPipeSubscribeReq seekReq) { + final PipeSubscribeSeekReq req = new PipeSubscribeSeekReq(); + + if (Objects.nonNull(seekReq.body) && seekReq.body.hasRemaining()) { + req.topicName = ReadWriteIOUtils.readString(seekReq.body); + req.seekType = ReadWriteIOUtils.readShort(seekReq.body); + if (req.seekType == SEEK_TO_TIMESTAMP) { + req.timestamp = ReadWriteIOUtils.readLong(seekReq.body); + } else if (req.seekType == SEEK_TO_TOPIC_PROGRESS + || req.seekType == SEEK_AFTER_TOPIC_PROGRESS) { + req.topicProgress = TopicProgress.deserialize(seekReq.body); + } + } + + req.version = seekReq.version; + req.type = seekReq.type; + req.body = seekReq.body; + + return req; + } + + /////////////////////////////// Object /////////////////////////////// + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final PipeSubscribeSeekReq that = (PipeSubscribeSeekReq) obj; + return Objects.equals(this.topicName, that.topicName) + && this.seekType == that.seekType + && this.timestamp == that.timestamp + && Objects.equals(this.topicProgress, that.topicProgress) + && this.version == that.version + && this.type == that.type + && Objects.equals(this.body, that.body); + } + + @Override + public int hashCode() { + return Objects.hash(topicName, seekType, timestamp, topicProgress, version, type, body); + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java new file mode 100644 index 0000000000000..c6ea90d5bb069 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/rpc/subscription/payload/response/PipeSubscribeSeekResp.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.response; + +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeResp; + +import java.util.Objects; + +public class PipeSubscribeSeekResp extends TPipeSubscribeResp { + + /////////////////////////////// Thrift /////////////////////////////// + + /** + * Serialize the incoming parameters into {@code PipeSubscribeSeekResp}, called by the + * subscription server. + */ + public static PipeSubscribeSeekResp toTPipeSubscribeResp(final TSStatus status) { + final PipeSubscribeSeekResp resp = new PipeSubscribeSeekResp(); + + resp.status = status; + resp.version = PipeSubscribeResponseVersion.VERSION_1.getVersion(); + resp.type = PipeSubscribeResponseType.ACK.getType(); + + return resp; + } + + /** + * Deserialize {@code TPipeSubscribeResp} to obtain parameters, called by the subscription client. + */ + public static PipeSubscribeSeekResp fromTPipeSubscribeResp(final TPipeSubscribeResp seekResp) { + final PipeSubscribeSeekResp resp = new PipeSubscribeSeekResp(); + + resp.status = seekResp.status; + resp.version = seekResp.version; + resp.type = seekResp.type; + resp.body = seekResp.body; + + return resp; + } + + /////////////////////////////// Object /////////////////////////////// + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final PipeSubscribeSeekResp that = (PipeSubscribeSeekResp) obj; + return Objects.equals(this.status, that.status) + && this.version == that.version + && this.type == that.type + && Objects.equals(this.body, that.body); + } + + @Override + public int hashCode() { + return Objects.hash(status, version, type, body); + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java index 0168a1ba3846d..abc5e2de2ff92 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTablePullConsumer.java @@ -20,6 +20,7 @@ package org.apache.iotdb.session.subscription.consumer; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import java.time.Duration; @@ -179,6 +180,19 @@ List poll(final Set topicNames, final Duration time void commitAsync( final Iterable messages, final AsyncCommitCallback callback); + void seekToBeginning(final String topicName) throws SubscriptionException; + + void seekToEnd(final String topicName) throws SubscriptionException; + + TopicProgress positions(final String topicName) throws SubscriptionException; + + TopicProgress committedPositions(final String topicName) throws SubscriptionException; + + void seek(final String topicName, final TopicProgress topicProgress) throws SubscriptionException; + + void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException; + /** * Retrieves the unique identifier of this consumer. If no consumer ID was provided at the time of * consumer construction, a random globally unique ID is automatically assigned after the consumer diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java index 803b7c51224a4..fc9d55bfe218a 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/ISubscriptionTreePullConsumer.java @@ -20,6 +20,7 @@ package org.apache.iotdb.session.subscription.consumer; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import java.time.Duration; @@ -179,6 +180,19 @@ List poll(final Set topicNames, final Duration time void commitAsync( final Iterable messages, final AsyncCommitCallback callback); + void seekToBeginning(final String topicName) throws SubscriptionException; + + void seekToEnd(final String topicName) throws SubscriptionException; + + TopicProgress positions(final String topicName) throws SubscriptionException; + + TopicProgress committedPositions(final String topicName) throws SubscriptionException; + + void seek(final String topicName, final TopicProgress topicProgress) throws SubscriptionException; + + void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException; + /** * Retrieves the unique identifier of this consumer. If no consumer ID was provided at the time of * consumer construction, a random globally unique ID is automatically assigned after the consumer diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java index c290af2b67b98..3008704f26a5d 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumer.java @@ -34,11 +34,17 @@ import org.apache.iotdb.rpc.subscription.payload.poll.FileInitPayload; import org.apache.iotdb.rpc.subscription.payload.poll.FilePiecePayload; import org.apache.iotdb.rpc.subscription.payload.poll.FileSealPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollPayload; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponse; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WatermarkPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.session.subscription.consumer.AsyncCommitCallback; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; @@ -77,6 +83,7 @@ import java.util.concurrent.Callable; import java.util.concurrent.CancellationException; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.ScheduledFuture; @@ -88,6 +95,7 @@ import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.FILE_INIT; import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.TABLETS; import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.TERMINATION; +import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType.WATERMARK; import static org.apache.iotdb.session.subscription.util.SetPartitioner.partition; abstract class AbstractSubscriptionConsumer implements AutoCloseable { @@ -121,6 +129,26 @@ abstract class AbstractSubscriptionConsumer implements AutoCloseable { private final int connectionTimeoutInMs; private final int maxPollParallelism; + /** + * The latest watermark timestamp received from the server. Updated when WATERMARK events are + * processed and stripped. Consumer users can query this to check timestamp progress. + */ + protected volatile long latestWatermarkTimestamp = Long.MIN_VALUE; + + /** Per-topic current positions used as the consumer-guided positioning hint in poll requests. */ + private final Map currentPositionsByTopic = new ConcurrentHashMap<>(); + + /** Per-topic committed positions used as durable recovery points for explicit seek/checkpoint. */ + private final Map committedPositionsByTopic = new ConcurrentHashMap<>(); + + /** + * Ack contexts for consensus messages that were already processed locally but could not be + * committed because the original provider became unavailable. They are flushed after the same + * topic+region is observed again from a live provider. + */ + private final Map> pendingRedirectAcksByTopicRegion = + new ConcurrentHashMap<>(); + @SuppressWarnings("java:S3077") protected volatile Map subscribedTopics = new HashMap<>(); @@ -376,6 +404,106 @@ private void unsubscribe(Set topicNames, final boolean needParse) providers.acquireReadLock(); try { unsubscribeWithRedirection(topicNames); + topicNames.forEach(this::clearPendingRedirectAcks); + } finally { + providers.releaseReadLock(); + } + } + + /////////////////////////////// seek /////////////////////////////// + + /** + * Seeks to the earliest available WAL position. Actual position depends on WAL retention — old + * segments may have been reclaimed. + */ + public void seekToBeginning(final String topicName) throws SubscriptionException { + checkIfOpened(); + seekInternal(topicName, PipeSubscribeSeekReq.SEEK_TO_BEGINNING, 0); + clearCurrentPositions(topicName); + clearCommittedPositions(topicName); + clearPendingRedirectAcks(topicName); + } + + /** Seeks to the current WAL tail. Only newly written data will be consumed after this. */ + public void seekToEnd(final String topicName) throws SubscriptionException { + checkIfOpened(); + seekInternal(topicName, PipeSubscribeSeekReq.SEEK_TO_END, 0); + clearCurrentPositions(topicName); + clearCommittedPositions(topicName); + clearPendingRedirectAcks(topicName); + } + + /** + * Returns the latest observed per-region positions for the given topic. This is the consumer's + * current fetch position hint and is sent back to the server on subsequent poll requests. + */ + public TopicProgress positions(final String topicName) throws SubscriptionException { + checkIfOpened(); + final TopicProgress progress = currentPositionsByTopic.get(topicName); + return Objects.nonNull(progress) + ? new TopicProgress(progress.getRegionProgress()) + : new TopicProgress(Collections.emptyMap()); + } + + /** + * Returns the latest committed per-region positions for the given topic. This is the recoverable + * checkpoint position that should be persisted by callers. + */ + public TopicProgress committedPositions(final String topicName) throws SubscriptionException { + checkIfOpened(); + final TopicProgress progress = committedPositionsByTopic.get(topicName); + return Objects.nonNull(progress) + ? new TopicProgress(progress.getRegionProgress()) + : new TopicProgress(Collections.emptyMap()); + } + + public void seek(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + checkIfOpened(); + final TopicProgress safeProgress = + Objects.nonNull(topicProgress) ? topicProgress : new TopicProgress(Collections.emptyMap()); + seekInternalTopicProgress(topicName, safeProgress); + overlayCurrentPositions(topicName, safeProgress); + overlayCommittedPositions(topicName, safeProgress); + clearPendingRedirectAcks(topicName); + } + + public void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + checkIfOpened(); + final TopicProgress safeProgress = + Objects.nonNull(topicProgress) ? topicProgress : new TopicProgress(Collections.emptyMap()); + seekAfterInternalTopicProgress(topicName, safeProgress); + overlayCurrentPositions(topicName, safeProgress); + overlayCommittedPositions(topicName, safeProgress); + clearPendingRedirectAcks(topicName); + } + + private void seekInternal(final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + providers.acquireReadLock(); + try { + seekWithRedirection(topicName, seekType, timestamp); + } finally { + providers.releaseReadLock(); + } + } + + private void seekInternalTopicProgress(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + providers.acquireReadLock(); + try { + seekWithRedirectionTopicProgress(topicName, topicProgress); + } finally { + providers.releaseReadLock(); + } + } + + private void seekAfterInternalTopicProgress( + final String topicName, final TopicProgress topicProgress) throws SubscriptionException { + providers.acquireReadLock(); + try { + seekAfterWithRedirectionTopicProgress(topicName, topicProgress); } finally { providers.releaseReadLock(); } @@ -522,9 +650,44 @@ private Path getFilePath( unsubscribe(Collections.singleton(topicNameToUnsubscribe), false); return Optional.empty(); }); + put( + WATERMARK, + (resp, timer) -> { + final SubscriptionCommitContext commitContext = resp.getCommitContext(); + final WatermarkPayload payload = (WatermarkPayload) resp.getPayload(); + return Optional.of( + new SubscriptionMessage( + commitContext, payload.getWatermarkTimestamp())); + }); } }); + /** + * Returns the set of DataNode IDs for providers that are currently available. Used by subclasses + * to detect unavailable DataNodes and notify the epoch ordering processor. + */ + protected Set getAvailableDataNodeIds() { + providers.acquireReadLock(); + try { + final Set ids = new HashSet<>(); + for (final AbstractSubscriptionProvider provider : providers.getAllAvailableProviders()) { + ids.add(provider.getDataNodeId()); + } + return ids; + } finally { + providers.releaseReadLock(); + } + } + + /** + * Returns the latest watermark timestamp received from the server. This tracks the maximum data + * timestamp observed across all polled regions. Returns {@code Long.MIN_VALUE} if no watermark + * has been received yet. + */ + public long getLatestWatermarkTimestamp() { + return latestWatermarkTimestamp; + } + protected List multiplePoll( /* @NotNull */ final Set topicNames, final long timeoutMs) { if (topicNames.isEmpty()) { @@ -685,6 +848,8 @@ private List singlePoll( // add all current messages to result messages messages.addAll(currentMessages); + advanceCurrentPositions(currentMessages); + flushPendingRedirectAcks(currentMessages); // TODO: maybe we can poll a few more times if (!messages.isEmpty()) { @@ -1079,7 +1244,7 @@ private List pollInternal( } // ignore SubscriptionConnectionException to improve poll auto retry try { - return provider.poll(topicNames, timeoutMs); + return provider.poll(topicNames, timeoutMs, buildCurrentProgressByTopic(topicNames)); } catch (final SubscriptionConnectionException ignored) { return Collections.emptyList(); } @@ -1155,7 +1320,59 @@ protected void ack(final Iterable messages) throws Subscrip for (final Entry> entry : dataNodeIdToSubscriptionCommitContexts.entrySet()) { commitInternal(entry.getKey(), entry.getValue(), false); + advanceCommittedPositions(entry.getValue()); + } + } + + protected Set ackWithPartialProgress( + final Iterable messages) throws SubscriptionException { + final Map> dataNodeIdToMessages = new HashMap<>(); + for (final SubscriptionMessage message : messages) { + dataNodeIdToMessages + .computeIfAbsent(message.getCommitContext().getDataNodeId(), ignored -> new ArrayList<>()) + .add(message); + } + + final Set removableMessages = new HashSet<>(); + for (final Entry> entry : dataNodeIdToMessages.entrySet()) { + final List commitContexts = + entry.getValue().stream() + .map(SubscriptionMessage::getCommitContext) + .collect(Collectors.toList()); + try { + commitInternal(entry.getKey(), commitContexts, false); + advanceCommittedPositions(commitContexts); + removableMessages.addAll(entry.getValue()); + } catch (final SubscriptionConnectionException e) { + int stagedCount = 0; + int retainedCount = 0; + for (final SubscriptionMessage message : entry.getValue()) { + if (isConsensusCommitContext(message.getCommitContext())) { + stagePendingRedirectAck(message.getCommitContext()); + removableMessages.add(message); + stagedCount++; + } else { + retainedCount++; + } + } + if (stagedCount > 0) { + LOGGER.warn( + "{} staged {} consensus ack(s) for redirect after provider {} became unavailable", + this, + stagedCount, + entry.getKey()); + } + if (retainedCount > 0) { + LOGGER.warn( + "{} keep {} non-consensus ack(s) pending after provider {} commit failure", + this, + retainedCount, + entry.getKey(), + e); + } + } } + return removableMessages; } protected void nack(final Iterable messages) throws SubscriptionException { @@ -1383,6 +1600,346 @@ private void unsubscribeWithRedirection(final Set topicNames) throw new SubscriptionRuntimeCriticalException(errorMessage); } + /** + * Sends seek request to ALL available providers. Unlike subscribe/unsubscribe, seek must reach + * every node because data regions for the topic may be distributed across different nodes. + */ + private void seekWithRedirection( + final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + final List providers = this.providers.getAllAvailableProviders(); + if (providers.isEmpty()) { + throw new SubscriptionConnectionException( + String.format( + "Cluster has no available subscription providers when %s seek topic %s", + this, topicName)); + } + boolean anySuccess = false; + for (final AbstractSubscriptionProvider provider : providers) { + try { + provider.seek(topicName, seekType, timestamp); + anySuccess = true; + } catch (final Exception e) { + LOGGER.warn( + "{} failed to seek topic {} from subscription provider {}, continuing with other providers...", + this, + topicName, + provider, + e); + } + } + if (!anySuccess) { + final String errorMessage = + String.format( + "%s failed to seek topic %s from all available subscription providers %s", + this, topicName, providers); + LOGGER.warn(errorMessage); + throw new SubscriptionRuntimeCriticalException(errorMessage); + } + } + + private void seekWithRedirectionTopicProgress( + final String topicName, final TopicProgress topicProgress) throws SubscriptionException { + final List providers = this.providers.getAllAvailableProviders(); + if (providers.isEmpty()) { + throw new SubscriptionConnectionException( + String.format( + "Cluster has no available subscription providers when %s seek topic %s", + this, topicName)); + } + boolean anySuccess = false; + for (final AbstractSubscriptionProvider provider : providers) { + try { + provider.seekToTopicProgress(topicName, topicProgress); + anySuccess = true; + } catch (final Exception e) { + LOGGER.warn( + "{} failed to seek topic {} to topicProgress(regionCount={}) from provider {}, continuing...", + this, + topicName, + topicProgress.getRegionProgress().size(), + provider, + e); + } + } + if (!anySuccess) { + final String errorMessage = + String.format( + "%s failed to seek topic %s to topicProgress(regionCount=%d) from all providers %s", + this, topicName, topicProgress.getRegionProgress().size(), providers); + LOGGER.warn(errorMessage); + throw new SubscriptionRuntimeCriticalException(errorMessage); + } + } + + private void seekAfterWithRedirectionTopicProgress( + final String topicName, final TopicProgress topicProgress) throws SubscriptionException { + final List providers = this.providers.getAllAvailableProviders(); + if (providers.isEmpty()) { + throw new SubscriptionConnectionException( + String.format( + "Cluster has no available subscription providers when %s seekAfter topic %s", + this, topicName)); + } + boolean anySuccess = false; + for (final AbstractSubscriptionProvider provider : providers) { + try { + provider.seekAfterTopicProgress(topicName, topicProgress); + anySuccess = true; + } catch (final Exception e) { + LOGGER.warn( + "{} failed to seekAfter topic {} to topicProgress(regionCount={}) from provider {}, continuing...", + this, + topicName, + topicProgress.getRegionProgress().size(), + provider, + e); + } + } + if (!anySuccess) { + final String errorMessage = + String.format( + "%s failed to seekAfter topic %s to topicProgress(regionCount=%d) from all providers %s", + this, topicName, topicProgress.getRegionProgress().size(), providers); + LOGGER.warn(errorMessage); + throw new SubscriptionRuntimeCriticalException(errorMessage); + } + } + + private Map buildCurrentProgressByTopic(final Set topicNames) { + final Map result = new HashMap<>(); + for (final String topicName : topicNames) { + final TopicProgress topicProgress = currentPositionsByTopic.get(topicName); + if (Objects.isNull(topicProgress) || topicProgress.getRegionProgress().isEmpty()) { + continue; + } + result.put(topicName, new TopicProgress(topicProgress.getRegionProgress())); + } + return result; + } + + private void advanceCurrentPositions(final List messages) { + for (final SubscriptionMessage message : messages) { + final SubscriptionCommitContext commitContext = message.getCommitContext(); + if (Objects.isNull(commitContext) || Objects.isNull(commitContext.getTopicName())) { + continue; + } + mergeTopicProgress( + currentPositionsByTopic, + commitContext.getTopicName(), + extractWriterId(commitContext), + extractWriterProgress(commitContext)); + } + } + + private void advanceCommittedPositions( + final List subscriptionCommitContexts) { + for (final SubscriptionCommitContext commitContext : subscriptionCommitContexts) { + if (Objects.isNull(commitContext) || Objects.isNull(commitContext.getTopicName())) { + continue; + } + mergeTopicProgress( + committedPositionsByTopic, + commitContext.getTopicName(), + extractWriterId(commitContext), + extractWriterProgress(commitContext)); + } + } + + private boolean isConsensusCommitContext(final SubscriptionCommitContext commitContext) { + return Objects.nonNull(commitContext) + && Objects.nonNull(commitContext.getWriterId()) + && Objects.nonNull(commitContext.getWriterProgress()) + && Objects.nonNull(commitContext.getRegionId()) + && !commitContext.getRegionId().isEmpty(); + } + + private String buildTopicRegionKey(final SubscriptionCommitContext commitContext) { + return commitContext.getTopicName() + '\u0001' + commitContext.getRegionId(); + } + + private void stagePendingRedirectAck(final SubscriptionCommitContext commitContext) { + pendingRedirectAcksByTopicRegion + .computeIfAbsent( + buildTopicRegionKey(commitContext), ignored -> ConcurrentHashMap.newKeySet()) + .add(commitContext); + } + + private void flushPendingRedirectAcks(final List currentMessages) { + final Map redirectTargetByTopicRegion = new HashMap<>(); + for (final SubscriptionMessage message : currentMessages) { + final SubscriptionCommitContext commitContext = message.getCommitContext(); + if (!isConsensusCommitContext(commitContext)) { + continue; + } + redirectTargetByTopicRegion.put( + buildTopicRegionKey(commitContext), commitContext.getDataNodeId()); + } + + for (final Entry entry : redirectTargetByTopicRegion.entrySet()) { + final Set pendingContexts = + pendingRedirectAcksByTopicRegion.get(entry.getKey()); + if (Objects.isNull(pendingContexts) || pendingContexts.isEmpty()) { + continue; + } + + final List contextsToRedirect = new ArrayList<>(pendingContexts); + try { + commitInternal(entry.getValue(), contextsToRedirect, false); + advanceCommittedPositions(contextsToRedirect); + contextsToRedirect.forEach(pendingContexts::remove); + if (pendingContexts.isEmpty()) { + pendingRedirectAcksByTopicRegion.remove(entry.getKey(), pendingContexts); + } + } catch (final SubscriptionException e) { + LOGGER.warn( + "{} failed to redirect {} pending consensus ack(s) for {} via provider {}", + this, + contextsToRedirect.size(), + entry.getKey(), + entry.getValue(), + e); + } + } + } + + private boolean isNewerPosition( + final long newEpoch, final long newSyncIndex, final long oldEpoch, final long oldSyncIndex) { + return newEpoch > oldEpoch || (newEpoch == oldEpoch && newSyncIndex > oldSyncIndex); + } + + private void clearCurrentPositions(final String topicName) { + currentPositionsByTopic.remove(topicName); + } + + private void clearCommittedPositions(final String topicName) { + committedPositionsByTopic.remove(topicName); + } + + private void clearPendingRedirectAcks(final String topicName) { + final String prefix = topicName + '\u0001'; + pendingRedirectAcksByTopicRegion.keySet().removeIf(key -> key.startsWith(prefix)); + } + + private void setCurrentPositions(final String topicName, final TopicProgress topicProgress) { + if (Objects.isNull(topicProgress) || topicProgress.getRegionProgress().isEmpty()) { + currentPositionsByTopic.remove(topicName); + return; + } + currentPositionsByTopic.put(topicName, new TopicProgress(topicProgress.getRegionProgress())); + } + + private void setCommittedPositions(final String topicName, final TopicProgress topicProgress) { + if (Objects.isNull(topicProgress) || topicProgress.getRegionProgress().isEmpty()) { + committedPositionsByTopic.remove(topicName); + return; + } + committedPositionsByTopic.put(topicName, new TopicProgress(topicProgress.getRegionProgress())); + } + + private void overlayCurrentPositions(final String topicName, final TopicProgress topicProgress) { + overlayTopicProgress(currentPositionsByTopic, topicName, topicProgress); + } + + private void overlayCommittedPositions( + final String topicName, final TopicProgress topicProgress) { + overlayTopicProgress(committedPositionsByTopic, topicName, topicProgress); + } + + private void overlayTopicProgress( + final Map progressByTopic, + final String topicName, + final TopicProgress topicProgress) { + if (Objects.isNull(topicName) + || topicName.isEmpty() + || Objects.isNull(topicProgress) + || topicProgress.getRegionProgress().isEmpty()) { + return; + } + progressByTopic.compute( + topicName, + (ignored, oldTopicProgress) -> { + final Map mergedRegionProgress = + Objects.nonNull(oldTopicProgress) + ? new HashMap<>(oldTopicProgress.getRegionProgress()) + : new HashMap<>(); + topicProgress + .getRegionProgress() + .forEach( + (regionId, regionProgress) -> { + if (Objects.isNull(regionId) + || regionId.isEmpty() + || Objects.isNull(regionProgress) + || regionProgress.getWriterPositions().isEmpty()) { + return; + } + mergedRegionProgress.put( + regionId, + new RegionProgress(new HashMap<>(regionProgress.getWriterPositions()))); + }); + return mergedRegionProgress.isEmpty() ? null : new TopicProgress(mergedRegionProgress); + }); + } + + private WriterId extractWriterId(final SubscriptionCommitContext commitContext) { + if (Objects.nonNull(commitContext.getWriterId())) { + return commitContext.getWriterId(); + } + if (Objects.isNull(commitContext.getRegionId()) || commitContext.getRegionId().isEmpty()) { + return null; + } + return new WriterId(commitContext.getRegionId(), commitContext.getDataNodeId(), 0L); + } + + private WriterProgress extractWriterProgress(final SubscriptionCommitContext commitContext) { + if (Objects.nonNull(commitContext.getWriterProgress())) { + return commitContext.getWriterProgress(); + } + if (commitContext.getLocalSeq() < 0) { + return null; + } + return new WriterProgress(commitContext.getPhysicalTime(), commitContext.getLocalSeq()); + } + + private void mergeTopicProgress( + final Map progressByTopic, + final String topicName, + final WriterId writerId, + final WriterProgress writerProgress) { + if (Objects.isNull(writerId) + || Objects.isNull(writerProgress) + || Objects.isNull(topicName) + || topicName.isEmpty()) { + return; + } + progressByTopic.compute( + topicName, + (key, oldTopicProgress) -> { + final Map regionProgressById = + Objects.nonNull(oldTopicProgress) + ? new HashMap<>(oldTopicProgress.getRegionProgress()) + : new HashMap<>(); + final RegionProgress oldRegionProgress = regionProgressById.get(writerId.getRegionId()); + final Map writerPositions = + Objects.nonNull(oldRegionProgress) + ? new HashMap<>(oldRegionProgress.getWriterPositions()) + : new HashMap<>(); + writerPositions.merge( + writerId, + writerProgress, + (oldVal, newVal) -> + isNewerPosition( + newVal.getPhysicalTime(), + newVal.getLocalSeq(), + oldVal.getPhysicalTime(), + oldVal.getLocalSeq()) + ? newVal + : oldVal); + regionProgressById.put(writerId.getRegionId(), new RegionProgress(writerPositions)); + return new TopicProgress(regionProgressById); + }); + } + Map fetchAllEndPointsWithRedirection() throws SubscriptionException { final List providers = this.providers.getAllAvailableProviders(); if (providers.isEmpty()) { diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java index 7f3582d195d6a..1faadf196a955 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionProvider.java @@ -37,11 +37,13 @@ import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequest; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequestType; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponse; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCloseReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCommitReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHandshakeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHeartbeatReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribePollReq; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSubscribeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeUnsubscribeReq; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeHandshakeResp; @@ -59,6 +61,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -332,14 +335,107 @@ Map unsubscribe(final Set topicNames) throws Subscr return unsubscribeResp.getTopics(); } + void seek(final String topicName, final short seekType, final long timestamp) + throws SubscriptionException { + final PipeSubscribeSeekReq req; + try { + req = PipeSubscribeSeekReq.toTPipeSubscribeReq(topicName, seekType, timestamp); + } catch (final IOException e) { + LOGGER.warn( + "IOException occurred when SubscriptionProvider {} serialize seek request for topic {}", + this, + topicName, + e); + throw new SubscriptionRuntimeNonCriticalException(e.getMessage(), e); + } + final TPipeSubscribeResp resp; + try { + resp = getSessionConnection().pipeSubscribe(req); + } catch (final TException | IoTDBConnectionException e) { + LOGGER.warn( + "TException/IoTDBConnectionException occurred when SubscriptionProvider {} seek with request for topic {}, set SubscriptionProvider unavailable", + this, + topicName, + e); + setUnavailable(); + throw new SubscriptionConnectionException(e.getMessage(), e); + } + verifyPipeSubscribeSuccess(resp.status); + } + + void seekToTopicProgress(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + final PipeSubscribeSeekReq req; + try { + req = PipeSubscribeSeekReq.toTPipeSubscribeReq(topicName, topicProgress); + } catch (final IOException e) { + LOGGER.warn( + "IOException occurred when SubscriptionProvider {} serialize seek(topicProgress) for topic {}", + this, + topicName, + e); + throw new SubscriptionRuntimeNonCriticalException(e.getMessage(), e); + } + final TPipeSubscribeResp resp; + try { + resp = getSessionConnection().pipeSubscribe(req); + } catch (final TException | IoTDBConnectionException e) { + LOGGER.warn( + "TException/IoTDBConnectionException occurred when SubscriptionProvider {} seek(topicProgress) for topic {}, set SubscriptionProvider unavailable", + this, + topicName, + e); + setUnavailable(); + throw new SubscriptionConnectionException(e.getMessage(), e); + } + verifyPipeSubscribeSuccess(resp.status); + } + + void seekAfterTopicProgress(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + final PipeSubscribeSeekReq req; + try { + req = PipeSubscribeSeekReq.toTPipeSubscribeSeekAfterReq(topicName, topicProgress); + } catch (final IOException e) { + LOGGER.warn( + "IOException occurred when SubscriptionProvider {} serialize seekAfter(topicProgress) for topic {}", + this, + topicName, + e); + throw new SubscriptionRuntimeNonCriticalException(e.getMessage(), e); + } + final TPipeSubscribeResp resp; + try { + resp = getSessionConnection().pipeSubscribe(req); + } catch (final TException | IoTDBConnectionException e) { + LOGGER.warn( + "TException/IoTDBConnectionException occurred when SubscriptionProvider {} seekAfter(topicProgress) for topic {}, set SubscriptionProvider unavailable", + this, + topicName, + e); + setUnavailable(); + throw new SubscriptionConnectionException(e.getMessage(), e); + } + verifyPipeSubscribeSuccess(resp.status); + } + List poll(final Set topicNames, final long timeoutMs) throws SubscriptionException { + return poll(topicNames, timeoutMs, Collections.emptyMap()); + } + + List poll( + final Set topicNames, + final long timeoutMs, + final Map progressByTopic) + throws SubscriptionException { return poll( new SubscriptionPollRequest( SubscriptionPollRequestType.POLL.getType(), new PollPayload(topicNames), timeoutMs, - session.getThriftMaxFrameSize())); + session.getThriftMaxFrameSize(), + progressByTopic)); } List pollFile( diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java index 0c7478fa64dfb..37aed9204b8d8 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPullConsumer.java @@ -21,8 +21,11 @@ import org.apache.iotdb.rpc.subscription.config.ConsumerConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.session.subscription.consumer.AsyncCommitCallback; +import org.apache.iotdb.session.subscription.payload.PollResult; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; import org.apache.iotdb.session.subscription.util.CollectionUtils; import org.apache.iotdb.session.subscription.util.IdentifierUtils; @@ -30,6 +33,7 @@ import org.slf4j.LoggerFactory; import java.time.Duration; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -64,6 +68,8 @@ public abstract class AbstractSubscriptionPullConsumer extends AbstractSubscript private final boolean autoCommit; private final long autoCommitIntervalMs; + private final List processors = new ArrayList<>(); + private SortedMap> uncommittedMessages; private final AtomicBoolean isClosed = new AtomicBoolean(true); @@ -134,6 +140,24 @@ public synchronized void close() { return; } + // flush all processors and commit any remaining buffered messages + if (!processors.isEmpty()) { + final List flushed = new ArrayList<>(); + for (final SubscriptionMessageProcessor processor : processors) { + final List out = processor.flush(); + if (out != null) { + flushed.addAll(out); + } + } + if (!flushed.isEmpty() && autoCommit) { + try { + commitSync(flushed); + } catch (final SubscriptionException e) { + LOGGER.warn("Failed to commit flushed processor messages on close", e); + } + } + } + if (autoCommit) { // commit all uncommitted messages commitAllUncommittedMessages(); @@ -185,7 +209,7 @@ protected List poll(final Set topicNames, final lon } final List messages = multiplePoll(parsedTopicNames, timeoutMs); - if (messages.isEmpty()) { + if (messages.isEmpty() && processors.isEmpty()) { LOGGER.info( "SubscriptionPullConsumer {} poll empty message from topics {} after {} millisecond(s)", this, @@ -194,6 +218,35 @@ protected List poll(final Set topicNames, final lon return messages; } + // Apply processor chain if configured + List processed = messages; + if (!processors.isEmpty()) { + for (final SubscriptionMessageProcessor processor : processors) { + processed = processor.process(processed); + } + } + + // Update watermark timestamp before stripping watermark events + for (final SubscriptionMessage m : processed) { + if (m.getMessageType() == SubscriptionMessageType.WATERMARK.getType()) { + final long ts = m.getWatermarkTimestamp(); + if (ts > latestWatermarkTimestamp) { + latestWatermarkTimestamp = ts; + } + } + } + + // Strip system messages — they are only for processors, not for users + processed.removeIf( + m -> { + final short type = m.getMessageType(); + return type == SubscriptionMessageType.WATERMARK.getType(); + }); + + if (processed.isEmpty()) { + return processed; + } + // add to uncommitted messages if (autoCommit) { final long currentTimestamp = System.currentTimeMillis(); @@ -203,10 +256,54 @@ protected List poll(final Set topicNames, final lon } uncommittedMessages .computeIfAbsent(index, o -> new ConcurrentSkipListSet<>()) - .addAll(messages); + .addAll(processed); + } + + return processed; + } + + /////////////////////////////// processor /////////////////////////////// + + /** + * Adds a message processor to the pipeline. Processors are applied in order on each poll() call. + * + * @param processor the processor to add + */ + protected AbstractSubscriptionPullConsumer addProcessor( + final SubscriptionMessageProcessor processor) { + processors.add(processor); + return this; + } + + /** + * Polls with processor metadata. Returns a {@link PollResult} containing the messages, the total + * number of buffered messages across all processors, and the current watermark. + */ + protected PollResult pollWithInfo(final long timeoutMs) throws SubscriptionException { + final List messages = poll(timeoutMs); + int totalBuffered = 0; + long watermark = -1; + for (final SubscriptionMessageProcessor processor : processors) { + totalBuffered += processor.getBufferedCount(); + if (processor instanceof WatermarkProcessor) { + watermark = ((WatermarkProcessor) processor).getWatermark(); + } } + return new PollResult(messages, totalBuffered, watermark); + } - return messages; + protected PollResult pollWithInfo(final Set topicNames, final long timeoutMs) + throws SubscriptionException { + final List messages = poll(topicNames, timeoutMs); + int totalBuffered = 0; + long watermark = -1; + for (final SubscriptionMessageProcessor processor : processors) { + totalBuffered += processor.getBufferedCount(); + if (processor instanceof WatermarkProcessor) { + watermark = ((WatermarkProcessor) processor).getWatermark(); + } + } + return new PollResult(messages, totalBuffered, watermark); } /////////////////////////////// commit /////////////////////////////// @@ -238,6 +335,46 @@ protected void commitAsync( super.commitAsync(messages, callback); } + /////////////////////////////// seek /////////////////////////////// + + /** + * Clears uncommitted auto-commit messages after seek to prevent stale acks from committing events + * that belonged to the pre-seek position. + */ + @Override + public void seekToBeginning(final String topicName) throws SubscriptionException { + super.seekToBeginning(topicName); + if (autoCommit) { + uncommittedMessages.clear(); + } + } + + @Override + public void seekToEnd(final String topicName) throws SubscriptionException { + super.seekToEnd(topicName); + if (autoCommit) { + uncommittedMessages.clear(); + } + } + + @Override + public void seek(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + super.seek(topicName, topicProgress); + if (autoCommit) { + uncommittedMessages.clear(); + } + } + + @Override + public void seekAfter(final String topicName, final TopicProgress topicProgress) + throws SubscriptionException { + super.seekAfter(topicName, topicProgress); + if (autoCommit) { + uncommittedMessages.clear(); + } + } + /////////////////////////////// auto commit /////////////////////////////// private void submitAutoCommitWorker() { @@ -274,8 +411,19 @@ public void run() { for (final Map.Entry> entry : uncommittedMessages.headMap(index).entrySet()) { try { - ack(entry.getValue()); - uncommittedMessages.remove(entry.getKey()); + final Set removableMessages = + ackWithPartialProgress(entry.getValue()); + if (removableMessages.isEmpty()) { + continue; + } + if (removableMessages.size() == entry.getValue().size()) { + uncommittedMessages.remove(entry.getKey()); + continue; + } + entry.getValue().removeAll(removableMessages); + if (entry.getValue().isEmpty()) { + uncommittedMessages.remove(entry.getKey()); + } } catch (final Exception e) { LOGGER.warn("something unexpected happened when auto commit messages...", e); } @@ -286,8 +434,18 @@ public void run() { private void commitAllUncommittedMessages() { for (final Map.Entry> entry : uncommittedMessages.entrySet()) { try { - ack(entry.getValue()); - uncommittedMessages.remove(entry.getKey()); + final Set removableMessages = ackWithPartialProgress(entry.getValue()); + if (removableMessages.isEmpty()) { + continue; + } + if (removableMessages.size() == entry.getValue().size()) { + uncommittedMessages.remove(entry.getKey()); + continue; + } + entry.getValue().removeAll(removableMessages); + if (entry.getValue().isEmpty()) { + uncommittedMessages.remove(entry.getKey()); + } } catch (final Exception e) { LOGGER.warn("something unexpected happened when commit messages during close", e); } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java index 3ff93db218b27..1ac9f08696ddb 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionPushConsumer.java @@ -26,6 +26,7 @@ import org.apache.iotdb.session.subscription.consumer.ConsumeResult; import org.apache.iotdb.session.subscription.consumer.tree.SubscriptionTreePushConsumer; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; import org.apache.iotdb.session.subscription.util.CollectionUtils; import org.slf4j.Logger; @@ -180,6 +181,21 @@ public void run() { try { final List messages = multiplePoll(subscribedTopics.keySet(), autoPollTimeoutMs); + // Update watermark timestamp before stripping watermark events + for (final SubscriptionMessage m : messages) { + if (m.getMessageType() == SubscriptionMessageType.WATERMARK.getType()) { + final long ts = m.getWatermarkTimestamp(); + if (ts > latestWatermarkTimestamp) { + latestWatermarkTimestamp = ts; + } + } + } + // Strip system messages — push consumer does not use processors + messages.removeIf( + m -> { + final short type = m.getMessageType(); + return type == SubscriptionMessageType.WATERMARK.getType(); + }); if (messages.isEmpty()) { LOGGER.info( "SubscriptionPushConsumer {} poll empty message from topics {} after {} millisecond(s)", diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/ColumnAlignProcessor.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/ColumnAlignProcessor.java new file mode 100644 index 0000000000000..13910a86c9abe --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/ColumnAlignProcessor.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; + +import org.apache.tsfile.utils.BitMap; +import org.apache.tsfile.write.record.Tablet; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * A non-buffering processor that forward-fills null columns in each Tablet using the last known + * value for the same device/table. This is useful for CDC scenarios where a write only updates a + * subset of columns, leaving others null; the processor fills them with the most recent value. + * + *

State is maintained per device (identified by {@code Tablet.getDeviceId()} for tree-model or + * {@code Tablet.getTableName()} for table-model). + */ +public class ColumnAlignProcessor implements SubscriptionMessageProcessor { + + // deviceKey -> (columnIndex -> lastValue) + private final Map> lastValues = new HashMap<>(); + + @Override + public List process(final List messages) { + for (final SubscriptionMessage message : messages) { + if (message.getMessageType() != SubscriptionMessageType.RECORD_HANDLER.getType()) { + continue; + } + final Iterator tablets = message.getRecordTabletIterator(); + while (tablets.hasNext()) { + fillTablet(tablets.next()); + } + } + return messages; + } + + @Override + public List flush() { + return Collections.emptyList(); + } + + private void fillTablet(final Tablet tablet) { + final String deviceKey = getDeviceKey(tablet); + final Map cache = lastValues.computeIfAbsent(deviceKey, k -> new HashMap<>()); + + final Object[] values = tablet.getValues(); + final BitMap[] bitMaps = tablet.getBitMaps(); + final int rowSize = tablet.getRowSize(); + final int columnCount = values.length; + + for (int row = 0; row < rowSize; row++) { + for (int col = 0; col < columnCount; col++) { + final boolean isNull = + bitMaps != null && bitMaps[col] != null && bitMaps[col].isMarked(row); + if (isNull) { + // try forward-fill from cache + final Object cached = cache.get(col); + if (cached != null) { + setValueAt(values[col], row, cached); + bitMaps[col].unmark(row); + } + } else { + // update cache with this non-null value + cache.put(col, getValueAt(values[col], row)); + } + } + } + } + + private static String getDeviceKey(final Tablet tablet) { + // tree model uses deviceId; table model uses tableName + final String deviceId = tablet.getDeviceId(); + return deviceId != null ? deviceId : tablet.getTableName(); + } + + private static Object getValueAt(final Object columnArray, final int row) { + if (columnArray instanceof long[]) { + return ((long[]) columnArray)[row]; + } else if (columnArray instanceof int[]) { + return ((int[]) columnArray)[row]; + } else if (columnArray instanceof double[]) { + return ((double[]) columnArray)[row]; + } else if (columnArray instanceof float[]) { + return ((float[]) columnArray)[row]; + } else if (columnArray instanceof boolean[]) { + return ((boolean[]) columnArray)[row]; + } else if (columnArray instanceof Object[]) { + return ((Object[]) columnArray)[row]; + } + return null; + } + + private static void setValueAt(final Object columnArray, final int row, final Object value) { + if (columnArray instanceof long[]) { + ((long[]) columnArray)[row] = (Long) value; + } else if (columnArray instanceof int[]) { + ((int[]) columnArray)[row] = (Integer) value; + } else if (columnArray instanceof double[]) { + ((double[]) columnArray)[row] = (Double) value; + } else if (columnArray instanceof float[]) { + ((float[]) columnArray)[row] = (Float) value; + } else if (columnArray instanceof boolean[]) { + ((boolean[]) columnArray)[row] = (Boolean) value; + } else if (columnArray instanceof Object[]) { + ((Object[]) columnArray)[row] = value; + } + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/SubscriptionMessageProcessor.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/SubscriptionMessageProcessor.java new file mode 100644 index 0000000000000..ceee674cd6901 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/SubscriptionMessageProcessor.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; + +import java.util.List; + +/** + * A processor that transforms, filters, or enriches subscription messages in the pull consumer + * pipeline. Processors are chained and invoked on each poll() call. + * + *

Processors may buffer messages internally (e.g., for watermark-based ordering) and return them + * in later process() calls. Buffered messages should be released via {@link #flush()} when the + * consumer closes. + */ +public interface SubscriptionMessageProcessor { + + /** + * Process a batch of messages. May return fewer, more, or different messages than the input. + * + * @param messages the messages from the previous stage (or raw poll) + * @return messages to pass to the next stage (or to the user) + */ + List process(List messages); + + /** + * Flush all internally buffered messages. Called when the consumer is closing. + * + * @return any remaining buffered messages + */ + List flush(); + + /** Returns the number of messages currently buffered by this processor. */ + default int getBufferedCount() { + return 0; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessor.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessor.java new file mode 100644 index 0000000000000..8c17896ce5de5 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessor.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessageType; + +import org.apache.tsfile.write.record.Tablet; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.PriorityQueue; + +/** + * A buffering processor that reorders messages based on watermark semantics. Messages are buffered + * internally and emitted only when the watermark advances past their maximum timestamp. + * + *

Watermark = (minimum of latest timestamp per active source) - maxOutOfOrdernessMs + * + *

A source is considered "stale" if its latest timestamp has not increased for {@code + * staleSourceTimeoutMs}. Stale sources are excluded from the watermark calculation, preventing a + * single slow or idle source from anchoring the global watermark indefinitely. + * + *

Server-side WATERMARK events (carrying per-region timestamp progress) serve as heartbeats, + * confirming source liveness. They advance the per-source timestamp only when their timestamp is + * higher than the previously observed value. + * + *

A timeout mechanism ensures that buffered messages are eventually flushed even if no new data + * arrives, preventing unbounded buffering. + * + *

Note: This processor is primarily intended as a reference implementation. For + * production use with large-scale out-of-order data, consider using a downstream stream processing + * framework (Flink, Spark) for watermark handling. + */ +public class WatermarkProcessor implements SubscriptionMessageProcessor { + + private static final long DEFAULT_STALE_SOURCE_TIMEOUT_MS = 30_000L; + private static final long DEFAULT_MAX_BUFFER_BYTES = 64L * 1024 * 1024; // 64 MB + + private final long maxOutOfOrdernessMs; + private final long timeoutMs; + private final long staleSourceTimeoutMs; + private final long maxBufferBytes; + + // Buffer ordered by message max timestamp + private final PriorityQueue buffer = + new PriorityQueue<>((a, b) -> Long.compare(a.maxTimestamp, b.maxTimestamp)); + + // Track latest timestamp per source (deviceId/tableName) + private final java.util.Map latestPerSource = new java.util.HashMap<>(); + // Track wall-clock time when each source's timestamp last increased + private final java.util.Map lastAdvancedTimeMs = new java.util.HashMap<>(); + private long lastEmitTimeMs = System.currentTimeMillis(); + private long bufferedBytes = 0; + + // Current watermark value + private long watermark = Long.MIN_VALUE; + + /** + * Creates a WatermarkProcessor with default stale source timeout (30 seconds). + * + * @param maxOutOfOrdernessMs maximum expected out-of-orderness in milliseconds + * @param timeoutMs if no data arrives within this duration, force-flush all buffered messages + */ + public WatermarkProcessor(final long maxOutOfOrdernessMs, final long timeoutMs) { + this(maxOutOfOrdernessMs, timeoutMs, DEFAULT_STALE_SOURCE_TIMEOUT_MS, DEFAULT_MAX_BUFFER_BYTES); + } + + /** + * Creates a WatermarkProcessor. + * + * @param maxOutOfOrdernessMs maximum expected out-of-orderness in milliseconds + * @param timeoutMs if no data arrives within this duration, force-flush all buffered messages + * @param staleSourceTimeoutMs if a source's timestamp has not increased for this duration, it is + * excluded from watermark calculation. Use {@link Long#MAX_VALUE} to disable. + * @param maxBufferBytes maximum total estimated bytes of buffered messages. When exceeded, all + * buffered messages are force-flushed regardless of watermark. Defaults to 64 MB. + */ + public WatermarkProcessor( + final long maxOutOfOrdernessMs, + final long timeoutMs, + final long staleSourceTimeoutMs, + final long maxBufferBytes) { + this.maxOutOfOrdernessMs = maxOutOfOrdernessMs; + this.timeoutMs = timeoutMs; + this.staleSourceTimeoutMs = staleSourceTimeoutMs; + this.maxBufferBytes = maxBufferBytes; + } + + @Override + public List process(final List messages) { + final long now = System.currentTimeMillis(); + + // Buffer incoming messages and update per-source timestamps + for (final SubscriptionMessage message : messages) { + // WATERMARK events carry server-side timestamp progress per region. + // They serve as heartbeats and advance per-source tracking only when the timestamp + // actually increases. + if (message.getMessageType() == SubscriptionMessageType.WATERMARK.getType()) { + final String regionKey = + "region-" + + message.getCommitContext().getDataNodeId() + + "-" + + message.getCommitContext().getRegionId(); + advanceSourceTimestamp(regionKey, message.getWatermarkTimestamp(), now); + continue; // Do not buffer system events + } + + final long maxTs = extractMaxTimestamp(message); + final long estimatedSize = message.estimateSize(); + buffer.add(new TimestampedMessage(message, maxTs, estimatedSize)); + bufferedBytes += estimatedSize; + updateSourceTimestamp(message, maxTs, now); + } + + // Compute watermark = min(latest per active source) - maxOutOfOrderness + // Sources whose timestamp has not increased for staleSourceTimeoutMs are excluded. + if (!latestPerSource.isEmpty()) { + long minLatest = Long.MAX_VALUE; + for (final java.util.Map.Entry entry : latestPerSource.entrySet()) { + final Long lastAdv = lastAdvancedTimeMs.get(entry.getKey()); + if (lastAdv != null && (now - lastAdv) <= staleSourceTimeoutMs) { + minLatest = Math.min(minLatest, entry.getValue()); + } + } + if (minLatest != Long.MAX_VALUE) { + watermark = minLatest - maxOutOfOrdernessMs; + } + // If all sources are stale, watermark stays unchanged and timeout will handle it. + } + + // Emit messages whose maxTimestamp <= watermark + final List emitted = emit(watermark); + + // Buffer overflow: force-flush all if buffer exceeds byte limit + if (bufferedBytes > maxBufferBytes) { + return forceFlushAll(); + } + + // Timeout: if nothing was emitted and timeout exceeded, force-flush all + if (emitted.isEmpty() && (now - lastEmitTimeMs) >= timeoutMs && !buffer.isEmpty()) { + return forceFlushAll(); + } + + if (!emitted.isEmpty()) { + lastEmitTimeMs = now; + } + return emitted; + } + + @Override + public List flush() { + return forceFlushAll(); + } + + @Override + public int getBufferedCount() { + return buffer.size(); + } + + /** Returns the current watermark value. */ + public long getWatermark() { + return watermark; + } + + private List emit(final long watermarkValue) { + final List result = new ArrayList<>(); + while (!buffer.isEmpty() && buffer.peek().maxTimestamp <= watermarkValue) { + final TimestampedMessage tm = buffer.poll(); + bufferedBytes -= tm.estimatedSize; + result.add(tm.message); + } + return result; + } + + private List forceFlushAll() { + final List result = new ArrayList<>(buffer.size()); + while (!buffer.isEmpty()) { + result.add(buffer.poll().message); + } + bufferedBytes = 0; + lastEmitTimeMs = System.currentTimeMillis(); + return result; + } + + private static long extractMaxTimestamp(final SubscriptionMessage message) { + long maxTs = Long.MIN_VALUE; + if (message.getMessageType() == SubscriptionMessageType.RECORD_HANDLER.getType()) { + final Iterator it = message.getRecordTabletIterator(); + while (it.hasNext()) { + final Tablet tablet = it.next(); + final long[] timestamps = tablet.getTimestamps(); + final int rowSize = tablet.getRowSize(); + for (int i = 0; i < rowSize; i++) { + maxTs = Math.max(maxTs, timestamps[i]); + } + } + } + // For non-tablet messages or empty messages, use current wall clock + if (maxTs == Long.MIN_VALUE) { + maxTs = System.currentTimeMillis(); + } + return maxTs; + } + + private void updateSourceTimestamp( + final SubscriptionMessage message, final long maxTs, final long nowMs) { + // Use region-based key so data events and WATERMARK events share the same key namespace. + final String regionId = message.getCommitContext().getRegionId(); + final int dataNodeId = message.getCommitContext().getDataNodeId(); + final String key = "region-" + dataNodeId + "-" + regionId; + advanceSourceTimestamp(key, maxTs, nowMs); + } + + /** + * Updates the per-source timestamp tracking. Only records a new "last advanced" wall-clock time + * when the timestamp actually increases, so that stale sources (whose timestamps don't advance) + * are eventually excluded from watermark calculation. + */ + private void advanceSourceTimestamp(final String key, final long newTs, final long nowMs) { + final Long oldTs = latestPerSource.get(key); + if (oldTs == null || newTs > oldTs) { + latestPerSource.put(key, newTs); + lastAdvancedTimeMs.put(key, nowMs); + } + } + + private static final class TimestampedMessage { + final SubscriptionMessage message; + final long maxTimestamp; + final long estimatedSize; + + TimestampedMessage( + final SubscriptionMessage message, final long maxTimestamp, final long estimatedSize) { + this.message = message; + this.maxTimestamp = maxTimestamp; + this.estimatedSize = estimatedSize; + } + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java index 83dd39aebbf7d..e3fb90cda470a 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/table/SubscriptionTablePullConsumer.java @@ -25,6 +25,8 @@ import org.apache.iotdb.session.subscription.consumer.ISubscriptionTablePullConsumer; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionProvider; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionPullConsumer; +import org.apache.iotdb.session.subscription.consumer.base.SubscriptionMessageProcessor; +import org.apache.iotdb.session.subscription.payload.PollResult; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import java.time.Duration; @@ -173,4 +175,24 @@ public String getConsumerGroupId() { public boolean allTopicMessagesHaveBeenConsumed() { return super.allTopicMessagesHaveBeenConsumed(); } + + /////////////////////////////// processor /////////////////////////////// + + public SubscriptionTablePullConsumer addProcessor(final SubscriptionMessageProcessor processor) { + super.addProcessor(processor); + return this; + } + + public PollResult pollWithInfo(final long timeoutMs) throws SubscriptionException { + return super.pollWithInfo(timeoutMs); + } + + public PollResult pollWithInfo(final Duration timeout) throws SubscriptionException { + return super.pollWithInfo(timeout.toMillis()); + } + + public PollResult pollWithInfo(final Set topicNames, final long timeoutMs) + throws SubscriptionException { + return super.pollWithInfo(topicNames, timeoutMs); + } } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java index 23050893f660d..c4daab68839aa 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/consumer/tree/SubscriptionTreePullConsumer.java @@ -27,6 +27,8 @@ import org.apache.iotdb.session.subscription.consumer.ISubscriptionTreePullConsumer; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionProvider; import org.apache.iotdb.session.subscription.consumer.base.AbstractSubscriptionPullConsumer; +import org.apache.iotdb.session.subscription.consumer.base.SubscriptionMessageProcessor; +import org.apache.iotdb.session.subscription.payload.PollResult; import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; import org.apache.iotdb.session.subscription.util.IdentifierUtils; @@ -220,6 +222,26 @@ public boolean allTopicMessagesHaveBeenConsumed() { return super.allTopicMessagesHaveBeenConsumed(); } + /////////////////////////////// processor /////////////////////////////// + + public SubscriptionTreePullConsumer addProcessor(final SubscriptionMessageProcessor processor) { + super.addProcessor(processor); + return this; + } + + public PollResult pollWithInfo(final long timeoutMs) throws SubscriptionException { + return super.pollWithInfo(timeoutMs); + } + + public PollResult pollWithInfo(final Duration timeout) throws SubscriptionException { + return super.pollWithInfo(timeout.toMillis()); + } + + public PollResult pollWithInfo(final Set topicNames, final long timeoutMs) + throws SubscriptionException { + return super.pollWithInfo(topicNames, timeoutMs); + } + /////////////////////////////// builder /////////////////////////////// @Deprecated // keep for forward compatibility diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/PollResult.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/PollResult.java new file mode 100644 index 0000000000000..be56548116e11 --- /dev/null +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/PollResult.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.payload; + +import java.util.Collections; +import java.util.List; + +/** Result of a poll operation that includes processor metadata alongside the messages. */ +public class PollResult { + + private final List messages; + private final int bufferedCount; + private final long watermark; + + public PollResult( + final List messages, final int bufferedCount, final long watermark) { + this.messages = messages != null ? messages : Collections.emptyList(); + this.bufferedCount = bufferedCount; + this.watermark = watermark; + } + + /** Returns the processed messages ready for consumption. */ + public List getMessages() { + return messages; + } + + /** Returns the total number of messages currently buffered across all processors. */ + public int getBufferedCount() { + return bufferedCount; + } + + /** + * Returns the current watermark timestamp (-1 if no watermark processor is configured). Messages + * with timestamps at or before this value have all been emitted. + */ + public long getWatermark() { + return watermark; + } + + @Override + public String toString() { + return "PollResult{messages=" + + messages.size() + + ", bufferedCount=" + + bufferedCount + + ", watermark=" + + watermark + + "}"; + } +} diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java index f41d34f0ab83a..9ba61f6bc2718 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessage.java @@ -39,11 +39,15 @@ public class SubscriptionMessage implements Comparable { private final SubscriptionMessageHandler handler; + /** Watermark timestamp, valid only when messageType == WATERMARK. */ + private final long watermarkTimestamp; + public SubscriptionMessage( final SubscriptionCommitContext commitContext, final Map> tablets) { this.commitContext = commitContext; this.messageType = SubscriptionMessageType.RECORD_HANDLER.getType(); this.handler = new SubscriptionRecordHandler(tablets); + this.watermarkTimestamp = Long.MIN_VALUE; } public SubscriptionMessage( @@ -53,6 +57,16 @@ public SubscriptionMessage( this.commitContext = commitContext; this.messageType = SubscriptionMessageType.TS_FILE.getType(); this.handler = new SubscriptionTsFileHandler(absolutePath, databaseName); + this.watermarkTimestamp = Long.MIN_VALUE; + } + + /** Watermark message carrying server-side timestamp progress for a region. */ + public SubscriptionMessage( + final SubscriptionCommitContext commitContext, final long watermarkTimestamp) { + this.commitContext = commitContext; + this.messageType = SubscriptionMessageType.WATERMARK.getType(); + this.handler = null; + this.watermarkTimestamp = watermarkTimestamp; } public SubscriptionCommitContext getCommitContext() { @@ -63,6 +77,34 @@ public short getMessageType() { return messageType; } + /** + * Returns the watermark timestamp carried by this message. Only valid when {@code + * getMessageType() == SubscriptionMessageType.WATERMARK.getType()}. + * + * @return the watermark timestamp, or {@code Long.MIN_VALUE} if not a watermark message + */ + public long getWatermarkTimestamp() { + return watermarkTimestamp; + } + + /** + * Estimates the heap memory occupied by this message in bytes. For tablet-based messages, this + * delegates to {@link Tablet#ramBytesUsed()} for accurate per-column estimation. + * + * @return estimated byte size + */ + public long estimateSize() { + // Object header + references + primitives (rough constant) + long size = 64; + if (handler instanceof SubscriptionRecordHandler) { + final Iterator it = getRecordTabletIterator(); + while (it.hasNext()) { + size += it.next().ramBytesUsed(); + } + } + return size; + } + /////////////////////////////// override /////////////////////////////// @Override @@ -75,13 +117,14 @@ public boolean equals(final Object obj) { } final SubscriptionMessage that = (SubscriptionMessage) obj; return Objects.equals(this.commitContext, that.commitContext) + && this.watermarkTimestamp == that.watermarkTimestamp && Objects.equals(this.messageType, that.messageType) && Objects.equals(this.handler, that.handler); } @Override public int hashCode() { - return Objects.hash(commitContext, messageType, handler); + return Objects.hash(commitContext, messageType, handler, watermarkTimestamp); } @Override @@ -95,6 +138,8 @@ public String toString() { + commitContext + ", messageType=" + SubscriptionMessageType.valueOf(messageType).toString() + + ", watermarkTimestamp=" + + watermarkTimestamp + "}"; } diff --git a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java index 34189c2fa9b42..0732c0590c181 100644 --- a/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java +++ b/iotdb-client/subscription/src/main/java/org/apache/iotdb/session/subscription/payload/SubscriptionMessageType.java @@ -26,6 +26,7 @@ public enum SubscriptionMessageType { RECORD_HANDLER((short) 0), TS_FILE((short) 1), + WATERMARK((short) 3), ; private final short type; diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContextTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContextTest.java new file mode 100644 index 0000000000000..c1c143447cc1d --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionCommitContextTest.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.apache.tsfile.utils.PublicBAOS; +import org.apache.tsfile.utils.ReadWriteIOUtils; +import org.junit.Test; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class SubscriptionCommitContextTest { + + @Test + public void testDeserializeV1Compatibility() throws IOException { + final ByteBuffer buffer = buildV1Buffer(1, 2, "topic", "group", 3L); + + final SubscriptionCommitContext context = SubscriptionCommitContext.deserialize(buffer); + + assertEquals(1, context.getDataNodeId()); + assertEquals(2, context.getRebootTimes()); + assertEquals("topic", context.getTopicName()); + assertEquals("group", context.getConsumerGroupId()); + assertEquals(3L, context.getCommitId()); + assertEquals(0L, context.getSeekGeneration()); + assertEquals("", context.getRegionId()); + assertEquals(0L, context.getPhysicalTime()); + assertFalse(context.hasWriterProgress()); + assertTrue(context.hasLegacyCommitId()); + assertTrue(context.isCommittable()); + } + + @Test + public void testDeserializeV2() throws IOException { + final SubscriptionCommitContext original = + new SubscriptionCommitContext(1, 2, "topic", "group", 3L, 4L, "region", 5L); + + final ByteBuffer buffer = SubscriptionCommitContext.serialize(original); + final SubscriptionCommitContext parsed = SubscriptionCommitContext.deserialize(buffer); + + assertEquals(original, parsed); + assertFalse(parsed.hasWriterProgress()); + assertTrue(parsed.hasLegacyCommitId()); + } + + @Test + public void testDeserializeV3() throws IOException { + final WriterId writerId = new WriterId("region", 7, 8L); + final WriterProgress writerProgress = new WriterProgress(9L, 10L); + final SubscriptionCommitContext original = + new SubscriptionCommitContext(1, 2, "topic", "group", 3L, writerId, writerProgress); + + final ByteBuffer buffer = SubscriptionCommitContext.serialize(original); + final SubscriptionCommitContext parsed = SubscriptionCommitContext.deserialize(buffer); + + assertEquals(original, parsed); + assertEquals(writerId, parsed.getWriterId()); + assertEquals(writerProgress, parsed.getWriterProgress()); + assertEquals("region", parsed.getRegionId()); + assertEquals(9L, parsed.getPhysicalTime()); + assertEquals(10L, parsed.getLocalSeq()); + assertTrue(parsed.hasWriterProgress()); + assertFalse(parsed.hasLegacyCommitId()); + assertTrue(parsed.isCommittable()); + } + + @Test(expected = IllegalArgumentException.class) + public void testDeserializeUnsupportedVersion() throws IOException { + final ByteBuffer buffer = buildV1BufferWithVersion((byte) 4, 1, 2, "topic", "group", 3L); + SubscriptionCommitContext.deserialize(buffer); + } + + private static ByteBuffer buildV1Buffer( + final int dataNodeId, + final int rebootTimes, + final String topicName, + final String consumerGroupId, + final long commitId) + throws IOException { + return buildV1BufferWithVersion( + (byte) 1, dataNodeId, rebootTimes, topicName, consumerGroupId, commitId); + } + + private static ByteBuffer buildV1BufferWithVersion( + final byte version, + final int dataNodeId, + final int rebootTimes, + final String topicName, + final String consumerGroupId, + final long commitId) + throws IOException { + try (final PublicBAOS byteArrayOutputStream = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + ReadWriteIOUtils.write(version, outputStream); + ReadWriteIOUtils.write(dataNodeId, outputStream); + ReadWriteIOUtils.write(rebootTimes, outputStream); + ReadWriteIOUtils.write(topicName, outputStream); + ReadWriteIOUtils.write(consumerGroupId, outputStream); + ReadWriteIOUtils.write(commitId, outputStream); + return ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequestTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequestTest.java new file mode 100644 index 0000000000000..ecfea3d160bc4 --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/poll/SubscriptionPollRequestTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.poll; + +import org.junit.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class SubscriptionPollRequestTest { + + @Test + public void testRoundTripWithProgressByTopic() throws IOException { + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(new WriterId("1_100", 7, 2L), new WriterProgress(1001L, 11L)); + writerPositions.put(new WriterId("1_100", 8, 1L), new WriterProgress(999L, 9L)); + + final TopicProgress topicProgress = + new TopicProgress(Collections.singletonMap("1_100", new RegionProgress(writerPositions))); + final Map progressByTopic = new LinkedHashMap<>(); + progressByTopic.put("topicA", topicProgress); + + final SubscriptionPollRequest original = + new SubscriptionPollRequest( + SubscriptionPollRequestType.POLL.getType(), + new PollPayload(Collections.singleton("topicA")), + 1234L, + 4096L, + progressByTopic); + + final ByteBuffer serialized = SubscriptionPollRequest.serialize(original); + final SubscriptionPollRequest parsed = SubscriptionPollRequest.deserialize(serialized); + + assertEquals(original.getRequestType(), parsed.getRequestType()); + assertEquals(original.getTimeoutMs(), parsed.getTimeoutMs()); + assertEquals(original.getMaxBytes(), parsed.getMaxBytes()); + assertEquals(original.getPayload(), parsed.getPayload()); + assertEquals(progressByTopic, parsed.getProgressByTopic()); + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReqTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReqTest.java new file mode 100644 index 0000000000000..c2afb43110289 --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/rpc/subscription/payload/request/PipeSubscribeSeekReqTest.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.rpc.subscription.payload.request; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.io.IOException; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class PipeSubscribeSeekReqTest { + + @Test + public void testTopicProgressSeekRoundTrip() throws IOException { + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(new WriterId("1_100", 1, 2L), new WriterProgress(1000L, 10L)); + final TopicProgress original = + new TopicProgress(Collections.singletonMap("1_100", new RegionProgress(writerPositions))); + + final PipeSubscribeSeekReq req = + PipeSubscribeSeekReq.toTPipeSubscribeSeekAfterReq("topicA", original); + final PipeSubscribeSeekReq parsed = PipeSubscribeSeekReq.fromTPipeSubscribeReq(req); + + assertEquals(PipeSubscribeSeekReq.SEEK_AFTER_TOPIC_PROGRESS, parsed.getSeekType()); + assertEquals("topicA", parsed.getTopicName()); + assertEquals(original, parsed.getTopicProgress()); + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumerProgressTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumerProgressTest.java new file mode 100644 index 0000000000000..726aab5666866 --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumerProgressTest.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; + +import org.junit.Test; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +public class AbstractSubscriptionConsumerProgressTest { + + private static final String TOPIC = "topic_progress_test"; + private static final String GROUP = "group_progress_test"; + private static final String REGION = "1_100"; + + @Test + public void testAdvanceCurrentPositionsWithWriterProgress() throws Exception { + final TestSubscriptionConsumer consumer = newConsumer(); + final WriterId writerId = new WriterId(REGION, 1, 2L); + final WriterProgress writerProgress = new WriterProgress(100L, 10L); + final SubscriptionMessage message = + new SubscriptionMessage( + new SubscriptionCommitContext(1, 0, TOPIC, GROUP, 0L, writerId, writerProgress), + Collections.emptyMap()); + + invokeAdvanceCurrentPositions(consumer, Collections.singletonList(message)); + + final TopicProgress positions = consumer.positions(TOPIC); + assertEquals( + writerProgress, + positions.getRegionProgress().get(REGION).getWriterPositions().get(writerId)); + } + + @Test + public void testAdvanceCommittedPositionsFallsBackToLegacyFields() throws Exception { + final TestSubscriptionConsumer consumer = newConsumer(); + final SubscriptionCommitContext legacyContext = + new SubscriptionCommitContext(7, 0, TOPIC, GROUP, 11L, 0L, REGION, 101L); + + invokeAdvanceCommittedPositions(consumer, Collections.singletonList(legacyContext)); + + final TopicProgress committed = consumer.committedPositions(TOPIC); + final RegionProgress regionProgress = committed.getRegionProgress().get(REGION); + assertNotNull(regionProgress); + assertEquals(1, regionProgress.getWriterPositions().size()); + final Map.Entry onlyEntry = + regionProgress.getWriterPositions().entrySet().iterator().next(); + assertEquals(new WriterId(REGION, 7, 0L), onlyEntry.getKey()); + assertEquals(new WriterProgress(101L, 11L), onlyEntry.getValue()); + } + + @Test + public void testAdvanceCurrentPositionsMergesPerWriterAndKeepsNewest() throws Exception { + final TestSubscriptionConsumer consumer = newConsumer(); + + final WriterId writer1 = new WriterId(REGION, 1, 1L); + final WriterId writer2 = new WriterId(REGION, 2, 1L); + final SubscriptionMessage olderWriter1 = + new SubscriptionMessage( + new SubscriptionCommitContext( + 1, 0, TOPIC, GROUP, 0L, writer1, new WriterProgress(100L, 8L)), + Collections.emptyMap()); + final SubscriptionMessage newerWriter1 = + new SubscriptionMessage( + new SubscriptionCommitContext( + 1, 0, TOPIC, GROUP, 0L, writer1, new WriterProgress(100L, 10L)), + Collections.emptyMap()); + final SubscriptionMessage writer2Message = + new SubscriptionMessage( + new SubscriptionCommitContext( + 2, 0, TOPIC, GROUP, 0L, writer2, new WriterProgress(95L, 7L)), + Collections.emptyMap()); + + invokeAdvanceCurrentPositions( + consumer, Arrays.asList(olderWriter1, newerWriter1, writer2Message)); + + final RegionProgress regionProgress = consumer.positions(TOPIC).getRegionProgress().get(REGION); + assertEquals(2, regionProgress.getWriterPositions().size()); + assertEquals(new WriterProgress(100L, 10L), regionProgress.getWriterPositions().get(writer1)); + assertEquals(new WriterProgress(95L, 7L), regionProgress.getWriterPositions().get(writer2)); + } + + private static TestSubscriptionConsumer newConsumer() throws Exception { + final TestSubscriptionConsumer consumer = + new TestSubscriptionConsumer( + new AbstractSubscriptionConsumerBuilder() + .consumerId("progress_consumer") + .consumerGroupId(GROUP)); + final Field isClosedField = AbstractSubscriptionConsumer.class.getDeclaredField("isClosed"); + isClosedField.setAccessible(true); + ((AtomicBoolean) isClosedField.get(consumer)).set(false); + return consumer; + } + + @SuppressWarnings("unchecked") + private static void invokeAdvanceCurrentPositions( + final AbstractSubscriptionConsumer consumer, final List messages) + throws Exception { + final Method method = + AbstractSubscriptionConsumer.class.getDeclaredMethod("advanceCurrentPositions", List.class); + method.setAccessible(true); + method.invoke(consumer, messages); + } + + private static void invokeAdvanceCommittedPositions( + final AbstractSubscriptionConsumer consumer, + final List commitContexts) + throws Exception { + final Method method = + AbstractSubscriptionConsumer.class.getDeclaredMethod( + "advanceCommittedPositions", List.class); + method.setAccessible(true); + method.invoke(consumer, commitContexts); + } + + private static final class TestSubscriptionConsumer extends AbstractSubscriptionConsumer { + + private TestSubscriptionConsumer(final AbstractSubscriptionConsumerBuilder builder) { + super(builder); + } + + @Override + protected AbstractSubscriptionProvider constructSubscriptionProvider( + final TEndPoint endPoint, + final String username, + final String password, + final String consumerId, + final String consumerGroupId, + final int thriftMaxFrameSize, + final long heartbeatIntervalMs, + final int connectionTimeoutInMs) { + throw new UnsupportedOperationException("No provider needed for progress unit tests"); + } + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumerSeekProgressTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumerSeekProgressTest.java new file mode 100644 index 0000000000000..97dad18bf9aea --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/AbstractSubscriptionConsumerSeekProgressTest.java @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +public class AbstractSubscriptionConsumerSeekProgressTest { + + private static final String TOPIC = "topic_seek_progress_test"; + private static final String REGION_A = "1_100"; + private static final String REGION_B = "1_101"; + + @Test + public void testOverlayTopicProgressPreservesMissingRegions() throws Exception { + final TestSubscriptionConsumer consumer = newConsumer(); + + invokeSetCurrentPositions( + consumer, + TOPIC, + buildTopicProgress( + REGION_A, + new WriterId(REGION_A, 1, 1L), + new WriterProgress(100L, 10L), + REGION_B, + new WriterId(REGION_B, 2, 1L), + new WriterProgress(200L, 20L))); + + invokeOverlayCurrentPositions( + consumer, + TOPIC, + new TopicProgress( + Collections.singletonMap( + REGION_A, + new RegionProgress( + Collections.singletonMap( + new WriterId(REGION_A, 1, 1L), new WriterProgress(50L, 5L)))))); + + final TopicProgress positions = consumer.positions(TOPIC); + assertNotNull(positions.getRegionProgress().get(REGION_A)); + assertNotNull(positions.getRegionProgress().get(REGION_B)); + assertEquals( + new WriterProgress(50L, 5L), + positions + .getRegionProgress() + .get(REGION_A) + .getWriterPositions() + .values() + .iterator() + .next()); + assertEquals( + new WriterProgress(200L, 20L), + positions + .getRegionProgress() + .get(REGION_B) + .getWriterPositions() + .values() + .iterator() + .next()); + } + + @Test + public void testOverlayTopicProgressAllowsSeekBackwardsForSpecifiedRegion() throws Exception { + final TestSubscriptionConsumer consumer = newConsumer(); + + invokeSetCommittedPositions( + consumer, + TOPIC, + new TopicProgress( + Collections.singletonMap( + REGION_A, + new RegionProgress( + Collections.singletonMap( + new WriterId(REGION_A, 1, 1L), new WriterProgress(100L, 10L)))))); + + invokeOverlayCommittedPositions( + consumer, + TOPIC, + new TopicProgress( + Collections.singletonMap( + REGION_A, + new RegionProgress( + Collections.singletonMap( + new WriterId(REGION_A, 1, 1L), new WriterProgress(80L, 4L)))))); + + assertEquals( + new WriterProgress(80L, 4L), + consumer + .committedPositions(TOPIC) + .getRegionProgress() + .get(REGION_A) + .getWriterPositions() + .values() + .iterator() + .next()); + } + + private static TestSubscriptionConsumer newConsumer() throws Exception { + final TestSubscriptionConsumer consumer = + new TestSubscriptionConsumer( + new AbstractSubscriptionConsumerBuilder() + .consumerId("seek_progress_consumer") + .consumerGroupId("seek_progress_group")); + final Field isClosedField = AbstractSubscriptionConsumer.class.getDeclaredField("isClosed"); + isClosedField.setAccessible(true); + ((AtomicBoolean) isClosedField.get(consumer)).set(false); + return consumer; + } + + private static void invokeSetCurrentPositions( + final AbstractSubscriptionConsumer consumer, + final String topicName, + final TopicProgress topicProgress) + throws Exception { + final Method method = + AbstractSubscriptionConsumer.class.getDeclaredMethod( + "setCurrentPositions", String.class, TopicProgress.class); + method.setAccessible(true); + method.invoke(consumer, topicName, topicProgress); + } + + private static void invokeSetCommittedPositions( + final AbstractSubscriptionConsumer consumer, + final String topicName, + final TopicProgress topicProgress) + throws Exception { + final Method method = + AbstractSubscriptionConsumer.class.getDeclaredMethod( + "setCommittedPositions", String.class, TopicProgress.class); + method.setAccessible(true); + method.invoke(consumer, topicName, topicProgress); + } + + private static void invokeOverlayCurrentPositions( + final AbstractSubscriptionConsumer consumer, + final String topicName, + final TopicProgress topicProgress) + throws Exception { + final Method method = + AbstractSubscriptionConsumer.class.getDeclaredMethod( + "overlayCurrentPositions", String.class, TopicProgress.class); + method.setAccessible(true); + method.invoke(consumer, topicName, topicProgress); + } + + private static void invokeOverlayCommittedPositions( + final AbstractSubscriptionConsumer consumer, + final String topicName, + final TopicProgress topicProgress) + throws Exception { + final Method method = + AbstractSubscriptionConsumer.class.getDeclaredMethod( + "overlayCommittedPositions", String.class, TopicProgress.class); + method.setAccessible(true); + method.invoke(consumer, topicName, topicProgress); + } + + private static TopicProgress buildTopicProgress( + final String regionA, + final WriterId writerA, + final WriterProgress writerProgressA, + final String regionB, + final WriterId writerB, + final WriterProgress writerProgressB) { + final Map regionProgress = new LinkedHashMap<>(); + regionProgress.put( + regionA, new RegionProgress(Collections.singletonMap(writerA, writerProgressA))); + regionProgress.put( + regionB, new RegionProgress(Collections.singletonMap(writerB, writerProgressB))); + return new TopicProgress(regionProgress); + } + + private static final class TestSubscriptionConsumer extends AbstractSubscriptionConsumer { + + private TestSubscriptionConsumer(final AbstractSubscriptionConsumerBuilder builder) { + super(builder); + } + + @Override + protected AbstractSubscriptionProvider constructSubscriptionProvider( + final TEndPoint endPoint, + final String username, + final String password, + final String consumerId, + final String consumerGroupId, + final int thriftMaxFrameSize, + final long heartbeatIntervalMs, + final int connectionTimeoutInMs) { + throw new UnsupportedOperationException("No provider needed for seek progress unit tests"); + } + } +} diff --git a/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessorTest.java b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessorTest.java new file mode 100644 index 0000000000000..613090650bd1a --- /dev/null +++ b/iotdb-client/subscription/src/test/java/org/apache/iotdb/session/subscription/consumer/base/WatermarkProcessorTest.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.session.subscription.consumer.base; + +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.session.subscription.payload.SubscriptionMessage; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class WatermarkProcessorTest { + + private static final String TOPIC = "topic1"; + private static final String GROUP = "group1"; + private static final String REGION_R1 = "R1"; + private static final String REGION_R2 = "R2"; + + private static SubscriptionMessage dataMsg(final String regionId, final int dataNodeId) { + final SubscriptionCommitContext ctx = + new SubscriptionCommitContext(dataNodeId, 0, TOPIC, GROUP, 0L, 0L, regionId, 0L); + return new SubscriptionMessage(ctx, Collections.emptyMap()); + } + + private static SubscriptionMessage watermarkMsg( + final String regionId, final int dataNodeId, final long watermarkTs) { + final SubscriptionCommitContext ctx = + new SubscriptionCommitContext(dataNodeId, 0, TOPIC, GROUP, 0L, 0L, regionId, 0L); + return new SubscriptionMessage(ctx, watermarkTs); + } + + @Test + public void testSingleRegionRelease() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + final List result = + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1000))); + + Assert.assertTrue(result.isEmpty()); + Assert.assertEquals(995, proc.getWatermark()); + } + + @Test + public void testTwoRegionsMinWatermark() { + final WatermarkProcessor proc = new WatermarkProcessor(10, 60_000); + + proc.process(Arrays.asList(watermarkMsg(REGION_R1, 1, 2000), watermarkMsg(REGION_R2, 1, 500))); + + Assert.assertEquals(490, proc.getWatermark()); + } + + @Test + public void testWatermarkAdvancesIdleRegion() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + proc.process(Arrays.asList(watermarkMsg(REGION_R1, 1, 2000), watermarkMsg(REGION_R2, 1, 500))); + Assert.assertEquals(495, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R2, 1, 1500))); + Assert.assertEquals(1495, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R2, 1, 3000))); + Assert.assertEquals(1995, proc.getWatermark()); + } + + @Test + public void testWatermarkEventsNotBuffered() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1000))); + + Assert.assertEquals(0, proc.getBufferedCount()); + } + + @Test + public void testFlushReleasesAll() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + proc.process(Arrays.asList(dataMsg(REGION_R1, 1), dataMsg(REGION_R1, 1))); + + proc.flush(); + Assert.assertEquals(0, proc.getBufferedCount()); + } + + @Test + public void testWatermarkNoRegression() { + final WatermarkProcessor proc = new WatermarkProcessor(10, 60_000); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 2000))); + Assert.assertEquals(1990, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1500))); + Assert.assertEquals(1990, proc.getWatermark()); + } + + @Test + public void testMultipleWatermarksInSingleBatch() { + final WatermarkProcessor proc = new WatermarkProcessor(0, 60_000); + + proc.process( + Arrays.asList( + watermarkMsg(REGION_R1, 1, 100), + watermarkMsg(REGION_R2, 1, 200), + watermarkMsg(REGION_R1, 1, 300))); + + Assert.assertEquals(200, proc.getWatermark()); + } + + @Test + public void testEmptyInput() { + final WatermarkProcessor proc = new WatermarkProcessor(5, 60_000); + + final List result = proc.process(Collections.emptyList()); + Assert.assertTrue(result.isEmpty()); + Assert.assertEquals(Long.MIN_VALUE, proc.getWatermark()); + } + + @Test + public void testThreeRegionsSlowestDeterminesWatermark() { + final WatermarkProcessor proc = new WatermarkProcessor(10, 60_000); + + proc.process( + Arrays.asList( + watermarkMsg(REGION_R1, 1, 5000), + watermarkMsg(REGION_R2, 1, 3000), + watermarkMsg("R3", 2, 4000))); + + Assert.assertEquals(2990, proc.getWatermark()); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R2, 1, 6000))); + Assert.assertEquals(3990, proc.getWatermark()); + } + + @Test + public void testZeroOutOfOrderness() { + final WatermarkProcessor proc = new WatermarkProcessor(0, 60_000); + + proc.process(Collections.singletonList(watermarkMsg(REGION_R1, 1, 1000))); + Assert.assertEquals(1000, proc.getWatermark()); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java index e5753bf1bd184..7f20f8cbfd03a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnAsyncRequestType.java @@ -79,6 +79,8 @@ public enum CnToDnAsyncRequestType { TOPIC_PUSH_MULTI_META, CONSUMER_GROUP_PUSH_ALL_META, CONSUMER_GROUP_PUSH_SINGLE_META, + PULL_COMMIT_PROGRESS, + SUBSCRIPTION_PUSH_RUNTIME, // TEMPLATE UPDATE_TEMPLATE, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java index cd69f8b2c846d..4faea49d2fb7f 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/CnToDnInternalServiceAsyncRequestManager.java @@ -47,6 +47,7 @@ import org.apache.iotdb.confignode.client.async.handlers.rpc.TreeDeviceViewFieldDetectionHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.CheckSchemaRegionUsingTemplateRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.ConsumerGroupPushMetaRPCHandler; +import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.PullCommitProgressRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.TopicPushMetaRPCHandler; import org.apache.iotdb.mpp.rpc.thrift.TActiveTriggerInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TAlterEncodingCompressorReq; @@ -83,6 +84,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TKillQueryInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TNotifyRegionMigrationReq; import org.apache.iotdb.mpp.rpc.thrift.TPipeHeartbeatReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressReq; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushMultiPipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushMultiTopicMetaReq; @@ -90,6 +92,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TPushSingleConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSinglePipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSingleTopicMetaReq; +import org.apache.iotdb.mpp.rpc.thrift.TPushSubscriptionRuntimeReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TRegionLeaderChangeReq; import org.apache.iotdb.mpp.rpc.thrift.TRegionRouteReq; @@ -224,6 +227,16 @@ protected void initActionMapBuilder() { (req, client, handler) -> client.pushSingleConsumerGroupMeta( (TPushSingleConsumerGroupMetaReq) req, (ConsumerGroupPushMetaRPCHandler) handler)); + actionMapBuilder.put( + CnToDnAsyncRequestType.PULL_COMMIT_PROGRESS, + (req, client, handler) -> + client.pullCommitProgress( + (TPullCommitProgressReq) req, (PullCommitProgressRPCHandler) handler)); + actionMapBuilder.put( + CnToDnAsyncRequestType.SUBSCRIPTION_PUSH_RUNTIME, + (req, client, handler) -> + client.pushSubscriptionRuntime( + (TPushSubscriptionRuntimeReq) req, (DataNodeTSStatusRPCHandler) handler)); actionMapBuilder.put( CnToDnAsyncRequestType.PIPE_HEARTBEAT, (req, client, handler) -> diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java index b2e2ec3232781..084998aa04825 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeAsyncRequestRPCHandler.java @@ -29,12 +29,14 @@ import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.CheckSchemaRegionUsingTemplateRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.ConsumerGroupPushMetaRPCHandler; +import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.PullCommitProgressRPCHandler; import org.apache.iotdb.confignode.client.async.handlers.rpc.subscription.TopicPushMetaRPCHandler; import org.apache.iotdb.mpp.rpc.thrift.TCheckSchemaRegionUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TCheckTimeSeriesExistenceResp; import org.apache.iotdb.mpp.rpc.thrift.TCountPathsUsingTemplateResp; import org.apache.iotdb.mpp.rpc.thrift.TDeviceViewResp; import org.apache.iotdb.mpp.rpc.thrift.TFetchSchemaBlackListResp; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushPipeMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; @@ -169,6 +171,14 @@ public static DataNodeAsyncRequestRPCHandler buildHandler( dataNodeLocationMap, (Map) responseMap, countDownLatch); + case PULL_COMMIT_PROGRESS: + return new PullCommitProgressRPCHandler( + requestType, + requestId, + targetDataNode, + dataNodeLocationMap, + (Map) responseMap, + countDownLatch); case CHANGE_REGION_LEADER: return new TransferLeaderRPCHandler( requestType, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java index 7c93f363dd4b8..bd8042071480a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/DataNodeTSStatusRPCHandler.java @@ -48,22 +48,19 @@ public DataNodeTSStatusRPCHandler( @Override public void onComplete(TSStatus response) { - // Put response responseMap.put(requestId, response); if (response.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - // Remove only if success nodeLocationMap.remove(requestId); LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); } else { - LOGGER.error( + logFailure( "Failed to {} on DataNode: {}, response: {}", requestType, formattedTargetLocation, response); } - // Always CountDown countDownLatch.countDown(); } @@ -76,14 +73,21 @@ public void onError(Exception e) { + formattedTargetLocation + ", exception: " + e.getMessage(); - LOGGER.error(errorMsg); + logFailure(errorMsg); responseMap.put( requestId, new TSStatus( RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode(), errorMsg))); - // Always CountDown countDownLatch.countDown(); } + + private void logFailure(final String format, final Object... args) { + if (requestType == CnToDnAsyncRequestType.SUBSCRIPTION_PUSH_RUNTIME) { + LOGGER.warn(format, args); + } else { + LOGGER.error(format, args); + } + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java index 2938d4f85b7cd..67ee9f372d747 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/ConsumerGroupPushMetaRPCHandler.java @@ -49,23 +49,19 @@ public ConsumerGroupPushMetaRPCHandler( @Override public void onComplete(TPushConsumerGroupMetaResp response) { - // Put response responseMap.put(requestId, response); if (response.getStatus().getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); + LOGGER.debug("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); } else { - LOGGER.error( + LOGGER.warn( "Failed to {} on DataNode: {}, response: {}", requestType, formattedTargetLocation, response); } - // Always remove to avoid retrying nodeLocationMap.remove(requestId); - - // Always CountDown countDownLatch.countDown(); } @@ -78,14 +74,13 @@ public void onError(Exception e) { + formattedTargetLocation + ", exception: " + e.getMessage(); - LOGGER.error(errorMsg, e); + LOGGER.warn(errorMsg); responseMap.put( requestId, new TPushConsumerGroupMetaResp( RpcUtils.getStatus(TSStatusCode.CONSUMER_PUSH_META_ERROR, errorMsg))); - // Always CountDown countDownLatch.countDown(); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/PullCommitProgressRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/PullCommitProgressRPCHandler.java new file mode 100644 index 0000000000000..a34dd627f320f --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/PullCommitProgressRPCHandler.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.client.async.handlers.rpc.subscription; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.confignode.client.async.CnToDnAsyncRequestType; +import org.apache.iotdb.confignode.client.async.handlers.rpc.DataNodeAsyncRequestRPCHandler; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; +import org.apache.iotdb.rpc.RpcUtils; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.concurrent.CountDownLatch; + +public class PullCommitProgressRPCHandler + extends DataNodeAsyncRequestRPCHandler { + private static final Logger LOGGER = LoggerFactory.getLogger(PullCommitProgressRPCHandler.class); + + public PullCommitProgressRPCHandler( + CnToDnAsyncRequestType requestType, + int requestId, + TDataNodeLocation targetDataNode, + Map dataNodeLocationMap, + Map responseMap, + CountDownLatch countDownLatch) { + super(requestType, requestId, targetDataNode, dataNodeLocationMap, responseMap, countDownLatch); + } + + @Override + public void onComplete(TPullCommitProgressResp response) { + responseMap.put(requestId, response); + + if (response.getStatus().getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + logSuspiciousRegionProgressPayloads(response); + LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); + } else { + LOGGER.error( + "Failed to {} on DataNode: {}, response: {}", + requestType, + formattedTargetLocation, + response); + } + + nodeLocationMap.remove(requestId); + countDownLatch.countDown(); + } + + @Override + public void onError(Exception e) { + String errorMsg = + "Failed to " + + requestType + + " on DataNode: " + + formattedTargetLocation + + ", exception: " + + e.getMessage(); + LOGGER.error(errorMsg, e); + + responseMap.put( + requestId, + new TPullCommitProgressResp( + RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, errorMsg))); + + countDownLatch.countDown(); + } + + private void logSuspiciousRegionProgressPayloads(final TPullCommitProgressResp response) { + if (response == null || !response.isSetCommitRegionProgress()) { + return; + } + for (final Map.Entry entry : + response.getCommitRegionProgress().entrySet()) { + if (isSuspiciousRegionProgressPayload(entry.getValue())) { + LOGGER.warn( + "PULL_COMMIT_PROGRESS confignode recv suspicious payload from DataNode {}, key={}, summary={}", + formattedTargetLocation, + entry.getKey(), + summarizeRegionProgressPayload(entry.getValue())); + } + } + } + + private boolean isSuspiciousRegionProgressPayload(final java.nio.ByteBuffer buffer) { + if (buffer == null) { + return true; + } + final java.nio.ByteBuffer duplicate = buffer.slice(); + if (duplicate.remaining() < Integer.BYTES) { + return true; + } + final int firstInt = duplicate.getInt(); + return firstInt < 0 || firstInt > 1_000_000; + } + + private String summarizeRegionProgressPayload(final java.nio.ByteBuffer buffer) { + if (buffer == null) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final java.nio.ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private String bytesToHex(final byte[] bytes) { + if (bytes == null || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java index 91ffdd7232b3f..2f5e609f0cfec 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/handlers/rpc/subscription/TopicPushMetaRPCHandler.java @@ -48,23 +48,19 @@ public TopicPushMetaRPCHandler( @Override public void onComplete(TPushTopicMetaResp response) { - // Put response responseMap.put(requestId, response); if (response.getStatus().getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.info("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); + LOGGER.debug("Successfully {} on DataNode: {}", requestType, formattedTargetLocation); } else { - LOGGER.error( + LOGGER.warn( "Failed to {} on DataNode: {}, response: {}", requestType, formattedTargetLocation, response); } - // Always remove to avoid retrying nodeLocationMap.remove(requestId); - - // Always CountDown countDownLatch.countDown(); } @@ -77,13 +73,12 @@ public void onError(Exception e) { + formattedTargetLocation + ", exception: " + e.getMessage(); - LOGGER.error(errorMsg, e); + LOGGER.warn(errorMsg); responseMap.put( requestId, new TPushTopicMetaResp(RpcUtils.getStatus(TSStatusCode.TOPIC_PUSH_META_ERROR, errorMsg))); - // Always CountDown countDownLatch.countDown(); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java index 7fd7cd029119a..662e5d4d445cb 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlan.java @@ -87,6 +87,7 @@ import org.apache.iotdb.confignode.consensus.request.write.region.PollRegionMaintainTaskPlan; import org.apache.iotdb.confignode.consensus.request.write.region.PollSpecificRegionMaintainTaskPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.AlterConsumerGroupPlan; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.ConsumerGroupHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterMultipleTopicsPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterTopicPlan; @@ -538,6 +539,9 @@ public static ConfigPhysicalPlan create(final ByteBuffer buffer) throws IOExcept case ConsumerGroupHandleMetaChange: plan = new ConsumerGroupHandleMetaChangePlan(); break; + case CommitProgressHandleMetaChange: + plan = new CommitProgressHandleMetaChangePlan(); + break; case PipeUnsetTemplate: plan = new PipeUnsetSchemaTemplatePlan(); break; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java index 371435c9175bb..872ef0596d3c6 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/ConfigPhysicalPlanType.java @@ -323,6 +323,8 @@ public enum ConfigPhysicalPlanType { ShowSubscription((short) 2000), + CommitProgressHandleMetaChange((short) 2001), + // Authority version after and equal 2.0 DropUserV2((short) 2100), UpdateUserV2((short) 2101), diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/subscription/consumer/runtime/CommitProgressHandleMetaChangePlan.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/subscription/consumer/runtime/CommitProgressHandleMetaChangePlan.java new file mode 100644 index 0000000000000..387b0a43b4a61 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/consensus/request/write/subscription/consumer/runtime/CommitProgressHandleMetaChangePlan.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime; + +import org.apache.iotdb.commons.subscription.meta.consumer.CommitProgressKeeper; +import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; +import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlanType; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +/** Consensus plan for handling per-region commit progress meta changes. */ +public class CommitProgressHandleMetaChangePlan extends ConfigPhysicalPlan { + + private Map regionProgressMap = new HashMap<>(); + + public CommitProgressHandleMetaChangePlan() { + super(ConfigPhysicalPlanType.CommitProgressHandleMetaChange); + } + + public CommitProgressHandleMetaChangePlan(final Map regionProgressMap) { + super(ConfigPhysicalPlanType.CommitProgressHandleMetaChange); + this.regionProgressMap = regionProgressMap; + } + + public Map getRegionProgressMap() { + return regionProgressMap; + } + + @Override + protected void serializeImpl(final DataOutputStream stream) throws IOException { + stream.writeShort(getType().getPlanType()); + stream.writeInt(regionProgressMap.size()); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final byte[] keyBytes = entry.getKey().getBytes("UTF-8"); + final ByteBuffer valueBuffer = entry.getValue().asReadOnlyBuffer(); + valueBuffer.rewind(); + final byte[] valueBytes = new byte[valueBuffer.remaining()]; + valueBuffer.get(valueBytes); + stream.writeInt(keyBytes.length); + stream.write(keyBytes); + stream.writeInt(valueBytes.length); + stream.write(valueBytes); + } + } + + @Override + protected void deserializeImpl(final ByteBuffer buffer) throws IOException { + regionProgressMap = CommitProgressKeeper.deserializeRegionProgressFromBuffer(buffer); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + final CommitProgressHandleMetaChangePlan that = (CommitProgressHandleMetaChangePlan) obj; + return Objects.equals(this.regionProgressMap, that.regionProgressMap); + } + + @Override + public int hashCode() { + return Objects.hash(regionProgressMap); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java index f455edb26b8b1..41eaaf7440180 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java @@ -191,6 +191,8 @@ import org.apache.iotdb.confignode.rpc.thrift.TGetAllSubscriptionInfoResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTemplatesResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTopicInfoResp; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDataNodeLocationsResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDatabaseReq; import org.apache.iotdb.confignode.rpc.thrift.TGetJarInListReq; @@ -254,6 +256,9 @@ import org.apache.iotdb.db.schemaengine.template.alter.TemplateAlterOperationUtil; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; import org.apache.iotdb.service.rpc.thrift.TPipeTransferReq; import org.apache.iotdb.service.rpc.thrift.TPipeTransferResp; @@ -264,6 +269,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.net.URL; @@ -276,8 +283,10 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; @@ -2508,6 +2517,83 @@ public TGetAllSubscriptionInfoResp getAllSubscriptionInfo() { : new TGetAllSubscriptionInfoResp(status, Collections.emptyList()); } + public TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) { + TSStatus status = confirmLeader(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return new TGetCommitProgressResp(status); + } + final String key = + req.getConsumerGroupId() + + "##" + + req.getTopicName() + + "##" + + req.getRegionId() + + "##" + + req.getDataNodeId(); + final String keyPrefix = + req.getConsumerGroupId() + "##" + req.getTopicName() + "##" + req.getRegionId() + "##"; + final org.apache.iotdb.commons.subscription.meta.consumer.CommitProgressKeeper keeper = + subscriptionManager + .getSubscriptionCoordinator() + .getSubscriptionInfo() + .getCommitProgressKeeper(); + final Map mergedWriterPositions = new LinkedHashMap<>(); + + for (final Map.Entry entry : keeper.getAllRegionProgress().entrySet()) { + if (!entry.getKey().startsWith(keyPrefix)) { + continue; + } + final RegionProgress regionProgress = deserializeRegionProgress(entry.getValue()); + if (Objects.isNull(regionProgress)) { + continue; + } + for (final Map.Entry writerEntry : + regionProgress.getWriterPositions().entrySet()) { + mergedWriterPositions.merge( + writerEntry.getKey(), + writerEntry.getValue(), + (oldProgress, newProgress) -> + compareWriterProgress(newProgress, oldProgress) > 0 ? newProgress : oldProgress); + } + } + final TGetCommitProgressResp resp = + new TGetCommitProgressResp(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode())); + if (!mergedWriterPositions.isEmpty()) { + resp.setCommittedRegionProgress( + serializeRegionProgress(new RegionProgress(mergedWriterPositions))); + } + return resp; + } + + private static RegionProgress deserializeRegionProgress(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.asReadOnlyBuffer(); + duplicate.rewind(); + return RegionProgress.deserialize(duplicate); + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } catch (final IOException e) { + throw new RuntimeException("Failed to serialize region progress " + regionProgress, e); + } + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + @Override public TPipeConfigTransferResp handleTransferConfigPlan(TPipeConfigTransferReq req) { TSStatus status = confirmLeader(); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index 499c2fe30d3f0..cdc59261060f5 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -114,7 +114,9 @@ import org.apache.iotdb.confignode.procedure.impl.schema.table.view.SetViewPropertiesProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.CreateConsumerProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.DropConsumerProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.CommitProgressSyncProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.ConsumerGroupMetaSyncProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.runtime.SubscriptionHandleLeaderChangeProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.CreateSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.DropSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.topic.CreateTopicProcedure; @@ -1665,6 +1667,21 @@ public void pipeHandleLeaderChange( } } + public void subscriptionHandleLeaderChange( + Map> regionGroupToOldAndNewLeaderPairMap, + long runtimeVersion) { + try { + final long procedureId = + executor.submitProcedure( + new SubscriptionHandleLeaderChangeProcedure( + regionGroupToOldAndNewLeaderPairMap, runtimeVersion)); + LOGGER.info( + "SubscriptionHandleLeaderChangeProcedure was submitted, procedureId: {}.", procedureId); + } catch (Exception e) { + LOGGER.warn("SubscriptionHandleLeaderChangeProcedure was failed to submit.", e); + } + } + public void pipeHandleMetaChange( boolean needWriteConsensusOnConfigNodes, boolean needPushPipeMetaToDataNodes) { try { @@ -1814,6 +1831,23 @@ public TSStatus consumerGroupMetaSync() { } } + public TSStatus commitProgressSync() { + try { + CommitProgressSyncProcedure procedure = new CommitProgressSyncProcedure(); + executor.submitProcedure(procedure); + TSStatus status = waitingProcedureFinished(procedure); + if (status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return status; + } else { + return new TSStatus(TSStatusCode.CONSUMER_PUSH_META_ERROR.getStatusCode()) + .setMessage(wrapTimeoutMessageForPipeProcedure(status.getMessage())); + } + } catch (Exception e) { + return new TSStatus(TSStatusCode.CONSUMER_PUSH_META_ERROR.getStatusCode()) + .setMessage(e.getMessage()); + } + } + public TSStatus createSubscription(TSubscribeReq req) { try { CreateSubscriptionProcedure procedure = new CreateSubscriptionProcedure(req); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java index 993bfc0e40066..55d9417f30a2b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/LoadManager.java @@ -88,6 +88,8 @@ public LoadManager(IManager configManager) { this.topologyService = new TopologyService(configManager, loadCache::updateTopology); this.eventService = new EventService(loadCache); this.eventService.register(configManager.getPipeManager().getPipeRuntimeCoordinator()); + this.eventService.register( + configManager.getSubscriptionManager().getSubscriptionLeaderChangeHandler()); this.eventService.register(routeBalancer); this.eventService.register(topologyService); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java index 1080b067fae82..ff06e20cf2dc7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionManager.java @@ -20,17 +20,32 @@ package org.apache.iotdb.confignode.manager.subscription; import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.subscription.runtime.SubscriptionLeaderChangeHandler; +import org.apache.iotdb.confignode.manager.subscription.runtime.SubscriptionRuntimeCoordinator; import org.apache.iotdb.confignode.persistence.subscription.SubscriptionInfo; public class SubscriptionManager { private final SubscriptionCoordinator subscriptionCoordinator; + private final SubscriptionRuntimeCoordinator subscriptionRuntimeCoordinator; + private final SubscriptionLeaderChangeHandler subscriptionLeaderChangeHandler; public SubscriptionManager(ConfigManager configManager, SubscriptionInfo subscriptionInfo) { this.subscriptionCoordinator = new SubscriptionCoordinator(configManager, subscriptionInfo); + this.subscriptionRuntimeCoordinator = new SubscriptionRuntimeCoordinator(configManager); + this.subscriptionLeaderChangeHandler = + new SubscriptionLeaderChangeHandler(subscriptionRuntimeCoordinator); } public SubscriptionCoordinator getSubscriptionCoordinator() { return subscriptionCoordinator; } + + public SubscriptionRuntimeCoordinator getSubscriptionRuntimeCoordinator() { + return subscriptionRuntimeCoordinator; + } + + public SubscriptionLeaderChangeHandler getSubscriptionLeaderChangeHandler() { + return subscriptionLeaderChangeHandler; + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java index de49987e13fbe..4931a2948fc61 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/SubscriptionMetaSyncer.java @@ -106,6 +106,13 @@ private synchronized void sync() { return; } + // sync commit progress if syncing consumer group meta successfully + final TSStatus commitProgressSyncStatus = procedureManager.commitProgressSync(); + if (commitProgressSyncStatus.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn("Failed to sync commit progress. Result status: {}.", commitProgressSyncStatus); + return; + } + LOGGER.info( "After this successful sync, if SubscriptionInfo is empty during this sync and has not been modified afterwards, all subsequent syncs will be skipped"); isLastSubscriptionSyncSuccessful = true; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionLeaderChangeHandler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionLeaderChangeHandler.java new file mode 100644 index 0000000000000..58cae4c8c2173 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionLeaderChangeHandler.java @@ -0,0 +1,24 @@ +package org.apache.iotdb.confignode.manager.subscription.runtime; + +import org.apache.iotdb.confignode.manager.load.subscriber.ConsensusGroupStatisticsChangeEvent; +import org.apache.iotdb.confignode.manager.load.subscriber.IClusterStatusSubscriber; +import org.apache.iotdb.confignode.manager.load.subscriber.NodeStatisticsChangeEvent; + +public class SubscriptionLeaderChangeHandler implements IClusterStatusSubscriber { + + private final SubscriptionRuntimeCoordinator runtimeCoordinator; + + public SubscriptionLeaderChangeHandler(final SubscriptionRuntimeCoordinator runtimeCoordinator) { + this.runtimeCoordinator = runtimeCoordinator; + } + + @Override + public void onNodeStatisticsChanged(final NodeStatisticsChangeEvent event) { + runtimeCoordinator.handleNodeStatisticsChange(event); + } + + @Override + public void onConsensusGroupStatisticsChanged(final ConsensusGroupStatisticsChangeEvent event) { + runtimeCoordinator.handleLeaderChangeEvent(event); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionRuntimeCoordinator.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionRuntimeCoordinator.java new file mode 100644 index 0000000000000..ec3799118fc0e --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/subscription/runtime/SubscriptionRuntimeCoordinator.java @@ -0,0 +1,149 @@ +package org.apache.iotdb.confignode.manager.subscription.runtime; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; +import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.load.cache.node.NodeStatistics; +import org.apache.iotdb.confignode.manager.load.subscriber.ConsensusGroupStatisticsChangeEvent; +import org.apache.iotdb.confignode.manager.load.subscriber.NodeStatisticsChangeEvent; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; + +import org.apache.tsfile.utils.Pair; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +public class SubscriptionRuntimeCoordinator { + + private final ConfigManager configManager; + private final Map> regionGroupToRuntimeLeaderPairMap = + new HashMap<>(); + private final AtomicLong runtimeVersionGenerator = new AtomicLong(System.currentTimeMillis()); + + public SubscriptionRuntimeCoordinator(final ConfigManager configManager) { + this.configManager = configManager; + } + + public synchronized void handleLeaderChangeEvent( + final ConsensusGroupStatisticsChangeEvent event) { + if (!hasAnyConsensusBasedTopic()) { + return; + } + + final Map> refreshMap = new HashMap<>(); + event + .getDifferentConsensusGroupStatisticsMap() + .forEach( + (regionGroupId, pair) -> { + if (regionGroupId.getType() != TConsensusGroupType.DataRegion) { + return; + } + final int oldLeaderNodeId = pair.left == null ? -1 : pair.left.getLeaderId(); + final int newLeaderNodeId = pair.right == null ? -1 : pair.right.getLeaderId(); + if (oldLeaderNodeId == newLeaderNodeId) { + return; + } + updateRuntimeLeaderPair(regionGroupId, oldLeaderNodeId, newLeaderNodeId, refreshMap); + }); + + submitRuntimeRefresh(refreshMap); + } + + public synchronized void handleNodeStatisticsChange(final NodeStatisticsChangeEvent event) { + if (!hasAnyConsensusBasedTopic()) { + return; + } + + final boolean shouldRefreshRuntime = + event.getDifferentNodeStatisticsMap().values().stream() + .anyMatch( + pair -> { + final NodeStatus oldStatus = getNodeStatus(pair.getLeft()); + final NodeStatus newStatus = getNodeStatus(pair.getRight()); + return oldStatus != newStatus + && (isRuntimeSensitiveStatus(oldStatus) + || isRuntimeSensitiveStatus(newStatus)); + }); + if (!shouldRefreshRuntime) { + return; + } + + seedRuntimeLeaderPairsFromCurrentLeaders(); + submitRuntimeRefresh(new HashMap<>(regionGroupToRuntimeLeaderPairMap)); + } + + public boolean hasAnyConsensusBasedTopic() { + for (final TopicMeta topicMeta : + configManager + .getSubscriptionManager() + .getSubscriptionCoordinator() + .getSubscriptionInfo() + .getAllTopicMeta()) { + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + if (TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat)) { + return true; + } + } + return false; + } + + private void updateRuntimeLeaderPair( + final TConsensusGroupId regionGroupId, + final int oldLeaderNodeId, + final int newLeaderNodeId, + final Map> refreshMap) { + if (newLeaderNodeId < 0) { + regionGroupToRuntimeLeaderPairMap.remove(regionGroupId); + return; + } + final Pair runtimeLeaderPair = new Pair<>(oldLeaderNodeId, newLeaderNodeId); + regionGroupToRuntimeLeaderPairMap.put(regionGroupId, runtimeLeaderPair); + refreshMap.put(regionGroupId, runtimeLeaderPair); + } + + private void seedRuntimeLeaderPairsFromCurrentLeaders() { + configManager + .getLoadManager() + .getRegionLeaderMap() + .forEach( + (regionGroupId, leaderId) -> { + if (regionGroupId.getType() == TConsensusGroupType.DataRegion && leaderId >= 0) { + regionGroupToRuntimeLeaderPairMap.putIfAbsent( + regionGroupId, new Pair<>(-1, leaderId)); + } + }); + } + + private void submitRuntimeRefresh( + final Map> regionGroupToOldAndNewLeaderPairMap) { + if (regionGroupToOldAndNewLeaderPairMap.isEmpty()) { + return; + } + configManager + .getProcedureManager() + .subscriptionHandleLeaderChange( + regionGroupToOldAndNewLeaderPairMap, + runtimeVersionGenerator.updateAndGet( + currentRuntimeVersion -> + Math.max(currentRuntimeVersion + 1, System.currentTimeMillis()))); + } + + private static NodeStatus getNodeStatus(final NodeStatistics statistics) { + return statistics == null ? NodeStatus.Unknown : statistics.getStatus(); + } + + private static boolean isRuntimeSensitiveStatus(final NodeStatus status) { + return status == NodeStatus.Unknown || status == NodeStatus.Removing; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java index 8016690d17c9a..1d232ec87a364 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java @@ -111,6 +111,7 @@ import org.apache.iotdb.confignode.consensus.request.write.region.OfferRegionMaintainTasksPlan; import org.apache.iotdb.confignode.consensus.request.write.region.PollSpecificRegionMaintainTaskPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.AlterConsumerGroupPlan; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.ConsumerGroupHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterMultipleTopicsPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterTopicPlan; @@ -636,6 +637,9 @@ public TSStatus executeNonQueryPlan(ConfigPhysicalPlan physicalPlan) case ConsumerGroupHandleMetaChange: return subscriptionInfo.handleConsumerGroupMetaChanges( (ConsumerGroupHandleMetaChangePlan) physicalPlan); + case CommitProgressHandleMetaChange: + return subscriptionInfo.handleCommitProgressChanges( + (CommitProgressHandleMetaChangePlan) physicalPlan); case AlterConsumerGroup: return subscriptionInfo.alterConsumerGroup((AlterConsumerGroupPlan) physicalPlan); case TopicHandleMetaChange: diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java index 0c262655156d3..2ce4805dc14f5 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/subscription/SubscriptionInfo.java @@ -21,12 +21,14 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.snapshot.SnapshotProcessor; +import org.apache.iotdb.commons.subscription.meta.consumer.CommitProgressKeeper; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper; import org.apache.iotdb.commons.subscription.meta.subscription.SubscriptionMeta; import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; import org.apache.iotdb.commons.subscription.meta.topic.TopicMetaKeeper; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.AlterConsumerGroupPlan; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.ConsumerGroupHandleMetaChangePlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterMultipleTopicsPlan; import org.apache.iotdb.confignode.consensus.request.write.subscription.topic.AlterTopicPlan; @@ -42,6 +44,8 @@ import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.consensus.common.DataSet; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.thrift.annotation.Nullable; @@ -54,7 +58,9 @@ import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -72,6 +78,7 @@ public class SubscriptionInfo implements SnapshotProcessor { private final TopicMetaKeeper topicMetaKeeper; private final ConsumerGroupMetaKeeper consumerGroupMetaKeeper; + private final CommitProgressKeeper commitProgressKeeper; private final ReentrantReadWriteLock subscriptionInfoLock = new ReentrantReadWriteLock(true); @@ -81,6 +88,7 @@ public class SubscriptionInfo implements SnapshotProcessor { public SubscriptionInfo() { this.topicMetaKeeper = new TopicMetaKeeper(); this.consumerGroupMetaKeeper = new ConsumerGroupMetaKeeper(); + this.commitProgressKeeper = new CommitProgressKeeper(); this.subscriptionInfoVersion = new SubscriptionInfoVersion(); } @@ -158,6 +166,8 @@ public boolean validateBeforeCreatingTopic(TCreateTopicReq createTopicReq) private boolean checkBeforeCreateTopicInternal(TCreateTopicReq createTopicReq) throws SubscriptionException { + validateTopicConfig(new TopicConfig(safeTopicAttributes(createTopicReq.getTopicAttributes()))); + if (!isTopicExisted(createTopicReq.getTopicName())) { return true; } @@ -247,6 +257,8 @@ public void validateBeforeAlteringTopic(TopicMeta topicMeta) throws Subscription } private void checkBeforeAlteringTopicInternal(TopicMeta topicMeta) throws SubscriptionException { + validateTopicConfig(topicMeta.getConfig()); + if (isTopicExisted(topicMeta.getTopicName())) { return; } @@ -258,6 +270,28 @@ private void checkBeforeAlteringTopicInternal(TopicMeta topicMeta) throws Subscr throw new SubscriptionException(exceptionMessage); } + private Map safeTopicAttributes(@Nullable final Map attributes) { + return Objects.nonNull(attributes) ? attributes : Collections.emptyMap(); + } + + private void validateTopicConfig(final TopicConfig topicConfig) throws SubscriptionException { + final String orderMode = topicConfig.getOrderMode(); + if (TopicConfig.isValidOrderMode(orderMode)) { + return; + } + + final String exceptionMessage = + String.format( + "Failed to create or alter topic, unsupported %s=%s, expected one of [%s, %s, %s]", + TopicConstant.ORDER_MODE_KEY, + orderMode, + TopicConstant.ORDER_MODE_LEADER_ONLY_VALUE, + TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE, + TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + LOGGER.warn(exceptionMessage); + throw new SubscriptionException(exceptionMessage); + } + public boolean isTopicExisted(String topicName) { acquireReadLock(); try { @@ -566,6 +600,21 @@ public TSStatus handleConsumerGroupMetaChanges(ConsumerGroupHandleMetaChangePlan } } + public TSStatus handleCommitProgressChanges(CommitProgressHandleMetaChangePlan plan) { + acquireWriteLock(); + try { + LOGGER.info("Handling commit progress meta changes ..."); + commitProgressKeeper.replaceAll(plan.getRegionProgressMap()); + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } finally { + releaseWriteLock(); + } + } + + public CommitProgressKeeper getCommitProgressKeeper() { + return commitProgressKeeper; + } + ///////////////////////////////// Subscription ///////////////////////////////// public void validateBeforeSubscribe(TSubscribeReq subscribeReq) throws SubscriptionException { @@ -740,6 +789,7 @@ public boolean processTakeSnapshot(File snapshotDir) throws IOException { try (final FileOutputStream fileOutputStream = new FileOutputStream(snapshotFile)) { topicMetaKeeper.processTakeSnapshot(fileOutputStream); consumerGroupMetaKeeper.processTakeSnapshot(fileOutputStream); + commitProgressKeeper.processTakeSnapshot(fileOutputStream); fileOutputStream.getFD().sync(); } @@ -764,6 +814,7 @@ public void processLoadSnapshot(File snapshotDir) throws IOException { try (final FileInputStream fileInputStream = new FileInputStream(snapshotFile)) { topicMetaKeeper.processLoadSnapshot(fileInputStream); consumerGroupMetaKeeper.processLoadSnapshot(fileInputStream); + commitProgressKeeper.processLoadSnapshot(fileInputStream); } } finally { releaseWriteLock(); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java index 960d0a7977f51..d271d5ef33b9c 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java @@ -70,6 +70,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TInactiveTriggerInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TNotifyRegionMigrationReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushMultiPipeMetaReq; @@ -79,12 +81,15 @@ import org.apache.iotdb.mpp.rpc.thrift.TPushSingleConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSinglePipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSingleTopicMetaReq; +import org.apache.iotdb.mpp.rpc.thrift.TPushSubscriptionRuntimeReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; +import org.apache.iotdb.mpp.rpc.thrift.TSubscriptionRuntimeStateEntry; import org.apache.iotdb.rpc.TSStatusCode; import org.apache.thrift.TException; import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.utils.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,8 +98,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; @@ -848,6 +855,85 @@ public List dropSingleConsumerGroupOnDataNode(String consumerGroupName .collect(Collectors.toList()); } + public Map pullCommitProgressFromDataNodes() { + final Map dataNodeLocationMap = + configManager.getNodeManager().getRegisteredDataNodeLocations(); + final TPullCommitProgressReq request = new TPullCommitProgressReq(); + + final DataNodeAsyncRequestContext + clientHandler = + new DataNodeAsyncRequestContext<>( + CnToDnAsyncRequestType.PULL_COMMIT_PROGRESS, request, dataNodeLocationMap); + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestToNodeWithRetryAndTimeoutInMs( + clientHandler, + PipeConfig.getInstance().getPipeMetaSyncerSyncIntervalMinutes() * 60 * 1000 * 2 / 3); + return clientHandler.getResponseMap(); + } + + public Map pushSubscriptionRuntimeStatesToDataNodes( + final Map> regionGroupToOldAndNewLeaderPairMap, + final long runtimeVersion) { + final Map dataNodeLocationMap = + configManager.getNodeManager().getRegisteredDataNodeLocations(); + final Map dataRegionReplicaSetMap = + getPartitionManager().getAllReplicaSetsMap(TConsensusGroupType.DataRegion); + final Set readableDataNodeIds = + getLoadManager().filterDataNodeThroughStatus(NodeStatus::isReadable).stream() + .collect(Collectors.toSet()); + final DataNodeAsyncRequestContext clientHandler = + new DataNodeAsyncRequestContext<>(CnToDnAsyncRequestType.SUBSCRIPTION_PUSH_RUNTIME); + + dataNodeLocationMap.forEach( + (dataNodeId, dataNodeLocation) -> { + final List runtimeStates = new ArrayList<>(); + regionGroupToOldAndNewLeaderPairMap.forEach( + (regionId, leaderPair) -> { + final int oldLeaderNodeId = leaderPair.getLeft(); + final int preferredWriterNodeId = leaderPair.getRight(); + final LinkedHashSet activeWriterNodeIds = new LinkedHashSet<>(); + final TRegionReplicaSet replicaSet = dataRegionReplicaSetMap.get(regionId); + if (replicaSet != null) { + replicaSet.getDataNodeLocations().stream() + .map(TDataNodeLocation::getDataNodeId) + .filter(readableDataNodeIds::contains) + .forEach(activeWriterNodeIds::add); + } + if (activeWriterNodeIds.isEmpty()) { + if (isRuntimeActiveWriterNode(preferredWriterNodeId)) { + activeWriterNodeIds.add(preferredWriterNodeId); + } + if (oldLeaderNodeId != preferredWriterNodeId + && isRuntimeActiveWriterNode(oldLeaderNodeId)) { + activeWriterNodeIds.add(oldLeaderNodeId); + } + } + runtimeStates.add( + new TSubscriptionRuntimeStateEntry( + regionId, + runtimeVersion, + preferredWriterNodeId, + preferredWriterNodeId == dataNodeId, + new ArrayList<>(activeWriterNodeIds))); + }); + clientHandler.putNodeLocation(dataNodeId, dataNodeLocation); + clientHandler.putRequest( + dataNodeId, new TPushSubscriptionRuntimeReq().setRuntimeStates(runtimeStates)); + }); + + CnToDnInternalServiceAsyncRequestManager.getInstance() + .sendAsyncRequestToNodeWithRetryAndTimeoutInMs( + clientHandler, + PipeConfig.getInstance().getPipeMetaSyncerSyncIntervalMinutes() * 60 * 1000 * 2 / 3); + return clientHandler.getResponseMap(); + } + + private boolean isRuntimeActiveWriterNode(final int dataNodeId) { + return dataNodeId >= 0 + && getLoadManager().getNodeStatus(dataNodeId) != NodeStatus.Unknown + && getLoadManager().getNodeStatus(dataNodeId) != NodeStatus.Removing; + } + public LockQueue getNodeLock() { return nodeLock; } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java index 07bbe2c014c42..927c306ae5587 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/AbstractOperateSubscriptionProcedure.java @@ -224,6 +224,7 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, OperateSubscriptionS getCycles() + 1, RETRY_THRESHOLD, e); + setNextState(getCurrentState()); // Wait 3s for next retry TimeUnit.MILLISECONDS.sleep(3000L); } else { @@ -239,6 +240,7 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, OperateSubscriptionS String.format( "ProcedureId %s: Fail to %s because %s", getProcId(), getOperation().name(), e.getMessage()))); + return Flow.NO_MORE_STATE; } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java index 4428a7ee4d305..84b94ead22cbf 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/SubscriptionOperation.java @@ -28,8 +28,10 @@ public enum SubscriptionOperation { ALTER_CONSUMER_GROUP("alter consumer group"), CREATE_SUBSCRIPTION("create subscription"), DROP_SUBSCRIPTION("drop subscription"), + HANDLE_LEADER_CHANGE("handle leader change"), SYNC_CONSUMER_GROUP_META("sync consumer group meta"), SYNC_TOPIC_META("sync topic meta"), + SYNC_COMMIT_PROGRESS("sync commit progress"), ; private final String name; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/consumer/runtime/CommitProgressSyncProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/consumer/runtime/CommitProgressSyncProcedure.java new file mode 100644 index 0000000000000..e9b3056e66211 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/consumer/runtime/CommitProgressSyncProcedure.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime; + +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.pipe.config.PipeConfig; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; +import org.apache.iotdb.confignode.persistence.subscription.SubscriptionInfo; +import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.impl.subscription.AbstractOperateSubscriptionProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.SubscriptionOperation; +import org.apache.iotdb.confignode.procedure.state.ProcedureLockState; +import org.apache.iotdb.confignode.procedure.store.ProcedureType; +import org.apache.iotdb.consensus.exception.ConsensusException; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; +import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; + +/** + * Periodically pulls commit progress from all DataNodes and persists the merged result to + * ConfigNode consensus. + */ +public class CommitProgressSyncProcedure extends AbstractOperateSubscriptionProcedure { + + private static final Logger LOGGER = LoggerFactory.getLogger(CommitProgressSyncProcedure.class); + + private static final long MIN_EXECUTION_INTERVAL_MS = + PipeConfig.getInstance().getPipeMetaSyncerSyncIntervalMinutes() * 60 * 1000 / 2; + private static final AtomicLong LAST_EXECUTION_TIME = new AtomicLong(0); + + public CommitProgressSyncProcedure() { + super(); + } + + @Override + protected AtomicReference acquireLockInternal( + ConfigNodeProcedureEnv configNodeProcedureEnv) { + return configNodeProcedureEnv + .getConfigManager() + .getSubscriptionManager() + .getSubscriptionCoordinator() + .tryLock(); + } + + @Override + protected ProcedureLockState acquireLock(ConfigNodeProcedureEnv configNodeProcedureEnv) { + if (System.currentTimeMillis() - LAST_EXECUTION_TIME.get() < MIN_EXECUTION_INTERVAL_MS) { + subscriptionInfo = null; + LOGGER.info( + "CommitProgressSyncProcedure: acquireLock, skip the procedure due to the last execution time {}", + LAST_EXECUTION_TIME.get()); + return ProcedureLockState.LOCK_ACQUIRED; + } + return super.acquireLock(configNodeProcedureEnv); + } + + @Override + protected SubscriptionOperation getOperation() { + return SubscriptionOperation.SYNC_COMMIT_PROGRESS; + } + + @Override + public boolean executeFromValidate(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: executeFromValidate"); + LAST_EXECUTION_TIME.set(System.currentTimeMillis()); + return true; + } + + @Override + public void executeFromOperateOnConfigNodes(ConfigNodeProcedureEnv env) + throws SubscriptionException { + LOGGER.info("CommitProgressSyncProcedure: executeFromOperateOnConfigNodes"); + + // 1. Pull commit progress from all DataNodes + final Map respMap = env.pullCommitProgressFromDataNodes(); + + // 2. Merge all DataNode responses with existing progress using Math::max + final Map mergedRegionProgress = + deserializeRegionProgressMap( + subscriptionInfo.get().getCommitProgressKeeper().getAllRegionProgress()); + + for (Map.Entry entry : respMap.entrySet()) { + final TPullCommitProgressResp resp = entry.getValue(); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "Failed to pull commit progress from DataNode {}, status: {}", + entry.getKey(), + resp.getStatus()); + continue; + } + if (resp.isSetCommitRegionProgress()) { + for (final Map.Entry progressEntry : + resp.getCommitRegionProgress().entrySet()) { + final RegionProgress incomingProgress = + deserializeRegionProgress(progressEntry.getKey(), progressEntry.getValue()); + if (Objects.nonNull(incomingProgress)) { + mergedRegionProgress.merge( + progressEntry.getKey(), + incomingProgress, + CommitProgressSyncProcedure::mergeRegionProgress); + } + } + } + } + + // 3. Write the merged progress to consensus + TSStatus response; + try { + response = + env.getConfigManager() + .getConsensusManager() + .write( + new CommitProgressHandleMetaChangePlan( + serializeRegionProgressMap(mergedRegionProgress))); + } catch (ConsensusException e) { + LOGGER.warn("Failed in the write API executing the consensus layer due to: ", e); + response = new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + response.setMessage(e.getMessage()); + } + if (response.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + throw new SubscriptionException(response.getMessage()); + } + } + + @Override + public void executeFromOperateOnDataNodes(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: executeFromOperateOnDataNodes (no-op)"); + // No need to push back to DataNodes + } + + @Override + public void rollbackFromValidate(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: rollbackFromValidate"); + } + + @Override + public void rollbackFromOperateOnConfigNodes(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: rollbackFromOperateOnConfigNodes"); + } + + @Override + public void rollbackFromOperateOnDataNodes(ConfigNodeProcedureEnv env) { + LOGGER.info("CommitProgressSyncProcedure: rollbackFromOperateOnDataNodes"); + } + + @Override + public void serialize(DataOutputStream stream) throws IOException { + stream.writeShort(ProcedureType.COMMIT_PROGRESS_SYNC_PROCEDURE.getTypeCode()); + super.serialize(stream); + } + + @Override + public boolean equals(Object o) { + return o instanceof CommitProgressSyncProcedure; + } + + @Override + public int hashCode() { + return 0; + } + + private static Map deserializeRegionProgressMap( + final Map serializedRegionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : serializedRegionProgressMap.entrySet()) { + final RegionProgress regionProgress = + deserializeRegionProgress(entry.getKey(), entry.getValue()); + if (Objects.nonNull(regionProgress)) { + result.put(entry.getKey(), regionProgress); + } + } + return result; + } + + private static Map serializeRegionProgressMap( + final Map regionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final ByteBuffer serialized = serializeRegionProgress(entry.getValue()); + if (Objects.nonNull(serialized)) { + result.put(entry.getKey(), serialized); + } + } + return result; + } + + private static RegionProgress deserializeRegionProgress( + final String key, final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.slice(); + try { + return RegionProgress.deserialize(duplicate); + } catch (final RuntimeException e) { + LOGGER.warn( + "CommitProgressSyncProcedure: failed to deserialize region progress, key={}, summary={}", + key, + summarizeRegionProgressPayload(buffer), + e); + throw e; + } + } + + private static String summarizeRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private static String bytesToHex(final byte[] bytes) { + if (Objects.isNull(bytes) || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } catch (final IOException e) { + throw new RuntimeException("Failed to serialize region progress " + regionProgress, e); + } + } + + private static RegionProgress mergeRegionProgress( + final RegionProgress left, final RegionProgress right) { + final Map merged = new LinkedHashMap<>(left.getWriterPositions()); + for (final Map.Entry entry : right.getWriterPositions().entrySet()) { + merged.merge( + entry.getKey(), + entry.getValue(), + (oldProgress, newProgress) -> + compareWriterProgress(newProgress, oldProgress) > 0 ? newProgress : oldProgress); + } + return new RegionProgress(merged); + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionHandleLeaderChangeProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionHandleLeaderChangeProcedure.java new file mode 100644 index 0000000000000..b935663dc952d --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionHandleLeaderChangeProcedure.java @@ -0,0 +1,454 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.subscription.runtime; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; +import org.apache.iotdb.confignode.consensus.request.write.subscription.consumer.runtime.CommitProgressHandleMetaChangePlan; +import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.impl.subscription.AbstractOperateSubscriptionProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.SubscriptionOperation; +import org.apache.iotdb.confignode.procedure.store.ProcedureType; +import org.apache.iotdb.consensus.exception.ConsensusException; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; +import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; +import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; +import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.utils.ReadWriteIOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Handles subscription runtime leader changes. The first version focuses on pulling the latest + * commit progress during leader migration so the new runtime owner starts from a fresher frontier. + */ +public class SubscriptionHandleLeaderChangeProcedure extends AbstractOperateSubscriptionProcedure { + + private static final Logger LOGGER = + LoggerFactory.getLogger(SubscriptionHandleLeaderChangeProcedure.class); + + private Map> regionGroupToOldAndNewLeaderPairMap = + new HashMap<>(); + private long runtimeVersion; + + public SubscriptionHandleLeaderChangeProcedure() { + super(); + } + + public SubscriptionHandleLeaderChangeProcedure( + final Map> regionGroupToOldAndNewLeaderPairMap, + final long runtimeVersion) { + super(); + this.regionGroupToOldAndNewLeaderPairMap = regionGroupToOldAndNewLeaderPairMap; + this.runtimeVersion = runtimeVersion; + } + + @Override + protected SubscriptionOperation getOperation() { + return SubscriptionOperation.HANDLE_LEADER_CHANGE; + } + + @Override + public boolean executeFromValidate(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: executeFromValidate"); + if (regionGroupToOldAndNewLeaderPairMap.isEmpty()) { + return false; + } + for (final TopicMeta topicMeta : subscriptionInfo.get().getAllTopicMeta()) { + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + if (TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat)) { + return true; + } + } + return false; + } + + @Override + public void executeFromOperateOnConfigNodes(final ConfigNodeProcedureEnv env) + throws SubscriptionException { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: executeFromOperateOnConfigNodes"); + + final Map respMap = env.pullCommitProgressFromDataNodes(); + final Map mergedRegionProgress = + deserializeRegionProgressMap( + subscriptionInfo.get().getCommitProgressKeeper().getAllRegionProgress()); + + for (final Map.Entry entry : respMap.entrySet()) { + final TPullCommitProgressResp resp = entry.getValue(); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: failed to pull commit progress from DataNode {}, status: {}", + entry.getKey(), + resp.getStatus()); + continue; + } + if (resp.isSetCommitRegionProgress()) { + for (final Map.Entry progressEntry : + resp.getCommitRegionProgress().entrySet()) { + final RegionProgress incomingProgress = + deserializeRegionProgress(progressEntry.getKey(), progressEntry.getValue()); + if (Objects.nonNull(incomingProgress)) { + mergedRegionProgress.merge( + progressEntry.getKey(), + incomingProgress, + SubscriptionHandleLeaderChangeProcedure::mergeRegionProgress); + } + } + } + } + + final TSStatus response; + try { + response = + env.getConfigManager() + .getConsensusManager() + .write( + new CommitProgressHandleMetaChangePlan( + serializeRegionProgressMap(mergedRegionProgress))); + } catch (final ConsensusException e) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: failed in the write API executing the consensus layer due to: ", + e); + throw new SubscriptionException(e.getMessage()); + } + + if (response.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + throw new SubscriptionException(response.getMessage()); + } + } + + @Override + public void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) + throws SubscriptionException, IOException { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: executeFromOperateOnDataNodes"); + + final Map topicRespMap = pushTopicMetaToDataNodes(env); + topicRespMap.forEach( + (dataNodeId, resp) -> { + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: ignored failed topic meta push to DataNode {}, status: {}", + dataNodeId, + resp.getStatus()); + } + }); + + final Map consumerGroupRespMap = + pushConsumerGroupMetaToDataNodes(env); + consumerGroupRespMap.forEach( + (dataNodeId, resp) -> { + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: ignored failed consumer group meta push to DataNode {}, status: {}", + dataNodeId, + resp.getStatus()); + } + }); + + final Map> runtimeLeaderPairMap = + regionGroupToOldAndNewLeaderPairMap.entrySet().stream() + .filter(entry -> entry.getValue().getRight() >= 0) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + if (!runtimeLeaderPairMap.isEmpty()) { + final Set readableDataNodeIds = getReadableDataNodeIds(env); + final Map runtimeRespMap = + env.pushSubscriptionRuntimeStatesToDataNodes(runtimeLeaderPairMap, runtimeVersion); + final String runtimePushError = + collectRequiredRuntimePushFailures(readableDataNodeIds, runtimeRespMap); + if (!runtimePushError.isEmpty()) { + throw new SubscriptionException( + String.format( + "Failed to push subscription runtime state to readable DataNodes during leader change, details: %s", + runtimePushError)); + } + runtimeRespMap.forEach( + (dataNodeId, status) -> { + if (!readableDataNodeIds.contains(dataNodeId) + && status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: ignored failed subscription runtime push to unreadable DataNode {}, status: {}", + dataNodeId, + status); + } + }); + } + } + + @Override + public void rollbackFromValidate(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: rollbackFromValidate"); + } + + @Override + public void rollbackFromOperateOnConfigNodes(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: rollbackFromOperateOnConfigNodes"); + } + + @Override + public void rollbackFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) { + LOGGER.info("SubscriptionHandleLeaderChangeProcedure: rollbackFromOperateOnDataNodes"); + } + + @Override + public void serialize(final DataOutputStream stream) throws IOException { + stream.writeShort(ProcedureType.SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE.getTypeCode()); + super.serialize(stream); + ReadWriteIOUtils.write(runtimeVersion, stream); + ReadWriteIOUtils.write(regionGroupToOldAndNewLeaderPairMap.size(), stream); + for (final Map.Entry> entry : + regionGroupToOldAndNewLeaderPairMap.entrySet()) { + ReadWriteIOUtils.write(entry.getKey().getId(), stream); + ReadWriteIOUtils.write(entry.getValue().getLeft(), stream); + ReadWriteIOUtils.write(entry.getValue().getRight(), stream); + } + } + + @Override + public void deserialize(final ByteBuffer byteBuffer) { + super.deserialize(byteBuffer); + runtimeVersion = ReadWriteIOUtils.readLong(byteBuffer); + final int size = ReadWriteIOUtils.readInt(byteBuffer); + for (int i = 0; i < size; ++i) { + final int dataRegionGroupId = ReadWriteIOUtils.readInt(byteBuffer); + final int oldLeaderId = ReadWriteIOUtils.readInt(byteBuffer); + final int newLeaderId = ReadWriteIOUtils.readInt(byteBuffer); + regionGroupToOldAndNewLeaderPairMap.put( + new TConsensusGroupId(TConsensusGroupType.DataRegion, dataRegionGroupId), + new Pair<>(oldLeaderId, newLeaderId)); + } + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final SubscriptionHandleLeaderChangeProcedure that = + (SubscriptionHandleLeaderChangeProcedure) o; + return getProcId() == that.getProcId() + && getCurrentState().equals(that.getCurrentState()) + && getCycles() == that.getCycles() + && runtimeVersion == that.runtimeVersion + && regionGroupToOldAndNewLeaderPairMap.equals(that.regionGroupToOldAndNewLeaderPairMap); + } + + @Override + public int hashCode() { + return Objects.hash( + getProcId(), + getCurrentState(), + getCycles(), + runtimeVersion, + regionGroupToOldAndNewLeaderPairMap); + } + + private static Map deserializeRegionProgressMap( + final Map serializedRegionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : serializedRegionProgressMap.entrySet()) { + final RegionProgress regionProgress = + deserializeRegionProgress(entry.getKey(), entry.getValue()); + if (Objects.nonNull(regionProgress)) { + result.put(entry.getKey(), regionProgress); + } + } + return result; + } + + private static Map serializeRegionProgressMap( + final Map regionProgressMap) { + final Map result = new HashMap<>(); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final ByteBuffer serialized = serializeRegionProgress(entry.getValue()); + if (Objects.nonNull(serialized)) { + result.put(entry.getKey(), serialized); + } + } + return result; + } + + private static RegionProgress deserializeRegionProgress( + final String key, final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.slice(); + try { + return RegionProgress.deserialize(duplicate); + } catch (final RuntimeException e) { + LOGGER.warn( + "SubscriptionHandleLeaderChangeProcedure: failed to deserialize region progress, key={}, summary={}", + key, + summarizeRegionProgressPayload(buffer), + e); + throw e; + } + } + + private static String summarizeRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private static String bytesToHex(final byte[] bytes) { + if (Objects.isNull(bytes) || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } catch (final IOException e) { + throw new RuntimeException("Failed to serialize region progress " + regionProgress, e); + } + } + + private static RegionProgress mergeRegionProgress( + final RegionProgress left, final RegionProgress right) { + final Map merged = new LinkedHashMap<>(left.getWriterPositions()); + for (final Map.Entry entry : right.getWriterPositions().entrySet()) { + merged.merge( + entry.getKey(), + entry.getValue(), + (oldProgress, newProgress) -> + compareWriterProgress(newProgress, oldProgress) > 0 ? newProgress : oldProgress); + } + return new RegionProgress(merged); + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + + private Set getReadableDataNodeIds(final ConfigNodeProcedureEnv env) + throws SubscriptionException { + final Set readableDataNodeIds = + env + .getConfigManager() + .getLoadManager() + .filterDataNodeThroughStatus(NodeStatus::isReadable) + .stream() + .collect(Collectors.toSet()); + if (readableDataNodeIds.isEmpty()) { + throw new SubscriptionException( + "No readable DataNode is available to accept subscription metadata/runtime updates during leader change"); + } + return readableDataNodeIds; + } + + private String collectRequiredRuntimePushFailures( + final Set readableDataNodeIds, final Map respMap) { + final StringBuilder errorMessageBuilder = new StringBuilder(); + for (final Integer dataNodeId : readableDataNodeIds) { + final TSStatus status = respMap.get(dataNodeId); + if (Objects.isNull(status)) { + errorMessageBuilder + .append("DataNode ") + .append(dataNodeId) + .append(": missing subscription runtime push response; "); + continue; + } + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + errorMessageBuilder + .append("DataNode ") + .append(dataNodeId) + .append(": ") + .append(status) + .append("; "); + } + } + return errorMessageBuilder.toString(); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java index cb5edd8cd91a3..6b71d5b16f79a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/CreateSubscriptionProcedure.java @@ -39,6 +39,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -52,6 +53,7 @@ import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.stream.Collectors; public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndPipeProcedure { @@ -66,6 +68,8 @@ public class CreateSubscriptionProcedure extends AbstractOperateSubscriptionAndP private AlterConsumerGroupProcedure alterConsumerGroupProcedure; private List createPipeProcedures = new ArrayList<>(); + private Set consensusTopicNames = new HashSet<>(); + // TODO: remove this variable later private final List alterTopicProcedures = new ArrayList<>(); // unused now @@ -103,15 +107,41 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) alterConsumerGroupProcedure = new AlterConsumerGroupProcedure(updatedConsumerGroupMeta, subscriptionInfo); - // Construct CreatePipeProcedureV2s + // Construct CreatePipeProcedureV2s (for non-consensus topics) for (final String topicName : subscribeReq.getTopicNames()) { + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); + + // Check if this topic should use consensus subscription: mode is live, format is Tablet + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + final boolean isConsensusBasedTopic = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + + if (isConsensusBasedTopic) { + // skip pipe creation + consensusTopicNames.add(topicName); + LOGGER.info( + "CreateSubscriptionProcedure: topic [{}] uses consensus-based subscription " + + "(mode={}, format={}), skipping pipe creation", + topicName, + topicMode, + topicFormat); + continue; + } + final String pipeName = PipeStaticMeta.generateSubscriptionPipeName(topicName, consumerGroupId); if (!subscriptionInfo.get().isTopicSubscribedByConsumerGroup(topicName, consumerGroupId) // even if there existed subscription meta, if there is no corresponding pipe meta, it // will try to create the pipe || !pipeTaskInfo.get().isPipeExisted(pipeName)) { - final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topicName); createPipeProcedures.add( new CreatePipeProcedureV2( new TCreatePipeReq() @@ -177,20 +207,29 @@ protected void executeFromOperateOnDataNodes(final ConfigNodeProcedureEnv env) // Push consumer group meta to data nodes alterConsumerGroupProcedure.executeFromOperateOnDataNodes(env); - // Push pipe meta to data nodes - final List pipeNames = - createPipeProcedures.stream() - .map(CreatePipeProcedureV2::getPipeName) - .collect(Collectors.toList()); - final String exceptionMessage = - AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( - null, pushMultiPipeMetaToDataNodes(pipeNames, env)); - if (!exceptionMessage.isEmpty()) { - // throw exception instead of logging warn, do not rely on metadata synchronization - throw new SubscriptionException( - String.format( - "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", - pipeNames, subscribeReq, exceptionMessage)); + if (!consensusTopicNames.isEmpty()) { + LOGGER.info( + "CreateSubscriptionProcedure: consensus-based topics {} will be handled by DataNode " + + "via consumer group meta push (no pipe creation needed)", + consensusTopicNames); + } + + // Push pipe meta to data nodes (only for non-consensus pipe-based topics) + if (!createPipeProcedures.isEmpty()) { + final List pipeNames = + createPipeProcedures.stream() + .map(CreatePipeProcedureV2::getPipeName) + .collect(Collectors.toList()); + final String exceptionMessage = + AbstractOperatePipeProcedureV2.parsePushPipeMetaExceptionForPipe( + null, pushMultiPipeMetaToDataNodes(pipeNames, env)); + if (!exceptionMessage.isEmpty()) { + // throw exception instead of logging warn, do not rely on metadata synchronization + throw new SubscriptionException( + String.format( + "Failed to create pipes %s when creating subscription with request %s, details: %s, metadata will be synchronized later.", + pipeNames, subscribeReq, exceptionMessage)); + } } } @@ -297,6 +336,12 @@ public void serialize(final DataOutputStream stream) throws IOException { } else { ReadWriteIOUtils.write(false, stream); } + + // Serialize consensus topic names + ReadWriteIOUtils.write(consensusTopicNames.size(), stream); + for (final String consensusTopicName : consensusTopicNames) { + ReadWriteIOUtils.write(consensusTopicName, stream); + } } @Override @@ -348,6 +393,14 @@ public void deserialize(final ByteBuffer byteBuffer) { } } } + + // Deserialize consensus topic names + if (byteBuffer.hasRemaining()) { + size = ReadWriteIOUtils.readInt(byteBuffer); + for (int i = 0; i < size; ++i) { + consensusTopicNames.add(ReadWriteIOUtils.readString(byteBuffer)); + } + } } @Override @@ -364,7 +417,8 @@ public boolean equals(final Object o) { && getCycles() == that.getCycles() && Objects.equals(subscribeReq, that.subscribeReq) && Objects.equals(alterConsumerGroupProcedure, that.alterConsumerGroupProcedure) - && Objects.equals(createPipeProcedures, that.createPipeProcedures); + && Objects.equals(createPipeProcedures, that.createPipeProcedures) + && Objects.equals(consensusTopicNames, that.consensusTopicNames); } @Override @@ -375,7 +429,8 @@ public int hashCode() { getCycles(), subscribeReq, alterConsumerGroupProcedure, - createPipeProcedures); + createPipeProcedures, + consensusTopicNames); } @TestOnly diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java index 6741a6c1e2a84..99f8ed649d852 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/subscription/subscription/DropSubscriptionProcedure.java @@ -22,6 +22,7 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.pipe.agent.task.meta.PipeStaticMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; +import org.apache.iotdb.commons.subscription.meta.topic.TopicMeta; import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.confignode.consensus.request.ConfigPhysicalPlan; import org.apache.iotdb.confignode.consensus.request.write.pipe.task.DropPipePlanV2; @@ -36,6 +37,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.consensus.exception.ConsensusException; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -100,6 +102,31 @@ protected boolean executeFromValidate(final ConfigNodeProcedureEnv env) for (final String topic : unsubscribeReq.getTopicNames()) { if (topicsUnsubByGroup.contains(topic)) { + // Check if this topic uses consensus-based subscription (same detection as + // CreateSubscriptionProcedure). Consensus topics have no pipe to drop. + final TopicMeta topicMeta = subscriptionInfo.get().deepCopyTopicMeta(topic); + final String topicMode = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.MODE_KEY, TopicConstant.MODE_DEFAULT_VALUE); + final String topicFormat = + topicMeta + .getConfig() + .getStringOrDefault(TopicConstant.FORMAT_KEY, TopicConstant.FORMAT_DEFAULT_VALUE); + final boolean isConsensusBasedTopic = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_HANDLER_VALUE.equalsIgnoreCase(topicFormat); + + if (isConsensusBasedTopic) { + LOGGER.info( + "DropSubscriptionProcedure: topic [{}] is consensus-based (mode={}, format={}), " + + "skipping pipe removal", + topic, + topicMode, + topicFormat); + continue; + } + // Topic will be subscribed by no consumers in this group dropPipeProcedures.add( new DropPipeProcedureV2( diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java index 140fffa852ccc..2af973e0c4425 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java @@ -72,7 +72,9 @@ import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.AlterConsumerGroupProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.CreateConsumerProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.DropConsumerProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.CommitProgressSyncProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.ConsumerGroupMetaSyncProcedure; +import org.apache.iotdb.confignode.procedure.impl.subscription.runtime.SubscriptionHandleLeaderChangeProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.CreateSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.subscription.DropSubscriptionProcedure; import org.apache.iotdb.confignode.procedure.impl.subscription.topic.AlterTopicProcedure; @@ -396,6 +398,12 @@ public Procedure create(ByteBuffer buffer) throws IOException { case CONSUMER_GROUP_META_SYNC_PROCEDURE: procedure = new ConsumerGroupMetaSyncProcedure(); break; + case COMMIT_PROGRESS_SYNC_PROCEDURE: + procedure = new CommitProgressSyncProcedure(); + break; + case SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE: + procedure = new SubscriptionHandleLeaderChangeProcedure(); + break; case CREATE_MANY_DATABASES_PROCEDURE: procedure = new CreateManyDatabasesProcedure(); break; @@ -544,6 +552,10 @@ public static ProcedureType getProcedureType(final Procedure procedure) { return ProcedureType.ALTER_CONSUMER_GROUP_PROCEDURE; } else if (procedure instanceof ConsumerGroupMetaSyncProcedure) { return ProcedureType.CONSUMER_GROUP_META_SYNC_PROCEDURE; + } else if (procedure instanceof CommitProgressSyncProcedure) { + return ProcedureType.COMMIT_PROGRESS_SYNC_PROCEDURE; + } else if (procedure instanceof SubscriptionHandleLeaderChangeProcedure) { + return ProcedureType.SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE; } else if (procedure instanceof DeleteLogicalViewProcedure) { return ProcedureType.DELETE_LOGICAL_VIEW_PROCEDURE; } else if (procedure instanceof AlterLogicalViewProcedure) { diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java index 839c8ace0984d..1cd6a46a4dcd1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java @@ -167,6 +167,8 @@ public enum ProcedureType { ALTER_CONSUMER_GROUP_PROCEDURE((short) 1507), TOPIC_META_SYNC_PROCEDURE((short) 1508), CONSUMER_GROUP_META_SYNC_PROCEDURE((short) 1509), + COMMIT_PROGRESS_SYNC_PROCEDURE((short) 1510), + SUBSCRIPTION_HANDLE_LEADER_CHANGE_PROCEDURE((short) 1511), /** Other */ @TestOnly diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java index 4d01f3770c218..d4c4b141916d6 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java @@ -159,6 +159,8 @@ import org.apache.iotdb.confignode.rpc.thrift.TGetAllTemplatesResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTopicInfoResp; import org.apache.iotdb.confignode.rpc.thrift.TGetClusterIdResp; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDataNodeLocationsResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDatabaseReq; import org.apache.iotdb.confignode.rpc.thrift.TGetJarInListReq; @@ -1313,6 +1315,11 @@ public TGetAllSubscriptionInfoResp getAllSubscriptionInfo() { return configManager.getAllSubscriptionInfo(); } + @Override + public TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) { + return configManager.getCommitProgress(req); + } + @Override public TGetRegionIdResp getRegionId(TGetRegionIdReq req) { return configManager.getRegionId(req); diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionProgressMergeTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionProgressMergeTest.java new file mode 100644 index 0000000000000..6b3b3253321c1 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/subscription/runtime/SubscriptionProgressMergeTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.subscription.runtime; + +import org.apache.iotdb.confignode.procedure.impl.subscription.consumer.runtime.CommitProgressSyncProcedure; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.lang.reflect.Method; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class SubscriptionProgressMergeTest { + + @Test + public void testCommitProgressSyncProcedureMergesPerWriterByMax() throws Exception { + final RegionProgress left = + createRegionProgress( + "1_1", + new WriterId("1_1", 7, 1L), + new WriterProgress(100L, 10L), + new WriterId("1_1", 8, 1L), + new WriterProgress(90L, 9L)); + final RegionProgress right = + createRegionProgress( + "1_1", + new WriterId("1_1", 7, 1L), + new WriterProgress(95L, 8L), + new WriterId("1_1", 8, 1L), + new WriterProgress(110L, 11L)); + + final RegionProgress merged = + invokeMergeRegionProgress(CommitProgressSyncProcedure.class, left, right); + + assertEquals( + new WriterProgress(100L, 10L), merged.getWriterPositions().get(new WriterId("1_1", 7, 1L))); + assertEquals( + new WriterProgress(110L, 11L), merged.getWriterPositions().get(new WriterId("1_1", 8, 1L))); + } + + @Test + public void testLeaderChangeProcedureMergesPerWriterByMax() throws Exception { + final RegionProgress left = + createRegionProgress( + "1_2", + new WriterId("1_2", 9, 3L), + new WriterProgress(200L, 20L), + new WriterId("1_2", 10, 3L), + new WriterProgress(150L, 15L)); + final RegionProgress right = + createRegionProgress( + "1_2", + new WriterId("1_2", 9, 3L), + new WriterProgress(220L, 18L), + new WriterId("1_2", 10, 3L), + new WriterProgress(140L, 14L)); + + final RegionProgress merged = + invokeMergeRegionProgress(SubscriptionHandleLeaderChangeProcedure.class, left, right); + + assertEquals( + new WriterProgress(220L, 18L), merged.getWriterPositions().get(new WriterId("1_2", 9, 3L))); + assertEquals( + new WriterProgress(150L, 15L), + merged.getWriterPositions().get(new WriterId("1_2", 10, 3L))); + } + + private static RegionProgress invokeMergeRegionProgress( + final Class clazz, final RegionProgress left, final RegionProgress right) + throws Exception { + final Method method = + clazz.getDeclaredMethod("mergeRegionProgress", RegionProgress.class, RegionProgress.class); + method.setAccessible(true); + return (RegionProgress) method.invoke(null, left, right); + } + + private static RegionProgress createRegionProgress( + final String regionId, + final WriterId firstWriterId, + final WriterProgress firstWriterProgress, + final WriterId secondWriterId, + final WriterProgress secondWriterProgress) { + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(firstWriterId, firstWriterProgress); + writerPositions.put(secondWriterId, secondWriterProgress); + return new RegionProgress(writerPositions); + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java index 9cdeaf60c3029..332d5c2d6ef16 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/DeserializedBatchIndexedConsensusRequest.java @@ -28,13 +28,24 @@ public class DeserializedBatchIndexedConsensusRequest implements IConsensusRequest, Comparable { private final long startSyncIndex; private final long endSyncIndex; + private final int writerNodeId; + private final long writerEpoch; + private final long endPhysicalTime; private final List insertNodes; private long memorySize; public DeserializedBatchIndexedConsensusRequest( - long startSyncIndex, long endSyncIndex, int size) { + long startSyncIndex, + long endSyncIndex, + int size, + int writerNodeId, + long writerEpoch, + long endPhysicalTime) { this.startSyncIndex = startSyncIndex; this.endSyncIndex = endSyncIndex; + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + this.endPhysicalTime = endPhysicalTime; // use arraylist here because we know the number of requests this.insertNodes = new ArrayList<>(size); } @@ -47,6 +58,18 @@ public long getEndSyncIndex() { return endSyncIndex; } + public int getWriterNodeId() { + return writerNodeId; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public long getEndPhysicalTime() { + return endPhysicalTime; + } + public List getInsertNodes() { return insertNodes; } @@ -72,12 +95,16 @@ public boolean equals(Object o) { DeserializedBatchIndexedConsensusRequest request = (DeserializedBatchIndexedConsensusRequest) o; return startSyncIndex == request.startSyncIndex && endSyncIndex == request.endSyncIndex + && writerNodeId == request.writerNodeId + && writerEpoch == request.writerEpoch + && endPhysicalTime == request.endPhysicalTime && Objects.equals(insertNodes, request.insertNodes); } @Override public int hashCode() { - return Objects.hash(startSyncIndex, endSyncIndex, insertNodes); + return Objects.hash( + startSyncIndex, endSyncIndex, writerNodeId, writerEpoch, endPhysicalTime, insertNodes); } @Override diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java index 2bf01d4ef868c..d78af2eba6373 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/common/request/IndexedConsensusRequest.java @@ -32,6 +32,19 @@ public class IndexedConsensusRequest implements IConsensusRequest { private final long searchIndex; private final long syncIndex; + + /** routing epoch from ConfigNode broadcast for ordered consensus subscription */ + private long epoch = 0; + + /** Millisecond physical time used as the first ordering key in the new subscription progress. */ + private long physicalTime = 0; + + /** Writer node id used as the second ordering key across multiple writers. */ + private int nodeId = -1; + + /** Writer-local lifecycle id. */ + private long writerEpoch = 0; + private final List requests; private final List serializedRequests; private long memorySize = 0; @@ -86,6 +99,56 @@ public long getSyncIndex() { return syncIndex; } + /** + * Returns the writer-local sequence used by the new subscription progress model. + * + *

For locally generated requests this is the request searchIndex. For replicated requests this + * is the source leader's propagated localSeq carried in syncIndex. + */ + public long getProgressLocalSeq() { + return syncIndex >= 0 ? syncIndex : searchIndex; + } + + public long getEpoch() { + return epoch; + } + + public IndexedConsensusRequest setEpoch(long epoch) { + this.epoch = epoch; + return this; + } + + public long getPhysicalTime() { + return physicalTime; + } + + public IndexedConsensusRequest setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + return this; + } + + public int getNodeId() { + return nodeId; + } + + public IndexedConsensusRequest setNodeId(int nodeId) { + this.nodeId = nodeId; + return this; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public IndexedConsensusRequest setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + return this; + } + + public long getLocalSeq() { + return searchIndex; + } + @Override public boolean equals(Object o) { if (this == o) { diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java index 32c4664b60dfd..738a72c4bc4ec 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/config/IoTConsensusConfig.java @@ -323,6 +323,7 @@ public static class Replication { private final IMemoryBlock consensusMemoryBlock; private final double maxMemoryRatioForQueue; private final long regionMigrationSpeedLimitBytesPerSecond; + private final long subscriptionWalRetentionSizeInBytes; private Replication( int maxLogEntriesNumPerBatch, @@ -338,7 +339,8 @@ private Replication( long checkpointGap, IMemoryBlock consensusMemoryBlock, double maxMemoryRatioForQueue, - long regionMigrationSpeedLimitBytesPerSecond) { + long regionMigrationSpeedLimitBytesPerSecond, + long subscriptionWalRetentionSizeInBytes) { this.maxLogEntriesNumPerBatch = maxLogEntriesNumPerBatch; this.maxSizePerBatch = maxSizePerBatch; this.maxPendingBatchesNum = maxPendingBatchesNum; @@ -353,6 +355,7 @@ private Replication( this.consensusMemoryBlock = consensusMemoryBlock; this.maxMemoryRatioForQueue = maxMemoryRatioForQueue; this.regionMigrationSpeedLimitBytesPerSecond = regionMigrationSpeedLimitBytesPerSecond; + this.subscriptionWalRetentionSizeInBytes = subscriptionWalRetentionSizeInBytes; } public int getMaxLogEntriesNumPerBatch() { @@ -411,6 +414,10 @@ public long getRegionMigrationSpeedLimitBytesPerSecond() { return regionMigrationSpeedLimitBytesPerSecond; } + public long getSubscriptionWalRetentionSizeInBytes() { + return subscriptionWalRetentionSizeInBytes; + } + public static Replication.Builder newBuilder() { return new Replication.Builder(); } @@ -434,6 +441,7 @@ public static class Builder { "Consensus-Default", null, Runtime.getRuntime().maxMemory() / 10); private double maxMemoryRatioForQueue = 0.6; private long regionMigrationSpeedLimitBytesPerSecond = 32 * 1024 * 1024L; + private long subscriptionWalRetentionSizeInBytes = 0; public Replication.Builder setMaxLogEntriesNumPerBatch(int maxLogEntriesNumPerBatch) { this.maxLogEntriesNumPerBatch = maxLogEntriesNumPerBatch; @@ -508,6 +516,12 @@ public Builder setRegionMigrationSpeedLimitBytesPerSecond( return this; } + public Builder setSubscriptionWalRetentionSizeInBytes( + long subscriptionWalRetentionSizeInBytes) { + this.subscriptionWalRetentionSizeInBytes = subscriptionWalRetentionSizeInBytes; + return this; + } + public Replication build() { return new Replication( maxLogEntriesNumPerBatch, @@ -523,7 +537,8 @@ public Replication build() { checkpointGap, consensusMemoryBlock, maxMemoryRatioForQueue, - regionMigrationSpeedLimitBytesPerSecond); + regionMigrationSpeedLimitBytesPerSecond, + subscriptionWalRetentionSizeInBytes); } } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java index 959191ca2d6d3..8cb168272b295 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensus.java @@ -82,6 +82,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.stream.Collectors; public class IoTConsensus implements IConsensus { @@ -98,6 +99,19 @@ public class IoTConsensus implements IConsensus { private final IoTConsensusRPCService service; private final RegisterManager registerManager = new RegisterManager(); private IoTConsensusConfig config; + + /** + * Optional callback invoked after a new local peer is created via {@link #createLocalPeer}. Used + * by the subscription system to auto-bind prefetching queues to new DataRegions. + */ + public static volatile BiConsumer onNewPeerCreated; + + /** + * Optional callback invoked before a local peer is deleted via {@link #deleteLocalPeer}. Used by + * the subscription system to unbind and clean up prefetching queues before the region is removed. + */ + public static volatile Consumer onPeerRemoved; + private final IClientManager clientManager; private final IClientManager syncClientManager; private final ScheduledExecutorService backgroundTaskService; @@ -299,11 +313,33 @@ public void createLocalPeer(ConsensusGroupId groupId, List peers) if (exist.get()) { throw new ConsensusGroupAlreadyExistException(groupId); } + + // Notify subscription system about new peer creation for auto-binding + final BiConsumer callback = onNewPeerCreated; + if (callback != null) { + try { + callback.accept(groupId, stateMachineMap.get(groupId)); + } catch (final Exception e) { + logger.warn("onNewPeerCreated callback failed for group {}", groupId, e); + } + } } @Override public void deleteLocalPeer(ConsensusGroupId groupId) throws ConsensusException { KillPoint.setKillPoint(IoTConsensusDeleteLocalPeerKillPoints.BEFORE_DELETE); + + // Notify subscription system before stopping the peer, so that subscription queues can + // properly unregister from the still-alive serverImpl. + final Consumer removeCallback = onPeerRemoved; + if (removeCallback != null) { + try { + removeCallback.accept(groupId); + } catch (final Exception e) { + logger.warn("onPeerRemoved callback failed for group {}", groupId, e); + } + } + AtomicBoolean exist = new AtomicBoolean(false); stateMachineMap.computeIfPresent( groupId, diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java index 567261efffffa..6405d5c9e93d6 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/IoTConsensusServerImpl.java @@ -58,6 +58,8 @@ import org.apache.iotdb.consensus.iot.thrift.TRemoveSyncLogChannelRes; import org.apache.iotdb.consensus.iot.thrift.TSendSnapshotFragmentReq; import org.apache.iotdb.consensus.iot.thrift.TSendSnapshotFragmentRes; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcReq; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcRes; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadReq; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadRes; import org.apache.iotdb.consensus.iot.thrift.TWaitReleaseAllRegionRelatedResourceReq; @@ -86,10 +88,13 @@ import java.util.LinkedList; import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.PriorityQueue; import java.util.TreeSet; import java.util.UUID; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; @@ -103,6 +108,7 @@ public class IoTConsensusServerImpl { public static final String SNAPSHOT_DIR_NAME = "snapshot"; + private static final String WRITER_META_FILE_NAME = "writer.meta"; private static final Pattern SNAPSHOT_INDEX_PATTEN = Pattern.compile(".*[^\\d](?=(\\d+))"); private static final PerformanceOverviewMetrics PERFORMANCE_OVERVIEW_METRICS = PerformanceOverviewMetrics.getInstance(); @@ -128,6 +134,31 @@ public class IoTConsensusServerImpl { IoTConsensusRateLimiter.getInstance(); private IndexedConsensusRequest lastConsensusRequest; + // Subscription queues receive IndexedConsensusRequest in real-time from write(), + // similar to LogDispatcher, enabling in-memory data delivery without waiting for WAL flush. + private final List> subscriptionQueues = + new CopyOnWriteArrayList<>(); + private static final long SUBSCRIPTION_QUEUE_FULL_LOG_INTERVAL_MS = TimeUnit.SECONDS.toMillis(10); + private final AtomicLong subscriptionQueueFullDroppedEntries = new AtomicLong(); + private final AtomicLong lastSubscriptionQueueFullLogTimeMs = new AtomicLong(); + + /** Current routing epoch for ordered consensus subscription. Set by external routing changes. */ + private volatile long currentRoutingEpoch = 0; + + /** Lifecycle identifier of the local writer for this region replica. */ + private volatile long currentWriterEpoch = 1; + + /** + * Maximum physical time known to this replica. Local writes assign from it; remote replication + * can also raise it so future local writes do not regress behind observed remote events. + */ + private final AtomicLong lastAssignedPhysicalTime = new AtomicLong(0); + + private final WriterSafeFrontierTracker writerSafeFrontierTracker = + new WriterSafeFrontierTracker(); + + private final Path writerMetaPath; + public IoTConsensusServerImpl( String storageDir, Peer thisNode, @@ -150,6 +181,8 @@ public IoTConsensusServerImpl( this.consensusReqReader = (ConsensusReqReader) stateMachine.read(new GetConsensusReqReaderPlan()); this.searchIndex = new AtomicLong(consensusReqReader.getCurrentSearchIndex()); + this.writerMetaPath = Paths.get(storageDir, WRITER_META_FILE_NAME); + initializeWriterMeta(); this.ioTConsensusServerMetrics = new IoTConsensusServerMetrics(this); this.logDispatcher = new LogDispatcher(this, clientManager); } @@ -209,6 +242,7 @@ public TSStatus write(IConsensusRequest request) { writeToStateMachineStartTime - getStateMachineLockTime); IndexedConsensusRequest indexedConsensusRequest = buildIndexedConsensusRequestForLocalRequest(request); + indexedConsensusRequest.setEpoch(currentRoutingEpoch); lastConsensusRequest = indexedConsensusRequest; if (indexedConsensusRequest.getSearchIndex() % 100000 == 0) { logger.info( @@ -228,6 +262,11 @@ public TSStatus write(IConsensusRequest request) { ioTConsensusServerMetrics.recordWriteStateMachineTime( writeToStateMachineEndTime - writeToStateMachineStartTime); if (result.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + writerSafeFrontierTracker.recordAppliedProgress( + thisNode.getNodeId(), + currentWriterEpoch, + indexedConsensusRequest.getPhysicalTime(), + indexedConsensusRequest.getLocalSeq()); // The index is used when constructing batch in LogDispatcher. If its value // increases but the corresponding request does not exist or is not put into // the queue, the dispatcher will try to find the request in WAL. This behavior @@ -236,17 +275,75 @@ public TSStatus write(IConsensusRequest request) { // in one transaction. synchronized (searchIndex) { logDispatcher.offer(indexedConsensusRequest); + // Deliver to subscription queues for real-time in-memory consumption. + // Offer AFTER stateMachine.write() so that InsertNode has inferred types + // and properly typed values (same timing as LogDispatcher). + final int sqCount = subscriptionQueues.size(); + if (sqCount > 0) { + logger.debug( + "write() offering to {} subscription queue(s), " + + "group={}, searchIndex={}, requestType={}", + sqCount, + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + indexedConsensusRequest.getRequests().isEmpty() + ? "EMPTY" + : indexedConsensusRequest.getRequests().get(0).getClass().getSimpleName()); + for (final BlockingQueue sq : subscriptionQueues) { + final boolean offered = sq.offer(indexedConsensusRequest); + logger.debug( + "offer result={}, queueSize={}, queueRemaining={}", + offered, + sq.size(), + sq.remainingCapacity()); + if (!offered) { + final long droppedCount = subscriptionQueueFullDroppedEntries.incrementAndGet(); + final long now = System.currentTimeMillis(); + final long lastLogTime = lastSubscriptionQueueFullLogTimeMs.get(); + if (now - lastLogTime >= SUBSCRIPTION_QUEUE_FULL_LOG_INTERVAL_MS + && lastSubscriptionQueueFullLogTimeMs.compareAndSet(lastLogTime, now)) { + logger.warn( + "Subscription queue full, dropped {} entry(s) in the last {} ms, latest " + + "searchIndex={}, queueSize={}, queueRemaining={}", + subscriptionQueueFullDroppedEntries.getAndSet(0), + SUBSCRIPTION_QUEUE_FULL_LOG_INTERVAL_MS, + indexedConsensusRequest.getSearchIndex(), + sq.size(), + sq.remainingCapacity()); + } else { + logger.debug( + "Subscription queue full, dropped entry searchIndex={}, droppedCount={}", + indexedConsensusRequest.getSearchIndex(), + droppedCount); + } + } + } + } else { + // Log periodically when no subscription queues are registered + if (indexedConsensusRequest.getSearchIndex() % 50 == 0) { + logger.debug( + "write() no subscription queues registered, " + + "group={}, searchIndex={}, this={}", + consensusGroupId, + indexedConsensusRequest.getSearchIndex(), + System.identityHashCode(this)); + } + } searchIndex.incrementAndGet(); } + persistWriterMetaOnSuccess(indexedConsensusRequest); // statistic the time of offering request into queue ioTConsensusServerMetrics.recordOfferRequestToQueueTime( System.nanoTime() - writeToStateMachineEndTime); } else { logger.debug( - "{}: write operation failed. searchIndex: {}. Code: {}", + "write operation FAILED. group={}, searchIndex={}, code={}, " + + "subscriptionQueues={}, this={}", thisNode.getGroupId(), indexedConsensusRequest.getSearchIndex(), - result.getCode()); + result.getCode(), + subscriptionQueues.size(), + System.identityHashCode(this)); } // statistic the time of total write process ioTConsensusServerMetrics.recordConsensusWriteTime( @@ -435,7 +532,7 @@ public interface ThrowableFunction { public void inactivatePeer(Peer peer, boolean forDeletionPurpose) throws ConsensusGroupModifyPeerException { ConsensusGroupModifyPeerException lastException = null; - // In region migration, if the target node restarts before the “addRegionPeer” phase within 1 + // In region migration, if the target node restarts before the 鈥渁ddRegionPeer鈥?phase within 1 // minutes, // the client in the ClientManager will become invalid. // This PR adds 1 retry at this point to ensure that region migration can still proceed @@ -659,6 +756,38 @@ private boolean isSuccess(TSStatus status) { return status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode(); } + public TSStatus syncSafeHlcToPeer( + final Peer targetPeer, + final int writerNodeId, + final long writerEpoch, + final long safePhysicalTime, + final long barrierLocalSeq) { + try (SyncIoTConsensusServiceClient client = + syncClientManager.borrowClient(targetPeer.getEndpoint())) { + final TSyncSafeHlcRes res = + client.syncSafeHlc( + new TSyncSafeHlcReq() + .setConsensusGroupId(thisNode.getGroupId().convertToTConsensusGroupId()) + .setWriterNodeId(writerNodeId) + .setWriterEpoch(writerEpoch) + .setSafePhysicalTime(safePhysicalTime) + .setBarrierLocalSeq(barrierLocalSeq)); + return res.getStatus(); + } catch (Exception e) { + logger.debug( + "Failed to sync safeHLC to peer {} for group {}, writer=({}, {}), safePt={}, barrier={}", + targetPeer, + consensusGroupId, + writerNodeId, + writerEpoch, + safePhysicalTime, + barrierLocalSeq, + e); + return new TSStatus(TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode()) + .setMessage(e.getMessage()); + } + } + /** build SyncLog channel with safeIndex as the default initial sync index. */ public void buildSyncLogChannel(Peer targetPeer, boolean startNow) { buildSyncLogChannel(targetPeer, getMinSyncIndex(), startNow); @@ -720,13 +849,152 @@ public IndexedConsensusRequest buildIndexedConsensusRequestForLocalRequest( new IoTProgressIndex(thisNode.getNodeId(), searchIndex.get() + 1); ((ComparableConsensusRequest) request).setProgressIndex(iotProgressIndex); } - return new IndexedConsensusRequest(searchIndex.get() + 1, Collections.singletonList(request)); + return new IndexedConsensusRequest(searchIndex.get() + 1, Collections.singletonList(request)) + .setPhysicalTime(assignPhysicalTimeInMs()) + .setNodeId(thisNode.getNodeId()) + .setWriterEpoch(currentWriterEpoch); } public IndexedConsensusRequest buildIndexedConsensusRequestForRemoteRequest( - long syncIndex, List requests) { - return new IndexedConsensusRequest( - ConsensusReqReader.DEFAULT_SEARCH_INDEX, syncIndex, requests); + long syncIndex, + long epoch, + long physicalTime, + int nodeId, + long writerEpoch, + List requests) { + observePhysicalTimeLowerBound(physicalTime); + IndexedConsensusRequest req = + new IndexedConsensusRequest(ConsensusReqReader.DEFAULT_SEARCH_INDEX, syncIndex, requests); + req.setEpoch(epoch); + req.setPhysicalTime(physicalTime); + req.setNodeId(nodeId); + req.setWriterEpoch(writerEpoch); + return req; + } + + public WriterSafeFrontierTracker.SafeHlc createIdleSafeHlcForCurrentWriter() { + final long safePhysicalTime = assignPhysicalTimeInMs(); + final long barrierLocalSeq = searchIndex.get(); + writerSafeFrontierTracker.recordAppliedProgress( + thisNode.getNodeId(), currentWriterEpoch, safePhysicalTime, barrierLocalSeq); + return new WriterSafeFrontierTracker.SafeHlc(safePhysicalTime, barrierLocalSeq); + } + + public void observeRemoteSafeHlc( + final int writerNodeId, + final long writerEpoch, + final long safePhysicalTime, + final long barrierLocalSeq) { + observePhysicalTimeLowerBound(safePhysicalTime); + writerSafeFrontierTracker.observePendingSafeHlc( + writerNodeId, writerEpoch, safePhysicalTime, barrierLocalSeq); + } + + public void recordRemoteAppliedWriterProgress( + final int writerNodeId, + final long writerEpoch, + final long physicalTime, + final long appliedLocalSeq) { + writerSafeFrontierTracker.recordAppliedProgress( + writerNodeId, writerEpoch, physicalTime, appliedLocalSeq); + } + + public long getEffectiveSafePhysicalTime(final int writerNodeId, final long writerEpoch) { + return writerSafeFrontierTracker.getEffectiveSafePt(writerNodeId, writerEpoch); + } + + public WriterSafeFrontierTracker getWriterSafeFrontierTracker() { + return writerSafeFrontierTracker; + } + + public boolean hasSubscriptionConsumers() { + return !subscriptionQueues.isEmpty(); + } + + private long assignPhysicalTimeInMs() { + while (true) { + final long previous = lastAssignedPhysicalTime.get(); + final long candidate = Math.max(System.currentTimeMillis(), previous); + if (lastAssignedPhysicalTime.compareAndSet(previous, candidate)) { + return candidate; + } + } + } + + private void observePhysicalTimeLowerBound(final long observedPhysicalTime) { + if (observedPhysicalTime <= 0) { + return; + } + while (true) { + final long previous = lastAssignedPhysicalTime.get(); + final long candidate = Math.max(previous, observedPhysicalTime); + if (candidate == previous || lastAssignedPhysicalTime.compareAndSet(previous, candidate)) { + return; + } + } + } + + private void initializeWriterMeta() { + final long recoveredSearchIndex = searchIndex.get(); + try { + final Optional writerMetaOptional = WriterMeta.load(writerMetaPath); + if (writerMetaOptional.isPresent()) { + final WriterMeta writerMeta = writerMetaOptional.get(); + if (recoveredSearchIndex >= writerMeta.getLastAllocatedLocalSeq()) { + currentWriterEpoch = writerMeta.getWriterEpoch(); + logger.info( + "Recovered writer meta for group {} from {}, writerEpoch={}, recoveredLocalSeq={}, " + + "persistedLocalSeq={}", + consensusGroupId, + writerMetaPath, + currentWriterEpoch, + recoveredSearchIndex, + writerMeta.getLastAllocatedLocalSeq()); + } else { + currentWriterEpoch = writerMeta.getWriterEpoch() + 1; + logger.warn( + "Recovered searchIndex {} is behind persisted writer localSeq {} for group {}. " + + "Starting a new writerEpoch {}.", + recoveredSearchIndex, + writerMeta.getLastAllocatedLocalSeq(), + consensusGroupId, + currentWriterEpoch); + } + lastAssignedPhysicalTime.set( + Math.max(writerMeta.getLastAssignedPhysicalTimeMs(), System.currentTimeMillis())); + return; + } + } catch (IOException e) { + logger.warn( + "Failed to load writer meta for group {} from {}. Starting with writerEpoch=1.", + consensusGroupId, + writerMetaPath, + e); + } + currentWriterEpoch = 1; + lastAssignedPhysicalTime.set(System.currentTimeMillis()); + logger.info( + "Initialized fresh writer meta for group {}, writerEpoch={}, recoveredLocalSeq={}", + consensusGroupId, + currentWriterEpoch, + recoveredSearchIndex); + } + + private void persistWriterMetaOnSuccess(final IndexedConsensusRequest indexedConsensusRequest) { + try { + new WriterMeta( + currentWriterEpoch, + indexedConsensusRequest.getLocalSeq(), + indexedConsensusRequest.getPhysicalTime()) + .persist(writerMetaPath); + } catch (IOException e) { + logger.warn( + "Failed to persist writer meta for group {} at localSeq={}, pt={}", + consensusGroupId, + indexedConsensusRequest.getLocalSeq(), + indexedConsensusRequest.getPhysicalTime(), + e); + } } /** @@ -757,6 +1025,45 @@ public long getSearchIndex() { return searchIndex.get(); } + public long getCurrentWriterEpoch() { + return currentWriterEpoch; + } + + public ConsensusReqReader getConsensusReqReader() { + return consensusReqReader; + } + + /** + * Registers a subscription pending queue for real-time in-memory data delivery. When {@link + * #write(IConsensusRequest)} succeeds, the IndexedConsensusRequest is offered to all registered + * subscription queues, enabling subscription consumers to receive data without waiting for WAL + * flush. + * + * @param queue the blocking queue to receive IndexedConsensusRequest entries + */ + public void registerSubscriptionQueue(final BlockingQueue queue) { + subscriptionQueues.add(queue); + // Immediately re-evaluate the safe delete index with new subscription awareness + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Registered subscription queue for group {}, " + + "total subscription queues: {}, currentSearchIndex={}, this={}", + consensusGroupId, + subscriptionQueues.size(), + searchIndex.get(), + System.identityHashCode(this)); + } + + public void unregisterSubscriptionQueue(final BlockingQueue queue) { + subscriptionQueues.remove(queue); + // Re-evaluate: with fewer subscribers, more WAL may be deletable + checkAndUpdateSafeDeletedSearchIndex(); + logger.info( + "Unregistered subscription queue for group {}, remaining subscription queues: {}", + consensusGroupId, + subscriptionQueues.size()); + } + public long getSyncLag() { long minSyncIndex = getMinSyncIndex(); return getSearchIndex() - minSyncIndex; @@ -872,17 +1179,59 @@ void checkAndUpdateIndex() { } /** - * If there is only one replica, set it to Long.MAX_VALUE. If there are multiple replicas, get the - * latest SafelyDeletedSearchIndex again. This enables wal to be deleted in a timely manner. + * Computes and updates the safe-to-delete WAL search index based on replication progress and + * subscription WAL retention policy. When no subscriptions exist, WAL is cleaned normally. + * + *

Subscription retention uses this region's own WAL disk usage (not global) and supports + * graduated cleanup: when WAL exceeds the retention limit, only enough oldest WAL files are + * released to bring the size back within the limit, rather than releasing all WAL at once. */ - void checkAndUpdateSafeDeletedSearchIndex() { + public void checkAndUpdateSafeDeletedSearchIndex() { if (configuration.isEmpty()) { logger.error( "Configuration is empty, which is unexpected. Safe deleted search index won't be updated this time."); - } else if (configuration.size() == 1) { + return; + } + + final boolean hasSubscriptions = !subscriptionQueues.isEmpty(); + final long retentionSizeLimit = + config.getReplication().getSubscriptionWalRetentionSizeInBytes(); + + if (configuration.size() == 1 && !hasSubscriptions) { + // Single replica, no subscription consumers => delete all WAL freely consensusReqReader.setSafelyDeletedSearchIndex(Long.MAX_VALUE); + consensusReqReader.setSubscriptionRetainedMinVersionId(Long.MAX_VALUE); } else { - consensusReqReader.setSafelyDeletedSearchIndex(getMinFlushedSyncIndex()); + final long replicationIndex = + configuration.size() > 1 ? getMinFlushedSyncIndex() : Long.MAX_VALUE; + + // Subscription WAL retention: if subscriptions exist and retention is configured, + // use this region's own WAL size to decide how much to retain. + long subscriptionRetentionBound = Long.MAX_VALUE; + long subscriptionRetainedMinVersionId = Long.MAX_VALUE; + if (hasSubscriptions && retentionSizeLimit > 0) { + final long regionWalSize = consensusReqReader.getRegionDiskUsage(); + if (regionWalSize <= retentionSizeLimit) { + // Region WAL size is within retention limit 鈥?preserve all WAL for subscribers. + // Use Long.MIN_VALUE + 1 instead of DEFAULT_SAFELY_DELETED_SEARCH_INDEX (Long.MIN_VALUE) + // because WAL's DeleteOutdatedFileTask treats Long.MIN_VALUE as a special case that + // allows all files to be deleted (no consensus constraint), which is opposite to our + // intent here. Long.MIN_VALUE + 1 avoids the special case and is still less than any + // real searchIndex (>= 0), so no WAL files will pass the searchIndex filter. + subscriptionRetentionBound = Long.MIN_VALUE + 1; + // Retain all WAL files for subscription + subscriptionRetainedMinVersionId = 0; + } else { + // Region WAL exceeds retention limit 鈥?free just enough to bring it back within limit + final long excess = regionWalSize - retentionSizeLimit; + subscriptionRetentionBound = consensusReqReader.getSearchIndexToFreeAtLeast(excess); + subscriptionRetainedMinVersionId = consensusReqReader.getVersionIdToFreeAtLeast(excess); + } + } + + consensusReqReader.setSafelyDeletedSearchIndex( + Math.min(replicationIndex, subscriptionRetentionBound)); + consensusReqReader.setSubscriptionRetainedMinVersionId(subscriptionRetainedMinVersionId); } } @@ -1019,6 +1368,14 @@ private TSStatus cacheAndInsertLatestNode(DeserializedBatchIndexedConsensusReque insertNode.markAsGeneratedByRemoteConsensusLeader(); subStatus.add(stateMachine.write(insertNode)); } + if (subStatus.stream() + .allMatch(status -> status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode())) { + recordRemoteAppliedWriterProgress( + request.getWriterNodeId(), + request.getWriterEpoch(), + request.getEndPhysicalTime(), + request.getEndSyncIndex()); + } long applyTime = System.nanoTime(); ioTConsensusServerMetrics.recordApplyCost(applyTime - sortTime); queueSortCondition.signalAll(); diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterMeta.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterMeta.java new file mode 100644 index 0000000000000..c3d30c8594c06 --- /dev/null +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterMeta.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.util.Optional; + +final class WriterMeta { + + private static final int FORMAT_VERSION = 1; + + private final long writerEpoch; + private final long lastAllocatedLocalSeq; + private final long lastAssignedPhysicalTimeMs; + + WriterMeta(long writerEpoch, long lastAllocatedLocalSeq, long lastAssignedPhysicalTimeMs) { + this.writerEpoch = writerEpoch; + this.lastAllocatedLocalSeq = lastAllocatedLocalSeq; + this.lastAssignedPhysicalTimeMs = lastAssignedPhysicalTimeMs; + } + + long getWriterEpoch() { + return writerEpoch; + } + + long getLastAllocatedLocalSeq() { + return lastAllocatedLocalSeq; + } + + long getLastAssignedPhysicalTimeMs() { + return lastAssignedPhysicalTimeMs; + } + + static Optional load(Path path) throws IOException { + if (!Files.exists(path)) { + return Optional.empty(); + } + try (InputStream inputStream = Files.newInputStream(path, StandardOpenOption.READ); + DataInputStream dataInputStream = new DataInputStream(inputStream)) { + final int version = dataInputStream.readInt(); + if (version != FORMAT_VERSION) { + throw new IOException( + String.format( + "Unsupported writer meta version %d in %s", version, path.toAbsolutePath())); + } + return Optional.of( + new WriterMeta( + dataInputStream.readLong(), dataInputStream.readLong(), dataInputStream.readLong())); + } + } + + void persist(Path path) throws IOException { + final Path parent = path.getParent(); + if (parent != null && !Files.exists(parent)) { + Files.createDirectories(parent); + } + final Path tempPath = + parent == null + ? Paths.get(path + ".tmp") + : parent.resolve(path.getFileName().toString() + ".tmp"); + try (OutputStream outputStream = + Files.newOutputStream( + tempPath, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING, + StandardOpenOption.WRITE); + DataOutputStream dataOutputStream = new DataOutputStream(outputStream)) { + dataOutputStream.writeInt(FORMAT_VERSION); + dataOutputStream.writeLong(writerEpoch); + dataOutputStream.writeLong(lastAllocatedLocalSeq); + dataOutputStream.writeLong(lastAssignedPhysicalTimeMs); + dataOutputStream.flush(); + } + Files.move(tempPath, path, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE); + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTracker.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTracker.java new file mode 100644 index 0000000000000..f48c258c4dd3a --- /dev/null +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTracker.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +/** + * Tracks per-writer safe frontier on the receiving side. + * + *

Each writer keeps at most one pending safeHLC because generated safeHLC for the same writer is + * expected to be totally ordered by both safePt and barrierLocalSeq. + */ +public class WriterSafeFrontierTracker { + + private static final Logger LOGGER = LoggerFactory.getLogger(WriterSafeFrontierTracker.class); + + private final Map states = new HashMap<>(); + + public synchronized void recordAppliedProgress( + final int writerNodeId, + final long writerEpoch, + final long physicalTime, + final long appliedLocalSeq) { + final WriterIdentity writerIdentity = new WriterIdentity(writerNodeId, writerEpoch); + final WriterFrontierState state = + states.computeIfAbsent(writerIdentity, ignored -> new WriterFrontierState()); + state.appliedLocalSeq = Math.max(state.appliedLocalSeq, appliedLocalSeq); + if (physicalTime > 0) { + state.effectiveSafePt = Math.max(state.effectiveSafePt, physicalTime); + } + promotePendingIfReady(state); + } + + public synchronized void observePendingSafeHlc( + final int writerNodeId, + final long writerEpoch, + final long safePhysicalTime, + final long barrierLocalSeq) { + if (safePhysicalTime <= 0) { + return; + } + final WriterIdentity writerIdentity = new WriterIdentity(writerNodeId, writerEpoch); + final WriterFrontierState state = + states.computeIfAbsent(writerIdentity, ignored -> new WriterFrontierState()); + final SafeHlc candidate = new SafeHlc(safePhysicalTime, barrierLocalSeq); + if (state.appliedLocalSeq >= barrierLocalSeq) { + state.effectiveSafePt = Math.max(state.effectiveSafePt, safePhysicalTime); + state.pendingSafeHlc = null; + return; + } + if (state.pendingSafeHlc == null) { + state.pendingSafeHlc = candidate; + return; + } + final SafeHlc pending = state.pendingSafeHlc; + if (dominates(candidate, pending)) { + state.pendingSafeHlc = candidate; + return; + } + if (dominates(pending, candidate)) { + return; + } + LOGGER.warn( + "Observed incomparable safeHLC for writer {}. keep pending={}, ignore candidate={}", + writerIdentity, + pending, + candidate); + } + + public synchronized long getEffectiveSafePt(final int writerNodeId, final long writerEpoch) { + final WriterFrontierState state = states.get(new WriterIdentity(writerNodeId, writerEpoch)); + return Objects.nonNull(state) ? state.effectiveSafePt : 0L; + } + + public synchronized SafeHlc getPendingSafeHlc(final int writerNodeId, final long writerEpoch) { + final WriterFrontierState state = states.get(new WriterIdentity(writerNodeId, writerEpoch)); + return Objects.nonNull(state) ? state.pendingSafeHlc : null; + } + + public synchronized Map snapshotEffectiveSafePts() { + final Map snapshot = new HashMap<>(); + for (final Map.Entry entry : states.entrySet()) { + snapshot.put(entry.getKey(), entry.getValue().effectiveSafePt); + } + return Collections.unmodifiableMap(snapshot); + } + + private void promotePendingIfReady(final WriterFrontierState state) { + if (state.pendingSafeHlc == null) { + return; + } + if (state.appliedLocalSeq >= state.pendingSafeHlc.getBarrierLocalSeq()) { + state.effectiveSafePt = + Math.max(state.effectiveSafePt, state.pendingSafeHlc.getSafePhysicalTime()); + state.pendingSafeHlc = null; + } + } + + private static boolean dominates(final SafeHlc left, final SafeHlc right) { + return left.safePhysicalTime >= right.safePhysicalTime + && left.barrierLocalSeq >= right.barrierLocalSeq; + } + + public static final class WriterIdentity { + private final int writerNodeId; + private final long writerEpoch; + + public WriterIdentity(final int writerNodeId, final long writerEpoch) { + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + } + + public int getWriterNodeId() { + return writerNodeId; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterIdentity)) { + return false; + } + final WriterIdentity that = (WriterIdentity) obj; + return writerNodeId == that.writerNodeId && writerEpoch == that.writerEpoch; + } + + @Override + public int hashCode() { + return Objects.hash(writerNodeId, writerEpoch); + } + + @Override + public String toString() { + return "WriterIdentity{" + + "writerNodeId=" + + writerNodeId + + ", writerEpoch=" + + writerEpoch + + '}'; + } + } + + public static final class SafeHlc { + private final long safePhysicalTime; + private final long barrierLocalSeq; + + public SafeHlc(final long safePhysicalTime, final long barrierLocalSeq) { + this.safePhysicalTime = safePhysicalTime; + this.barrierLocalSeq = barrierLocalSeq; + } + + public long getSafePhysicalTime() { + return safePhysicalTime; + } + + public long getBarrierLocalSeq() { + return barrierLocalSeq; + } + + @Override + public String toString() { + return "SafeHlc{" + + "safePhysicalTime=" + + safePhysicalTime + + ", barrierLocalSeq=" + + barrierLocalSeq + + '}'; + } + } + + private static final class WriterFrontierState { + private long appliedLocalSeq = 0L; + private long effectiveSafePt = 0L; + private SafeHlc pendingSafeHlc; + } +} diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java index bb0326d7473e7..bd3650fc9c231 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/client/DispatchLogHandler.java @@ -71,12 +71,16 @@ public void onComplete(TSyncLogEntriesRes response) { .collect(Collectors.toList()); String messages = String.join(", ", retryStatusMessages); - logger.warn( - "Can not send {} to peer {} for {} times because {}", - batch, - thread.getPeer(), - ++retryCount, - messages); + if (++retryCount == 1) { + logger.warn("Can not send {} to peer {} because {}", batch, thread.getPeer(), messages); + } else { + logger.debug( + "Can not send {} to peer {} for {} times because {}", + batch, + thread.getPeer(), + retryCount, + messages); + } sleepCorrespondingTimeAndRetryAsynchronous(); } else { if (logger.isDebugEnabled()) { @@ -105,14 +109,19 @@ public void onComplete(TSyncLogEntriesRes response) { public void onError(Exception exception) { ++retryCount; Throwable rootCause = ExceptionUtils.getRootCause(exception); - logger.warn( - "Can not send {} to peer for {} times {} because {}", - batch, - thread.getPeer(), - retryCount, - rootCause.toString()); + final Throwable actualCause = rootCause == null ? exception : rootCause; + if (retryCount == 1) { + logger.warn("Can not send {} to peer {} because {}", batch, thread.getPeer(), actualCause); + } else { + logger.debug( + "Can not send {} to peer for {} times {} because {}", + batch, + thread.getPeer(), + retryCount, + actualCause.toString()); + } // skip TApplicationException caused by follower - if (rootCause instanceof TApplicationException) { + if (actualCause instanceof TApplicationException) { completeBatch(batch); logger.warn("Skip retrying this Batch {} because of TApplicationException.", batch); logDispatcherThreadMetrics.recordSyncLogTimePerRequest(System.nanoTime() - createTime); diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java index 6959b56b674d3..0f03ac8799d1e 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/log/ConsensusReqReader.java @@ -90,4 +90,48 @@ interface ReqIterator { /** Get total size of wal files. */ long getTotalSize(); + + /** + * Get disk usage of this specific WAL node (region-local), as opposed to {@link #getTotalSize()} + * which returns the global WAL disk usage across all WAL nodes. + */ + default long getRegionDiskUsage() { + return getTotalSize(); + } + + /** + * Calculate the search index boundary that, if used as safelyDeletedSearchIndex, would free at + * least {@code bytesToFree} bytes of WAL files from the oldest files of this WAL node. + * + * @param bytesToFree the minimum number of bytes to free + * @return the startSearchIndex of the WAL file just after the freed range, or {@link + * #DEFAULT_SAFELY_DELETED_SEARCH_INDEX} if no files need to be freed + */ + default long getSearchIndexToFreeAtLeast(long bytesToFree) { + // Default implementation: if any freeing is needed, allow deleting everything. + return bytesToFree > 0 ? Long.MAX_VALUE : DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + + /** + * Set the minimum WAL file versionId that must be retained for subscription consumers. Files with + * versionId >= this value will not be deleted, regardless of their WALFileStatus. This protects + * Follower WAL files (CONTAINS_NONE_SEARCH_INDEX) from being deleted while subscriptions need + * them. + * + * @param minVersionId the minimum versionId to retain; Long.MAX_VALUE means no retention + */ + default void setSubscriptionRetainedMinVersionId(long minVersionId) { + // no-op by default + } + + /** + * Calculate the minimum WAL file versionId to retain such that freeing all files with versionId + * below that value would release at least {@code bytesToFree} bytes. + * + * @param bytesToFree the minimum number of bytes to free + * @return the versionId boundary; files with versionId < this can be freed + */ + default long getVersionIdToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : 0; + } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java index 374691bf38bf1..e50b9e8b22a56 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/LogDispatcher.java @@ -20,14 +20,17 @@ package org.apache.iotdb.consensus.iot.logdispatcher; import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.client.IClientManager; import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.service.metric.MetricService; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; import org.apache.iotdb.consensus.common.Peer; import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; import org.apache.iotdb.consensus.config.IoTConsensusConfig; import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.WriterSafeFrontierTracker; import org.apache.iotdb.consensus.iot.client.AsyncIoTConsensusServiceClient; import org.apache.iotdb.consensus.iot.client.DispatchLogHandler; import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; @@ -167,15 +170,16 @@ public synchronized OptionalLong getMinFlushedSyncIndex() { return threads.stream().mapToLong(LogDispatcherThread::getLastFlushedSyncIndex).min(); } - public void checkAndFlushIndex() { + public synchronized void checkAndFlushIndex() { if (!threads.isEmpty()) { threads.forEach( thread -> { IndexController controller = thread.getController(); controller.update(controller.getCurrentIndex(), true); }); - // do not set SafelyDeletedSearchIndex as it is Long.MAX_VALUE when replica is 1 - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); + // Use subscription-aware safe-delete to avoid deleting WAL entries + // still needed by subscription consumers. + impl.checkAndUpdateSafeDeletedSearchIndex(); } } @@ -213,7 +217,7 @@ public long getLogEntriesFromQueue() { public class LogDispatcherThread implements Runnable { - private static final long PENDING_REQUEST_TAKING_TIME_OUT_IN_SEC = 10; + private static final long PENDING_REQUEST_TAKING_TIME_OUT_IN_MS = 10_000L; private static final long START_INDEX = 1; private final IoTConsensusConfig config; private final Peer peer; @@ -236,6 +240,7 @@ public class LogDispatcherThread implements Runnable { private final LogDispatcherThreadMetrics logDispatcherThreadMetrics; private final CountDownLatch runFinished = new CountDownLatch(1); + private volatile long lastIdleSafeHlcSentTimeMs = 0L; public LogDispatcherThread(Peer peer, IoTConsensusConfig config, long initialSyncIndex) { this.peer = peer; @@ -354,9 +359,10 @@ public void run() { while (!Thread.interrupted() && !stopped) { long startTime = System.nanoTime(); while ((batch = getBatch()).isEmpty()) { + maybeSendIdleSafeHlc(); // we may block here if there is no requests in the queue IndexedConsensusRequest request = - pendingEntries.poll(PENDING_REQUEST_TAKING_TIME_OUT_IN_SEC, TimeUnit.SECONDS); + pendingEntries.poll(calculateIdlePollTimeoutInMs(), TimeUnit.MILLISECONDS); if (request != null) { bufferedEntries.add(request); // If write pressure is low, we simply sleep a little to reduce the number of RPC @@ -364,6 +370,8 @@ public void run() { && bufferedEntries.isEmpty()) { Thread.sleep(config.getReplication().getMaxWaitingTimeForAccumulatingBatchInMs()); } + } else { + maybeSendIdleSafeHlc(); } // Immediately check for interrupts after poll and sleep if (Thread.interrupted() || stopped) { @@ -397,8 +405,9 @@ public void updateSafelyDeletedSearchIndex() { // indicating that insert nodes whose search index are before this value can be deleted // safely. // - // Use minFlushedSyncIndex here to reserve the WAL which are not flushed and support kill -9. - reader.setSafelyDeletedSearchIndex(impl.getMinFlushedSyncIndex()); + // Use subscription-aware safe-delete to avoid deleting WAL entries + // still needed by subscription consumers. + impl.checkAndUpdateSafeDeletedSearchIndex(); // notify if (impl.unblockWrite()) { impl.signal(); @@ -406,6 +415,7 @@ public void updateSafelyDeletedSearchIndex() { } public Batch getBatch() { + long startIndex = syncStatus.getNextSendingIndex(); long maxIndex; synchronized (impl.getIndexObject()) { @@ -504,6 +514,56 @@ public Batch getBatch() { return batches; } + private void maybeSendIdleSafeHlc() { + if (!shouldSendIdleSafeHlc()) { + return; + } + final long now = System.currentTimeMillis(); + if (now - lastIdleSafeHlcSentTimeMs + < SubscriptionConfig.getInstance().getSubscriptionConsensusIdleSafeHlcIntervalMs()) { + return; + } + final WriterSafeFrontierTracker.SafeHlc safeHlc = impl.createIdleSafeHlcForCurrentWriter(); + final TSStatus status = + impl.syncSafeHlcToPeer( + peer, + impl.getThisNode().getNodeId(), + impl.getCurrentWriterEpoch(), + safeHlc.getSafePhysicalTime(), + safeHlc.getBarrierLocalSeq()); + if (status.getCode() == org.apache.iotdb.rpc.TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + lastIdleSafeHlcSentTimeMs = now; + } else { + logger.debug( + "{}: Failed to send idle safeHLC to {}. status={}", + impl.getThisNode().getGroupId(), + peer, + status); + } + } + + private long calculateIdlePollTimeoutInMs() { + if (!shouldSendIdleSafeHlc()) { + return PENDING_REQUEST_TAKING_TIME_OUT_IN_MS; + } + final long elapsedSinceLastIdleSafeHlcMs = + System.currentTimeMillis() - lastIdleSafeHlcSentTimeMs; + final long untilNextIdleSafeHlcMs = + Math.max( + 1L, + SubscriptionConfig.getInstance().getSubscriptionConsensusIdleSafeHlcIntervalMs() + - elapsedSinceLastIdleSafeHlcMs); + return Math.min(PENDING_REQUEST_TAKING_TIME_OUT_IN_MS, untilNextIdleSafeHlcMs); + } + + private boolean shouldSendIdleSafeHlc() { + return impl.hasSubscriptionConsumers() + && pendingEntries.isEmpty() + && bufferedEntries.isEmpty() + && !syncStatus.hasPendingBatches() + && syncStatus.getNextSendingIndex() > impl.getSearchIndex(); + } + public void sendBatchAsync(Batch batch, DispatchLogHandler handler) { try { AsyncIoTConsensusServiceClient client = clientManager.borrowClient(peer.getEndpoint()); @@ -565,9 +625,13 @@ private boolean constructBatchFromWAL(long currentIndex, long maxIndex, Batch lo targetIndex = data.getSearchIndex() + 1; data.buildSerializedRequests(); // construct request from wal - logBatches.addTLogEntry( + TLogEntry logEntry = new TLogEntry( - data.getSerializedRequests(), data.getSearchIndex(), true, data.getMemorySize())); + data.getSerializedRequests(), data.getSearchIndex(), true, data.getMemorySize()); + logEntry.setEpoch(data.getEpoch()); + logEntry.setPhysicalTime(data.getPhysicalTime()); + logEntry.setWriterEpoch(writerEpochToShort(data.getWriterEpoch())); + logBatches.addTLogEntry(logEntry); } // In the case of corrupt Data, we return true so that we can send a batch as soon as // possible, avoiding potential duplication @@ -576,12 +640,16 @@ private boolean constructBatchFromWAL(long currentIndex, long maxIndex, Batch lo private void constructBatchIndexedFromConsensusRequest( IndexedConsensusRequest request, Batch logBatches) { - logBatches.addTLogEntry( + TLogEntry logEntry = new TLogEntry( request.getSerializedRequests(), request.getSearchIndex(), false, - request.getMemorySize())); + request.getMemorySize()); + logEntry.setEpoch(request.getEpoch()); + logEntry.setPhysicalTime(request.getPhysicalTime()); + logEntry.setWriterEpoch(writerEpochToShort(request.getWriterEpoch())); + logBatches.addTLogEntry(logEntry); } } @@ -592,4 +660,11 @@ public static AtomicLong getReceiverMemSizeSum() { public static AtomicLong getSenderMemSizeSum() { return senderMemSizeSum; } + + private static short writerEpochToShort(long writerEpoch) { + if (writerEpoch < Short.MIN_VALUE || writerEpoch > Short.MAX_VALUE) { + throw new IllegalArgumentException("writerEpoch exceeds short range: " + writerEpoch); + } + return (short) writerEpoch; + } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java index accc9f7667d21..35304b82406c1 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/logdispatcher/SyncStatus.java @@ -109,4 +109,8 @@ public synchronized long getNextSendingIndex() { public synchronized List getPendingBatches() { return pendingBatches; } + + public synchronized boolean hasPendingBatches() { + return !pendingBatches.isEmpty(); + } } diff --git a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java index 71c14aebaa139..2075d8d871cba 100644 --- a/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java +++ b/iotdb-core/consensus/src/main/java/org/apache/iotdb/consensus/iot/service/IoTConsensusRPCServiceProcessor.java @@ -48,6 +48,8 @@ import org.apache.iotdb.consensus.iot.thrift.TSendSnapshotFragmentRes; import org.apache.iotdb.consensus.iot.thrift.TSyncLogEntriesReq; import org.apache.iotdb.consensus.iot.thrift.TSyncLogEntriesRes; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcReq; +import org.apache.iotdb.consensus.iot.thrift.TSyncSafeHlcRes; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadReq; import org.apache.iotdb.consensus.iot.thrift.TTriggerSnapshotLoadRes; import org.apache.iotdb.consensus.iot.thrift.TWaitReleaseAllRegionRelatedResourceReq; @@ -107,11 +109,19 @@ public TSyncLogEntriesRes syncLogEntries(TSyncLogEntriesReq req) { } BatchIndexedConsensusRequest logEntriesInThisBatch = new BatchIndexedConsensusRequest(req.peerId); + final int sourceNodeId = req.peerId; // We use synchronized to ensure atomicity of executing multiple logs for (TLogEntry entry : req.getLogEntries()) { + long epoch = entry.isSetEpoch() ? entry.getEpoch() : 0L; + long physicalTime = entry.isSetPhysicalTime() ? entry.getPhysicalTime() : 0L; + long writerEpoch = entry.isSetWriterEpoch() ? entry.getWriterEpoch() : 0L; logEntriesInThisBatch.add( impl.buildIndexedConsensusRequestForRemoteRequest( entry.getSearchIndex(), + epoch, + physicalTime, + sourceNodeId, + writerEpoch, entry.getData().stream() .map( entry.isFromWAL() @@ -133,6 +143,28 @@ public TSyncLogEntriesRes syncLogEntries(TSyncLogEntriesReq req) { .setReceiverMemSize(deserializedRequest.getMemorySize()); } + @Override + public TSyncSafeHlcRes syncSafeHlc(final TSyncSafeHlcReq req) { + final ConsensusGroupId groupId = + ConsensusGroupId.Factory.createFromTConsensusGroupId(req.getConsensusGroupId()); + final IoTConsensusServerImpl impl = consensus.getImpl(groupId); + if (impl == null) { + final String message = + String.format("unexpected consensusGroupId %s for TSyncSafeHlcReq", groupId); + LOGGER.error(message); + final TSStatus status = new TSStatus(TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode()); + status.setMessage(message); + return new TSyncSafeHlcRes().setStatus(status); + } + impl.observeRemoteSafeHlc( + req.getWriterNodeId(), + req.getWriterEpoch(), + req.getSafePhysicalTime(), + req.getBarrierLocalSeq()); + return new TSyncSafeHlcRes() + .setStatus(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode())); + } + @Override public TInactivatePeerRes inactivatePeer(TInactivatePeerReq req) throws TException { if (req.isForDeletionPurpose()) { diff --git a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTrackerTest.java b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTrackerTest.java new file mode 100644 index 0000000000000..a368750cc7916 --- /dev/null +++ b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/WriterSafeFrontierTrackerTest.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.consensus.iot; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +public class WriterSafeFrontierTrackerTest { + + @Test + public void testPendingSafeHlcPromotesWhenBarrierIsApplied() { + final WriterSafeFrontierTracker tracker = new WriterSafeFrontierTracker(); + + tracker.recordAppliedProgress(7, 2L, 100L, 10L); + assertEquals(100L, tracker.getEffectiveSafePt(7, 2L)); + + tracker.observePendingSafeHlc(7, 2L, 130L, 20L); + assertEquals(100L, tracker.getEffectiveSafePt(7, 2L)); + assertEquals(130L, tracker.getPendingSafeHlc(7, 2L).getSafePhysicalTime()); + + tracker.recordAppliedProgress(7, 2L, 125L, 19L); + assertEquals(125L, tracker.getEffectiveSafePt(7, 2L)); + + tracker.recordAppliedProgress(7, 2L, 126L, 20L); + assertEquals(130L, tracker.getEffectiveSafePt(7, 2L)); + assertNull(tracker.getPendingSafeHlc(7, 2L)); + } + + @Test + public void testSameWriterKeepsOnlyNewestPendingSafeHlc() { + final WriterSafeFrontierTracker tracker = new WriterSafeFrontierTracker(); + + tracker.observePendingSafeHlc(9, 3L, 200L, 30L); + tracker.observePendingSafeHlc(9, 3L, 220L, 35L); + + assertEquals(220L, tracker.getPendingSafeHlc(9, 3L).getSafePhysicalTime()); + assertEquals(35L, tracker.getPendingSafeHlc(9, 3L).getBarrierLocalSeq()); + + tracker.observePendingSafeHlc(9, 3L, 210L, 32L); + assertEquals(220L, tracker.getPendingSafeHlc(9, 3L).getSafePhysicalTime()); + assertEquals(35L, tracker.getPendingSafeHlc(9, 3L).getBarrierLocalSeq()); + } +} diff --git a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java index 733df885e48fe..99d035b596bc1 100644 --- a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java +++ b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/FakeConsensusReqReader.java @@ -57,6 +57,16 @@ public long getTotalSize() { return 0; } + @Override + public long getRegionDiskUsage() { + return 0; + } + + @Override + public long getSearchIndexToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + private class FakeConsensusReqIterator implements ConsensusReqReader.ReqIterator { private long nextSearchIndex; diff --git a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java index a515010a3497a..9aa27d79ff645 100644 --- a/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java +++ b/iotdb-core/consensus/src/test/java/org/apache/iotdb/consensus/iot/util/TestStateMachine.java @@ -97,11 +97,18 @@ public TSStatus write(IConsensusRequest request) { public IConsensusRequest deserializeRequest(IConsensusRequest request) { if (request instanceof BatchIndexedConsensusRequest) { BatchIndexedConsensusRequest consensusRequest = (BatchIndexedConsensusRequest) request; + final IndexedConsensusRequest lastIndexedRequest = + consensusRequest.getRequests().isEmpty() + ? null + : consensusRequest.getRequests().get(consensusRequest.getRequests().size() - 1); DeserializedBatchIndexedConsensusRequest result = new DeserializedBatchIndexedConsensusRequest( consensusRequest.getStartSyncIndex(), consensusRequest.getEndSyncIndex(), - consensusRequest.getRequests().size()); + consensusRequest.getRequests().size(), + consensusRequest.getSourcePeerId(), + lastIndexedRequest != null ? lastIndexedRequest.getWriterEpoch() : 0L, + lastIndexedRequest != null ? lastIndexedRequest.getPhysicalTime() : 0L); for (IndexedConsensusRequest r : consensusRequest.getRequests()) { result.add(r); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java index 8b3eb5ffd2fe4..c141c52867cfd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/DataRegionConsensusImpl.java @@ -158,6 +158,8 @@ private static ConsensusConfig buildConsensusConfig() { .setMaxMemoryRatioForQueue(CONF.getMaxMemoryRatioForQueue()) .setRegionMigrationSpeedLimitBytesPerSecond( CONF.getRegionMigrationSpeedLimitBytesPerSecond()) + .setSubscriptionWalRetentionSizeInBytes( + COMMON_CONF.getSubscriptionConsensusWalRetentionSizeInBytes()) .build()) .build()) .setIoTConsensusV2Config( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java index e0184b8595d23..f97c0194425f6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataExecutionVisitor.java @@ -75,7 +75,7 @@ public TSStatus visitRelationalInsertRows(RelationalInsertRowsNode node, DataReg public TSStatus visitInsertRow(InsertRowNode node, DataRegion dataRegion) { try { dataRegion.insert(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (OutOfTTLException e) { LOGGER.warn("Error in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -99,7 +99,7 @@ public TSStatus visitRelationalInsertTablet( public TSStatus visitInsertTablet(final InsertTabletNode node, final DataRegion dataRegion) { try { dataRegion.insertTablet(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (final OutOfTTLException e) { LOGGER.debug("Error in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -136,7 +136,7 @@ public TSStatus visitInsertTablet(final InsertTabletNode node, final DataRegion public TSStatus visitInsertRows(InsertRowsNode node, DataRegion dataRegion) { try { dataRegion.insert(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (WriteProcessRejectException e) { LOGGER.warn("Reject in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -173,7 +173,7 @@ public TSStatus visitInsertRows(InsertRowsNode node, DataRegion dataRegion) { public TSStatus visitInsertMultiTablets(InsertMultiTabletsNode node, DataRegion dataRegion) { try { dataRegion.insertTablets(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (WriteProcessRejectException e) { LOGGER.warn("Reject in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -208,7 +208,7 @@ public TSStatus visitInsertRowsOfOneDevice( InsertRowsOfOneDeviceNode node, DataRegion dataRegion) { try { dataRegion.insert(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (WriteProcessRejectException e) { LOGGER.warn("Reject in executing plan node: {}, caused by {}", node, e.getMessage()); @@ -264,7 +264,7 @@ public TSStatus visitDeleteData(DeleteDataNode node, DataRegion dataRegion) { dataRegion.deleteByDevice(path, node); } } - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (IOException | IllegalPathException e) { LOGGER.error("Error in executing plan node: {}", node, e); @@ -279,7 +279,7 @@ public TSStatus visitDeleteData( final RelationalDeleteDataNode node, final DataRegion dataRegion) { try { dataRegion.deleteByTable(node); - dataRegion.insertSeparatorToWAL(); + dataRegion.insertSeparatorToWAL(node); return StatusUtils.OK; } catch (final IOException e) { LOGGER.error("Error in executing plan node: {}", node, e); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java index 5fa375406b896..edafc3d597b5f 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/DataRegionStateMachine.java @@ -154,6 +154,10 @@ protected PlanNode grabPlanNode(IndexedConsensusRequest indexedRequest) { PlanNode planNode = getPlanNode(req); if (planNode instanceof SearchNode) { ((SearchNode) planNode).setSearchIndex(indexedRequest.getSearchIndex()); + ((SearchNode) planNode).setPhysicalTime(indexedRequest.getPhysicalTime()); + ((SearchNode) planNode).setNodeId(indexedRequest.getNodeId()); + ((SearchNode) planNode).setWriterEpoch(indexedRequest.getWriterEpoch()); + ((SearchNode) planNode).setSyncIndex(indexedRequest.getSyncIndex()); searchNodes.add((SearchNode) planNode); } else { logger.warn("Unexpected PlanNode type {}, which is not SearchNode", planNode.getClass()); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java index 240c1b1caa0fe..a835335aa81b2 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/consensus/statemachine/dataregion/IoTConsensusDataRegionStateMachine.java @@ -82,11 +82,18 @@ public IConsensusRequest deserializeRequest(IConsensusRequest request) { result = grabPlanNode(indexedRequest); } else if (request instanceof BatchIndexedConsensusRequest) { BatchIndexedConsensusRequest batchRequest = (BatchIndexedConsensusRequest) request; + final IndexedConsensusRequest lastIndexedRequest = + batchRequest.getRequests().isEmpty() + ? null + : batchRequest.getRequests().get(batchRequest.getRequests().size() - 1); DeserializedBatchIndexedConsensusRequest deserializedRequest = new DeserializedBatchIndexedConsensusRequest( batchRequest.getStartSyncIndex(), batchRequest.getEndSyncIndex(), - batchRequest.getRequests().size()); + batchRequest.getRequests().size(), + batchRequest.getSourcePeerId(), + lastIndexedRequest != null ? lastIndexedRequest.getWriterEpoch() : 0L, + lastIndexedRequest != null ? lastIndexedRequest.getPhysicalTime() : 0L); for (IndexedConsensusRequest indexedRequest : batchRequest.getRequests()) { final PlanNode planNode = grabPlanNode(indexedRequest); if (planNode instanceof ComparableConsensusRequest) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java index 45f86a4706c0e..f83c23871f516 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/agent/task/execution/PipeSubtaskExecutorManager.java @@ -33,7 +33,7 @@ public class PipeSubtaskExecutorManager { private final PipeProcessorSubtaskExecutor processorExecutor; private final Supplier connectorExecutorSupplier; - private final SubscriptionSubtaskExecutor subscriptionExecutor; + private volatile SubscriptionSubtaskExecutor subscriptionExecutor; public PipeProcessorSubtaskExecutor getProcessorExecutor() { return processorExecutor; @@ -49,6 +49,7 @@ public IoTConsensusV2SubtaskExecutor getConsensusExecutor() { } public SubscriptionSubtaskExecutor getSubscriptionExecutor() { + ensureSubscriptionExecutors(); return subscriptionExecutor; } @@ -57,15 +58,28 @@ public SubscriptionSubtaskExecutor getSubscriptionExecutor() { private PipeSubtaskExecutorManager() { processorExecutor = new PipeProcessorSubtaskExecutor(); connectorExecutorSupplier = PipeSinkSubtaskExecutor::new; - subscriptionExecutor = - SubscriptionConfig.getInstance().getSubscriptionEnabled() - ? new SubscriptionSubtaskExecutor() - : null; + ensureSubscriptionExecutors(); // IoTV2 uses global singleton executor pool. IoTV2GlobalComponentContainer.getInstance() .setConsensusExecutor(new IoTConsensusV2SubtaskExecutor()); } + public synchronized void ensureSubscriptionExecutors() { + if (!SubscriptionConfig.getInstance().getSubscriptionEnabled()) { + return; + } + if (subscriptionExecutor == null || subscriptionExecutor.isShutdown()) { + subscriptionExecutor = new SubscriptionSubtaskExecutor(); + } + } + + public synchronized void shutdownSubscriptionExecutors() { + if (subscriptionExecutor != null) { + subscriptionExecutor.shutdown(); + subscriptionExecutor = null; + } + } + private static class PipeTaskExecutorHolder { private static PipeSubtaskExecutorManager instance = null; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java index e2c04caedfb20..e0dce94b1dda7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java @@ -117,6 +117,8 @@ import org.apache.iotdb.confignode.rpc.thrift.TGetAllTemplatesResp; import org.apache.iotdb.confignode.rpc.thrift.TGetAllTopicInfoResp; import org.apache.iotdb.confignode.rpc.thrift.TGetClusterIdResp; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDataNodeLocationsResp; import org.apache.iotdb.confignode.rpc.thrift.TGetDatabaseReq; import org.apache.iotdb.confignode.rpc.thrift.TGetJarInListReq; @@ -1265,6 +1267,12 @@ public TGetAllSubscriptionInfoResp getAllSubscriptionInfo() throws TException { () -> client.getAllSubscriptionInfo(), resp -> !updateConfigNodeLeader(resp.status)); } + @Override + public TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) throws TException { + return executeRemoteCallWithRetry( + () -> client.getCommitProgress(req), resp -> !updateConfigNodeLeader(resp.status)); + } + @Override public TPipeConfigTransferResp handleTransferConfigPlan(TPipeConfigTransferReq req) throws TException { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index f5cdb47876a3d..8b99d7055a4c2 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -211,6 +211,8 @@ import org.apache.iotdb.db.storageengine.rescon.quotas.DataNodeSpaceQuotaManager; import org.apache.iotdb.db.storageengine.rescon.quotas.DataNodeThrottleQuotaManager; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusRegionRuntimeState; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.trigger.executor.TriggerExecutor; import org.apache.iotdb.db.trigger.executor.TriggerFireResult; import org.apache.iotdb.db.trigger.service.TriggerManagementService; @@ -283,6 +285,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TMaintainPeerReq; import org.apache.iotdb.mpp.rpc.thrift.TNotifyRegionMigrationReq; import org.apache.iotdb.mpp.rpc.thrift.TPipeHeartbeatReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressReq; +import org.apache.iotdb.mpp.rpc.thrift.TPullCommitProgressResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage; @@ -294,6 +298,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TPushSingleConsumerGroupMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSinglePipeMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushSingleTopicMetaReq; +import org.apache.iotdb.mpp.rpc.thrift.TPushSubscriptionRuntimeReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaReq; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaResp; import org.apache.iotdb.mpp.rpc.thrift.TPushTopicMetaRespExceptionMessage; @@ -312,6 +317,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TSendFragmentInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TSendFragmentInstanceResp; import org.apache.iotdb.mpp.rpc.thrift.TSendSinglePlanNodeResp; +import org.apache.iotdb.mpp.rpc.thrift.TSubscriptionRuntimeStateEntry; +import org.apache.iotdb.mpp.rpc.thrift.TSyncSubscriptionProgressReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceDeletionWithPatternAndFilterReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceDeletionWithPatternOrModReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceInvalidateCacheReq; @@ -357,6 +364,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -1551,6 +1559,132 @@ public TPushConsumerGroupMetaResp pushSingleConsumerGroupMeta( } } + @Override + public TPullCommitProgressResp pullCommitProgress(TPullCommitProgressReq req) { + try { + final int dataNodeId = IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + final Map regionProgress = + SubscriptionAgent.broker().collectAllRegionCommitProgress(dataNodeId); + logSuspiciousRegionProgressPayloads(regionProgress); + return new TPullCommitProgressResp(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode())) + .setCommitRegionProgress(regionProgress); + } catch (Exception e) { + LOGGER.warn("Error occurred when pulling commit progress", e); + return new TPullCommitProgressResp( + new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode())); + } + } + + @Override + public TSStatus syncSubscriptionProgress(TSyncSubscriptionProgressReq req) { + try { + SubscriptionAgent.broker() + .receiveSubscriptionProgress( + req.getConsumerGroupId(), + req.getTopicName(), + req.getRegionId(), + req.getEpoch(), + req.getSyncIndex(), + req.isSetWriterNodeId() ? req.getWriterNodeId() : -1, + req.isSetWriterEpoch() ? req.getWriterEpoch() : 0L); + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } catch (Exception e) { + LOGGER.warn("Error occurred when receiving subscription progress broadcast", e); + return new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + } + } + + private static void logSuspiciousRegionProgressPayloads( + final Map regionProgress) { + if (Objects.isNull(regionProgress) || regionProgress.isEmpty()) { + return; + } + for (final Map.Entry entry : regionProgress.entrySet()) { + if (isSuspiciousRegionProgressPayload(entry.getValue())) { + LOGGER.warn( + "PULL_COMMIT_PROGRESS datanode send suspicious payload, key={}, summary={}", + entry.getKey(), + summarizeRegionProgressPayload(entry.getValue())); + } + } + } + + private static boolean isSuspiciousRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return true; + } + final ByteBuffer duplicate = buffer.slice(); + if (duplicate.remaining() < Integer.BYTES) { + return true; + } + final int firstInt = duplicate.getInt(); + return firstInt < 0 || firstInt > 1_000_000; + } + + private static String summarizeRegionProgressPayload(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return "null"; + } + final int position = buffer.position(); + final int limit = buffer.limit(); + final int capacity = buffer.capacity(); + final ByteBuffer duplicate = buffer.slice(); + final int remaining = duplicate.remaining(); + final String firstIntSummary; + if (remaining >= Integer.BYTES) { + final int firstInt = duplicate.getInt(); + firstIntSummary = firstInt + "(0x" + String.format("%08x", firstInt) + ")"; + duplicate.position(0); + } else { + firstIntSummary = "n/a"; + } + final int sampleLength = Math.min(16, remaining); + final byte[] sample = new byte[sampleLength]; + duplicate.get(sample, 0, sampleLength); + return "pos=" + + position + + ", limit=" + + limit + + ", capacity=" + + capacity + + ", remaining=" + + remaining + + ", firstInt=" + + firstIntSummary + + ", firstBytes=" + + bytesToHex(sample); + } + + private static String bytesToHex(final byte[] bytes) { + if (Objects.isNull(bytes) || bytes.length == 0) { + return ""; + } + final StringBuilder builder = new StringBuilder(bytes.length * 2); + for (final byte b : bytes) { + builder.append(String.format("%02x", b)); + } + return builder.toString(); + } + + @Override + public TSStatus pushSubscriptionRuntime(TPushSubscriptionRuntimeReq req) { + try { + for (final TSubscriptionRuntimeStateEntry runtimeStateEntry : req.getRuntimeStates()) { + ConsensusSubscriptionSetupHandler.applyRuntimeState( + runtimeStateEntry.getRegionId(), + new ConsensusRegionRuntimeState( + runtimeStateEntry.getRuntimeVersion(), + runtimeStateEntry.getPreferredWriterNodeId(), + runtimeStateEntry.isActive(), + new LinkedHashSet<>(runtimeStateEntry.getActiveWriterNodeIds()))); + } + return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()); + } catch (Exception e) { + LOGGER.warn("Error occurred when pushing subscription runtime state", e); + return new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()); + } + } + @Override public TPipeHeartbeatResp pipeHeartbeat(TPipeHeartbeatReq req) throws TException { final TPipeHeartbeatResp resp = new TPipeHeartbeatResp(new ArrayList<>()); @@ -2239,6 +2373,13 @@ public TDataNodeHeartbeatResp getDataNodeHeartBeat(TDataNodeHeartbeatReq req) th public TSStatus updateRegionCache(TRegionRouteReq req) { boolean result = ClusterPartitionFetcher.getInstance().updateRegionCache(req); if (result) { + // Notify consensus subscription queues of any preferred-writer changes + try { + ConsensusSubscriptionSetupHandler.onRegionRouteChanged( + req.getRegionRouteMap(), req.getTimestamp()); + } catch (final Exception e) { + LOGGER.warn("Failed to process epoch ordering on region route change", e); + } return RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS); } else { return RpcUtils.getStatus(TSStatusCode.PARTITION_CACHE_UPDATE_ERROR); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java index 11d70e0daa755..eb668a206a1b3 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedDeleteDataNode.java @@ -96,6 +96,72 @@ public void setProgressIndex(ProgressIndex progressIndex) { deleteDataNode.setProgressIndex(progressIndex); } + @Override + public SearchNode setSearchIndex(final long searchIndex) { + deleteDataNode.setSearchIndex(searchIndex); + return this; + } + + @Override + public long getSearchIndex() { + return deleteDataNode.getSearchIndex(); + } + + @Override + public long getRoutingEpoch() { + return deleteDataNode.getRoutingEpoch(); + } + + @Override + public SearchNode setRoutingEpoch(final long routingEpoch) { + deleteDataNode.setRoutingEpoch(routingEpoch); + return this; + } + + @Override + public long getPhysicalTime() { + return deleteDataNode.getPhysicalTime(); + } + + @Override + public SearchNode setPhysicalTime(final long physicalTime) { + deleteDataNode.setPhysicalTime(physicalTime); + return this; + } + + @Override + public int getNodeId() { + return deleteDataNode.getNodeId(); + } + + @Override + public SearchNode setNodeId(final int nodeId) { + deleteDataNode.setNodeId(nodeId); + return this; + } + + @Override + public long getWriterEpoch() { + return deleteDataNode.getWriterEpoch(); + } + + @Override + public SearchNode setWriterEpoch(final long writerEpoch) { + deleteDataNode.setWriterEpoch(writerEpoch); + return this; + } + + @Override + public long getSyncIndex() { + return deleteDataNode.getSyncIndex(); + } + + @Override + public SearchNode setSyncIndex(final long syncIndex) { + deleteDataNode.setSyncIndex(syncIndex); + return this; + } + @Override public List getChildren() { return deleteDataNode.getChildren(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java index 2e517700217b7..f8c7ee9a17415 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/pipe/PipeEnrichedInsertNode.java @@ -233,6 +233,61 @@ public SearchNode setSearchIndex(final long searchIndex) { return this; } + @Override + public long getRoutingEpoch() { + return insertNode.getRoutingEpoch(); + } + + @Override + public SearchNode setRoutingEpoch(final long routingEpoch) { + insertNode.setRoutingEpoch(routingEpoch); + return this; + } + + @Override + public long getPhysicalTime() { + return insertNode.getPhysicalTime(); + } + + @Override + public SearchNode setPhysicalTime(final long physicalTime) { + insertNode.setPhysicalTime(physicalTime); + return this; + } + + @Override + public int getNodeId() { + return insertNode.getNodeId(); + } + + @Override + public SearchNode setNodeId(final int nodeId) { + insertNode.setNodeId(nodeId); + return this; + } + + @Override + public long getWriterEpoch() { + return insertNode.getWriterEpoch(); + } + + @Override + public SearchNode setWriterEpoch(final long writerEpoch) { + insertNode.setWriterEpoch(writerEpoch); + return this; + } + + @Override + public long getSyncIndex() { + return insertNode.getSyncIndex(); + } + + @Override + public SearchNode setSyncIndex(final long syncIndex) { + insertNode.setSyncIndex(syncIndex); + return this; + } + @Override protected void serializeAttributes(final ByteBuffer byteBuffer) { PlanNodeType.PIPE_ENRICHED_INSERT_DATA.serialize(byteBuffer); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java index cfba72d66db62..7c0bc25dfaa55 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/DeleteDataNode.java @@ -398,6 +398,10 @@ public SearchNode merge(List searchNodes) { pathList, firstOne.getDeleteStartTime(), firstOne.getDeleteEndTime()) - .setSearchIndex(firstOne.searchIndex); + .setSearchIndex(firstOne.searchIndex) + .setPhysicalTime(firstOne.getPhysicalTime()) + .setNodeId(firstOne.getNodeId()) + .setWriterEpoch(firstOne.getWriterEpoch()) + .setSyncIndex(firstOne.getSyncIndex()); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java index b41d178b396c6..bf842e862b447 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertMultiTabletsNode.java @@ -142,6 +142,34 @@ public SearchNode setSearchIndex(long index) { return this; } + @Override + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + insertTabletNodeList.forEach(plan -> plan.setPhysicalTime(physicalTime)); + return this; + } + + @Override + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + insertTabletNodeList.forEach(plan -> plan.setNodeId(nodeId)); + return this; + } + + @Override + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + insertTabletNodeList.forEach(plan -> plan.setWriterEpoch(writerEpoch)); + return this; + } + + @Override + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + insertTabletNodeList.forEach(plan -> plan.setSyncIndex(syncIndex)); + return this; + } + @Override public List splitByPartition(IAnalysis analysis) { Map splitMap = new HashMap<>(); @@ -156,6 +184,10 @@ public List splitByPartition(IAnalysis analysis) { } else { tmpNode = new InsertMultiTabletsNode(this.getPlanNodeId()); tmpNode.setDataRegionReplicaSet(dataRegionReplicaSet); + tmpNode.setPhysicalTime(getPhysicalTime()); + tmpNode.setNodeId(getNodeId()); + tmpNode.setWriterEpoch(getWriterEpoch()); + tmpNode.setSyncIndex(getSyncIndex()); tmpNode.addInsertTabletNode((InsertTabletNode) subNode, i); splitMap.put(dataRegionReplicaSet, tmpNode); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java index 88a6faa004745..9aac99f485cef 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertNode.java @@ -110,6 +110,10 @@ public final SearchNode merge(List searchNodes) { .collect(Collectors.toList()); InsertNode result = mergeInsertNode(insertNodes); result.setSearchIndex(insertNodes.get(0).getSearchIndex()); + result.setPhysicalTime(insertNodes.get(0).getPhysicalTime()); + result.setNodeId(insertNodes.get(0).getNodeId()); + result.setWriterEpoch(insertNodes.get(0).getWriterEpoch()); + result.setSyncIndex(insertNodes.get(0).getSyncIndex()); result.setTargetPath(insertNodes.get(0).getTargetPath()); return result; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java index 7392b7612705e..7a22085285cc5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsNode.java @@ -136,6 +136,34 @@ public SearchNode setSearchIndex(long index) { return this; } + @Override + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + insertRowNodeList.forEach(plan -> plan.setPhysicalTime(physicalTime)); + return this; + } + + @Override + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + insertRowNodeList.forEach(plan -> plan.setNodeId(nodeId)); + return this; + } + + @Override + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + insertRowNodeList.forEach(plan -> plan.setWriterEpoch(writerEpoch)); + return this; + } + + @Override + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + insertRowNodeList.forEach(plan -> plan.setSyncIndex(syncIndex)); + return this; + } + public Map getResults() { return results; } @@ -287,6 +315,10 @@ public List splitByPartition(IAnalysis analysis) { } else { tmpNode = new InsertRowsNode(this.getPlanNodeId()); tmpNode.setDataRegionReplicaSet(dataRegionReplicaSet); + tmpNode.setPhysicalTime(getPhysicalTime()); + tmpNode.setNodeId(getNodeId()); + tmpNode.setWriterEpoch(getWriterEpoch()); + tmpNode.setSyncIndex(getSyncIndex()); tmpNode.addOneInsertRowNode(insertRowNode, i); splitMap.put(dataRegionReplicaSet, tmpNode); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java index f1e28d32b104d..d3b9329bf756b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/InsertRowsOfOneDeviceNode.java @@ -106,6 +106,34 @@ public SearchNode setSearchIndex(long index) { return this; } + @Override + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + insertRowNodeList.forEach(plan -> plan.setPhysicalTime(physicalTime)); + return this; + } + + @Override + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + insertRowNodeList.forEach(plan -> plan.setNodeId(nodeId)); + return this; + } + + @Override + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + insertRowNodeList.forEach(plan -> plan.setWriterEpoch(writerEpoch)); + return this; + } + + @Override + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + insertRowNodeList.forEach(plan -> plan.setSyncIndex(syncIndex)); + return this; + } + public TSStatus[] getFailingStatus() { return StatusUtils.getFailingStatus(results, insertRowNodeList.size()); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java index 632d7c9ee1e0a..78117076ba5de 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalDeleteDataNode.java @@ -330,6 +330,10 @@ public SearchNode merge(List searchNodes) { .flatMap(Collection::stream) .collect(Collectors.toList()); return new RelationalDeleteDataNode(this.getPlanNodeId(), allTableDeletionEntries, databaseName) - .setSearchIndex(getSearchIndex()); + .setSearchIndex(getSearchIndex()) + .setPhysicalTime(getPhysicalTime()) + .setNodeId(getNodeId()) + .setWriterEpoch(getWriterEpoch()) + .setSyncIndex(getSyncIndex()); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java index 594ccf50471f9..c8bcf04808ff7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/RelationalInsertRowsNode.java @@ -184,6 +184,10 @@ public List splitByPartition(IAnalysis analysis) { } else { tmpNode = new RelationalInsertRowsNode(this.getPlanNodeId()); tmpNode.setDataRegionReplicaSet(dataRegionReplicaSet); + tmpNode.setPhysicalTime(getPhysicalTime()); + tmpNode.setNodeId(getNodeId()); + tmpNode.setWriterEpoch(getWriterEpoch()); + tmpNode.setSyncIndex(getSyncIndex()); tmpNode.addOneInsertRowNode(insertRowNode, i); splitMap.put(dataRegionReplicaSet, tmpNode); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java index d506d1414e15e..7c0a9fec2bfe5 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/planner/plan/node/write/SearchNode.java @@ -23,11 +23,17 @@ import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeId; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.WritePlanNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.IWALByteBufferView; +import java.io.DataInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; import java.util.List; public abstract class SearchNode extends WritePlanNode implements ComparableConsensusRequest { + protected static final int WAL_POSITION_SERIALIZED_SIZE = Long.BYTES; + /** this insert node doesn't need to participate in iot consensus */ public static final long NO_CONSENSUS_INDEX = ConsensusReqReader.DEFAULT_SEARCH_INDEX; @@ -37,6 +43,25 @@ public abstract class SearchNode extends WritePlanNode implements ComparableCons */ protected long searchIndex = NO_CONSENSUS_INDEX; + /** routing epoch from ConfigNode broadcast, used for ordered consensus subscription */ + protected long routingEpoch = 0; + + /** Millisecond physical time used as the first ordering key in the new subscription progress. */ + protected long physicalTime = 0; + + /** Writer node id used as the second ordering key across multiple writers. */ + protected int nodeId = -1; + + /** Writer-local lifecycle id. */ + protected long writerEpoch = 0; + + /** + * syncIndex carries the source Leader's searchIndex for replicated (Follower) writes. On Leader + * nodes this stays at NO_CONSENSUS_INDEX (-1). Only stored in WALMetaData V3, never changes the + * WAL entry's own searchIndex. + */ + protected long syncIndex = NO_CONSENSUS_INDEX; + protected SearchNode(PlanNodeId id) { super(id); } @@ -51,5 +76,71 @@ public SearchNode setSearchIndex(long searchIndex) { return this; } + public long getRoutingEpoch() { + return routingEpoch; + } + + public SearchNode setRoutingEpoch(long routingEpoch) { + this.routingEpoch = routingEpoch; + return this; + } + + public long getPhysicalTime() { + return physicalTime; + } + + public SearchNode setPhysicalTime(long physicalTime) { + this.physicalTime = physicalTime; + return this; + } + + public int getNodeId() { + return nodeId; + } + + public SearchNode setNodeId(int nodeId) { + this.nodeId = nodeId; + return this; + } + + public long getWriterEpoch() { + return writerEpoch; + } + + public SearchNode setWriterEpoch(long writerEpoch) { + this.writerEpoch = writerEpoch; + return this; + } + + public long getSyncIndex() { + return syncIndex; + } + + public SearchNode setSyncIndex(long syncIndex) { + this.syncIndex = syncIndex; + return this; + } + + public long getLocalSeq() { + return searchIndex; + } + + public SearchNode setLocalSeq(long localSeq) { + this.searchIndex = localSeq; + return this; + } + + protected final void serializeWalPosition(IWALByteBufferView buffer) { + buffer.putLong(searchIndex); + } + + protected final void deserializeWalPosition(DataInputStream stream) throws IOException { + this.searchIndex = stream.readLong(); + } + + protected final void deserializeWalPosition(ByteBuffer buffer) { + this.searchIndex = buffer.getLong(); + } + public abstract SearchNode merge(List searchNodes); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java index eb37cb1d5d21b..0b31f1d274d17 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java @@ -87,6 +87,7 @@ import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalDeleteDataNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode; import org.apache.iotdb.db.queryengine.plan.relational.metadata.TableMetadataImpl; import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache.LastCacheLoadStrategy; import org.apache.iotdb.db.queryengine.plan.relational.metadata.fetcher.cache.TableDeviceSchemaCache; @@ -1729,6 +1730,10 @@ private List insertToTsFileProcessors( if (v == null) { v = insertRowsNode.emptyClone(); v.setSearchIndex(insertRowNode.getSearchIndex()); + v.setPhysicalTime(insertRowsNode.getPhysicalTime()); + v.setNodeId(insertRowsNode.getNodeId()); + v.setWriterEpoch(insertRowsNode.getWriterEpoch()); + v.setSyncIndex(insertRowsNode.getSyncIndex()); v.setAligned(insertRowNode.isAligned()); if (insertRowNode.isGeneratedByPipe()) { v.markAsGeneratedByPipe(); @@ -2852,8 +2857,7 @@ public void deleteByDevice(final MeasurementPath pattern, final DeleteDataNode n } TreeDeviceSchemaCacheManager.getInstance().invalidateLastCache(pattern); // write log to impacted working TsFileProcessors - List walListeners = - logDeletionInWAL(startTime, endTime, searchIndex, pattern); + List walListeners = logDeletionInWAL(node, pattern); for (WALFlushListener walFlushListener : walListeners) { if (walFlushListener.waitForResult() == WALFlushListener.Status.FAILURE) { @@ -3018,8 +3022,7 @@ public void deleteDataDirectly(MeasurementPath pathToDelete, DeleteDataNode node } TreeDeviceSchemaCacheManager.getInstance().invalidateDatabaseLastCache(getDatabaseName()); // write log to impacted working TsFileProcessors - List walListeners = - logDeletionInWAL(startTime, endTime, searchIndex, pathToDelete); + List walListeners = logDeletionInWAL(node, pathToDelete); for (WALFlushListener walFlushListener : walListeners) { if (walFlushListener.waitForResult() == WALFlushListener.Status.FAILURE) { @@ -3095,22 +3098,37 @@ private List logDeletionInWAL(RelationalDeleteDataNode deleteD } private List logDeletionInWAL( - long startTime, long endTime, long searchIndex, MeasurementPath path) { + DeleteDataNode templateDeleteDataNode, MeasurementPath path) { if (config.getWalMode() == WALMode.DISABLE) { return Collections.emptyList(); } List walFlushListeners = new ArrayList<>(); DeleteDataNode deleteDataNode = - new DeleteDataNode(new PlanNodeId(""), Collections.singletonList(path), startTime, endTime); - deleteDataNode.setSearchIndex(searchIndex); + new DeleteDataNode( + new PlanNodeId(""), + Collections.singletonList(path), + templateDeleteDataNode.getDeleteStartTime(), + templateDeleteDataNode.getDeleteEndTime()); + deleteDataNode + .setSearchIndex(templateDeleteDataNode.getSearchIndex()) + .setPhysicalTime(templateDeleteDataNode.getPhysicalTime()) + .setNodeId(templateDeleteDataNode.getNodeId()) + .setWriterEpoch(templateDeleteDataNode.getWriterEpoch()) + .setSyncIndex(templateDeleteDataNode.getSyncIndex()); for (Map.Entry entry : workSequenceTsFileProcessors.entrySet()) { - if (TimePartitionUtils.satisfyPartitionId(startTime, endTime, entry.getKey())) { + if (TimePartitionUtils.satisfyPartitionId( + templateDeleteDataNode.getDeleteStartTime(), + templateDeleteDataNode.getDeleteEndTime(), + entry.getKey())) { WALFlushListener walFlushListener = entry.getValue().logDeleteDataNodeInWAL(deleteDataNode); walFlushListeners.add(walFlushListener); } } for (Map.Entry entry : workUnsequenceTsFileProcessors.entrySet()) { - if (TimePartitionUtils.satisfyPartitionId(startTime, endTime, entry.getKey())) { + if (TimePartitionUtils.satisfyPartitionId( + templateDeleteDataNode.getDeleteStartTime(), + templateDeleteDataNode.getDeleteEndTime(), + entry.getKey())) { WALFlushListener walFlushListener = entry.getValue().logDeleteDataNodeInWAL(deleteDataNode); walFlushListeners.add(walFlushListener); } @@ -3187,17 +3205,27 @@ private void deleteObjectFiles(List matchedObjectDirs, List for details. */ public void insertSeparatorToWAL() { + insertSeparatorToWAL(null); + } + + public void insertSeparatorToWAL(final SearchNode sourceNode) { writeLock("insertSeparatorToWAL"); try { if (deleted) { return; } + final ContinuousSameSearchIndexSeparatorNode separatorNode = + new ContinuousSameSearchIndexSeparatorNode(); + if (Objects.nonNull(sourceNode)) { + separatorNode + .setRoutingEpoch(sourceNode.getRoutingEpoch()) + .setPhysicalTime(sourceNode.getPhysicalTime()) + .setNodeId(sourceNode.getNodeId()) + .setWriterEpoch(sourceNode.getWriterEpoch()) + .setSyncIndex(sourceNode.getSyncIndex()); + } getWALNode() - .ifPresent( - walNode -> - walNode.log( - TsFileProcessor.MEMTABLE_NOT_EXIST, - new ContinuousSameSearchIndexSeparatorNode())); + .ifPresent(walNode -> walNode.log(TsFileProcessor.MEMTABLE_NOT_EXIST, separatorNode)); } finally { writeUnlock(); } @@ -4497,6 +4525,10 @@ public void insert(InsertRowsOfOneDeviceNode insertRowsOfOneDeviceNode) if (v == null) { v = new InsertRowsNode(insertRowsOfOneDeviceNode.getPlanNodeId()); v.setSearchIndex(insertRowNode.getSearchIndex()); + v.setPhysicalTime(insertRowsOfOneDeviceNode.getPhysicalTime()); + v.setNodeId(insertRowsOfOneDeviceNode.getNodeId()); + v.setWriterEpoch(insertRowsOfOneDeviceNode.getWriterEpoch()); + v.setSyncIndex(insertRowsOfOneDeviceNode.getSyncIndex()); v.setAligned(insertRowNode.isAligned()); if (insertRowNode.isGeneratedByPipe()) { v.markAsGeneratedByPipe(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java index a7d79f92b5753..1eed5f2a5f16a 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/buffer/WALBuffer.java @@ -25,6 +25,7 @@ import org.apache.iotdb.commons.utils.TestOnly; import org.apache.iotdb.db.conf.IoTDBConfig; import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.ContinuousSameSearchIndexSeparatorNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.DeleteDataNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.ObjectNode; @@ -35,6 +36,7 @@ import org.apache.iotdb.db.storageengine.dataregion.wal.exception.BrokenWALFileException; import org.apache.iotdb.db.storageengine.dataregion.wal.exception.WALNodeClosedException; import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALWriter; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.MemoryControlledWALEntryQueue; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileStatus; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; @@ -326,26 +328,64 @@ private void handleInfoEntry(WALEntry walEntry) { walEntry.getWalFlushListener().fail(e); return; } - // parse search index + // parse search index and writer-progress metadata long searchIndex = DEFAULT_SEARCH_INDEX; - if (walEntry.getType().needSearch()) { + long syncIndex = DEFAULT_SEARCH_INDEX; + long physicalTime = 0; + int nodeId = -1; + long writerEpoch = 0; + if (walEntry.getType() == WALEntryType.CONTINUOUS_SAME_SEARCH_INDEX_SEPARATOR_NODE) { + final ContinuousSameSearchIndexSeparatorNode separatorNode = + (ContinuousSameSearchIndexSeparatorNode) walEntry.getValue(); + syncIndex = separatorNode.getSyncIndex(); + physicalTime = separatorNode.getPhysicalTime(); + nodeId = separatorNode.getNodeId(); + writerEpoch = separatorNode.getWriterEpoch(); + } else if (walEntry.getType().needSearch()) { if (walEntry.getType() == WALEntryType.DELETE_DATA_NODE) { searchIndex = ((DeleteDataNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((DeleteDataNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((DeleteDataNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((DeleteDataNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((DeleteDataNode) walEntry.getValue()).getWriterEpoch(); } else if (walEntry.getType() == WALEntryType.RELATIONAL_DELETE_DATA_NODE) { searchIndex = ((RelationalDeleteDataNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((RelationalDeleteDataNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((RelationalDeleteDataNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((RelationalDeleteDataNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((RelationalDeleteDataNode) walEntry.getValue()).getWriterEpoch(); } else if (walEntry.getType() == WALEntryType.OBJECT_FILE_NODE) { searchIndex = ((ObjectNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((ObjectNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((ObjectNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((ObjectNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((ObjectNode) walEntry.getValue()).getWriterEpoch(); } else { searchIndex = ((InsertNode) walEntry.getValue()).getSearchIndex(); + syncIndex = ((InsertNode) walEntry.getValue()).getSyncIndex(); + physicalTime = ((InsertNode) walEntry.getValue()).getPhysicalTime(); + nodeId = ((InsertNode) walEntry.getValue()).getNodeId(); + writerEpoch = ((InsertNode) walEntry.getValue()).getWriterEpoch(); } if (searchIndex != DEFAULT_SEARCH_INDEX) { currentSearchIndex = searchIndex; currentFileStatus = WALFileStatus.CONTAINS_SEARCH_INDEX; } } + // For Leader writes: syncIndex stays -1, use searchIndex as the ordering key + // For Follower writes: searchIndex is -1, syncIndex carries source's searchIndex + long effectiveSyncIndex = (syncIndex >= 0) ? syncIndex : searchIndex; + long effectiveLocalSeq = (syncIndex >= 0) ? syncIndex : searchIndex; // update related info totalSize += size; - info.metaData.add(size, searchIndex, walEntry.getMemTableId()); + info.metaData.add( + size, + searchIndex, + walEntry.getMemTableId(), + physicalTime, + nodeId, + writerEpoch, + effectiveLocalSeq); info.memTableId2WalDiskUsage.compute( walEntry.getMemTableId(), (k, v) -> v == null ? size : v + size); info.fsyncListeners.add(walEntry.getWalFlushListener()); @@ -748,6 +788,11 @@ public boolean isAllWALEntriesConsumed() { } } + public WALMetaData getCurrentWALMetaDataSnapshot() { + final WALWriter writer = currentWALFileWriter; + return writer == null ? new WALMetaData() : writer.snapshotMetaData(); + } + public CheckpointManager getCheckpointManager() { return checkpointManager; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java index 95721f846ccca..8ad62c8a395a0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/LogWriter.java @@ -69,7 +69,8 @@ protected LogWriter(File logFile, WALFileVersion version) throws IOException { this.logFile = logFile; this.logStream = new FileOutputStream(logFile, true); this.logChannel = this.logStream.getChannel(); - if ((!logFile.exists() || logFile.length() == 0) && version == WALFileVersion.V2) { + if ((!logFile.exists() || logFile.length() == 0) + && (version == WALFileVersion.V2 || version == WALFileVersion.V3)) { this.logChannel.write(ByteBuffer.wrap(version.getVersionBytes())); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReader.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReader.java new file mode 100644 index 0000000000000..7b2d8485efbed --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReader.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.wal.io; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Reader dedicated to the new writer-based subscription progress model. + * + *

It keeps the original WAL entry body untouched and exposes per-entry writer metadata from WAL + * footer arrays alongside the current entry buffer. + */ +public class ProgressWALReader implements Closeable { + + private final WALByteBufReader delegate; + + public ProgressWALReader(File logFile) throws IOException { + this.delegate = new WALByteBufReader(logFile); + } + + public ProgressWALReader(File logFile, WALMetaData metaDataSnapshot) throws IOException { + this.delegate = new WALByteBufReader(logFile, metaDataSnapshot); + } + + public boolean hasNext() { + return delegate.hasNext(); + } + + public ByteBuffer next() throws IOException { + return delegate.next(); + } + + public WALMetaData getMetaData() { + return delegate.getMetaData(); + } + + public long getCurrentEntryPhysicalTime() { + return delegate.getCurrentEntryPhysicalTime(); + } + + public int getCurrentEntryNodeId() { + return delegate.getCurrentEntryNodeId(); + } + + public long getCurrentEntryWriterEpoch() { + return delegate.getCurrentEntryWriterEpoch(); + } + + public long getCurrentEntryLocalSeq() { + return delegate.getCurrentEntryLocalSeq(); + } + + public int getCurrentEntryIndex() { + return delegate.getCurrentEntryIndex(); + } + + @Override + public void close() throws IOException { + delegate.close(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java index 2f257da9adc4a..4b5b198c18e12 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALByteBufReader.java @@ -27,6 +27,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; +import java.util.List; /** * This reader returns {@link WALEntry} as {@link ByteBuffer}, the usage of WALByteBufReader is like @@ -36,6 +37,8 @@ public class WALByteBufReader implements Closeable { private WALMetaData metaData; private DataInputStream logStream; private Iterator sizeIterator; + // V3: track current entry index to provide per-entry epoch/syncIndex + private int currentEntryIndex = -1; public WALByteBufReader(File logFile) throws IOException { WALInputStream walInputStream = new WALInputStream(logFile); @@ -49,6 +52,18 @@ public WALByteBufReader(File logFile) throws IOException { } } + public WALByteBufReader(File logFile, WALMetaData metaDataSnapshot) throws IOException { + WALInputStream walInputStream = new WALInputStream(logFile); + try { + this.logStream = new DataInputStream(walInputStream); + this.metaData = metaDataSnapshot == null ? new WALMetaData() : metaDataSnapshot; + this.sizeIterator = this.metaData.getBuffersSize().iterator(); + } catch (Exception e) { + walInputStream.close(); + throw e; + } + } + /** Like {@link Iterator#hasNext()}. */ public boolean hasNext() { return sizeIterator.hasNext(); @@ -60,6 +75,7 @@ public boolean hasNext() { * @throws IOException when failing to read from channel. */ public ByteBuffer next() throws IOException { + currentEntryIndex++; int size = sizeIterator.next(); // TODO: Reuse this buffer ByteBuffer buffer = ByteBuffer.allocate(size); @@ -84,4 +100,51 @@ public void close() throws IOException { public long getFirstSearchIndex() { return metaData.getFirstSearchIndex(); } + + /** Returns a compatibility epoch view of the current entry, mirrored from physicalTime. */ + public long getCurrentEntryEpoch() { + return getCurrentEntryPhysicalTime(); + } + + /** Returns a compatibility syncIndex view of the current entry, mirrored from localSeq. */ + public long getCurrentEntrySyncIndex() { + return getCurrentEntryLocalSeq(); + } + + public long getCurrentEntryPhysicalTime() { + List physicalTimes = metaData.getPhysicalTimes(); + if (currentEntryIndex >= 0 && currentEntryIndex < physicalTimes.size()) { + return physicalTimes.get(currentEntryIndex); + } + return 0L; + } + + public int getCurrentEntryNodeId() { + List nodeIds = metaData.getNodeIds(); + if (currentEntryIndex >= 0 && currentEntryIndex < nodeIds.size()) { + return nodeIds.get(currentEntryIndex); + } + return -1; + } + + public long getCurrentEntryWriterEpoch() { + List writerEpochs = metaData.getWriterEpochs(); + if (currentEntryIndex >= 0 && currentEntryIndex < writerEpochs.size()) { + return writerEpochs.get(currentEntryIndex); + } + return 0L; + } + + public long getCurrentEntryLocalSeq() { + List localSeqs = metaData.getLocalSeqs(); + if (currentEntryIndex >= 0 && currentEntryIndex < localSeqs.size()) { + return localSeqs.get(currentEntryIndex); + } + return metaData.getFirstSearchIndex() + currentEntryIndex; + } + + /** Returns the current entry index (0-based). */ + public int getCurrentEntryIndex() { + return currentEntryIndex; + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java index e3d374551b115..fc09c34b6508e 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALFileVersion.java @@ -26,7 +26,8 @@ public enum WALFileVersion { V1("WAL"), - V2("V2-WAL"); + V2("V2-WAL"), + V3("V3-WAL"); private final String versionString; private byte[] versionBytes; @@ -56,7 +57,7 @@ public static WALFileVersion getVersion(FileChannel channel) throws IOException long originalPosition = channel.position(); try { // head magic string starts to exist since V2 - WALFileVersion[] versions = {V2}; + WALFileVersion[] versions = {V3, V2}; for (WALFileVersion version : versions) { channel.position(0); if (channel.size() < version.versionBytes.length) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java index 0a7dbb5463c1a..906002b5922fd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALInputStream.java @@ -82,7 +82,7 @@ private void getEndOffset() throws IOException { } ByteBuffer metadataSizeBuf = ByteBuffer.allocate(Integer.BYTES); long position; - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { // New Version ByteBuffer magicStringBuffer = ByteBuffer.allocate(version.getVersionBytes().length); channel.read(magicStringBuffer, channel.size() - version.getVersionBytes().length); @@ -122,7 +122,7 @@ private void getEndOffset() throws IOException { int metadataSize = metadataSizeBuf.getInt(); endOffset = channel.size() - version.getVersionBytes().length - Integer.BYTES - metadataSize; } finally { - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { // Set the position back to the end of head magic string channel.position(version.getVersionBytes().length); } else { @@ -191,7 +191,7 @@ private void loadNextSegment() throws IOException { } long startTime = System.nanoTime(); long startPosition = channel.position(); - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { loadNextSegmentV2(); } else if (version == WALFileVersion.V1) { loadNextSegmentV1(); @@ -295,7 +295,7 @@ private void tryLoadSegment() throws IOException { * @throws IOException If the file is broken or the given position is invalid */ public void skipToGivenLogicalPosition(long pos) throws IOException { - if (version == WALFileVersion.V2) { + if (version == WALFileVersion.V2 || version == WALFileVersion.V3) { channel.position(version.getVersionBytes().length); long posRemain = pos; SegmentInfo segmentInfo; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java index ba9211656ef03..4608325ea837f 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaData.java @@ -32,13 +32,18 @@ import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; /** * Metadata exists at the end of each wal file, including each entry's size, search index of first * entry and the number of entries. + * + *

V3 extension stores per-entry writer progress metadata, plus file-level timestamp range, to + * support consensus subscription recovery. */ public class WALMetaData implements SerializedSize { @@ -54,6 +59,20 @@ public class WALMetaData implements SerializedSize { private final Set memTablesId; private long truncateOffSet = 0; + // V3 fields: file-level data timestamp range for timestamp-based seek + private long minDataTs = Long.MAX_VALUE; + private long maxDataTs = Long.MIN_VALUE; + // V3 extension for writer-based subscription progress. + private final List physicalTimes; + private final List nodeIds; + private final List writerEpochs; + private final List localSeqs; + + private static final short DEFAULT_NODE_ID = (short) -1; + private static final short DEFAULT_WRITER_EPOCH = 0; + private static final int V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT = + Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES; + public WALMetaData() { this(ConsensusReqReader.DEFAULT_SEARCH_INDEX, new ArrayList<>(), new HashSet<>()); } @@ -62,14 +81,67 @@ public WALMetaData(long firstSearchIndex, List buffersSize, Set m this.firstSearchIndex = firstSearchIndex; this.buffersSize = buffersSize; this.memTablesId = memTablesId; + this.physicalTimes = new ArrayList<>(); + this.nodeIds = new ArrayList<>(); + this.writerEpochs = new ArrayList<>(); + this.localSeqs = new ArrayList<>(); } + /** V2-compatible add without explicit writer progress metadata. */ public void add(int size, long searchIndex, long memTableId) { + add(size, searchIndex, memTableId, 0L, DEFAULT_NODE_ID, DEFAULT_WRITER_EPOCH, searchIndex); + } + + /** + * Compatibility add using the old (epoch, syncIndex) signature. The values are now interpreted as + * (physicalTime, localSeq). + */ + public void add(int size, long searchIndex, long memTableId, long epoch, long syncIndex) { + add( + size, + searchIndex, + memTableId, + epoch, + DEFAULT_NODE_ID, + DEFAULT_WRITER_EPOCH, + syncIndex >= 0 ? syncIndex : searchIndex); + } + + public void add( + int size, + long searchIndex, + long memTableId, + long physicalTime, + int nodeId, + long writerEpoch, + long localSeq) { if (buffersSize.isEmpty()) { firstSearchIndex = searchIndex; } buffersSize.add(size); memTablesId.add(memTableId); + physicalTimes.add(physicalTime); + nodeIds.add(toShortExact(nodeId, "nodeId")); + writerEpochs.add(toShortExact(writerEpoch, "writerEpoch")); + localSeqs.add(localSeq); + } + + private static short toShortExact(long value, String fieldName) { + if (value < Short.MIN_VALUE || value > Short.MAX_VALUE) { + throw new IllegalArgumentException( + String.format("%s %s exceeds short range", fieldName, value)); + } + return (short) value; + } + + /** Update file-level timestamp range with a data point's timestamp. */ + public void updateTimestampRange(long dataTs) { + if (dataTs < minDataTs) { + minDataTs = dataTs; + } + if (dataTs > maxDataTs) { + maxDataTs = dataTs; + } } public void addAll(WALMetaData metaData) { @@ -78,16 +150,47 @@ public void addAll(WALMetaData metaData) { } buffersSize.addAll(metaData.getBuffersSize()); memTablesId.addAll(metaData.getMemTablesId()); + physicalTimes.addAll(metaData.getPhysicalTimes()); + nodeIds.addAll(metaData.getNodeIds()); + writerEpochs.addAll(metaData.getWriterEpochs()); + localSeqs.addAll(metaData.getLocalSeqs()); + if (metaData.minDataTs < this.minDataTs) { + this.minDataTs = metaData.minDataTs; + } + if (metaData.maxDataTs > this.maxDataTs) { + this.maxDataTs = metaData.maxDataTs; + } } @Override public int serializedSize() { - return FIXED_SERIALIZED_SIZE - + buffersSize.size() * Integer.BYTES - + (memTablesId.isEmpty() ? 0 : Integer.BYTES + memTablesId.size() * Long.BYTES); + return serializedSize(WALFileVersion.V2); + } + + public int serializedSize(WALFileVersion version) { + int size = + FIXED_SERIALIZED_SIZE + + buffersSize.size() * Integer.BYTES + + (memTablesId.isEmpty() ? 0 : Integer.BYTES + memTablesId.size() * Long.BYTES); + if (version == WALFileVersion.V3) { + // minDataTs(long) + maxDataTs(long) + size += Long.BYTES * 2; + // physicalTimes(long[]) + localSeqs(long[]) + size += buffersSize.size() * Long.BYTES * 2; + // defaultNodeId(short) + defaultWriterEpoch(short) + overrideCount(int) + // + override ordinals(int[]) + override nodeIds(short[]) + override writerEpochs(short[]) + final int overrideCount = getWriterOverrideCount(); + size += Short.BYTES * 2 + Integer.BYTES; + size += overrideCount * (Integer.BYTES + Short.BYTES + Short.BYTES); + } + return size; } public void serialize(ByteBuffer buffer) { + serialize(buffer, WALFileVersion.V2); + } + + public void serialize(ByteBuffer buffer, WALFileVersion version) { buffer.putLong(firstSearchIndex); buffer.putInt(buffersSize.size()); for (int size : buffersSize) { @@ -99,9 +202,49 @@ public void serialize(ByteBuffer buffer) { buffer.putLong(memTableId); } } + if (version == WALFileVersion.V3) { + buffer.putLong(minDataTs); + buffer.putLong(maxDataTs); + for (long physicalTime : physicalTimes) { + buffer.putLong(physicalTime); + } + for (long localSeq : localSeqs) { + buffer.putLong(localSeq); + } + final short defaultNodeId = computeDefaultNodeId(); + final short defaultWriterEpoch = computeDefaultWriterEpoch(); + final List overrideIndexes = new ArrayList<>(); + final List overrideNodeIds = new ArrayList<>(); + final List overrideWriterEpochs = new ArrayList<>(); + for (int i = 0; i < buffersSize.size(); i++) { + final short nodeId = nodeIds.get(i); + final short writerEpoch = writerEpochs.get(i); + if (nodeId != defaultNodeId || writerEpoch != defaultWriterEpoch) { + overrideIndexes.add(i); + overrideNodeIds.add(nodeId); + overrideWriterEpochs.add(writerEpoch); + } + } + buffer.putShort(defaultNodeId); + buffer.putShort(defaultWriterEpoch); + buffer.putInt(overrideIndexes.size()); + for (int overrideIndex : overrideIndexes) { + buffer.putInt(overrideIndex); + } + for (short nodeId : overrideNodeIds) { + buffer.putShort(nodeId); + } + for (short writerEpoch : overrideWriterEpochs) { + buffer.putShort(writerEpoch); + } + } } public static WALMetaData deserialize(ByteBuffer buffer) { + return deserialize(buffer, WALFileVersion.V2); + } + + public static WALMetaData deserialize(ByteBuffer buffer, WALFileVersion version) { long firstSearchIndex = buffer.getLong(); int entriesNum = buffer.getInt(); List buffersSize = new ArrayList<>(entriesNum); @@ -109,13 +252,56 @@ public static WALMetaData deserialize(ByteBuffer buffer) { buffersSize.add(buffer.getInt()); } Set memTablesId = new HashSet<>(); - if (buffer.hasRemaining()) { + final boolean serializedEmptyV3WithoutMemTableCount = + version == WALFileVersion.V3 + && entriesNum == 0 + && buffer.remaining() == V3_EMPTY_METADATA_REMAINING_WITHOUT_MEMTABLE_COUNT; + if (buffer.hasRemaining() && !serializedEmptyV3WithoutMemTableCount) { int memTablesIdNum = buffer.getInt(); for (int i = 0; i < memTablesIdNum; ++i) { memTablesId.add(buffer.getLong()); } } - return new WALMetaData(firstSearchIndex, buffersSize, memTablesId); + WALMetaData result = new WALMetaData(firstSearchIndex, buffersSize, memTablesId); + // V3 extension: file-level timestamp range + per-entry writer progress metadata + if (version == WALFileVersion.V3 && buffer.hasRemaining()) { + result.minDataTs = buffer.getLong(); + result.maxDataTs = buffer.getLong(); + if (buffer.remaining() >= entriesNum * Long.BYTES * 2 + Short.BYTES * 2 + Integer.BYTES) { + for (int i = 0; i < entriesNum; i++) { + result.physicalTimes.add(buffer.getLong()); + } + for (int i = 0; i < entriesNum; i++) { + result.localSeqs.add(buffer.getLong()); + } + final short defaultNodeId = buffer.getShort(); + final short defaultWriterEpoch = buffer.getShort(); + final int overrideCount = buffer.getInt(); + final int[] overrideIndexes = new int[overrideCount]; + final short[] overrideNodeIds = new short[overrideCount]; + final short[] overrideWriterEpochs = new short[overrideCount]; + for (int i = 0; i < overrideCount; i++) { + overrideIndexes[i] = buffer.getInt(); + } + for (int i = 0; i < overrideCount; i++) { + overrideNodeIds[i] = buffer.getShort(); + } + for (int i = 0; i < overrideCount; i++) { + overrideWriterEpochs[i] = buffer.getShort(); + } + for (int i = 0; i < entriesNum; i++) { + result.nodeIds.add(defaultNodeId); + result.writerEpochs.add(defaultWriterEpoch); + } + for (int i = 0; i < overrideCount; i++) { + result.nodeIds.set(overrideIndexes[i], overrideNodeIds[i]); + result.writerEpochs.set(overrideIndexes[i], overrideWriterEpochs[i]); + } + } else { + result.rebuildWriterMetadataWithDefaults(); + } + } + return result; } public List getBuffersSize() { @@ -130,6 +316,106 @@ public long getFirstSearchIndex() { return firstSearchIndex; } + public List getPhysicalTimes() { + return physicalTimes; + } + + public List getNodeIds() { + return nodeIds; + } + + public List getWriterEpochs() { + return writerEpochs; + } + + public List getLocalSeqs() { + return localSeqs; + } + + private short computeDefaultNodeId() { + return unpackNodeId(computeDefaultWriterIdentity()); + } + + private short computeDefaultWriterEpoch() { + return unpackWriterEpoch(computeDefaultWriterIdentity()); + } + + private int getWriterOverrideCount() { + final short defaultNodeId = computeDefaultNodeId(); + final short defaultWriterEpoch = computeDefaultWriterEpoch(); + int count = 0; + for (int i = 0; i < buffersSize.size(); i++) { + if (nodeIds.get(i) != defaultNodeId || writerEpochs.get(i) != defaultWriterEpoch) { + count++; + } + } + return count; + } + + private int computeDefaultWriterIdentity() { + if (nodeIds.isEmpty()) { + return packWriterIdentity(DEFAULT_NODE_ID, DEFAULT_WRITER_EPOCH); + } + final Map counts = new HashMap<>(); + int bestIdentity = packWriterIdentity(nodeIds.get(0), writerEpochs.get(0)); + int bestCount = 0; + for (int i = 0; i < nodeIds.size(); i++) { + final int identity = packWriterIdentity(nodeIds.get(i), writerEpochs.get(i)); + final int count = counts.merge(identity, 1, Integer::sum); + if (count > bestCount) { + bestCount = count; + bestIdentity = identity; + } + } + return bestIdentity; + } + + private static int packWriterIdentity(short nodeId, short writerEpoch) { + return ((nodeId & 0xFFFF) << 16) | (writerEpoch & 0xFFFF); + } + + private static short unpackNodeId(int identity) { + return (short) (identity >>> 16); + } + + private static short unpackWriterEpoch(int identity) { + return (short) identity; + } + + public WALMetaData copy() { + WALMetaData copy = + new WALMetaData(firstSearchIndex, new ArrayList<>(buffersSize), new HashSet<>(memTablesId)); + copy.truncateOffSet = truncateOffSet; + copy.physicalTimes.addAll(physicalTimes); + copy.nodeIds.addAll(nodeIds); + copy.writerEpochs.addAll(writerEpochs); + copy.localSeqs.addAll(localSeqs); + copy.minDataTs = minDataTs; + copy.maxDataTs = maxDataTs; + return copy; + } + + public long getMinDataTs() { + return minDataTs; + } + + public long getMaxDataTs() { + return maxDataTs; + } + + private void rebuildWriterMetadataWithDefaults() { + physicalTimes.clear(); + nodeIds.clear(); + writerEpochs.clear(); + localSeqs.clear(); + for (int i = 0; i < buffersSize.size(); i++) { + physicalTimes.add(0L); + nodeIds.add(DEFAULT_NODE_ID); + writerEpochs.add(DEFAULT_WRITER_EPOCH); + localSeqs.add(firstSearchIndex + i); + } + } + public static WALMetaData readFromWALFile(File logFile, FileChannel channel) throws IOException { if (channel.size() < WALFileVersion.V2.getVersionBytes().length || !isValidMagicString(channel)) { @@ -150,7 +436,7 @@ public static WALMetaData readFromWALFile(File logFile, FileChannel channel) thr ByteBuffer metadataBuf = ByteBuffer.allocate(metadataSize); channel.read(metadataBuf, position - metadataSize); metadataBuf.flip(); - metaData = WALMetaData.deserialize(metadataBuf); + metaData = WALMetaData.deserialize(metadataBuf, version); // versions before V1.3, should recover memTable ids from entries if (metaData.memTablesId.isEmpty()) { int offset = Byte.BYTES; @@ -174,11 +460,16 @@ public static WALMetaData readFromWALFile(File logFile, FileChannel channel) thr } private static boolean isValidMagicString(FileChannel channel) throws IOException { - ByteBuffer magicStringBytes = ByteBuffer.allocate(WALFileVersion.V2.getVersionBytes().length); - channel.read(magicStringBytes, channel.size() - WALFileVersion.V2.getVersionBytes().length); + // V3 magic string is the longest; read enough bytes to check all versions + int maxMagicLen = + Math.max( + WALFileVersion.V3.getVersionBytes().length, WALFileVersion.V2.getVersionBytes().length); + ByteBuffer magicStringBytes = ByteBuffer.allocate(maxMagicLen); + channel.read(magicStringBytes, channel.size() - maxMagicLen); magicStringBytes.flip(); String magicString = new String(magicStringBytes.array(), StandardCharsets.UTF_8); - return magicString.equals(WALFileVersion.V2.getVersionString()) + return magicString.contains(WALFileVersion.V3.getVersionString()) + || magicString.contains(WALFileVersion.V2.getVersionString()) || magicString.contains(WALFileVersion.V1.getVersionString()); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java index 6f13040bec8b4..10d164f3851cd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALWriter.java @@ -34,11 +34,11 @@ public class WALWriter extends LogWriter { private WALFileStatus walFileStatus = WALFileStatus.CONTAINS_NONE_SEARCH_INDEX; // wal files' metadata protected final WALMetaData metaData = new WALMetaData(); - // By default is V2 - private WALFileVersion version = WALFileVersion.V2; + // By default is V3 for writer-progress metadata support. + private WALFileVersion version = WALFileVersion.V3; public WALWriter(File logFile) throws IOException { - this(logFile, WALFileVersion.V2); + this(logFile, WALFileVersion.V3); } public WALWriter(File logFile, WALFileVersion version) throws IOException { @@ -58,12 +58,16 @@ public double write(ByteBuffer buffer, WALMetaData metaData) throws IOException return write(buffer); } - public void updateMetaData(WALMetaData metaData) { + public synchronized void updateMetaData(WALMetaData metaData) { this.metaData.addAll(metaData); } - private void endFile() throws IOException { - if (logFile.length() == WALFileVersion.V2.getVersionBytes().length) { + public synchronized WALMetaData snapshotMetaData() { + return metaData.copy(); + } + + private synchronized void endFile() throws IOException { + if (logFile.length() == version.getVersionBytes().length) { super.close(); return; } @@ -72,12 +76,12 @@ private void endFile() throws IOException { // mark info part ends endMarker.serialize(markerBuffer); write(markerBuffer, false); - int metaDataSize = metaData.serializedSize(); + int metaDataSize = metaData.serializedSize(version); ByteBuffer buffer = ByteBuffer.allocate(metaDataSize + Integer.BYTES + version.getVersionBytes().length); - // flush meta data - metaData.serialize(buffer); + // flush meta data with version-aware serialization + metaData.serialize(buffer, version); buffer.putInt(metaDataSize); // add magic string buffer.put(version.getVersionBytes()); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java index e35d5e79fc019..9779f824d645c 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALFakeNode.java @@ -143,6 +143,26 @@ public long getTotalSize() { return 0; } + @Override + public long getRegionDiskUsage() { + return 0; + } + + @Override + public long getSearchIndexToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + + @Override + public void setSubscriptionRetainedMinVersionId(long minVersionId) { + // do nothing + } + + @Override + public long getVersionIdToFreeAtLeast(long bytesToFree) { + return bytesToFree > 0 ? Long.MAX_VALUE : 0; + } + public static WALFakeNode getFailureInstance(Exception e) { return new WALFakeNode( Status.FAILURE, new WALException("Cannot write wal into a fake node. ", e)); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java index 07dd4d78f6605..38909893bca29 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/WALNode.java @@ -52,6 +52,7 @@ import org.apache.iotdb.db.storageengine.dataregion.wal.checkpoint.CheckpointType; import org.apache.iotdb.db.storageengine.dataregion.wal.checkpoint.MemTableInfo; import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALByteBufReader; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileStatus; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; import org.apache.iotdb.db.storageengine.dataregion.wal.utils.listener.AbstractResultListener; @@ -82,6 +83,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; @@ -112,6 +114,8 @@ public class WALNode implements IWALNode { private final Map memTableSnapshotCount = new ConcurrentHashMap<>(); // insert nodes whose search index are before this value can be deleted safely private volatile long safelyDeletedSearchIndex = DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + // WAL files with versionId >= this value are retained for subscription consumers + private volatile long subscriptionRetainedMinVersionId = Long.MAX_VALUE; private volatile boolean deleted = false; @@ -572,6 +576,7 @@ public boolean isContainsActiveOrPinnedMemTable(Long versionId) { private boolean canDeleteFile(long fileArrIdx, WALFileStatus walFileStatus, long versionId) { return (fileArrIdx < fileIndexAfterFilterSafelyDeleteIndex || walFileStatus == WALFileStatus.CONTAINS_NONE_SEARCH_INDEX) + && versionId < subscriptionRetainedMinVersionId && !isContainsActiveOrPinnedMemTable(versionId); } } @@ -584,6 +589,11 @@ public void setSafelyDeletedSearchIndex(long safelyDeletedSearchIndex) { this.safelyDeletedSearchIndex = safelyDeletedSearchIndex; } + @Override + public void setSubscriptionRetainedMinVersionId(long minVersionId) { + this.subscriptionRetainedMinVersionId = minVersionId; + } + /** This iterator is not concurrency-safe, cannot read the current-writing wal file. */ @Override public ReqIterator getReqIterator(long startIndex) { @@ -654,6 +664,11 @@ public boolean hasNext() { AtomicReference> tmpNodes = new AtomicReference<>(new ArrayList<>()); AtomicBoolean notFirstFile = new AtomicBoolean(false); AtomicBoolean hasCollectedSufficientData = new AtomicBoolean(false); + // V3: track writer progress metadata for current entry group + AtomicLong currentEntrySyncIndex = new AtomicLong(-1); + AtomicLong currentEntryPhysicalTime = new AtomicLong(0); + AtomicLong currentEntryWriterEpoch = new AtomicLong(0); + AtomicLong currentEntryNodeId = new AtomicLong(-1); long memorySize = 0; @@ -662,7 +677,15 @@ public boolean hasNext() { Runnable tryToCollectInsertNodeAndBumpIndex = () -> { if (!tmpNodes.get().isEmpty()) { - insertNodes.add(new IndexedConsensusRequest(nextSearchIndex, tmpNodes.get())); + long syncIdx = currentEntrySyncIndex.get(); + IndexedConsensusRequest req = + (syncIdx >= 0) + ? new IndexedConsensusRequest(nextSearchIndex, syncIdx, tmpNodes.get()) + : new IndexedConsensusRequest(nextSearchIndex, tmpNodes.get()); + req.setPhysicalTime(currentEntryPhysicalTime.get()) + .setNodeId((int) currentEntryNodeId.get()) + .setWriterEpoch(currentEntryWriterEpoch.get()); + insertNodes.add(req); tmpNodes.set(new ArrayList<>()); nextSearchIndex++; if (notFirstFile.get()) { @@ -695,6 +718,10 @@ public boolean hasNext() { } else if (currentWalEntryIndex < nextSearchIndex) { // WAL entry is outdated, do nothing, continue to see next WAL entry } else if (currentWalEntryIndex == nextSearchIndex) { + currentEntrySyncIndex.set(walByteBufReader.getCurrentEntrySyncIndex()); + currentEntryPhysicalTime.set(walByteBufReader.getCurrentEntryPhysicalTime()); + currentEntryWriterEpoch.set(walByteBufReader.getCurrentEntryWriterEpoch()); + currentEntryNodeId.set(walByteBufReader.getCurrentEntryNodeId()); if (type == WALEntryType.OBJECT_FILE_NODE) { WALEntry walEntry = WALEntry.deserialize( @@ -723,6 +750,10 @@ public boolean hasNext() { currentWalEntryIndex); nextSearchIndex = currentWalEntryIndex; } + currentEntrySyncIndex.set(walByteBufReader.getCurrentEntrySyncIndex()); + currentEntryPhysicalTime.set(walByteBufReader.getCurrentEntryPhysicalTime()); + currentEntryWriterEpoch.set(walByteBufReader.getCurrentEntryWriterEpoch()); + currentEntryNodeId.set(walByteBufReader.getCurrentEntryNodeId()); if (type == WALEntryType.OBJECT_FILE_NODE) { WALEntry walEntry = WALEntry.deserialize( @@ -898,11 +929,71 @@ public long getCurrentWALFileVersion() { return buffer.getCurrentWALFileVersion(); } + public WALMetaData getCurrentWALMetaDataSnapshot() { + return buffer.getCurrentWALMetaDataSnapshot(); + } + @Override public long getTotalSize() { return WALManager.getInstance().getTotalDiskUsage(); } + @Override + public long getRegionDiskUsage() { + return buffer.getDiskUsage(); + } + + @Override + public long getSearchIndexToFreeAtLeast(long bytesToFree) { + if (bytesToFree <= 0) { + return DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + File[] walFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (walFiles == null || walFiles.length <= 1) { + // No files or only the current-writing file — cannot free anything + return DEFAULT_SAFELY_DELETED_SEARCH_INDEX; + } + WALFileUtils.ascSortByVersionId(walFiles); + // Exclude the last file (currently being written) + long accumulated = 0; + for (int i = 0; i < walFiles.length - 1; i++) { + accumulated += walFiles[i].length(); + if (accumulated >= bytesToFree) { + // The next file's startSearchIndex is the boundary: everything before it can be deleted + if (i + 1 < walFiles.length) { + return WALFileUtils.parseStartSearchIndex(walFiles[i + 1].getName()); + } + break; + } + } + // Could not free enough even by deleting all non-current files — allow deleting all + return Long.MAX_VALUE; + } + + @Override + public long getVersionIdToFreeAtLeast(long bytesToFree) { + if (bytesToFree <= 0) { + return 0; + } + File[] walFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (walFiles == null || walFiles.length <= 1) { + return 0; + } + WALFileUtils.ascSortByVersionId(walFiles); + long accumulated = 0; + for (int i = 0; i < walFiles.length - 1; i++) { + accumulated += walFiles[i].length(); + if (accumulated >= bytesToFree) { + // Return the versionId of the next file — files before it can be freed + if (i + 1 < walFiles.length) { + return WALFileUtils.parseVersionId(walFiles[i + 1].getName()); + } + break; + } + } + return Long.MAX_VALUE; + } + // endregion @Override diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java index 117f06c764440..6ebc48caa5409 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtils.java @@ -19,8 +19,18 @@ package org.apache.iotdb.db.storageengine.dataregion.wal.utils; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.ProgressWALReader; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Path; import java.util.Arrays; import java.util.Comparator; @@ -35,6 +45,11 @@ import static org.apache.iotdb.commons.conf.IoTDBConstant.WAL_VERSION_ID; public class WALFileUtils { + + private static final Logger logger = LoggerFactory.getLogger(WALFileUtils.class); + private static final int SEARCH_INDEX_OFFSET = + WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES; + /** * versionId is a self-incremented id number, helping to maintain the order of wal files. * startSearchIndex is the valid search index of last flushed wal entry. statusCode is the. For @@ -182,4 +197,306 @@ public static String getTsFileRelativePath(String absolutePath) { Path path = new File(absolutePath).toPath(); return path.subpath(path.getNameCount() - 5, path.getNameCount()).toString(); } + + /** + * Find the earliest local searchIndex strictly after the given compatibility frontier. This + * fallback path is only used when the caller has a coarse (physicalTime, localSeq) pair but no + * writer identity. + */ + public static long findSearchIndexAfterCompatibleProgress( + final File logDir, final long physicalTime, final long localSeq) { + final long[] bestSearchIndex = new long[] {-1L}; + final long[] bestPhysicalTime = new long[] {Long.MAX_VALUE}; + final long[] bestLocalSeq = new long[] {Long.MAX_VALUE}; + final int[] bestNodeId = new int[] {Integer.MAX_VALUE}; + + forEachSealedSearchableRequest( + logDir, + request -> { + if (compareCompatibleProgress( + request.physicalTime, request.nodeId, request.localSeq, physicalTime, localSeq) + <= 0) { + return true; + } + if (bestSearchIndex[0] < 0L + || compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + bestPhysicalTime[0], + bestNodeId[0], + bestLocalSeq[0]) + < 0) { + bestSearchIndex[0] = request.searchIndex; + bestPhysicalTime[0] = request.physicalTime; + bestLocalSeq[0] = request.localSeq; + bestNodeId[0] = request.nodeId; + } + return true; + }); + return bestSearchIndex[0]; + } + + /** + * Locate the first local searchIndex whose writer progress is equal to or strictly greater than + * the given writer-local frontier. This is currently used by single-writer recovery paths, so it + * matches only entries from the supplied (nodeId, writerEpoch) pair. + * + * @return [targetSearchIndex, exactMatchFlag], or null if no matching/later entry exists + */ + public static long[] locateByWriterProgress( + final File logDir, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + final long[] exactSearchIndex = new long[] {-1L}; + final long[] firstAfterSearchIndex = new long[] {-1L}; + final long[] firstAfterPhysicalTime = new long[] {Long.MAX_VALUE}; + final long[] firstAfterLocalSeq = new long[] {Long.MAX_VALUE}; + + forEachSealedSearchableRequest( + logDir, + request -> { + if (request.nodeId != nodeId || request.writerEpoch != writerEpoch) { + return true; + } + final int cmp = + compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + physicalTime, + nodeId, + localSeq); + if (cmp == 0) { + exactSearchIndex[0] = request.searchIndex; + return false; + } + if (cmp > 0 + && (firstAfterSearchIndex[0] < 0L + || compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + firstAfterPhysicalTime[0], + nodeId, + firstAfterLocalSeq[0]) + < 0)) { + firstAfterSearchIndex[0] = request.searchIndex; + firstAfterPhysicalTime[0] = request.physicalTime; + firstAfterLocalSeq[0] = request.localSeq; + } + return true; + }); + + if (exactSearchIndex[0] >= 0L) { + return new long[] {exactSearchIndex[0], 1L}; + } + if (firstAfterSearchIndex[0] >= 0L) { + return new long[] {firstAfterSearchIndex[0], 0L}; + } + return null; + } + + public static long findSearchIndexByWriterProgress( + final File logDir, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + final long[] located = + locateByWriterProgress(logDir, nodeId, writerEpoch, physicalTime, localSeq); + return located != null && located[1] == 1L ? located[0] : -1L; + } + + public static long findSearchIndexAfterWriterProgress( + final File logDir, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + final long[] bestSearchIndex = new long[] {-1L}; + final long[] bestPhysicalTime = new long[] {Long.MAX_VALUE}; + final long[] bestLocalSeq = new long[] {Long.MAX_VALUE}; + forEachSealedSearchableRequest( + logDir, + request -> { + if (request.nodeId != nodeId || request.writerEpoch != writerEpoch) { + return true; + } + if (compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + physicalTime, + nodeId, + localSeq) + <= 0) { + return true; + } + if (bestSearchIndex[0] < 0L + || compareWriterProgress( + request.physicalTime, + request.nodeId, + request.localSeq, + bestPhysicalTime[0], + nodeId, + bestLocalSeq[0]) + < 0) { + bestSearchIndex[0] = request.searchIndex; + bestPhysicalTime[0] = request.physicalTime; + bestLocalSeq[0] = request.localSeq; + } + return true; + }); + return bestSearchIndex[0]; + } + + private interface SearchableRequestVisitor { + boolean onRequest(SearchableRequestMeta request); + } + + private static final class SearchableRequestMeta { + private final long searchIndex; + private final long physicalTime; + private final int nodeId; + private final long writerEpoch; + private final long localSeq; + + private SearchableRequestMeta( + final long searchIndex, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + this.searchIndex = searchIndex; + this.physicalTime = physicalTime; + this.nodeId = nodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + } + + private static void forEachSealedSearchableRequest( + final File logDir, final SearchableRequestVisitor visitor) { + final File[] walFiles = listSealedWALFiles(logDir); + if (walFiles == null || walFiles.length == 0) { + return; + } + + for (final File walFile : walFiles) { + try (final ProgressWALReader reader = new ProgressWALReader(walFile)) { + long pendingSearchIndex = Long.MIN_VALUE; + long pendingPhysicalTime = 0L; + int pendingNodeId = -1; + long pendingWriterEpoch = 0L; + long pendingLocalSeq = Long.MIN_VALUE; + boolean hasPending = false; + + while (reader.hasNext()) { + final ByteBuffer buffer = reader.next(); + final WALEntryType type = WALEntryType.valueOf(buffer.get()); + buffer.clear(); + if (!type.needSearch()) { + continue; + } + + final long currentLocalSeq = reader.getCurrentEntryLocalSeq(); + final long currentPhysicalTime = reader.getCurrentEntryPhysicalTime(); + final int currentNodeId = reader.getCurrentEntryNodeId(); + final long currentWriterEpoch = reader.getCurrentEntryWriterEpoch(); + + buffer.position(SEARCH_INDEX_OFFSET); + final long bodySearchIndex = buffer.getLong(); + buffer.clear(); + final long currentSearchIndex = bodySearchIndex >= 0 ? bodySearchIndex : currentLocalSeq; + + if (hasPending + && pendingLocalSeq == currentLocalSeq + && pendingNodeId == currentNodeId + && pendingWriterEpoch == currentWriterEpoch) { + if (pendingSearchIndex < 0 && currentSearchIndex >= 0) { + pendingSearchIndex = currentSearchIndex; + } + continue; + } + + if (hasPending + && !visitor.onRequest( + new SearchableRequestMeta( + pendingSearchIndex >= 0 ? pendingSearchIndex : pendingLocalSeq, + pendingPhysicalTime, + pendingNodeId, + pendingWriterEpoch, + pendingLocalSeq))) { + return; + } + + hasPending = true; + pendingSearchIndex = currentSearchIndex; + pendingPhysicalTime = currentPhysicalTime; + pendingNodeId = currentNodeId; + pendingWriterEpoch = currentWriterEpoch; + pendingLocalSeq = currentLocalSeq; + } + + if (hasPending + && !visitor.onRequest( + new SearchableRequestMeta( + pendingSearchIndex >= 0 ? pendingSearchIndex : pendingLocalSeq, + pendingPhysicalTime, + pendingNodeId, + pendingWriterEpoch, + pendingLocalSeq))) { + return; + } + } catch (final IOException e) { + logger.warn("Failed to scan WAL file {} for searchable request metadata", walFile, e); + } + } + } + + private static int compareCompatibleProgress( + final long leftPhysicalTime, + final int leftNodeId, + final long leftLocalSeq, + final long rightPhysicalTime, + final long rightLocalSeq) { + if (leftPhysicalTime != rightPhysicalTime) { + return Long.compare(leftPhysicalTime, rightPhysicalTime); + } + if (leftLocalSeq != rightLocalSeq) { + return Long.compare(leftLocalSeq, rightLocalSeq); + } + return 0; + } + + private static int compareWriterProgress( + final long leftPhysicalTime, + final int leftNodeId, + final long leftLocalSeq, + final long rightPhysicalTime, + final int rightNodeId, + final long rightLocalSeq) { + if (leftPhysicalTime != rightPhysicalTime) { + return Long.compare(leftPhysicalTime, rightPhysicalTime); + } + if (leftNodeId != rightNodeId) { + return Integer.compare(leftNodeId, rightNodeId); + } + return Long.compare(leftLocalSeq, rightLocalSeq); + } + + private static File[] listSealedWALFiles(final File logDir) { + final File[] walFiles = listAllWALFiles(logDir); + if (walFiles == null || walFiles.length == 0) { + return walFiles; + } + ascSortByVersionId(walFiles); + if (walFiles.length == 1) { + return new File[0]; + } + return Arrays.copyOf(walFiles, walFiles.length - 1); + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java index 00007f921b260..2a260e6b0c8c0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgent.java @@ -19,17 +19,31 @@ package org.apache.iotdb.db.subscription.agent; +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.subscription.broker.ConsensusSubscriptionBroker; import org.apache.iotdb.db.subscription.broker.SubscriptionBroker; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusRegionRuntimeState; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; import org.apache.iotdb.db.subscription.resource.SubscriptionDataNodeResourceManager; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; import org.apache.iotdb.db.subscription.task.subtask.SubscriptionSinkSubtask; import org.apache.iotdb.rpc.subscription.config.ConsumerConfig; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -43,7 +57,12 @@ public class SubscriptionBrokerAgent { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBrokerAgent.class); - private final Map consumerGroupIdToSubscriptionBroker = + /** Pipe-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToPipeBroker = + new ConcurrentHashMap<>(); + + /** Consensus-based subscription brokers, one per consumer group. */ + private final Map consumerGroupIdToConsensusBroker = new ConcurrentHashMap<>(); private final Cache prefetchingQueueCount = @@ -53,18 +72,64 @@ public class SubscriptionBrokerAgent { public List poll( final ConsumerConfig consumerConfig, final Set topicNames, final long maxBytes) { + return poll(consumerConfig, topicNames, maxBytes, Collections.emptyMap()); + } + + public List poll( + final ConsumerConfig consumerConfig, + final Set topicNames, + final long maxBytes, + final Map progressByTopic) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List allEvents = new ArrayList<>(); + long remainingBytes = maxBytes; + + // Poll from pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.nonNull(pipeBroker)) { + final List pipeEvents = + pipeBroker.poll(consumerId, topicNames, remainingBytes); + allEvents.addAll(pipeEvents); + for (final SubscriptionEvent event : pipeEvents) { + try { + remainingBytes -= event.getCurrentResponseSize(); + } catch (final IOException ignored) { + // best effort + } + } + } + + // Poll from consensus-based broker + if (remainingBytes > 0) { + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker)) { + LOGGER.debug( + "SubscriptionBrokerAgent: polling consensus broker for consumer group [{}], " + + "topicNames={}, remainingBytes={}", + consumerGroupId, + topicNames, + remainingBytes); + allEvents.addAll( + consensusBroker.poll(consumerId, topicNames, remainingBytes, progressByTopic)); + } else { + LOGGER.debug( + "SubscriptionBrokerAgent: no consensus broker for consumer group [{}]", + consumerGroupId); + } + } + + if (allEvents.isEmpty() + && Objects.isNull(pipeBroker) + && Objects.isNull(consumerGroupIdToConsensusBroker.get(consumerGroupId))) { final String errorMessage = - String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - // TODO: currently we fetch messages from all topics - final String consumerId = consumerConfig.getConsumerId(); - return broker.poll(consumerId, topicNames, maxBytes); + + return allEvents; } public List pollTsFile( @@ -72,16 +137,18 @@ public List pollTsFile( final SubscriptionCommitContext commitContext, final long writingOffset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // TsFile polling can only be called by pipe-based subscriptions + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + "Subscription: pipe broker bound to consumer group [%s] does not exist", + consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTsFile(consumerId, commitContext, writingOffset); + return pipeBroker.pollTsFile(consumerId, commitContext, writingOffset); } public List pollTablets( @@ -89,16 +156,26 @@ public List pollTablets( final SubscriptionCommitContext commitContext, final int offset) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final String topicName = commitContext.getTopicName(); + + // Try consensus-based broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.pollTablets(consumerId, commitContext, offset); + } + + // Fall back to pipe-based broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { final String errorMessage = String.format( "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.pollTablets(consumerId, commitContext, offset); + return pipeBroker.pollTablets(consumerId, commitContext, offset); } /** @@ -109,46 +186,190 @@ public List commit( final List commitContexts, final boolean nack) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String consumerId = consumerConfig.getConsumerId(); + final List allSuccessful = new ArrayList<>(); + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + + if (Objects.isNull(pipeBroker) && Objects.isNull(consensusBroker)) { + final String errorMessage = + String.format("Subscription: no broker bound to consumer group [%s]", consumerGroupId); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + // Partition commit contexts by which broker owns the topic. + final List pipeContexts = new ArrayList<>(); + final List consensusContexts = new ArrayList<>(); + for (final SubscriptionCommitContext ctx : commitContexts) { + final String topicName = ctx.getTopicName(); + if (Objects.nonNull(consensusBroker) + && ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + consensusContexts.add(ctx); + } else { + pipeContexts.add(ctx); + } + } + + if (Objects.nonNull(pipeBroker) && !pipeContexts.isEmpty()) { + allSuccessful.addAll(pipeBroker.commit(consumerId, pipeContexts, nack)); + } + if (Objects.nonNull(consensusBroker) && !consensusContexts.isEmpty()) { + allSuccessful.addAll(consensusBroker.commit(consumerId, consensusContexts, nack)); + } + + return allSuccessful; + } + + public void seek( + final ConsumerConfig consumerConfig, final String topicName, final short seekType) { + final String consumerGroupId = consumerConfig.getConsumerGroupId(); + + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + ensureConsensusSeekRuntimeAvailable(consumerGroupId, topicName, "seek"); + if (seekType != PipeSubscribeSeekReq.SEEK_TO_BEGINNING + && seekType != PipeSubscribeSeekReq.SEEK_TO_END) { + final String errorMessage = + String.format( + "Subscription: consensus seek only supports beginning/end or topic progress, " + + "consumerGroup=%s, topic=%s, seekType=%s", + consumerGroupId, topicName, seekType); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + consensusBroker.seek(topicName, seekType); + return; + } + + final String errorMessage = + String.format( + "Subscription: seek is only supported for consensus-based subscriptions, " + + "consumerGroup=%s, topic=%s", + consumerGroupId, topicName); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + public void seekToTopicProgress( + final ConsumerConfig consumerConfig, + final String topicName, + final TopicProgress topicProgress) { + final String consumerGroupId = consumerConfig.getConsumerGroupId(); + + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + ensureConsensusSeekRuntimeAvailable(consumerGroupId, topicName, "seek(topicProgress)"); + consensusBroker.seek(topicName, topicProgress); + return; + } + + final String errorMessage = + String.format( + "Subscription: seek(topicProgress) is only supported for consensus-based subscriptions, " + + "consumerGroup=%s, topic=%s", + consumerGroupId, topicName); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + public void seekAfterTopicProgress( + final ConsumerConfig consumerConfig, + final String topicName, + final TopicProgress topicProgress) { + final String consumerGroupId = consumerConfig.getConsumerGroupId(); + + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + ensureConsensusSeekRuntimeAvailable(consumerGroupId, topicName, "seekAfter(topicProgress)"); + consensusBroker.seekAfter(topicName, topicProgress); + return; + } + + final String errorMessage = + String.format( + "Subscription: seekAfter(topicProgress) is only supported for consensus-based subscriptions, " + + "consumerGroup=%s, topic=%s", + consumerGroupId, topicName); + LOGGER.warn(errorMessage); + throw new SubscriptionException(errorMessage); + } + + private void ensureConsensusSeekRuntimeAvailable( + final String consumerGroupId, final String topicName, final String operation) { + if (!ConsensusSubscriptionPrefetchExecutorManager.getInstance().isStarted() + || SubscriptionAgent.runtime().isShutdown()) { final String errorMessage = String.format( - "Subscription: broker bound to consumer group [%s] does not exist", consumerGroupId); + "Subscription: consensus %s is unavailable because subscription runtime is stopped, " + + "consumerGroup=%s, topic=%s", + operation, consumerGroupId, topicName); LOGGER.warn(errorMessage); throw new SubscriptionException(errorMessage); } - final String consumerId = consumerConfig.getConsumerId(); - return broker.commit(consumerId, commitContexts, nack); } public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String consumerGroupId = commitContext.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + final String topicName = commitContext.getTopicName(); + + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.isCommitContextOutdated(commitContext); + } + + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { return true; } - return broker.isCommitContextOutdated(commitContext); + return pipeBroker.isCommitContextOutdated(commitContext); } public List fetchTopicNamesToUnsubscribe( final ConsumerConfig consumerConfig, final Set topicNames) { final String consumerGroupId = consumerConfig.getConsumerGroupId(); - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + + // Consensus-based subscription topics are unbounded streams, so they do not trigger + // auto-unsubscribe. + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + final Set pipeOnlyTopicNames; + if (Objects.nonNull(consensusBroker)) { + pipeOnlyTopicNames = new java.util.HashSet<>(topicNames); + pipeOnlyTopicNames.removeIf(consensusBroker::hasQueue); + } else { + pipeOnlyTopicNames = topicNames; + } + + if (pipeOnlyTopicNames.isEmpty()) { + return Collections.emptyList(); + } + + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { return Collections.emptyList(); } - return broker.fetchTopicNamesToUnsubscribe(topicNames); + return pipeBroker.fetchTopicNamesToUnsubscribe(pipeOnlyTopicNames); } /////////////////////////////// broker /////////////////////////////// public boolean isBrokerExist(final String consumerGroupId) { - return consumerGroupIdToSubscriptionBroker.containsKey(consumerGroupId); + return consumerGroupIdToPipeBroker.containsKey(consumerGroupId) + || consumerGroupIdToConsensusBroker.containsKey(consumerGroupId); } public void createBrokerIfNotExist(final String consumerGroupId) { - consumerGroupIdToSubscriptionBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); - LOGGER.info("Subscription: create broker bound to consumer group [{}]", consumerGroupId); + consumerGroupIdToPipeBroker.computeIfAbsent(consumerGroupId, SubscriptionBroker::new); + LOGGER.info("Subscription: create pipe broker bound to consumer group [{}]", consumerGroupId); } /** @@ -156,26 +377,46 @@ public void createBrokerIfNotExist(final String consumerGroupId) { */ public boolean dropBroker(final String consumerGroupId) { final AtomicBoolean dropped = new AtomicBoolean(false); - consumerGroupIdToSubscriptionBroker.compute( + + // Drop pipe broker + consumerGroupIdToPipeBroker.compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { + dropped.set(true); + return null; + } + if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", + "Subscription: pipe broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); - dropped.set(true); + return broker; + } + dropped.set(true); + LOGGER.info( + "Subscription: drop pipe broker bound to consumer group [{}]", consumerGroupId); + return null; + }); + + // Drop consensus broker + consumerGroupIdToConsensusBroker.compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { return null; } if (!broker.isEmpty()) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] is not empty when dropping", + "Subscription: consensus broker bound to consumer group [{}] is not empty when dropping", consumerGroupId); return broker; } dropped.set(true); - LOGGER.info("Subscription: drop broker bound to consumer group [{}]", consumerGroupId); - return null; // remove this entry + LOGGER.info( + "Subscription: drop consensus broker bound to consumer group [{}]", consumerGroupId); + return null; }); + return dropped.get(); } @@ -183,15 +424,14 @@ public boolean dropBroker(final String consumerGroupId) { public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { final String consumerGroupId = subtask.getConsumerGroupId(); - consumerGroupIdToSubscriptionBroker + consumerGroupIdToPipeBroker .compute( consumerGroupId, (id, broker) -> { if (Objects.isNull(broker)) { LOGGER.info( - "Subscription: broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", + "Subscription: pipe broker bound to consumer group [{}] does not exist, create new for binding prefetching queue", consumerGroupId); - // TODO: consider more robust metadata semantics return new SubscriptionBroker(consumerGroupId); } return broker; @@ -200,41 +440,181 @@ public void bindPrefetchingQueue(final SubscriptionSinkSubtask subtask) { prefetchingQueueCount.invalidate(); } - public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); + public void bindConsensusPrefetchingQueue( + final String consumerGroupId, + final String topicName, + final String orderMode, + final ConsensusGroupId consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress, + final long tailStartSearchIndex, + final long initialRuntimeVersion, + final boolean initialActive) { + consumerGroupIdToConsensusBroker + .compute( + consumerGroupId, + (id, broker) -> { + if (Objects.isNull(broker)) { + LOGGER.info( + "Subscription: consensus broker bound to consumer group [{}] does not exist, create new for binding consensus prefetching queue", + consumerGroupId); + return new ConsensusSubscriptionBroker(consumerGroupId); + } + return broker; + }) + .bindConsensusPrefetchingQueue( + topicName, + orderMode, + consensusGroupId, + serverImpl, + converter, + commitManager, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + prefetchingQueueCount.invalidate(); + } + + public void refreshConsensusQueueOrderMode(final String topicName, final String orderMode) { + LOGGER.info( + "SubscriptionBrokerAgent: refreshing consensus queue order-mode for topic [{}] to [{}]", + topicName, + orderMode); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.refreshConsensusQueueOrderMode(topicName, orderMode); + } + } + + public void unbindConsensusPrefetchingQueue( + final String consumerGroupId, final String topicName) { + final ConsensusSubscriptionBroker broker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); if (Objects.isNull(broker)) { LOGGER.warn( - "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); + "Subscription: consensus broker bound to consumer group [{}] does not exist", + consumerGroupId); return; } - broker.updateCompletedTopicNames(topicName); + broker.unbindConsensusPrefetchingQueue(topicName); + prefetchingQueueCount.invalidate(); + } + + public void unbindByRegion(final ConsensusGroupId regionId) { + int totalClosed = 0; + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + totalClosed += broker.unbindByRegion(regionId); + } + if (totalClosed > 0) { + prefetchingQueueCount.invalidate(); + LOGGER.info( + "Subscription: unbound {} consensus prefetching queue(s) for removed region [{}]", + totalClosed, + regionId); + } + } + + /** + * Activates or deactivates all consensus prefetching queues bound to {@code regionId} across all + * consumer groups. Called on leader migration to ensure only the preferred writer serves + * subscription data. + */ + public void setActiveForRegion(final ConsensusGroupId regionId, final boolean active) { + LOGGER.info( + "SubscriptionBrokerAgent: setActiveForRegion regionId={}, active={}", regionId, active); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.setActiveForRegion(regionId, active); + } + } + + public void setActiveWritersForRegion( + final ConsensusGroupId regionId, final Set activeWriterNodeIds) { + LOGGER.info( + "SubscriptionBrokerAgent: setActiveWritersForRegion regionId={}, activeWriterNodeIds={}", + regionId, + activeWriterNodeIds); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.setActiveWritersForRegion(regionId, activeWriterNodeIds); + } + } + + public void applyRuntimeStateForRegion( + final ConsensusGroupId regionId, final ConsensusRegionRuntimeState runtimeState) { + LOGGER.info( + "SubscriptionBrokerAgent: applyRuntimeStateForRegion regionId={}, runtimeState={}", + regionId, + runtimeState); + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.applyRuntimeStateForRegion(regionId, runtimeState); + } + } + + public void abortConsensusPendingSeeksForRuntimeStop() { + for (final ConsensusSubscriptionBroker broker : consumerGroupIdToConsensusBroker.values()) { + broker.abortPendingSeeksForRuntimeStop(); + } + } + + public void updateCompletedTopicNames(final String consumerGroupId, final String topicName) { + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { + LOGGER.warn( + "Subscription: pipe broker bound to consumer group [{}] does not exist", consumerGroupId); + return; + } + pipeBroker.updateCompletedTopicNames(topicName); } public void unbindPrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.removeQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.unbindPrefetchingQueue(topicName); + pipeBroker.unbindPrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public void removePrefetchingQueue(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + consensusBroker.removeQueue(topicName); + prefetchingQueueCount.invalidate(); + return; + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return; } - broker.removePrefetchingQueue(topicName); + pipeBroker.removePrefetchingQueue(topicName); prefetchingQueueCount.invalidate(); } public boolean executePrefetch(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + if (ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + return false; + } + + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { SubscriptionDataNodeResourceManager.log() .schedule(SubscriptionBrokerAgent.class, consumerGroupId, topicName) .ifPresent( @@ -244,27 +624,81 @@ public boolean executePrefetch(final String consumerGroupId, final String topicN consumerGroupId)); return false; } - return broker.executePrefetch(topicName); + return pipeBroker.executePrefetch(topicName); } public int getPipeEventCount(final String consumerGroupId, final String topicName) { - final SubscriptionBroker broker = consumerGroupIdToSubscriptionBroker.get(consumerGroupId); - if (Objects.isNull(broker)) { + // Try consensus broker first + final ConsensusSubscriptionBroker consensusBroker = + consumerGroupIdToConsensusBroker.get(consumerGroupId); + if (Objects.nonNull(consensusBroker) && consensusBroker.hasQueue(topicName)) { + return consensusBroker.getEventCount(topicName); + } + // Fall back to pipe broker + final SubscriptionBroker pipeBroker = consumerGroupIdToPipeBroker.get(consumerGroupId); + if (Objects.isNull(pipeBroker)) { LOGGER.warn( "Subscription: broker bound to consumer group [{}] does not exist", consumerGroupId); return 0; } - return broker.getPipeEventCount(topicName); + return pipeBroker.getPipeEventCount(topicName); } public int getPrefetchingQueueCount() { return prefetchingQueueCount.get(); } + public Map getConsensusLagSummary() { + final Map result = new ConcurrentHashMap<>(); + for (final Map.Entry entry : + consumerGroupIdToConsensusBroker.entrySet()) { + final String groupId = entry.getKey(); + for (final Map.Entry lag : entry.getValue().getLagSummary().entrySet()) { + result.put(groupId + "/" + lag.getKey(), lag.getValue()); + } + } + return result; + } + private int getPrefetchingQueueCountInternal() { - return consumerGroupIdToSubscriptionBroker.values().stream() - .map(SubscriptionBroker::getPrefetchingQueueCount) - .reduce(0, Integer::sum); + int count = + consumerGroupIdToPipeBroker.values().stream() + .map(SubscriptionBroker::getPrefetchingQueueCount) + .reduce(0, Integer::sum); + count += + consumerGroupIdToConsensusBroker.values().stream() + .map(ConsensusSubscriptionBroker::getQueueCount) + .reduce(0, Integer::sum); + return count; + } + + /////////////////////////////// Commit Progress /////////////////////////////// + + public Map collectAllRegionCommitProgress(final int dataNodeId) { + return ConsensusSubscriptionCommitManager.getInstance().collectAllRegionProgress(dataNodeId); + } + + /** + * Receives a committed progress broadcast from another DataNode (Leader → Follower). Delegates to + * CommitManager to update local progress state. + */ + public void receiveSubscriptionProgress( + final String consumerGroupId, + final String topicName, + final String regionId, + final long physicalTime, + final long localSeq, + final int writerNodeId, + final long writerEpoch) { + ConsensusSubscriptionCommitManager.getInstance() + .receiveProgressBroadcast( + consumerGroupId, + topicName, + regionId, + physicalTime, + localSeq, + writerNodeId, + writerEpoch); } /////////////////////////////// Cache /////////////////////////////// @@ -272,8 +706,9 @@ private int getPrefetchingQueueCountInternal() { /** * A simple generic cache that computes and stores a value on demand. * - *

Note that since the get() and invalidate() methods are not modified with synchronized, the - * value obtained may not be entirely accurate. + *

Both {@code value} and {@code valid} are volatile to ensure visibility across threads. The + * {@code get()} method uses a local snapshot of {@code valid} to avoid double-read reordering. + * Concurrent recomputation by multiple threads is benign (idempotent supplier). * * @param the type of the cached value */ @@ -304,8 +739,10 @@ private void invalidate() { */ private T get() { if (!valid) { - value = supplier.get(); + final T computed = supplier.get(); + value = computed; valid = true; + return computed; } return value; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java index 4ee6b191a2478..8dba7812c9028 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionConsumerAgent.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMeta; import org.apache.iotdb.commons.subscription.meta.consumer.ConsumerGroupMetaKeeper; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.mpp.rpc.thrift.TPushConsumerGroupMetaRespExceptionMessage; import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; @@ -131,11 +132,34 @@ private void handleSingleConsumerGroupMetaChangesInternal( for (final String topicName : topicsUnsubByGroup) { SubscriptionAgent.broker().removePrefetchingQueue(consumerGroupId, topicName); } + // Tear down consensus-based subscriptions for unsubscribed topics + if (!topicsUnsubByGroup.isEmpty()) { + ConsensusSubscriptionSetupHandler.teardownConsensusSubscriptions( + consumerGroupId, topicsUnsubByGroup); + } + + // Detect newly subscribed topics (present in new meta but not in old meta) + final Set newlySubscribedTopics = + ConsumerGroupMeta.getTopicsNewlySubByGroup(metaInAgent, metaFromCoordinator); + + LOGGER.info( + "Subscription: consumer group [{}] meta change detected, " + + "topicsUnsubByGroup={}, newlySubscribedTopics={}", + consumerGroupId, + topicsUnsubByGroup, + newlySubscribedTopics); // TODO: Currently we fully replace the entire ConsumerGroupMeta without carefully checking the // changes in its fields. consumerGroupMetaKeeper.removeConsumerGroupMeta(consumerGroupId); consumerGroupMetaKeeper.addConsumerGroupMeta(consumerGroupId, metaFromCoordinator); + + // Set up consensus-based subscription for newly subscribed live-mode topics. + // This must happen after the meta is updated so that the broker can find the topic config. + if (!newlySubscribedTopics.isEmpty()) { + ConsensusSubscriptionSetupHandler.handleNewSubscriptions( + consumerGroupId, newlySubscribedTopics); + } } public TPushConsumerGroupMetaRespExceptionMessage handleConsumerGroupMetaChanges( @@ -221,4 +245,24 @@ public Set getTopicNamesSubscribedByConsumer( releaseReadLock(); } } + + /** + * Get all active subscriptions: consumerGroupId → set of subscribed topic names. Used by + * consensus subscription auto-binding when a new DataRegion is created. + */ + public java.util.Map> getAllSubscriptions() { + acquireReadLock(); + try { + final java.util.Map> result = new java.util.HashMap<>(); + for (final ConsumerGroupMeta meta : consumerGroupMetaKeeper.getAllConsumerGroupMeta()) { + final Set topics = meta.getSubscribedTopicNames(); + if (!topics.isEmpty()) { + result.put(meta.getConsumerGroupId(), new java.util.HashSet<>(topics)); + } + } + return result; + } finally { + releaseReadLock(); + } + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java index aec165684635a..e942453f7bd6c 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionRuntimeAgent.java @@ -23,6 +23,7 @@ import org.apache.iotdb.commons.service.IService; import org.apache.iotdb.commons.service.ServiceType; import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; import java.util.concurrent.atomic.AtomicBoolean; @@ -67,6 +68,7 @@ public void start() throws StartupException { } SubscriptionConfig.getInstance().printAllConfigs(); + ConsensusSubscriptionPrefetchExecutorManager.getInstance().start(); SubscriptionAgentLauncher.launchSubscriptionTopicAgent(); SubscriptionAgentLauncher.launchSubscriptionConsumerAgent(); @@ -80,8 +82,9 @@ public void stop() { return; } isShutdown.set(true); - - // let PipeDataNodeRuntimeAgent to drop all related pipe tasks + SubscriptionAgent.broker().abortConsensusPendingSeeksForRuntimeStop(); + ConsensusSubscriptionPrefetchExecutorManager.getInstance().stop(); + SubscriptionAgent.broker().abortConsensusPendingSeeksForRuntimeStop(); } public boolean isShutdown() { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java index 37cdaa72690be..0f724da7afff0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/agent/SubscriptionTopicAgent.java @@ -88,6 +88,8 @@ private void handleSingleTopicMetaChangesInternal(final TopicMeta metaFromCoordi final String topicName = metaFromCoordinator.getTopicName(); topicMetaKeeper.removeTopicMeta(topicName); topicMetaKeeper.addTopicMeta(topicName, metaFromCoordinator); + SubscriptionAgent.broker() + .refreshConsensusQueueOrderMode(topicName, metaFromCoordinator.getConfig().getOrderMode()); } public TPushTopicMetaRespExceptionMessage handleTopicMetaChanges( @@ -174,6 +176,15 @@ public String getTopicMode(final String topicName) { } } + public String getTopicOrderMode(final String topicName) { + acquireReadLock(); + try { + return topicMetaKeeper.getTopicMeta(topicName).getConfig().getOrderMode(); + } finally { + releaseReadLock(); + } + } + public Map getTopicConfigs(final Set topicNames) { acquireReadLock(); try { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java new file mode 100644 index 0000000000000..e0768a31f3ad2 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBroker.java @@ -0,0 +1,798 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusLogToTabletConverter; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusRegionRuntimeState; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionCommitManager; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; + +/** + * Consensus-based subscription broker that reads data directly from IoTConsensus WAL. Each instance + * manages consensus prefetching queues for a single consumer group. + */ +public class ConsensusSubscriptionBroker implements ISubscriptionBroker { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusSubscriptionBroker.class); + + private final String brokerId; // consumer group id + + /** Maps topic name to a list of ConsensusPrefetchingQueues, one per data region. */ + private final Map> topicNameToConsensusPrefetchingQueues; + + /** Round-robin counter for fair polling among region queues already assigned to this consumer. */ + private final AtomicInteger pollRoundRobinIndex = new AtomicInteger(0); + + private final Map> topicConsumerLastPollMs = + new ConcurrentHashMap<>(); + + private final Map topicOwnershipSnapshots = + new ConcurrentHashMap<>(); + + public ConsensusSubscriptionBroker(final String brokerId) { + this.brokerId = brokerId; + this.topicNameToConsensusPrefetchingQueues = new ConcurrentHashMap<>(); + } + + @Override + public boolean isEmpty() { + return topicNameToConsensusPrefetchingQueues.isEmpty(); + } + + @Override + public boolean hasQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + return Objects.nonNull(queues) + && !queues.isEmpty() + && queues.stream().anyMatch(q -> !q.isClosed()); + } + + //////////////////////////// poll //////////////////////////// + + @Override + public List poll( + final String consumerId, final Set topicNames, final long maxBytes) { + return poll(consumerId, topicNames, maxBytes, Collections.emptyMap()); + } + + public List poll( + final String consumerId, + final Set topicNames, + final long maxBytes, + final Map progressByTopic) { + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll called, consumerId={}, topicNames={}, " + + "queueCount={}, maxBytes={}", + brokerId, + consumerId, + topicNames, + topicNameToConsensusPrefetchingQueues.size(), + maxBytes); + + final List eventsToPoll = new ArrayList<>(); + final List eventsToNack = new ArrayList<>(); + long totalSize = 0; + + for (final String topicName : topicNames) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + continue; + } + + final TopicOwnershipSnapshot ownershipSnapshot = + refreshAndGetTopicOwnership(topicName, queues, consumerId); + final List assignedQueues = + getAssignedQueues(queues, consumerId, ownershipSnapshot); + if (assignedQueues.isEmpty()) { + continue; + } + + final List pollQueues = + buildPollOrderForAssignedQueues(assignedQueues, topicName); + final int eventsBeforeTopicPoll = eventsToPoll.size(); + + for (final ConsensusPrefetchingQueue consensusQueue : pollQueues) { + if (consensusQueue.isClosed()) { + continue; + } + + final String regionIdStr = consensusQueue.getConsensusGroupId().toString(); + final TopicProgress topicProgress = progressByTopic.get(topicName); + final RegionProgress regionProgress = + Objects.nonNull(topicProgress) + ? topicProgress.getRegionProgress().get(regionIdStr) + : null; + + final SubscriptionEvent event = consensusQueue.poll(consumerId, regionProgress); + if (Objects.isNull(event)) { + continue; + } + + final long currentSize; + try { + currentSize = event.getCurrentResponseSize(); + } catch (final IOException e) { + eventsToNack.add(event); + continue; + } + + eventsToPoll.add(event); + totalSize += currentSize; + + if (totalSize >= maxBytes) { + break; + } + } + if (totalSize >= maxBytes) { + break; + } + } + + // Nack any events that had errors + if (!eventsToNack.isEmpty()) { + commit( + consumerId, + eventsToNack.stream() + .map(SubscriptionEvent::getCommitContext) + .collect(Collectors.toList()), + true); + } + + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: poll result, consumerId={}, eventsPolled={}, eventsNacked={}", + brokerId, + consumerId, + eventsToPoll.size(), + eventsToNack.size()); + + return eventsToPoll; + } + + @Override + public List pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return Collections.emptyList(); + } + + final ConsensusPrefetchingQueue assignedQueue = + getAssignedQueueForConsumer( + queues, topicName, consumerId, commitContext.getRegionId(), "pollTablets"); + if (Objects.isNull(assignedQueue)) { + return Collections.emptyList(); + } + + final SubscriptionEvent event = assignedQueue.pollTablets(consumerId, commitContext, offset); + if (Objects.nonNull(event)) { + return Collections.singletonList(event); + } + return Collections.emptyList(); + } + + //////////////////////////// commit //////////////////////////// + + @Override + public List commit( + final String consumerId, + final List commitContexts, + final boolean nack) { + final List successfulCommitContexts = new ArrayList<>(); + for (final SubscriptionCommitContext commitContext : commitContexts) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to commit", + brokerId, + topicName); + continue; + } + + final ConsensusPrefetchingQueue assignedQueue = + getAssignedQueueForConsumer( + queues, topicName, consumerId, commitContext.getRegionId(), nack ? "nack" : "ack"); + boolean handled = false; + if (Objects.nonNull(assignedQueue)) { + final boolean success; + if (!nack) { + success = assignedQueue.ackSilent(consumerId, commitContext); + } else { + success = assignedQueue.nackSilent(consumerId, commitContext); + } + if (success) { + successfulCommitContexts.add(commitContext); + handled = true; + } + } + if (!handled) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: commit context {} not found in any of {} region queue(s) for topic [{}]", + brokerId, + commitContext, + queues.size(), + topicName); + } + } + return successfulCommitContexts; + } + + @Override + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + final String topicName = commitContext.getTopicName(); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return true; + } + // Route directly to the correct region queue using regionId + final String regionId = commitContext.getRegionId(); + for (final ConsensusPrefetchingQueue q : queues) { + if (!regionId.isEmpty() && !regionId.equals(q.getConsensusGroupId().toString())) { + continue; + } + return q.isCommitContextOutdated(commitContext); + } + return true; + } + + //////////////////////////// seek //////////////////////////// + + public void seek(final String topicName, final short seekType) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to seek", + brokerId, + topicName); + return; + } + + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + switch (seekType) { + case PipeSubscribeSeekReq.SEEK_TO_BEGINNING: + queue.seekToBeginning(); + break; + case PipeSubscribeSeekReq.SEEK_TO_END: + queue.seekToEnd(); + break; + default: + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: unsupported seekType {} for topic [{}]", + brokerId, + seekType, + topicName); + break; + } + } + } + + public void seek(final String topicName, final TopicProgress topicProgress) { + final TopicProgress safeProgress = + topicProgress != null ? topicProgress : new TopicProgress(Collections.emptyMap()); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to seek(topicProgress)", + brokerId, + topicName); + return; + } + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + final RegionProgress regionProgress = + safeProgress.getRegionProgress().get(queue.getConsensusGroupId().toString()); + seekQueueToRegionProgress(queue, regionProgress, false); + } + } + + public void seekAfter(final String topicName, final TopicProgress topicProgress) { + final TopicProgress safeProgress = + topicProgress != null ? topicProgress : new TopicProgress(Collections.emptyMap()); + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "ConsensusSubscriptionBroker [{}]: no queues for topic [{}] to seekAfter(topicProgress)", + brokerId, + topicName); + return; + } + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + final RegionProgress regionProgress = + safeProgress.getRegionProgress().get(queue.getConsensusGroupId().toString()); + seekQueueToRegionProgress(queue, regionProgress, true); + } + } + + private void seekQueueToRegionProgress( + final ConsensusPrefetchingQueue queue, + final RegionProgress regionProgress, + final boolean seekAfter) { + if (Objects.isNull(regionProgress) || regionProgress.getWriterPositions().isEmpty()) { + return; + } + if (seekAfter) { + queue.seekAfterRegionProgress(regionProgress); + } else { + queue.seekToRegionProgress(regionProgress); + } + } + + //////////////////////////// prefetching //////////////////////////// + + @Override + public boolean executePrefetch(final String topicName) { + // Consensus prefetch is fully driven by queue-local wakeup sources and the dedicated delayed + // scheduler. This interface remains only to satisfy the shared broker contract used by + // pipe-based subscription. + return false; + } + + @Override + public int getEventCount(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues)) { + return 0; + } + return queues.stream().mapToInt(ConsensusPrefetchingQueue::getPrefetchedEventCount).sum(); + } + + @Override + public int getQueueCount() { + return topicNameToConsensusPrefetchingQueues.size(); + } + + /** + * Returns per-region lag information for all topics managed by this broker. The result maps + * "topicName/regionId" to the lag (number of WAL entries behind). + */ + public Map getLagSummary() { + final Map lagMap = new ConcurrentHashMap<>(); + for (final Map.Entry> entry : + topicNameToConsensusPrefetchingQueues.entrySet()) { + for (final ConsensusPrefetchingQueue queue : entry.getValue()) { + if (!queue.isClosed()) { + lagMap.put(entry.getKey() + "/" + queue.getConsensusGroupId().toString(), queue.getLag()); + } + } + } + return lagMap; + } + + private TopicOwnershipSnapshot refreshAndGetTopicOwnership( + final String topicName, + final List queues, + final String consumerId) { + final ConcurrentHashMap consumerTimestamps = + topicConsumerLastPollMs.computeIfAbsent(topicName, ignored -> new ConcurrentHashMap<>()); + consumerTimestamps.put(consumerId, System.currentTimeMillis()); + evictInactiveConsumers(consumerTimestamps); + final List sortedConsumers = new ArrayList<>(consumerTimestamps.keySet()); + Collections.sort(sortedConsumers); + + final List activeRegionIds = + queues.stream() + .filter(q -> !q.isClosed()) + .map(q -> q.getConsensusGroupId().toString()) + .sorted() + .collect(Collectors.toList()); + + final TopicOwnershipSnapshot existingSnapshot = topicOwnershipSnapshots.get(topicName); + if (Objects.nonNull(existingSnapshot) + && existingSnapshot.hasSameConsumers(sortedConsumers) + && existingSnapshot.hasSameRegions(activeRegionIds)) { + return existingSnapshot; + } + + final TopicOwnershipSnapshot refreshedSnapshot = + TopicOwnershipSnapshot.create(sortedConsumers, activeRegionIds); + topicOwnershipSnapshots.put(topicName, refreshedSnapshot); + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: refreshed ownership for topic [{}], consumers={}, regions={}, generation={}", + brokerId, + topicName, + sortedConsumers, + activeRegionIds, + refreshedSnapshot.getGeneration()); + return refreshedSnapshot; + } + + private List getAssignedQueues( + final List queues, + final String consumerId, + final TopicOwnershipSnapshot ownershipSnapshot) { + if (Objects.isNull(ownershipSnapshot) || ownershipSnapshot.isEmpty()) { + return Collections.emptyList(); + } + final List assignedQueues = new ArrayList<>(); + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + if (consumerId.equals( + ownershipSnapshot.getOwnerConsumerId(queue.getConsensusGroupId().toString()))) { + assignedQueues.add(queue); + } + } + return assignedQueues; + } + + private List buildPollOrderForAssignedQueues( + final List assignedQueues, final String topicName) { + if (assignedQueues.size() <= 1) { + return assignedQueues; + } + final List pollQueues = new ArrayList<>(assignedQueues); + if (SubscriptionConfig.getInstance().isSubscriptionConsensusLagBasedPriority()) { + pollQueues.sort( + Comparator.comparingLong(ConsensusPrefetchingQueue::getLag) + .reversed() + .thenComparing(q -> q.getConsensusGroupId().toString())); + return pollQueues; + } + + final int startOffset = Math.floorMod(pollRoundRobinIndex.getAndIncrement(), pollQueues.size()); + final List orderedQueues = new ArrayList<>(pollQueues.size()); + for (int i = 0; i < pollQueues.size(); i++) { + orderedQueues.add(pollQueues.get((startOffset + i) % pollQueues.size())); + } + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: stable ownership poll order for topic [{}], assignedQueueCount={}", + brokerId, + topicName, + orderedQueues.size()); + return orderedQueues; + } + + private ConsensusPrefetchingQueue getAssignedQueueForConsumer( + final List queues, + final String topicName, + final String consumerId, + final String regionId, + final String action) { + final TopicOwnershipSnapshot ownershipSnapshot = + refreshAndGetTopicOwnership(topicName, queues, consumerId); + for (final ConsensusPrefetchingQueue queue : queues) { + if (queue.isClosed()) { + continue; + } + if (!regionId.isEmpty() && !regionId.equals(queue.getConsensusGroupId().toString())) { + continue; + } + if (consumerId.equals( + ownershipSnapshot.getOwnerConsumerId(queue.getConsensusGroupId().toString()))) { + return queue; + } + LOGGER.debug( + "ConsensusSubscriptionBroker [{}]: consumer [{}] skipped {} on topic [{}], region [{}] is currently owned by [{}]", + brokerId, + consumerId, + action, + topicName, + queue.getConsensusGroupId(), + ownershipSnapshot.getOwnerConsumerId(queue.getConsensusGroupId().toString())); + return null; + } + return null; + } + + /** Evicts consumers that have not polled within the configured eviction timeout. */ + private void evictInactiveConsumers(final ConcurrentHashMap consumerTimestamps) { + final long now = System.currentTimeMillis(); + final long timeout = + SubscriptionConfig.getInstance().getSubscriptionConsensusConsumerEvictionTimeoutMs(); + consumerTimestamps.entrySet().removeIf(entry -> (now - entry.getValue()) > timeout); + } + + //////////////////////////// queue management //////////////////////////// + + public void bindConsensusPrefetchingQueue( + final String topicName, + final String orderMode, + final ConsensusGroupId consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress, + final long tailStartSearchIndex, + final long initialRuntimeVersion, + final boolean initialActive) { + // Get or create the list of queues for this topic + final List queues = + topicNameToConsensusPrefetchingQueues.computeIfAbsent( + topicName, k -> new CopyOnWriteArrayList<>()); + + // Check for duplicate region binding + for (final ConsensusPrefetchingQueue existing : queues) { + if (consensusGroupId.equals(existing.getConsensusGroupId()) && !existing.isClosed()) { + LOGGER.info( + "Subscription: consensus prefetching queue for topic [{}], region [{}] " + + "in consumer group [{}] already exists, skipping", + topicName, + consensusGroupId, + brokerId); + return; + } + } + + // Create the per-region consensus queue for this topic. + final ConsensusPrefetchingQueue consensusQueue = + new ConsensusPrefetchingQueue( + brokerId, + topicName, + orderMode, + consensusGroupId, + serverImpl, + converter, + commitManager, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + queues.add(consensusQueue); + LOGGER.info( + "Subscription: create consensus prefetching queue bound to topic [{}] for consumer group [{}], " + + "consensusGroupId={}, fallbackCommittedRegionProgress={}, " + + "tailStartSearchIndex={}, initialRuntimeVersion={}, initialActive={}, totalRegionQueues={}", + topicName, + brokerId, + consensusGroupId, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive, + queues.size()); + } + + public void refreshConsensusQueueOrderMode(final String topicName, final String orderMode) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + return; + } + + for (final ConsensusPrefetchingQueue queue : queues) { + queue.setOrderMode(orderMode); + } + } + + public void unbindConsensusPrefetchingQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.isNull(queues) || queues.isEmpty()) { + LOGGER.warn( + "Subscription: consensus prefetching queues bound to topic [{}] for consumer group [{}] do not exist", + topicName, + brokerId); + return; + } + + for (final ConsensusPrefetchingQueue q : queues) { + q.close(); + } + topicNameToConsensusPrefetchingQueues.remove(topicName); + topicConsumerLastPollMs.remove(topicName); + topicOwnershipSnapshots.remove(topicName); + LOGGER.info( + "Subscription: drop all {} consensus prefetching queue(s) bound to topic [{}] for consumer group [{}]", + queues.size(), + topicName, + brokerId); + } + + public int unbindByRegion(final ConsensusGroupId regionId) { + int closedCount = 0; + for (final Map.Entry> entry : + topicNameToConsensusPrefetchingQueues.entrySet()) { + final List queues = entry.getValue(); + final int beforeSize = queues.size(); + queues.removeIf( + q -> { + if (!regionId.equals(q.getConsensusGroupId())) { + return false; + } + q.close(); + LOGGER.info( + "Subscription: closed consensus prefetching queue for topic [{}] region [{}] " + + "in consumer group [{}] due to region removal", + entry.getKey(), + regionId, + brokerId); + return true; + }); + closedCount += beforeSize - queues.size(); + if (queues.isEmpty()) { + topicNameToConsensusPrefetchingQueues.remove(entry.getKey(), queues); + topicConsumerLastPollMs.remove(entry.getKey()); + topicOwnershipSnapshots.remove(entry.getKey()); + } else { + topicOwnershipSnapshots.remove(entry.getKey()); + } + } + return closedCount; + } + + /** + * Activates or deactivates all queues bound to {@code regionId}. Called on leader migration: + * {@code false} on old leader, {@code true} on new leader. Inactive queues skip prefetching and + * return null on poll, ensuring only the preferred writer serves subscription data. + */ + public void setActiveForRegion(final ConsensusGroupId regionId, final boolean active) { + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (regionId.equals(q.getConsensusGroupId())) { + q.setActive(active); + } + } + } + } + + public void setActiveWritersForRegion( + final ConsensusGroupId regionId, final Set activeWriterNodeIds) { + final Set normalizedActiveWriterNodeIds = + Collections.unmodifiableSet(new LinkedHashSet<>(activeWriterNodeIds)); + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (regionId.equals(q.getConsensusGroupId())) { + q.setActiveWriterNodeIds(normalizedActiveWriterNodeIds); + } + } + } + } + + public void applyRuntimeStateForRegion( + final ConsensusGroupId regionId, final ConsensusRegionRuntimeState runtimeState) { + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (regionId.equals(q.getConsensusGroupId())) { + q.applyRuntimeState(runtimeState); + } + } + } + } + + public void abortPendingSeeksForRuntimeStop() { + for (final List queues : + topicNameToConsensusPrefetchingQueues.values()) { + for (final ConsensusPrefetchingQueue q : queues) { + if (!q.isClosed()) { + q.abortPendingSeekForRuntimeStop(); + } + } + } + } + + @Override + public void removeQueue(final String topicName) { + final List queues = + topicNameToConsensusPrefetchingQueues.get(topicName); + if (Objects.nonNull(queues) && !queues.isEmpty()) { + LOGGER.info( + "Subscription: consensus prefetching queue(s) bound to topic [{}] for consumer group [{}] still exist, unbind before closing", + topicName, + brokerId); + unbindConsensusPrefetchingQueue(topicName); + } + } + + private static final class TopicOwnershipSnapshot { + + private final List activeConsumers; + private final List activeRegionIds; + private final Map ownerByRegionId; + private final int generation; + + private TopicOwnershipSnapshot( + final List activeConsumers, + final List activeRegionIds, + final Map ownerByRegionId, + final int generation) { + this.activeConsumers = activeConsumers; + this.activeRegionIds = activeRegionIds; + this.ownerByRegionId = ownerByRegionId; + this.generation = generation; + } + + private static TopicOwnershipSnapshot create( + final List activeConsumers, final List activeRegionIds) { + if (activeConsumers.isEmpty() || activeRegionIds.isEmpty()) { + return new TopicOwnershipSnapshot( + Collections.emptyList(), Collections.emptyList(), Collections.emptyMap(), 0); + } + + final Map ownerByRegionId = new ConcurrentHashMap<>(); + final int consumerCount = activeConsumers.size(); + for (final String regionId : activeRegionIds) { + final int ownerIdx = Math.floorMod(regionId.hashCode(), consumerCount); + ownerByRegionId.put(regionId, activeConsumers.get(ownerIdx)); + } + return new TopicOwnershipSnapshot( + Collections.unmodifiableList(new ArrayList<>(activeConsumers)), + Collections.unmodifiableList(new ArrayList<>(activeRegionIds)), + Collections.unmodifiableMap(ownerByRegionId), + ownerByRegionId.hashCode()); + } + + private boolean isEmpty() { + return activeConsumers.isEmpty() || activeRegionIds.isEmpty(); + } + + private boolean hasSameConsumers(final List consumers) { + return activeConsumers.equals(consumers); + } + + private boolean hasSameRegions(final List regionIds) { + return activeRegionIds.equals(regionIds); + } + + private String getOwnerConsumerId(final String regionId) { + return ownerByRegionId.get(regionId); + } + + private int getGeneration() { + return generation; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java new file mode 100644 index 0000000000000..aaa88a5f84777 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/ISubscriptionBroker.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; + +import java.util.List; +import java.util.Set; + +public interface ISubscriptionBroker { + + List poll(String consumerId, Set topicNames, long maxBytes); + + List pollTablets( + String consumerId, SubscriptionCommitContext commitContext, int offset); + + List commit( + String consumerId, List commitContexts, boolean nack); + + boolean isCommitContextOutdated(SubscriptionCommitContext commitContext); + + boolean executePrefetch(String topicName); + + int getEventCount(String topicName); + + int getQueueCount(); + + void removeQueue(String topicName); + + boolean isEmpty(); + + boolean hasQueue(String topicName); +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java index cc03f7261419b..8f9d05324e905 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionBroker.java @@ -56,7 +56,7 @@ import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; -public class SubscriptionBroker { +public class SubscriptionBroker implements ISubscriptionBroker { private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionBroker.class); @@ -83,14 +83,23 @@ public SubscriptionBroker(final String brokerId) { .build(consumerId -> new SubscriptionStates()); } + @Override public boolean isEmpty() { return topicNameToPrefetchingQueue.isEmpty() && completedTopicNames.isEmpty() && topicNameToCommitIdGenerator.isEmpty(); } + @Override + public boolean hasQueue(final String topicName) { + final SubscriptionPrefetchingQueue prefetchingQueue = + topicNameToPrefetchingQueue.get(topicName); + return Objects.nonNull(prefetchingQueue) && !prefetchingQueue.isClosed(); + } + //////////////////////////// provided for SubscriptionBrokerAgent //////////////////////////// + @Override public List poll( final String consumerId, final Set topicNames, final long maxBytes) { final List eventsToPoll = new ArrayList<>(); @@ -112,9 +121,10 @@ public List poll( // Iterate over each sorted topic name and poll the corresponding events int remainingTopicSize = sortedTopicNames.size(); for (final String topicName : sortedTopicNames) { + remainingTopicSize -= 1; + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); - remainingTopicSize -= 1; // Recheck if (Objects.isNull(prefetchingQueue) || prefetchingQueue.isClosed()) { @@ -182,6 +192,7 @@ private Set prepareCandidateTopicNames( final List eventsToPoll /* output parameter */) { final Set candidateTopicNames = new HashSet<>(); for (final String topicName : topicNames) { + // Check pipe-based queue final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); // If there is no prefetching queue for the topic, check if it's completed @@ -271,6 +282,7 @@ public List pollTsFile( return Collections.emptyList(); } + @Override public List pollTablets( final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { final String topicName = commitContext.getTopicName(); @@ -312,6 +324,7 @@ public List pollTablets( /** * @return list of successful commit contexts */ + @Override public List commit( final String consumerId, final List commitContexts, @@ -348,6 +361,7 @@ public List commit( return successfulCommitContexts; } + @Override public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { final String topicName = commitContext.getTopicName(); final SubscriptionPrefetchingQueue prefetchingQueue = @@ -457,6 +471,11 @@ public void unbindPrefetchingQueue(final String topicName) { brokerId); } + @Override + public void removeQueue(final String topicName) { + removePrefetchingQueue(topicName); + } + public void removePrefetchingQueue(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -473,6 +492,7 @@ public void removePrefetchingQueue(final String topicName) { topicNameToCommitIdGenerator.remove(topicName); } + @Override public boolean executePrefetch(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -505,6 +525,11 @@ public boolean executePrefetch(final String topicName) { : prefetchingQueue.executePrefetchV2(); } + @Override + public int getEventCount(final String topicName) { + return getPipeEventCount(topicName); + } + public int getPipeEventCount(final String topicName) { final SubscriptionPrefetchingQueue prefetchingQueue = topicNameToPrefetchingQueue.get(topicName); @@ -525,6 +550,11 @@ public int getPipeEventCount(final String topicName) { return prefetchingQueue.getPipeEventCount(); } + @Override + public int getQueueCount() { + return getPrefetchingQueueCount(); + } + public int getPrefetchingQueueCount() { return topicNameToPrefetchingQueue.size(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java index b8bdc4e802ff5..b325d0938c499 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/SubscriptionPrefetchingQueue.java @@ -849,6 +849,18 @@ public boolean nackInternal( ev.nack(); // now pollable nacked.set(true); + if (ev.isPoisoned()) { + LOGGER.error( + "Subscription: poison message detected (nackCount={}), force-acking event {} in prefetching queue: {}", + ev.getNackCount(), + ev, + this); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; // remove from inFlightEvents + } + // no need to update inFlightEvents and prefetchingQueue return ev; }); @@ -1017,11 +1029,33 @@ private static RemappingFunction COMBINER( (ev) -> { if (ev.eagerlyPollable()) { ev.nack(); // now pollable (the nack operation here is actually unnecessary) + if (ev.isPoisoned()) { + LOGGER.error( + "Subscription: poison message detected (nackCount={}), force-acking eagerly pollable event {} in prefetching queue: {}", + ev.getNackCount(), + ev, + this); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } prefetchEvent(ev); // no need to log warn for eagerly pollable event return null; // remove this entry } else if (ev.pollable()) { ev.nack(); // now pollable + if (ev.isPoisoned()) { + LOGGER.error( + "Subscription: poison message detected (nackCount={}), force-acking pollable event {} in prefetching queue: {}", + ev.getNackCount(), + ev, + this); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } prefetchEvent(ev); LOGGER.warn( "Subscription: SubscriptionPrefetchingQueue {} recycle event {} from in flight events, nack and enqueue it to prefetching queue", diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java new file mode 100644 index 0000000000000..9d3f2b283c556 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusLogToTabletConverter.java @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.RelationalInsertTabletNode; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.file.metadata.IDeviceID; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.utils.BitMap; +import org.apache.tsfile.write.record.Tablet; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +/** Converts IoTConsensus WAL log entries (InsertNode) to Tablet format for subscription. */ +public class ConsensusLogToTabletConverter { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusLogToTabletConverter.class); + + private final TreePattern treePattern; + private final TablePattern tablePattern; + + /** + * The actual database name of the DataRegion this converter processes (table-model format without + * "root." prefix). Null for tree-model topics. + */ + private final String databaseName; + + public ConsensusLogToTabletConverter( + final TreePattern treePattern, final TablePattern tablePattern, final String databaseName) { + this.treePattern = treePattern; + this.tablePattern = tablePattern; + this.databaseName = databaseName; + } + + public String getDatabaseName() { + return databaseName; + } + + static String safeDeviceIdForLog(final InsertNode node) { + try { + final Object deviceId = node.getDeviceID(); + return deviceId != null ? deviceId.toString() : "null"; + } catch (final Exception e) { + return "N/A(" + node.getType() + ")"; + } + } + + public List convert(final InsertNode insertNode) { + if (Objects.isNull(insertNode)) { + return Collections.emptyList(); + } + + final PlanNodeType nodeType = insertNode.getType(); + if (nodeType == null) { + LOGGER.warn("InsertNode type is null, skipping conversion"); + return Collections.emptyList(); + } + + LOGGER.debug( + "ConsensusLogToTabletConverter: converting InsertNode type={}, deviceId={}", + nodeType, + safeDeviceIdForLog(insertNode)); + + switch (nodeType) { + case INSERT_ROW: + return convertInsertRowNode((InsertRowNode) insertNode); + case INSERT_TABLET: + return convertInsertTabletNode((InsertTabletNode) insertNode); + case INSERT_ROWS: + return convertInsertRowsNode((InsertRowsNode) insertNode); + case INSERT_ROWS_OF_ONE_DEVICE: + return convertInsertRowsOfOneDeviceNode((InsertRowsOfOneDeviceNode) insertNode); + case INSERT_MULTI_TABLET: + return convertInsertMultiTabletsNode((InsertMultiTabletsNode) insertNode); + case RELATIONAL_INSERT_ROW: + return convertRelationalInsertRowNode((RelationalInsertRowNode) insertNode); + case RELATIONAL_INSERT_TABLET: + return convertRelationalInsertTabletNode((RelationalInsertTabletNode) insertNode); + case RELATIONAL_INSERT_ROWS: + return convertRelationalInsertRowsNode((RelationalInsertRowsNode) insertNode); + default: + LOGGER.debug("Unsupported InsertNode type for subscription: {}", nodeType); + return Collections.emptyList(); + } + } + + // ======================== Tree Model Conversion ======================== + + private List convertInsertRowNode(final InsertRowNode node) { + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final long time = node.getTime(); + + // Determine which columns match the pattern + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + // Build Tablet with matched columns + final int columnCount = matchedColumnIndices.size(); + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + final Tablet tablet = new Tablet(deviceId.toString(), schemas, 1 /* maxRowNumber */); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = matchedColumnIndices.get(i); + final Object value = values[originalColIdx]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[originalColIdx], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertInsertTabletNode(final InsertTabletNode node) { + final IDeviceID deviceId = node.getDeviceID(); + + // Device-level path filtering + if (treePattern != null && !treePattern.mayOverlapWithDevice(deviceId)) { + return Collections.emptyList(); + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + + // Column filtering + final List matchedColumnIndices = getMatchedTreeColumnIndices(deviceId, measurements); + if (matchedColumnIndices.isEmpty()) { + return Collections.emptyList(); + } + + final int columnCount = matchedColumnIndices.size(); + final boolean allColumnsMatch = (columnCount == measurements.length); + + // Build schemas (always needed) + final List schemas = new ArrayList<>(columnCount); + for (final int colIdx : matchedColumnIndices) { + schemas.add(new MeasurementSchema(measurements[colIdx], dataTypes[colIdx])); + } + + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; + + for (int i = 0; i < columnCount; i++) { + final int originalColIdx = allColumnsMatch ? i : matchedColumnIndices.get(i); + newColumns[i] = copyColumnArray(dataTypes[originalColIdx], columns[originalColIdx], rowCount); + if (bitMaps != null && bitMaps[originalColIdx] != null) { + newBitMaps[i] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[originalColIdx], 0, newBitMaps[i], 0, rowCount); + } + } + + final Tablet tablet = + new Tablet(deviceId.toString(), schemas, newTimes, newColumns, newBitMaps, rowCount); + + return Collections.singletonList(tablet); + } + + private List convertInsertRowsNode(final InsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + // Handle merge bug: RelationalInsertRowNode.mergeInsertNode() is not overridden, + // so merged relational nodes arrive as InsertRowsNode (tree) with RelationalInsertRowNode + // children. Dispatch correctly by checking the actual child type. + if (rowNode instanceof RelationalInsertRowNode) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } else { + tablets.addAll(convertInsertRowNode(rowNode)); + } + } + return tablets; + } + + private List convertInsertRowsOfOneDeviceNode(final InsertRowsOfOneDeviceNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertInsertRowNode(rowNode)); + } + return tablets; + } + + private List convertInsertMultiTabletsNode(final InsertMultiTabletsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertTabletNode tabletNode : node.getInsertTabletNodeList()) { + tablets.addAll(convertInsertTabletNode(tabletNode)); + } + return tablets; + } + + // ======================== Table Model Conversion ======================== + + private List convertRelationalInsertRowNode(final RelationalInsertRowNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final long time = node.getTime(); + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final Object[] values = node.getValues(); + + final int columnCount = measurements.length; + final List schemas = new ArrayList<>(columnCount); + for (int i = 0; i < columnCount; i++) { + schemas.add(new MeasurementSchema(measurements[i], dataTypes[i])); + } + + final Tablet tablet = new Tablet(tableName != null ? tableName : "", schemas, 1); + tablet.addTimestamp(0, time); + + for (int i = 0; i < columnCount; i++) { + final Object value = values[i]; + if (value == null) { + if (tablet.getBitMaps() == null) { + tablet.initBitMaps(); + } + tablet.getBitMaps()[i].mark(0); + } else { + addValueToTablet(tablet, 0, i, dataTypes[i], value); + } + } + tablet.setRowSize(1); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertTabletNode(final RelationalInsertTabletNode node) { + final String tableName = node.getTableName(); + + // Table-level pattern filtering + if (tablePattern != null) { + if (databaseName != null && !tablePattern.matchesDatabase(databaseName)) { + return Collections.emptyList(); + } + if (tableName != null && !tablePattern.matchesTable(tableName)) { + return Collections.emptyList(); + } + } + + final String[] measurements = node.getMeasurements(); + final TSDataType[] dataTypes = node.getDataTypes(); + final long[] times = node.getTimes(); + final Object[] columns = node.getColumns(); + final BitMap[] bitMaps = node.getBitMaps(); + final int rowCount = node.getRowCount(); + + final int columnCount = measurements.length; + final List schemas = new ArrayList<>(columnCount); + for (int i = 0; i < columnCount; i++) { + schemas.add(new MeasurementSchema(measurements[i], dataTypes[i])); + } + + // Build column arrays and bitmaps using bulk copy + final long[] newTimes = Arrays.copyOf(times, rowCount); + final Object[] newColumns = new Object[columnCount]; + final BitMap[] newBitMaps = new BitMap[columnCount]; + + for (int colIdx = 0; colIdx < columnCount; colIdx++) { + newColumns[colIdx] = copyColumnArray(dataTypes[colIdx], columns[colIdx], rowCount); + if (bitMaps != null && bitMaps[colIdx] != null) { + newBitMaps[colIdx] = new BitMap(rowCount); + BitMap.copyOfRange(bitMaps[colIdx], 0, newBitMaps[colIdx], 0, rowCount); + } + } + + final Tablet tablet = + new Tablet( + tableName != null ? tableName : "", + schemas, + newTimes, + newColumns, + newBitMaps, + rowCount); + + return Collections.singletonList(tablet); + } + + private List convertRelationalInsertRowsNode(final RelationalInsertRowsNode node) { + final List tablets = new ArrayList<>(); + for (final InsertRowNode rowNode : node.getInsertRowNodeList()) { + tablets.addAll(convertRelationalInsertRowNode((RelationalInsertRowNode) rowNode)); + } + return tablets; + } + + // ======================== Helper Methods ======================== + + /** + * Returns indices of columns that match the tree pattern. If no tree pattern is specified, all + * column indices are returned. + */ + private List getMatchedTreeColumnIndices( + final IDeviceID deviceId, final String[] measurements) { + if (treePattern == null || treePattern.isRoot() || treePattern.coversDevice(deviceId)) { + // All columns match + final List allIndices = new ArrayList<>(measurements.length); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null) { + allIndices.add(i); + } + } + return allIndices; + } + + final List matchedIndices = new ArrayList<>(); + for (int i = 0; i < measurements.length; i++) { + if (measurements[i] != null && treePattern.matchesMeasurement(deviceId, measurements[i])) { + matchedIndices.add(i); + } + } + return matchedIndices; + } + + /** + * Bulk-copies a typed column array using System.arraycopy. Returns a new array of the same type + * containing the first {@code rowCount} elements. + */ + private Object copyColumnArray( + final TSDataType dataType, final Object sourceColumn, final int rowCount) { + switch (dataType) { + case BOOLEAN: + { + final boolean[] src = (boolean[]) sourceColumn; + final boolean[] dst = new boolean[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT32: + case DATE: + { + final int[] src = (int[]) sourceColumn; + final int[] dst = new int[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case INT64: + case TIMESTAMP: + { + final long[] src = (long[]) sourceColumn; + final long[] dst = new long[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case FLOAT: + { + final float[] src = (float[]) sourceColumn; + final float[] dst = new float[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case DOUBLE: + { + final double[] src = (double[]) sourceColumn; + final double[] dst = new double[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + case TEXT: + case BLOB: + case STRING: + { + final Binary[] src = (Binary[]) sourceColumn; + final Binary[] dst = new Binary[rowCount]; + System.arraycopy(src, 0, dst, 0, rowCount); + return dst; + } + default: + LOGGER.warn("Unsupported data type for bulk copy: {}", dataType); + return sourceColumn; + } + } + + /** + * Adds a single value to the tablet at the specified position. + * + *

IMPORTANT: In tsfile-2.2.1, Tablet.addTimestamp() calls initBitMapsWithApiUsage() which + * creates bitMaps and marks ALL positions as null via markAll(). Since we write values directly + * to the underlying typed arrays (bypassing the Tablet.addValue() API which would call + * updateBitMap to unmark), we must explicitly unmark the bitmap position to indicate the value is + * NOT null. + */ + private void addValueToTablet( + final Tablet tablet, + final int rowIndex, + final int columnIndex, + final TSDataType dataType, + final Object value) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[columnIndex])[rowIndex] = (boolean) value; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[columnIndex])[rowIndex] = (int) value; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[columnIndex])[rowIndex] = (long) value; + break; + case FLOAT: + ((float[]) tablet.getValues()[columnIndex])[rowIndex] = (float) value; + break; + case DOUBLE: + ((double[]) tablet.getValues()[columnIndex])[rowIndex] = (double) value; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[columnIndex])[rowIndex] = (Binary) value; + break; + default: + LOGGER.warn("Unsupported data type: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + // addTimestamp() triggers initBitMapsWithApiUsage() which marks all positions as null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[columnIndex] != null) { + bitMaps[columnIndex].unmark(rowIndex); + } + } + + /** Copies a single column value from the source column array to the tablet. */ + private void copyColumnValue( + final Tablet tablet, + final int targetRowIndex, + final int targetColumnIndex, + final TSDataType dataType, + final Object sourceColumn, + final int sourceRowIndex) { + switch (dataType) { + case BOOLEAN: + ((boolean[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((boolean[]) sourceColumn)[sourceRowIndex]; + break; + case INT32: + case DATE: + ((int[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((int[]) sourceColumn)[sourceRowIndex]; + break; + case INT64: + case TIMESTAMP: + ((long[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((long[]) sourceColumn)[sourceRowIndex]; + break; + case FLOAT: + ((float[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((float[]) sourceColumn)[sourceRowIndex]; + break; + case DOUBLE: + ((double[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((double[]) sourceColumn)[sourceRowIndex]; + break; + case TEXT: + case BLOB: + case STRING: + ((Binary[]) tablet.getValues()[targetColumnIndex])[targetRowIndex] = + ((Binary[]) sourceColumn)[sourceRowIndex]; + break; + default: + LOGGER.warn("Unsupported data type for copy: {}", dataType); + return; + } + // Unmark the bitmap position to indicate this value is NOT null. + final BitMap[] bitMaps = tablet.getBitMaps(); + if (bitMaps != null && bitMaps[targetColumnIndex] != null) { + bitMaps[targetColumnIndex].unmark(targetRowIndex); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java new file mode 100644 index 0000000000000..62794cf0fdbe2 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueue.java @@ -0,0 +1,3498 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.consensus.common.request.IConsensusRequest; +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.common.request.IoTConsensusRequest; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.WriterSafeFrontierTracker; +import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent; +import org.apache.iotdb.db.pipe.resource.memory.PipeMemoryWeightUtil; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertMultiTabletsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsOfOneDeviceNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertTabletNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.SearchNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.ProgressWALReader; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; +import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.db.subscription.metric.ConsensusSubscriptionPrefetchingQueueMetrics; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutor; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; +import org.apache.iotdb.db.subscription.task.subtask.ConsensusPrefetchSubtask; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.ErrorPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; +import org.apache.iotdb.rpc.subscription.payload.poll.TabletsPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.WatermarkPayload; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.write.record.Tablet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.PriorityQueue; +import java.util.Set; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.function.Supplier; + +import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; + +public class ConsensusPrefetchingQueue { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchingQueue.class); + + private final String brokerId; // consumer group id + private final String topicName; + private final ConsensusGroupId consensusGroupId; + + private final IoTConsensusServerImpl serverImpl; + + private final ConsensusReqReader consensusReqReader; + + private final WakeableIndexedConsensusQueue pendingEntries; + + private static final int PENDING_QUEUE_CAPACITY = 4096; + + private final ConsensusLogToTabletConverter converter; + + private final ConsensusSubscriptionCommitManager commitManager; + + private final AtomicLong seekGeneration; + + /** Internal WAL reader cursor used only for local replay positioning and deduplication. */ + private final AtomicLong nextExpectedSearchIndex; + + private final PriorityBlockingQueue prefetchingQueue; + + private final Map, SubscriptionEvent> inFlightEvents; + + private static final int MAX_PREFETCHING_QUEUE_SIZE = + SubscriptionConfig.getInstance().getSubscriptionConsensusPrefetchingQueueCapacity(); + + private final AtomicLong walGapSkippedEntries = new AtomicLong(0); + + /** Guards queue state transitions that touch replay positioning, seek state, and lane buffers. */ + private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true); + + private volatile boolean isClosed = false; + + private volatile boolean closeRequested = false; + + private volatile boolean isActive = true; + + private volatile Set activeWriterNodeIds = Collections.emptySet(); + + private volatile Set runtimeActiveWriterNodeIds = Collections.emptySet(); + + private volatile int preferredWriterNodeId = -1; + + private volatile int previousPreferredWriterNodeId = -1; + + // ======================== Routing Runtime Version ======================== + + private volatile long runtimeVersion = 0; + + private final AtomicLong runtimeVersionChangeCount = new AtomicLong(0); + + // ======================== Unified WAL / Release State ======================== + + private volatile ProgressWALIterator subscriptionWALIterator; + + /** + * Seek requests must not close/reset the WAL iterator from RPC threads because the prefetch + * worker may be reading it concurrently. Instead, seek only records the latest desired reset and + * the queue's next prefetch round applies it after observing the new seek generation. + */ + private volatile long pendingSubscriptionWalResetSearchIndex = Long.MIN_VALUE; + + private volatile long pendingSubscriptionWalResetGeneration = Long.MIN_VALUE; + + // ======================== Watermark ======================== + + /** Maximum data timestamp observed across all InsertNodes processed by this queue. */ + private volatile long maxObservedTimestamp = Long.MIN_VALUE; + + /** Wall-clock time (ms) of last watermark injection. 0 means never injected. */ + private volatile long lastWatermarkEmitTimeMs = 0; + + /** Number of entries accepted from realtime pending queue. */ + private final AtomicLong pendingPathAcceptedEntries = new AtomicLong(0); + + /** Number of entries accepted from WAL-backed paths (historical or catch-up). */ + private final AtomicLong walPathAcceptedEntries = new AtomicLong(0); + + private final Object prefetchBindingLock = new Object(); + + private volatile ConsensusPrefetchSubtask prefetchSubtask; + + private volatile ConsensusSubscriptionPrefetchExecutor prefetchExecutor; + + /** + * Whether the prefetch runtime has been initialized. Starts as false (dormant). Set to true on + * the first poll with a region progress hint or when a seek installs a pending reset. This keeps + * queue creation cheap: realtime entries can be buffered immediately while WAL replay state is + * only built once the queue is actually activated. + */ + private volatile boolean prefetchInitialized = false; + + private volatile PendingSeekRequest pendingSeekRequest; + + private final DeliveryBatchState lingerBatch = new DeliveryBatchState(); + + private volatile long observedSeekGeneration; + + private volatile long lastStatsLogTimeMs = System.currentTimeMillis(); + + private volatile long lastPendingAcceptedEntries = 0L; + + private volatile long lastWalAcceptedEntries = 0L; + + private volatile boolean pendingWalGapRetryRequested = false; + + private volatile long walGapWaitStartTimeMs = 0L; + + private volatile long lastWalGapWaitLogTimeMs = 0L; + + /** Fallback committed region progress from local persisted state. */ + private final RegionProgress fallbackCommittedRegionProgress; + + /** Recovery-time per-writer frontiers used to skip already committed entries after restart. */ + private final Map recoveryWriterProgressByWriter = + new ConcurrentHashMap<>(); + + /** + * Source-level dedup frontier for follower-origin entries that do not carry a local searchIndex. + * The same request may first arrive through pendingEntries and later become visible from WAL; + * once a follower-origin localSeq has already been materialized into queue state, the WAL path + * must not materialize it again. + */ + private final Map materializedFollowerProgressByWriter = + new ConcurrentHashMap<>(); + + /** + * Lane state keyed by writer identity. Release gating reasons in terms of writer lanes and safe + * frontiers instead of a region-level committed frontier. + */ + private final Map writerLanes = new ConcurrentHashMap<>(); + + /** + * Realtime lane buffers used by both pending replay and WAL catch-up so queue materialization + * converges on the same per-writer lane representation before batch delivery. + */ + private final Map> realtimeEntriesByLane = + new ConcurrentHashMap<>(); + + /** + * Local tail position used only when initialization starts without any persisted region progress. + */ + private final long fallbackTailSearchIndex; + + /** Local sequence used to represent the position immediately before a writer's first record. */ + private static final long BEFORE_FIRST_LOCAL_SEQ = -1L; + + /** Writer-progress metadata for the current pending/WAL batch being assembled. */ + private volatile long batchPhysicalTime = 0L; + + private volatile int batchWriterNodeId = -1; + private volatile long batchWriterEpoch = 0L; + private volatile String orderMode = TopicConstant.ORDER_MODE_DEFAULT_VALUE; + + protected enum ReplayLocateStatus { + FOUND, + AT_END, + LOCATE_MISS + } + + protected static final class ReplayLocateDecision { + private final ReplayLocateStatus status; + private final long startSearchIndex; + private final RegionProgress recoveryRegionProgress; + private final String detail; + + private ReplayLocateDecision( + final ReplayLocateStatus status, + final long startSearchIndex, + final RegionProgress recoveryRegionProgress, + final String detail) { + this.status = status; + this.startSearchIndex = startSearchIndex; + this.recoveryRegionProgress = recoveryRegionProgress; + this.detail = detail; + } + + static ReplayLocateDecision found( + final long startSearchIndex, + final RegionProgress recoveryRegionProgress, + final String detail) { + return new ReplayLocateDecision( + ReplayLocateStatus.FOUND, startSearchIndex, recoveryRegionProgress, detail); + } + + static ReplayLocateDecision atEnd( + final long startSearchIndex, + final RegionProgress recoveryRegionProgress, + final String detail) { + return new ReplayLocateDecision( + ReplayLocateStatus.AT_END, startSearchIndex, recoveryRegionProgress, detail); + } + + static ReplayLocateDecision locateMiss( + final RegionProgress recoveryRegionProgress, final String detail) { + return new ReplayLocateDecision( + ReplayLocateStatus.LOCATE_MISS, Long.MIN_VALUE, recoveryRegionProgress, detail); + } + + protected ReplayLocateStatus getStatus() { + return status; + } + + protected long getStartSearchIndex() { + return startSearchIndex; + } + + protected RegionProgress getRecoveryRegionProgress() { + return recoveryRegionProgress; + } + + protected String getDetail() { + return detail; + } + } + + private static final class WakeableIndexedConsensusQueue + extends LinkedBlockingDeque { + + private final Runnable wakeupHook; + + private WakeableIndexedConsensusQueue(final int capacity, final Runnable wakeupHook) { + super(capacity); + this.wakeupHook = wakeupHook; + } + + @Override + public boolean offer(final IndexedConsensusRequest request) { + final boolean offered = super.offer(request); + if (offered) { + wakeupHook.run(); + } + return offered; + } + + @Override + public void put(final IndexedConsensusRequest request) throws InterruptedException { + super.put(request); + wakeupHook.run(); + } + } + + private static final class PendingSeekRequest { + + private final long targetSearchIndex; + private final RegionProgress committedRegionProgress; + private final String seekReason; + private final boolean previousPrefetchInitialized; + private final long previousSeekGeneration; + private final long targetSeekGeneration; + + private boolean completed = false; + private RuntimeException failure; + + private PendingSeekRequest( + final long targetSearchIndex, + final RegionProgress committedRegionProgress, + final String seekReason, + final boolean previousPrefetchInitialized, + final long previousSeekGeneration, + final long targetSeekGeneration) { + this.targetSearchIndex = targetSearchIndex; + this.committedRegionProgress = committedRegionProgress; + this.seekReason = seekReason; + this.previousPrefetchInitialized = previousPrefetchInitialized; + this.previousSeekGeneration = previousSeekGeneration; + this.targetSeekGeneration = targetSeekGeneration; + } + + private synchronized void complete() { + completed = true; + notifyAll(); + } + + private synchronized void fail(final RuntimeException failure) { + this.failure = failure; + completed = true; + notifyAll(); + } + + private synchronized void awaitCompletion() { + while (!completed) { + try { + wait(50L); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while waiting for seek application", e); + } + } + if (failure != null) { + throw failure; + } + } + } + + public ConsensusPrefetchingQueue( + final String brokerId, + final String topicName, + final String orderMode, + final ConsensusGroupId consensusGroupId, + final IoTConsensusServerImpl serverImpl, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress, + final long tailStartSearchIndex, + final long initialRuntimeVersion, + final boolean initialActive) { + this.brokerId = brokerId; + this.topicName = topicName; + this.consensusGroupId = consensusGroupId; + this.serverImpl = serverImpl; + this.consensusReqReader = serverImpl.getConsensusReqReader(); + this.converter = converter; + this.commitManager = commitManager; + this.fallbackCommittedRegionProgress = fallbackCommittedRegionProgress; + this.fallbackTailSearchIndex = tailStartSearchIndex; + this.runtimeVersion = initialRuntimeVersion; + this.isActive = initialActive; + this.orderMode = TopicConfig.normalizeOrderMode(orderMode); + + this.seekGeneration = new AtomicLong(0); + this.nextExpectedSearchIndex = new AtomicLong(tailStartSearchIndex); + + this.prefetchingQueue = new PriorityBlockingQueue<>(); + this.inFlightEvents = new ConcurrentHashMap<>(); + this.observedSeekGeneration = seekGeneration.get(); + + // Register pending queue early so we don't miss real-time writes + this.pendingEntries = + new WakeableIndexedConsensusQueue(PENDING_QUEUE_CAPACITY, this::requestPrefetch); + serverImpl.registerSubscriptionQueue(pendingEntries); + + LOGGER.info( + "ConsensusPrefetchingQueue created (dormant): brokerId={}, topicName={}, " + + "orderMode={}, consensusGroupId={}, fallbackCommittedRegionProgress={}, " + + "fallbackTailSearchIndex={}, initialRuntimeVersion={}, initialActive={}", + brokerId, + topicName, + this.orderMode, + consensusGroupId, + fallbackCommittedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + + // Register metrics + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance().register(this); + } + + // ======================== Lock Operations ======================== + + private void acquireReadLock() { + lock.readLock().lock(); + } + + private void releaseReadLock() { + lock.readLock().unlock(); + } + + private void acquireWriteLock() { + lock.writeLock().lock(); + } + + private void releaseWriteLock() { + lock.writeLock().unlock(); + } + + private void requestPrefetch() { + if (closeRequested || isClosed) { + return; + } + final ConsensusPrefetchSubtask subtask = ensurePrefetchSubtaskBound(); + if (Objects.nonNull(subtask)) { + subtask.requestWakeupNow(); + } + } + + private ConsensusPrefetchSubtask ensurePrefetchSubtaskBound() { + if (closeRequested || isClosed) { + return null; + } + + final ConsensusSubscriptionPrefetchExecutor currentExecutor = + ConsensusSubscriptionPrefetchExecutorManager.getInstance().getExecutor(); + if (Objects.isNull(currentExecutor)) { + return null; + } + + final ConsensusPrefetchSubtask currentSubtask = prefetchSubtask; + if (Objects.nonNull(currentSubtask) + && prefetchExecutor == currentExecutor + && !currentSubtask.isClosed()) { + return currentSubtask; + } + + synchronized (prefetchBindingLock) { + if (closeRequested || isClosed) { + return null; + } + + if (Objects.nonNull(prefetchSubtask) + && prefetchExecutor == currentExecutor + && !prefetchSubtask.isClosed()) { + return prefetchSubtask; + } + + final ConsensusPrefetchSubtask staleSubtask = prefetchSubtask; + final ConsensusSubscriptionPrefetchExecutor staleExecutor = prefetchExecutor; + if (Objects.nonNull(staleSubtask) + && Objects.nonNull(staleExecutor) + && (staleExecutor != currentExecutor || staleSubtask.isClosed()) + && !staleExecutor.isShutdown()) { + staleExecutor.deregister(staleSubtask.getTaskId()); + } + + final ConsensusPrefetchSubtask newSubtask = new ConsensusPrefetchSubtask(this); + if (!currentExecutor.register(newSubtask)) { + return null; + } + prefetchExecutor = currentExecutor; + prefetchSubtask = newSubtask; + return newSubtask; + } + } + + private Pair + detachPrefetchSubtask() { + synchronized (prefetchBindingLock) { + final Pair detached = + new Pair<>(prefetchExecutor, prefetchSubtask); + prefetchExecutor = null; + prefetchSubtask = null; + return detached; + } + } + + private boolean shouldRecoverPrefetchBindingAfterEmptyPoll() { + if (!prefetchInitialized || isClosed || closeRequested || pendingSeekRequest != null) { + return false; + } + + final ConsensusSubscriptionPrefetchExecutor currentExecutor = + ConsensusSubscriptionPrefetchExecutorManager.getInstance().getExecutor(); + if (Objects.isNull(currentExecutor)) { + return false; + } + + final ConsensusPrefetchSubtask currentSubtask = prefetchSubtask; + final boolean bindingMissing = + Objects.isNull(currentSubtask) + || currentSubtask.isClosed() + || Objects.isNull(prefetchExecutor) + || prefetchExecutor.isShutdown() + || prefetchExecutor != currentExecutor; + if (!bindingMissing) { + return false; + } + + return hasImmediatePrefetchableWork() + || hasHistoricalWalLag() + || !lingerBatch.isEmpty() + || !inFlightEvents.isEmpty() + || computeWatermarkDelayMs() > 0L; + } + + // ======================== Poll ======================== + + public SubscriptionEvent poll(final String consumerId) { + return poll(consumerId, null); + } + + public SubscriptionEvent poll(final String consumerId, final RegionProgress regionProgress) { + acquireReadLock(); + try { + if (isClosed || closeRequested || !isActive) { + return null; + } + if (!prefetchInitialized) { + initPrefetch(regionProgress); + } + if (pendingSeekRequest != null) { + return null; + } + final SubscriptionEvent event = pollInternal(consumerId); + if (Objects.nonNull(event) && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) { + requestPrefetch(); + } else if (Objects.isNull(event) && shouldRecoverPrefetchBindingAfterEmptyPoll()) { + requestPrefetch(); + } + return event; + } finally { + releaseReadLock(); + } + } + + private synchronized void initPrefetch(final RegionProgress regionProgress) { + if (prefetchInitialized) { + return; // double-check under synchronization + } + + final RegionProgress committedRegionProgress = resolveCommittedRegionProgressForInit(); + final boolean useConsumerHint = + shouldUseConsumerRegionProgressHint(regionProgress, committedRegionProgress); + final RegionProgress recoveryRegionProgress = + useConsumerHint + ? mergeRecoveryRegionProgress(committedRegionProgress, regionProgress) + : committedRegionProgress; + final String progressSource = + useConsumerHint + ? Objects.nonNull(committedRegionProgress) + && !committedRegionProgress.getWriterPositions().isEmpty() + ? "merged committed region progress with consumer topic progress hint" + : "consumer topic progress hint" + : "committed region progress fallback"; + final ReplayLocateDecision resolvedStart = + resolveInitReplayStartDecision(recoveryRegionProgress, progressSource); + + clearRecoveryWriterProgress(); + final RegionProgress effectiveRecoveryRegionProgress = + resolvedStart.getRecoveryRegionProgress(); + if (Objects.nonNull(effectiveRecoveryRegionProgress) + && !effectiveRecoveryRegionProgress.getWriterPositions().isEmpty()) { + installRecoveryWriterProgress(effectiveRecoveryRegionProgress); + } + + this.nextExpectedSearchIndex.set(resolvedStart.getStartSearchIndex()); + if (consensusReqReader instanceof WALNode) { + this.subscriptionWALIterator = + new ProgressWALIterator( + (WALNode) consensusReqReader, resolvedStart.getStartSearchIndex()); + } + this.prefetchInitialized = true; + this.observedSeekGeneration = seekGeneration.get(); + this.lingerBatch.reset(); + resetBatchWriterProgress(); + + LOGGER.info( + "ConsensusPrefetchingQueue {}: prefetch initialized, startSearchIndex={}, progressSource={}, recoveryWriterCount={}", + this, + resolvedStart.getStartSearchIndex(), + resolvedStart.getDetail(), + recoveryWriterProgressByWriter.size()); + + requestPrefetch(); + } + + private ReplayLocateDecision resolveInitReplayStartDecision( + final RegionProgress recoveryRegionProgress, final String progressSource) { + if (Objects.isNull(recoveryRegionProgress) + || recoveryRegionProgress.getWriterPositions().isEmpty()) { + return ReplayLocateDecision.found( + fallbackTailSearchIndex, + new RegionProgress(Collections.emptyMap()), + progressSource + " (tail start without progress)"); + } + if (!(consensusReqReader instanceof WALNode)) { + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot recover from non-empty region progress without WAL access: %s", + this, recoveryRegionProgress)); + } + + final ReplayLocateDecision replayTarget = + locateReplayStartForRegionProgress(recoveryRegionProgress, true); + switch (replayTarget.getStatus()) { + case FOUND: + case AT_END: + return new ReplayLocateDecision( + replayTarget.getStatus(), + replayTarget.getStartSearchIndex(), + replayTarget.getRecoveryRegionProgress(), + progressSource + " (" + replayTarget.getDetail() + ")"); + case LOCATE_MISS: + default: + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot initialize replay start from region progress %s: %s", + this, recoveryRegionProgress, replayTarget.getDetail())); + } + } + + private boolean shouldUseConsumerRegionProgressHint( + final RegionProgress regionProgress, final RegionProgress committedRegionProgress) { + if (Objects.isNull(regionProgress) || regionProgress.getWriterPositions().isEmpty()) { + return false; + } + if (Objects.isNull(committedRegionProgress) + || committedRegionProgress.getWriterPositions().isEmpty()) { + return true; + } + for (final Map.Entry entry : + regionProgress.getWriterPositions().entrySet()) { + if (Objects.isNull(entry.getKey()) || Objects.isNull(entry.getValue())) { + continue; + } + final WriterProgress committedWriterProgress = + committedRegionProgress.getWriterPositions().get(entry.getKey()); + if (Objects.isNull(committedWriterProgress) + || compareWriterProgress(entry.getValue(), committedWriterProgress) > 0) { + return true; + } + } + return false; + } + + private RegionProgress mergeRecoveryRegionProgress( + final RegionProgress committedRegionProgress, final RegionProgress consumerRegionProgress) { + if (Objects.isNull(committedRegionProgress) + || committedRegionProgress.getWriterPositions().isEmpty()) { + return consumerRegionProgress; + } + if (Objects.isNull(consumerRegionProgress) + || consumerRegionProgress.getWriterPositions().isEmpty()) { + return committedRegionProgress; + } + + final Map mergedWriterProgress = new LinkedHashMap<>(); + committedRegionProgress + .getWriterPositions() + .forEach( + (writerId, writerProgress) -> { + if (Objects.nonNull(writerId) && Objects.nonNull(writerProgress)) { + mergedWriterProgress.put(writerId, writerProgress); + } + }); + consumerRegionProgress + .getWriterPositions() + .forEach( + (writerId, writerProgress) -> { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + return; + } + mergedWriterProgress.merge( + writerId, + writerProgress, + (committedWriterProgress, consumerWriterProgress) -> + compareWriterProgress(consumerWriterProgress, committedWriterProgress) > 0 + ? consumerWriterProgress + : committedWriterProgress); + }); + return new RegionProgress(mergedWriterProgress); + } + + protected RegionProgress resolveCommittedRegionProgressForInit() { + commitManager.getOrCreateState(brokerId, topicName, consensusGroupId); + final RegionProgress latestCommittedRegionProgress = + commitManager.getCommittedRegionProgress(brokerId, topicName, consensusGroupId); + if (Objects.nonNull(latestCommittedRegionProgress) + && !latestCommittedRegionProgress.getWriterPositions().isEmpty()) { + return latestCommittedRegionProgress; + } + return Objects.nonNull(fallbackCommittedRegionProgress) + && !fallbackCommittedRegionProgress.getWriterPositions().isEmpty() + ? fallbackCommittedRegionProgress + : null; + } + + private void installRecoveryWriterProgress(final RegionProgress regionProgress) { + recoveryWriterProgressByWriter.clear(); + recoveryWriterProgressByWriter.putAll(regionProgress.getWriterPositions()); + regionProgress + .getWriterPositions() + .keySet() + .forEach(writerId -> trackWriterLane(writerId.getNodeId(), writerId.getWriterEpoch())); + } + + private void clearRecoveryWriterProgress() { + recoveryWriterProgressByWriter.clear(); + } + + private boolean shouldSkipForRecoveryProgress(final IndexedConsensusRequest request) { + if (recoveryWriterProgressByWriter.isEmpty()) { + return false; + } + return isRequestCoveredByRegionProgress(request, recoveryWriterProgressByWriter, true); + } + + private boolean hasComparableWriterProgress(final IndexedConsensusRequest request) { + return request.getNodeId() >= 0 + && request.getWriterEpoch() >= 0 + && request.getPhysicalTime() > 0 + && request.getProgressLocalSeq() >= 0; + } + + private WriterId toWriterId(final IndexedConsensusRequest request) { + return new WriterId(consensusGroupId.toString(), request.getNodeId(), request.getWriterEpoch()); + } + + private WriterProgress toWriterProgress(final IndexedConsensusRequest request) { + return new WriterProgress(request.getPhysicalTime(), request.getProgressLocalSeq()); + } + + private boolean isRequestCoveredByRegionProgress( + final IndexedConsensusRequest request, + final Map regionProgressByWriter, + final boolean seekAfter) { + if (!hasComparableWriterProgress(request)) { + return false; + } + final WriterProgress committedProgress = regionProgressByWriter.get(toWriterId(request)); + if (Objects.isNull(committedProgress)) { + return false; + } + final int cmp = compareWriterProgress(toWriterProgress(request), committedProgress); + return seekAfter ? cmp <= 0 : cmp < 0; + } + + private WriterProgress decrementWriterProgress(final WriterProgress writerProgress) { + return new WriterProgress( + writerProgress.getPhysicalTime(), + writerProgress.getLocalSeq() > 0L + ? writerProgress.getLocalSeq() - 1L + : BEFORE_FIRST_LOCAL_SEQ); + } + + protected ReplayLocateDecision scanReplayStartForRequests( + final Iterable requests, + final RegionProgress regionProgress, + final boolean seekAfter) { + final Map requestedWriterProgress = new LinkedHashMap<>(); + if (Objects.nonNull(regionProgress)) { + regionProgress + .getWriterPositions() + .forEach( + (writerId, writerProgress) -> { + if (Objects.nonNull(writerId) && Objects.nonNull(writerProgress)) { + requestedWriterProgress.put(writerId, writerProgress); + } + }); + } + final Map effectiveRecoveryWriterProgress = + new LinkedHashMap<>(requestedWriterProgress); + final Set exactVisibleWriterIds = new LinkedHashSet<>(); + Long firstUncoveredReplayableSearchIndex = null; + boolean sawBlockingNonReplayableUncovered = false; + + for (final IndexedConsensusRequest request : requests) { + if (!hasComparableWriterProgress(request)) { + continue; + } + + final WriterId writerId = toWriterId(request); + final WriterProgress requestProgress = toWriterProgress(request); + final WriterProgress storedWriterProgress = requestedWriterProgress.get(writerId); + if (!seekAfter + && Objects.nonNull(storedWriterProgress) + && compareWriterProgress(requestProgress, storedWriterProgress) == 0) { + exactVisibleWriterIds.add(writerId); + } + + if (isRequestCoveredByRegionProgress(request, requestedWriterProgress, seekAfter)) { + continue; + } + + if (request.getSearchIndex() >= 0) { + if (Objects.isNull(firstUncoveredReplayableSearchIndex)) { + firstUncoveredReplayableSearchIndex = request.getSearchIndex(); + } + } else if (Objects.isNull(firstUncoveredReplayableSearchIndex)) { + sawBlockingNonReplayableUncovered = true; + } + } + + if (!seekAfter && !exactVisibleWriterIds.isEmpty()) { + for (final WriterId writerId : exactVisibleWriterIds) { + final WriterProgress writerProgress = requestedWriterProgress.get(writerId); + if (Objects.nonNull(writerProgress)) { + effectiveRecoveryWriterProgress.put(writerId, decrementWriterProgress(writerProgress)); + } + } + } + final RegionProgress effectiveRecoveryRegionProgress = + new RegionProgress(effectiveRecoveryWriterProgress); + + if (sawBlockingNonReplayableUncovered) { + return ReplayLocateDecision.locateMiss( + effectiveRecoveryRegionProgress, + "uncovered non-replayable WAL records appear before the first local replayable record"); + } + if (Objects.nonNull(firstUncoveredReplayableSearchIndex)) { + return ReplayLocateDecision.found( + firstUncoveredReplayableSearchIndex, + effectiveRecoveryRegionProgress, + "resolved first uncovered replayable WAL record"); + } + return ReplayLocateDecision.atEnd( + consensusReqReader.getCurrentSearchIndex(), + computeTailRegionProgress(), + "all locally replayable WAL records are already covered"); + } + + protected ReplayLocateDecision locateReplayStartForRegionProgress( + final RegionProgress regionProgress, final boolean seekAfter) { + if (!(consensusReqReader instanceof WALNode)) { + return ReplayLocateDecision.locateMiss( + regionProgress, "WAL access is unavailable for region-level replay lookup"); + } + + final WALNode walNode = (WALNode) consensusReqReader; + final List replayRequests = new ArrayList<>(); + try (final ProgressWALIterator iterator = new ProgressWALIterator(walNode, Long.MIN_VALUE)) { + while (iterator.hasNext()) { + replayRequests.add(iterator.next()); + } + if (iterator.hasIncompleteScan()) { + return ReplayLocateDecision.locateMiss( + regionProgress, + "replay lookup did not complete: " + iterator.getIncompleteScanDetail()); + } + return scanReplayStartForRequests(replayRequests, regionProgress, seekAfter); + } catch (final IOException e) { + return ReplayLocateDecision.locateMiss( + regionProgress, "failed to close replay lookup iterator: " + e.getMessage()); + } + } + + private boolean shouldTrackFollowerProgressForDedup(final IndexedConsensusRequest request) { + return request.getSearchIndex() < 0 + && request.getNodeId() >= 0 + && request.getWriterEpoch() >= 0 + && request.getProgressLocalSeq() >= 0; + } + + private boolean shouldSkipForMaterializedFollowerProgress(final IndexedConsensusRequest request) { + if (!shouldTrackFollowerProgressForDedup(request)) { + return false; + } + final Long materializedLocalSeq = + materializedFollowerProgressByWriter.get( + new WriterLaneId(request.getNodeId(), request.getWriterEpoch())); + return Objects.nonNull(materializedLocalSeq) + && request.getProgressLocalSeq() <= materializedLocalSeq; + } + + private void markMaterializedFollowerProgress(final IndexedConsensusRequest request) { + if (!shouldTrackFollowerProgressForDedup(request)) { + return; + } + materializedFollowerProgressByWriter.merge( + new WriterLaneId(request.getNodeId(), request.getWriterEpoch()), + request.getProgressLocalSeq(), + Math::max); + } + + private int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + + private WriterLaneState trackWriterLane(final int writerNodeId, final long writerEpoch) { + return writerLanes.computeIfAbsent( + new WriterLaneId(writerNodeId, writerEpoch), ignored -> new WriterLaneState()); + } + + private void refreshWriterLaneSafeFrontiers() { + final Map safePts = + serverImpl.getWriterSafeFrontierTracker().snapshotEffectiveSafePts(); + for (final Map.Entry entry : + safePts.entrySet()) { + final WriterLaneState laneState = + trackWriterLane(entry.getKey().getWriterNodeId(), entry.getKey().getWriterEpoch()); + laneState.effectiveSafePt = Math.max(laneState.effectiveSafePt, entry.getValue()); + } + } + + private PriorityQueue buildLaneFrontiers( + final Map laneEntriesByLane, final Function headSupplier) { + refreshWriterLaneSafeFrontiers(); + final PriorityQueue frontiers = new PriorityQueue<>(); + final boolean useActiveWriterBarriers = shouldUseActiveWriterBarriers(); + final Set laneIds = ConcurrentHashMap.newKeySet(); + final Set seenActiveWriterNodeIds = ConcurrentHashMap.newKeySet(); + laneIds.addAll(writerLanes.keySet()); + laneIds.addAll(laneEntriesByLane.keySet()); + for (final WriterLaneId laneId : laneIds) { + final WriterLaneState laneState = writerLanes.get(laneId); + if (Objects.nonNull(laneState) && laneState.closed) { + continue; + } + final T head = headSupplier.apply(laneId); + if (Objects.nonNull(head)) { + if (isLaneRuntimeActive(laneId)) { + seenActiveWriterNodeIds.add(laneId.writerNodeId); + } + frontiers.add(LaneFrontier.forHead(laneId, head)); + continue; + } + if (Objects.nonNull(laneState) + && laneState.effectiveSafePt > 0 + && useActiveWriterBarriers + && isLaneRuntimeActive(laneId)) { + seenActiveWriterNodeIds.add(laneId.writerNodeId); + frontiers.add(LaneFrontier.forBarrier(laneId, laneState.effectiveSafePt)); + } + } + if (useActiveWriterBarriers) { + for (final Integer activeWriterNodeId : activeWriterNodeIds) { + if (!seenActiveWriterNodeIds.contains(activeWriterNodeId)) { + frontiers.add( + LaneFrontier.forBarrier(new WriterLaneId(activeWriterNodeId, 0L), Long.MIN_VALUE)); + break; + } + } + } + return frontiers; + } + + private boolean shouldUseActiveWriterBarriers() { + return !TopicConstant.ORDER_MODE_PER_WRITER_VALUE.equals(orderMode); + } + + private void bufferRealtimeEntry(final PreparedEntry entry) { + final WriterLaneId laneId = new WriterLaneId(entry.writerNodeId, entry.writerEpoch); + realtimeEntriesByLane + .computeIfAbsent(laneId, ignored -> new TreeMap<>()) + .put(entry.localSeq, entry); + } + + private PreparedEntry peekRealtimeEntry(final WriterLaneId laneId) { + final NavigableMap laneEntries = realtimeEntriesByLane.get(laneId); + if (Objects.isNull(laneEntries) || laneEntries.isEmpty()) { + return null; + } + final Map.Entry firstEntry = laneEntries.firstEntry(); + return Objects.nonNull(firstEntry) ? firstEntry.getValue() : null; + } + + private void removeRealtimeEntry(final WriterLaneId laneId, final long localSeq) { + final NavigableMap laneEntries = realtimeEntriesByLane.get(laneId); + if (Objects.isNull(laneEntries)) { + return; + } + laneEntries.remove(localSeq); + if (laneEntries.isEmpty()) { + realtimeEntriesByLane.remove(laneId); + } + } + + private PriorityQueue buildRealtimeLaneFrontiers() { + return buildLaneFrontiers(realtimeEntriesByLane, this::peekRealtimeEntry); + } + + private SubscriptionEvent pollInternal(final String consumerId) { + final long size = prefetchingQueue.size(); + if (size == 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: prefetching queue is empty for consumerId={}, " + + "pendingEntriesSize={}, nextExpected={}, isClosed={}, prefetchInitialized={}, subtaskScheduled={}", + this, + consumerId, + pendingEntries.size(), + nextExpectedSearchIndex.get(), + isClosed, + prefetchInitialized, + Objects.nonNull(prefetchSubtask) && prefetchSubtask.isScheduledOrRunning()); + return null; + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: polling, queue size={}, consumerId={}", + this, + size, + consumerId); + long count = 0; + + SubscriptionEvent event; + try { + while (count++ < size + && Objects.nonNull( + event = + prefetchingQueue.poll( + SubscriptionConfig.getInstance().getSubscriptionPollMaxBlockingTimeMs(), + TimeUnit.MILLISECONDS))) { + // Metadata events (currently WATERMARK) are fire-and-forget: + // skip inFlightEvents tracking so they are not recycled and re-delivered indefinitely. + if (event.getCurrentResponse().getResponseType() + == SubscriptionPollResponseType.WATERMARK.getType()) { + return event; + } + + if (event.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll committed event {} (broken invariant), remove it", + this, + event); + continue; + } + + if (!event.pollable()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {} poll non-pollable event {} (broken invariant), nack it", + this, + event); + event.nack(); + continue; + } + + // Mark as polled before updating inFlightEvents + event.recordLastPolledTimestamp(); + inFlightEvents.put(new Pair<>(consumerId, event.getCommitContext()), event); + event.recordLastPolledConsumerId(consumerId); + return event; + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + LOGGER.warn("ConsensusPrefetchingQueue {} interrupted while polling", this, e); + } + + return null; + } + + public SubscriptionEvent pollTablets( + final String consumerId, final SubscriptionCommitContext commitContext, final int offset) { + acquireReadLock(); + try { + if (isClosed || closeRequested || pendingSeekRequest != null) { + return null; + } + final SubscriptionEvent event = inFlightEvents.get(new Pair<>(consumerId, commitContext)); + if (Objects.isNull(event)) { + if (isCommitContextOutdated(commitContext)) { + return generateOutdatedErrorResponse(); + } + return generateErrorResponse( + String.format( + "ConsensusPrefetchingQueue %s: no in-flight event for consumer %s, commit context %s", + this, consumerId, commitContext)); + } + return event; + } finally { + releaseReadLock(); + } + } + + // ======================== Prefetch Round Drive ======================== + + private static final long WAL_GAP_RETRY_SLEEP_MS = 10L; + private static final long WAL_GAP_WAIT_LOG_INTERVAL_MS = 5_000L; + + private static final long PREFETCH_STATS_LOG_INTERVAL_MS = 5_000L; + + public PrefetchRoundResult drivePrefetchOnce() { + if (applyPendingSeekRequestIfNecessary()) { + return closeRequested ? PrefetchRoundResult.dormant() : PrefetchRoundResult.rescheduleNow(); + } + + acquireReadLock(); + try { + if (isClosed || closeRequested || !prefetchInitialized) { + return PrefetchRoundResult.dormant(); + } + + logPeriodicStatsIfNecessary(); + + final long currentSeekGeneration = seekGeneration.get(); + if (currentSeekGeneration != observedSeekGeneration) { + resetRoundStateForSeek(currentSeekGeneration); + } + + applyPendingSubscriptionWalReset(observedSeekGeneration); + recycleInFlightEvents(); + + if (!isActive || prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + return computeIdleRoundResult(); + } + + final SubscriptionConfig config = SubscriptionConfig.getInstance(); + final int maxWalEntries = config.getSubscriptionConsensusBatchMaxWalEntries(); + final int batchMaxDelayMs = config.getSubscriptionConsensusBatchMaxDelayInMs(); + final int maxTablets = config.getSubscriptionConsensusBatchMaxTabletCount(); + final long maxBatchBytes = config.getSubscriptionConsensusBatchMaxSizeInBytes(); + + final List batch = drainPendingEntries(maxWalEntries); + if (!batch.isEmpty()) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: drained {} entries from pendingEntries, " + + "first searchIndex={}, last searchIndex={}, nextExpected={}, " + + "prefetchingQueueSize={}", + this, + batch.size(), + batch.get(0).getSearchIndex(), + batch.get(batch.size() - 1).getSearchIndex(), + nextExpectedSearchIndex.get(), + prefetchingQueue.size()); + + final boolean batchAccepted = + accumulateFromPending( + batch, lingerBatch, observedSeekGeneration, maxTablets, maxBatchBytes); + if (!batchAccepted) { + if (pendingWalGapRetryRequested) { + // Once a drained batch hits an unresolved WAL gap, the affected suffix falls back to + // the WAL path on later rounds instead of being requeued into the bounded pending path. + return PrefetchRoundResult.rescheduleAfter(WAL_GAP_RETRY_SLEEP_MS); + } + resetRoundStateForSeek(seekGeneration.get()); + return PrefetchRoundResult.rescheduleNow(); + } + } + + if (batch.isEmpty() && lingerBatch.isEmpty()) { + tryCatchUpFromWAL(observedSeekGeneration); + } + + if (!drainBufferedRealtimeLanes( + lingerBatch, observedSeekGeneration, maxTablets, maxBatchBytes)) { + resetRoundStateForSeek(seekGeneration.get()); + return PrefetchRoundResult.rescheduleNow(); + } + + if (!lingerBatch.isEmpty() && lingerBatch.firstTabletTimeMs > 0L) { + final long lingerElapsedMs = System.currentTimeMillis() - lingerBatch.firstTabletTimeMs; + if (lingerElapsedMs >= batchMaxDelayMs) { + if (seekGeneration.get() != observedSeekGeneration) { + resetRoundStateForSeek(seekGeneration.get()); + return PrefetchRoundResult.rescheduleNow(); + } + LOGGER.debug( + "ConsensusPrefetchingQueue {}: time-based flush, {} tablets lingered for {}ms " + + "(threshold={}ms)", + this, + lingerBatch.tablets.size(), + lingerElapsedMs, + batchMaxDelayMs); + flushBatch(lingerBatch, observedSeekGeneration); + } + } + + maybeInjectWatermark(); + return computeIdleRoundResult(); + } catch (final Throwable fatal) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: prefetch round failed " + "(type={}, message={})", + this, + fatal.getClass().getName(), + fatal.getMessage(), + fatal); + if (fatal instanceof VirtualMachineError) { + markClosed(); + return PrefetchRoundResult.dormant(); + } + return PrefetchRoundResult.rescheduleAfter(100L); + } finally { + releaseReadLock(); + } + } + + private void logPeriodicStatsIfNecessary() { + final long nowMs = System.currentTimeMillis(); + if (nowMs - lastStatsLogTimeMs < PREFETCH_STATS_LOG_INTERVAL_MS) { + return; + } + + final long currentPendingAcceptedEntries = pendingPathAcceptedEntries.get(); + final long currentWalAcceptedEntries = walPathAcceptedEntries.get(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: periodic stats, lag={}, pendingDelta={}, walDelta={}, " + + "pendingTotal={}, walTotal={}, pendingQueueSize={}, prefetchingQueueSize={}, " + + "inFlightEventsSize={}, realtimeLaneCount={}, walHasNext={}, isActive={}, subtaskScheduled={}", + this, + getLag(), + currentPendingAcceptedEntries - lastPendingAcceptedEntries, + currentWalAcceptedEntries - lastWalAcceptedEntries, + currentPendingAcceptedEntries, + currentWalAcceptedEntries, + pendingEntries.size(), + prefetchingQueue.size(), + inFlightEvents.size(), + realtimeEntriesByLane.size(), + hasReadableWalEntries(), + isActive, + Objects.nonNull(prefetchSubtask) && prefetchSubtask.isScheduledOrRunning()); + lastStatsLogTimeMs = nowMs; + lastPendingAcceptedEntries = currentPendingAcceptedEntries; + lastWalAcceptedEntries = currentWalAcceptedEntries; + } + + private void resetRoundStateForSeek(final long newSeekGeneration) { + restorePendingSubscriptionWalCursor(newSeekGeneration); + lingerBatch.reset(); + resetBatchWriterProgress(); + observedSeekGeneration = newSeekGeneration; + } + + private List drainPendingEntries(final int maxWalEntries) { + final List batch = new ArrayList<>(); + IndexedConsensusRequest next; + while (batch.size() < maxWalEntries && (next = pendingEntries.poll()) != null) { + batch.add(next); + } + return batch; + } + + private PrefetchRoundResult computeIdleRoundResult() { + if (isClosed || !prefetchInitialized || !isActive) { + return PrefetchRoundResult.dormant(); + } + if (prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + return PrefetchRoundResult.dormant(); + } + if (hasImmediatePrefetchableWork()) { + return PrefetchRoundResult.rescheduleNow(); + } + long delayMs = Long.MAX_VALUE; + if (hasHistoricalWalLag()) { + delayMs = Math.min(delayMs, WAL_GAP_RETRY_SLEEP_MS); + } + if (!lingerBatch.isEmpty() && lingerBatch.firstTabletTimeMs > 0L) { + final long lingerDelayMs = + SubscriptionConfig.getInstance().getSubscriptionConsensusBatchMaxDelayInMs() + - (System.currentTimeMillis() - lingerBatch.firstTabletTimeMs); + delayMs = Math.min(delayMs, Math.max(1L, lingerDelayMs)); + } + + final long watermarkDelayMs = computeWatermarkDelayMs(); + if (watermarkDelayMs > 0L) { + delayMs = Math.min(delayMs, watermarkDelayMs); + } + + if (!inFlightEvents.isEmpty()) { + delayMs = + Math.min( + delayMs, + SubscriptionConfig.getInstance().getSubscriptionRecycleUncommittedEventIntervalMs()); + } + + return delayMs == Long.MAX_VALUE + ? PrefetchRoundResult.dormant() + : PrefetchRoundResult.rescheduleAfter(delayMs); + } + + private long computeWatermarkDelayMs() { + if (maxObservedTimestamp == Long.MIN_VALUE) { + return -1L; + } + final long intervalMs = + SubscriptionConfig.getInstance().getSubscriptionConsensusWatermarkIntervalMs(); + if (intervalMs <= 0L) { + return -1L; + } + if (lastWatermarkEmitTimeMs == 0L) { + return 1L; + } + final long elapsedMs = System.currentTimeMillis() - lastWatermarkEmitTimeMs; + return elapsedMs >= intervalMs ? 1L : Math.max(1L, intervalMs - elapsedMs); + } + + private boolean hasImmediatePrefetchableWork() { + return !pendingEntries.isEmpty() || !realtimeEntriesByLane.isEmpty() || hasReadableWalEntries(); + } + + private boolean hasHistoricalWalLag() { + return nextExpectedSearchIndex.get() < consensusReqReader.getCurrentSearchIndex(); + } + + /** + * Accumulates tablets from pending entries into the linger buffer. When pending replay outruns + * the local WAL reader, this method backfills the local-index gap from WAL before continuing. + * + * @return false if the batch became stale because seek generation changed while flushing + */ + private static boolean hasLocalSearchIndex(final IndexedConsensusRequest request) { + return request.getSearchIndex() >= 0; + } + + private boolean isBeforeLocalCursor(final IndexedConsensusRequest request) { + return hasLocalSearchIndex(request) && request.getSearchIndex() < nextExpectedSearchIndex.get(); + } + + private void advanceLocalCursorIfPresent(final IndexedConsensusRequest request) { + if (hasLocalSearchIndex(request)) { + nextExpectedSearchIndex.set(request.getSearchIndex() + 1); + } + } + + private boolean appendRealtimeRequest( + final IndexedConsensusRequest request, + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes, + final boolean fromPending) { + final PreparedEntry preparedEntry = prepareEntry(request); + if (Objects.isNull(preparedEntry)) { + return true; + } + if (!appendPreparedEntryViaRealtimeLane( + batchState, preparedEntry, expectedSeekGeneration, maxTablets, maxBatchBytes)) { + return false; + } + if (fromPending) { + markAcceptedFromPending(); + } else { + markAcceptedFromWal(); + } + return true; + } + + private boolean accumulateFromPending( + final List batch, + final DeliveryBatchState lingerBatch, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + + int processedCount = 0; + int skippedCount = 0; + + for (int index = 0; index < batch.size(); index++) { + final IndexedConsensusRequest request = batch.get(index); + final long searchIndex = request.getSearchIndex(); + + // Only local-indexed requests participate in the internal WAL read cursor. + final long expected = nextExpectedSearchIndex.get(); + if (hasLocalSearchIndex(request) && searchIndex > expected) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: gap detected, expected={}, got={}. " + + "Filling {} entries from WAL.", + this, + expected, + searchIndex, + searchIndex - expected); + if (!fillGapFromWAL( + expected, + searchIndex, + lingerBatch, + expectedSeekGeneration, + maxTablets, + maxBatchBytes)) { + return false; + } + } + + if (isBeforeLocalCursor(request)) { + skippedCount++; + continue; + } + + if (shouldSkipForRecoveryProgress(request)) { + skippedCount++; + advanceLocalCursorIfPresent(request); + continue; + } + if (shouldSkipForMaterializedFollowerProgress(request)) { + skippedCount++; + advanceLocalCursorIfPresent(request); + continue; + } + + if (!appendRealtimeRequest( + request, lingerBatch, expectedSeekGeneration, maxTablets, maxBatchBytes, true)) { + return false; + } + markMaterializedFollowerProgress(request); + processedCount++; + advanceLocalCursorIfPresent(request); + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: accumulate complete, batchSize={}, processed={}, " + + "skipped={}, lingerTablets={}, nextExpected={}", + this, + batch.size(), + processedCount, + skippedCount, + lingerBatch.tablets.size(), + nextExpectedSearchIndex.get()); + + return true; + } + + /** + * Fills a gap in the pending queue by reading entries from WAL so the internal local replay + * cursor stays contiguous even when pending delivery jumps ahead of the WAL iterator. + * + *

Temporary WAL visibility lag is treated as a normal back-pressure condition: once a drained + * pending batch encounters an unresolved local-index gap, the queue backs off and lets the + * affected suffix fall back to the WAL path on later rounds. This keeps replay contiguous without + * requeueing the drained batch back into the bounded pending queue. + * + * @return false if gap fill had to stop because the current batch became stale or the queue was + * interrupted/closed + */ + private boolean fillGapFromWAL( + final long fromIndex, + final long toIndex, + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + pendingWalGapRetryRequested = false; + resetSubscriptionWALPosition(fromIndex); + if (seekGeneration.get() != expectedSeekGeneration || isClosed) { + return false; + } + if (!pumpFromSubscriptionWAL( + batchState, expectedSeekGeneration, Integer.MAX_VALUE, maxTablets, maxBatchBytes)) { + return false; + } + + final long nextExpected = nextExpectedSearchIndex.get(); + if (nextExpected >= toIndex) { + walGapWaitStartTimeMs = 0L; + lastWalGapWaitLogTimeMs = 0L; + return true; + } + + final long nowMs = System.currentTimeMillis(); + if (walGapWaitStartTimeMs == 0L) { + walGapWaitStartTimeMs = nowMs; + } + if (lastWalGapWaitLogTimeMs == 0L + || nowMs - lastWalGapWaitLogTimeMs >= WAL_GAP_WAIT_LOG_INTERVAL_MS) { + LOGGER.info( + "ConsensusPrefetchingQueue {}: waiting {}ms for WAL gap [{}, {}) to become visible, " + + "currentNextExpected={}, currentWalIndex={}, seekGeneration={}", + this, + nowMs - walGapWaitStartTimeMs, + nextExpected, + toIndex, + nextExpected, + consensusReqReader.getCurrentSearchIndex(), + expectedSeekGeneration); + lastWalGapWaitLogTimeMs = nowMs; + } + onWalGapRetryScheduled(); + pendingWalGapRetryRequested = true; + return false; + } + + /** + * Try catch-up from WAL when the pending queue was empty. This handles cold-start or scenarios + * where the subscription started after data was already written. + */ + private void tryCatchUpFromWAL(final long expectedSeekGeneration) { + final SubscriptionConfig config = SubscriptionConfig.getInstance(); + final int maxTablets = config.getSubscriptionConsensusBatchMaxTabletCount(); + final long maxBatchBytes = config.getSubscriptionConsensusBatchMaxSizeInBytes(); + final int maxWalEntries = config.getSubscriptionConsensusBatchMaxWalEntries(); + + final DeliveryBatchState batchState = new DeliveryBatchState(); + resetSubscriptionWALPosition(nextExpectedSearchIndex.get()); + final boolean accepted = + pumpFromSubscriptionWAL( + batchState, expectedSeekGeneration, maxWalEntries, maxTablets, maxBatchBytes); + if (!accepted) { + return; + } + + if (!batchState.isEmpty()) { + flushBatch(batchState, expectedSeekGeneration); + } + } + + private boolean pumpFromSubscriptionWAL( + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxWalEntries, + final int maxTablets, + final long maxBatchBytes) { + if (Objects.isNull(subscriptionWALIterator)) { + return true; + } + + subscriptionWALIterator.refresh(); + ensureSubscriptionWalReadable(); + + int entriesRead = 0; + while (entriesRead < maxWalEntries + && subscriptionWALIterator.hasNext() + && prefetchingQueue.size() < MAX_PREFETCHING_QUEUE_SIZE) { + try { + final IndexedConsensusRequest walEntry = subscriptionWALIterator.next(); + entriesRead++; + + if (isBeforeLocalCursor(walEntry)) { + continue; + } + if (shouldSkipForRecoveryProgress(walEntry)) { + advanceLocalCursorIfPresent(walEntry); + continue; + } + if (shouldSkipForMaterializedFollowerProgress(walEntry)) { + advanceLocalCursorIfPresent(walEntry); + continue; + } + + if (!appendRealtimeRequest( + walEntry, batchState, expectedSeekGeneration, maxTablets, maxBatchBytes, false)) { + return false; + } + markMaterializedFollowerProgress(walEntry); + advanceLocalCursorIfPresent(walEntry); + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error reading subscription WAL", this, e); + break; + } + } + + if (entriesRead > 0) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: subscription WAL read {} entries, nextExpectedSearchIndex={}", + this, + entriesRead, + nextExpectedSearchIndex.get()); + } + return true; + } + + private void ensureSubscriptionWalReadable() { + if (Objects.isNull(subscriptionWALIterator) + || subscriptionWALIterator.hasNext() + || !(consensusReqReader instanceof WALNode)) { + return; + } + + final long currentWalIndex = consensusReqReader.getCurrentSearchIndex(); + if (nextExpectedSearchIndex.get() > currentWalIndex) { + return; + } + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: subscription WAL exhausted at {} while current WAL is {}. " + + "Rolling WAL file to expose current-file entries.", + this, + nextExpectedSearchIndex.get(), + currentWalIndex); + ((WALNode) consensusReqReader).rollWALFile(); + resetSubscriptionWALPosition(nextExpectedSearchIndex.get()); + if (Objects.nonNull(subscriptionWALIterator)) { + subscriptionWALIterator.refresh(); + } + } + + private void resetSubscriptionWALPosition(final long startSearchIndex) { + closeSubscriptionWALIterator(); + subscriptionWALIterator = createSubscriptionWALIterator(startSearchIndex); + } + + protected ProgressWALIterator createSubscriptionWALIterator(final long startSearchIndex) { + if (consensusReqReader instanceof WALNode) { + return new ProgressWALIterator((WALNode) consensusReqReader, startSearchIndex); + } + return null; + } + + protected void onWalGapRetryScheduled() {} + + private boolean hasReadableWalEntries() { + return Objects.nonNull(subscriptionWALIterator) && subscriptionWALIterator.hasNext(); + } + + private void requestSubscriptionWalReset( + final long targetSearchIndex, final long seekGenerationValue) { + pendingSubscriptionWalResetSearchIndex = targetSearchIndex; + pendingSubscriptionWalResetGeneration = seekGenerationValue; + } + + private void applyPendingSubscriptionWalReset(final long observedSeekGeneration) { + if (pendingSubscriptionWalResetGeneration != observedSeekGeneration + || pendingSubscriptionWalResetSearchIndex == Long.MIN_VALUE) { + return; + } + resetSubscriptionWALPosition(pendingSubscriptionWalResetSearchIndex); + pendingSubscriptionWalResetSearchIndex = Long.MIN_VALUE; + pendingSubscriptionWalResetGeneration = Long.MIN_VALUE; + } + + private void restorePendingSubscriptionWalCursor(final long observedSeekGeneration) { + if (pendingSubscriptionWalResetGeneration != observedSeekGeneration + || pendingSubscriptionWalResetSearchIndex == Long.MIN_VALUE) { + return; + } + // A seek can land in the middle of a prefetch iteration. Restore the local cursor to the + // pending seek target before resuming under the new generation so stale in-flight work does + // not permanently advance the historical replay frontier. + nextExpectedSearchIndex.set(pendingSubscriptionWalResetSearchIndex); + } + + private void closeSubscriptionWALIterator() { + if (Objects.isNull(subscriptionWALIterator)) { + return; + } + try { + subscriptionWALIterator.close(); + } catch (final IOException e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error closing subscription WAL iterator", this, e); + } finally { + subscriptionWALIterator = null; + } + } + + /** + * Deserializes the IConsensusRequest entries within an IndexedConsensusRequest to produce an + * InsertNode. WAL entries are typically stored as IoTConsensusRequest (serialized ByteBuffers), + * and a single logical write may be split across multiple fragments (SearchNode). This method + * handles both cases. + * + *

The deserialization follows the same pattern as {@code + * DataRegionStateMachine.grabPlanNode()}. + */ + private InsertNode deserializeToInsertNode(final IndexedConsensusRequest indexedRequest) { + final List searchNodes = new ArrayList<>(); + PlanNode nonSearchNode = null; + + for (final IConsensusRequest req : indexedRequest.getRequests()) { + PlanNode planNode; + try { + if (req instanceof IoTConsensusRequest) { + // WAL entries read from file are wrapped as IoTConsensusRequest (ByteBuffer) + planNode = WALEntry.deserializeForConsensus(req.serializeToByteBuffer()); + } else if (req instanceof InsertNode) { + // In-memory entries (not yet flushed to WAL file) may already be PlanNode + planNode = (PlanNode) req; + } else { + // ByteBufferConsensusRequest or unknown + planNode = PlanNodeType.deserialize(req.serializeToByteBuffer()); + } + } catch (final Exception e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to deserialize IConsensusRequest " + + "(type={}) in searchIndex={}: {}", + this, + req.getClass().getSimpleName(), + indexedRequest.getSearchIndex(), + e.getMessage(), + e); + continue; + } + + if (planNode instanceof SearchNode) { + final SearchNode searchNode = (SearchNode) planNode; + searchNode.setSearchIndex(indexedRequest.getSearchIndex()); + if (indexedRequest.getSyncIndex() >= 0) { + searchNode.setSyncIndex(indexedRequest.getSyncIndex()); + } + if (indexedRequest.getPhysicalTime() > 0) { + searchNode.setPhysicalTime(indexedRequest.getPhysicalTime()); + } + if (indexedRequest.getNodeId() >= 0) { + searchNode.setNodeId(indexedRequest.getNodeId()); + } + if (indexedRequest.getWriterEpoch() > 0) { + searchNode.setWriterEpoch(indexedRequest.getWriterEpoch()); + } + searchNodes.add(searchNode); + } else { + nonSearchNode = planNode; + } + } + + // Merge split SearchNode fragments (same pattern as DataRegionStateMachine.grabPlanNode) + if (!searchNodes.isEmpty()) { + final PlanNode merged = searchNodes.get(0).merge(searchNodes); + if (merged instanceof InsertNode) { + final InsertNode mergedInsert = (InsertNode) merged; + LOGGER.debug( + "ConsensusPrefetchingQueue {}: deserialized merged InsertNode for searchIndex={}, " + + "type={}, deviceId={}, searchNodeCount={}", + this, + indexedRequest.getSearchIndex(), + mergedInsert.getType(), + ConsensusLogToTabletConverter.safeDeviceIdForLog(mergedInsert), + searchNodes.size()); + + return mergedInsert; + } + } + + if (nonSearchNode != null) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: searchIndex={} contains non-InsertNode PlanNode: {}", + this, + indexedRequest.getSearchIndex(), + nonSearchNode.getClass().getSimpleName()); + } + + return null; + } + + private PreparedEntry prepareEntry(final IndexedConsensusRequest indexedRequest) { + final InsertNode insertNode = deserializeToInsertNode(indexedRequest); + if (Objects.isNull(insertNode)) { + return null; + } + + final long localSeq = + indexedRequest.getProgressLocalSeq() >= 0 + ? indexedRequest.getProgressLocalSeq() + : indexedRequest.getSearchIndex(); + final long searchIndex = indexedRequest.getSearchIndex(); + final long physicalTime = + indexedRequest.getPhysicalTime() > 0 + ? indexedRequest.getPhysicalTime() + : insertNode.getPhysicalTime(); + final int writerNodeId = + indexedRequest.getNodeId() >= 0 ? indexedRequest.getNodeId() : insertNode.getNodeId(); + final long writerEpoch = + indexedRequest.getWriterEpoch() > 0 + ? indexedRequest.getWriterEpoch() + : insertNode.getWriterEpoch(); + + trackWriterLane(writerNodeId, writerEpoch); + final long maxTs = extractMaxTime(insertNode); + if (maxTs > maxObservedTimestamp) { + maxObservedTimestamp = maxTs; + } + final List tablets = converter.convert(insertNode); + if (tablets.isEmpty()) { + return null; + } + + return new PreparedEntry( + tablets, searchIndex, physicalTime, writerNodeId, writerEpoch, localSeq); + } + + private static long estimateTabletSize(final Tablet tablet) { + return PipeMemoryWeightUtil.calculateTabletSizeInBytes(tablet); + } + + private void createAndEnqueueEvent( + final List tablets, final long startSearchIndex, final long endSearchIndex) { + createAndEnqueueEvent( + tablets, startSearchIndex, endSearchIndex, endSearchIndex, seekGeneration.get()); + } + + private boolean createAndEnqueueEvent( + final List tablets, + final long startSearchIndex, + final long endSearchIndex, + final long commitLocalSeq, + final long expectedSeekGeneration) { + if (tablets.isEmpty()) { + return true; + } + + if (seekGeneration.get() != expectedSeekGeneration) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: skip stale event with searchIndex range [{}, {}], " + + "expectedSeekGeneration={}, currentSeekGeneration={}", + this, + startSearchIndex, + endSearchIndex, + expectedSeekGeneration, + seekGeneration.get()); + return false; + } + + final SubscriptionCommitContext commitContext = buildWriterCommitContext(commitLocalSeq); + final WriterId writerId = commitContext.getWriterId(); + final WriterProgress writerProgress = commitContext.getWriterProgress(); + commitManager.recordMapping(brokerId, topicName, consensusGroupId, writerId, writerProgress); + + // nextOffset <= 0 means all tablets delivered in single batch + // -tablets.size() indicates total count + // Use Map> constructor with actual database name for table model; + final TabletsPayload payload = + new TabletsPayload( + Collections.singletonMap(converter.getDatabaseName(), tablets), -tablets.size()); + + final SubscriptionEvent event = + new SubscriptionEvent( + SubscriptionPollResponseType.TABLETS.getType(), payload, commitContext); + + prefetchingQueue.add(event); + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: ENQUEUED event with {} tablets, " + + "searchIndex range [{}, {}], prefetchQueueSize={}", + this, + tablets.size(), + startSearchIndex, + endSearchIndex, + prefetchingQueue.size()); + + // After enqueuing the data event, control metadata is handled separately from user data. + return true; + } + + private SubscriptionCommitContext buildWriterCommitContext(final long localSeq) { + final int effectiveNodeId = + batchWriterNodeId >= 0 + ? batchWriterNodeId + : IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + final WriterId writerId = + new WriterId(consensusGroupId.toString(), effectiveNodeId, batchWriterEpoch); + final WriterProgress writerProgress = new WriterProgress(batchPhysicalTime, localSeq); + return new SubscriptionCommitContext( + IoTDBDescriptor.getInstance().getConfig().getDataNodeId(), + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + seekGeneration.get(), + writerId, + writerProgress); + } + + private void updateBatchWriterProgress( + final long physicalTime, final int writerNodeId, final long writerEpoch) { + if (physicalTime > 0) { + this.batchPhysicalTime = physicalTime; + } + if (writerNodeId >= 0) { + this.batchWriterNodeId = writerNodeId; + } + if (writerEpoch > 0) { + this.batchWriterEpoch = writerEpoch; + } + } + + private void resetBatchWriterProgress() { + this.batchPhysicalTime = 0L; + this.batchWriterNodeId = -1; + this.batchWriterEpoch = 0L; + } + + private long estimateTabletsBytes(final List tablets) { + long estimatedBytes = 0L; + for (final Tablet tablet : tablets) { + estimatedBytes += estimateTabletSize(tablet); + } + return estimatedBytes; + } + + private boolean appendPreparedEntryViaRealtimeLane( + final DeliveryBatchState batchState, + final PreparedEntry preparedEntry, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + bufferRealtimeEntry(preparedEntry); + return drainRealtimeLanes(batchState, expectedSeekGeneration, maxTablets, maxBatchBytes); + } + + private int getRealtimeBufferedEntryCount() { + int count = 0; + for (final NavigableMap laneEntries : realtimeEntriesByLane.values()) { + count += laneEntries.size(); + } + return count; + } + + private boolean drainBufferedRealtimeLanes( + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + while (!realtimeEntriesByLane.isEmpty()) { + final int bufferedBefore = getRealtimeBufferedEntryCount(); + if (!drainRealtimeLanes(batchState, expectedSeekGeneration, maxTablets, maxBatchBytes)) { + return false; + } + + final int bufferedAfter = getRealtimeBufferedEntryCount(); + if (bufferedAfter == 0 || prefetchingQueue.size() >= MAX_PREFETCHING_QUEUE_SIZE) { + return true; + } + + if (batchState.isEmpty()) { + return true; + } + + if (!flushBatch(batchState, expectedSeekGeneration)) { + return false; + } + } + return true; + } + + private boolean canAppendLaneEntry( + final DeliveryBatchState batchState, + final LaneBufferedEntry entry, + final long entryEstimatedBytes, + final int maxEntries, + final int maxTablets, + final long maxBatchBytes) { + final boolean wouldExceedEntryLimit = + maxEntries != Integer.MAX_VALUE && batchState.entryCount >= maxEntries; + final boolean wouldExceedTabletLimit = + !batchState.isEmpty() && batchState.tablets.size() + entry.getTablets().size() > maxTablets; + final boolean wouldExceedByteLimit = + !batchState.isEmpty() && batchState.estimatedBytes + entryEstimatedBytes > maxBatchBytes; + // Keep all consensus subscription modes on a single-writer commit/delivery shape so + // SubscriptionCommitContext and RegionProgress remain per-writer. + final boolean writerChanged = + !batchState.isEmpty() + && (batchState.writerNodeId != entry.getWriterNodeId() + || batchState.writerEpoch != entry.getWriterEpoch()); + return !(wouldExceedEntryLimit + || wouldExceedTabletLimit + || wouldExceedByteLimit + || writerChanged); + } + + private boolean drainRealtimeLanes( + final DeliveryBatchState batchState, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) { + return drainLaneEntries( + batchState, + this::buildRealtimeLaneFrontiers, + this::peekRealtimeEntry, + entry -> true, + (laneId, entry) -> removeRealtimeEntry(laneId, entry.localSeq), + Integer.MAX_VALUE, + maxTablets, + maxBatchBytes, + true); + } + + private boolean drainLaneEntries( + final DeliveryBatchState batchState, + final Supplier> frontierSupplier, + final Function headSupplier, + final Predicate releasePredicate, + final BiConsumer removeHeadAction, + final int maxEntries, + final int maxTablets, + final long maxBatchBytes, + final boolean trackLingerTime) { + while (true) { + final PriorityQueue frontiers = frontierSupplier.get(); + if (frontiers.isEmpty()) { + return true; + } + final LaneFrontier frontier = frontiers.peek(); + if (Objects.isNull(frontier) || frontier.isBarrier) { + return true; + } + final T laneHead = headSupplier.apply(frontier.laneId); + if (Objects.isNull(laneHead)) { + return true; + } + if (!releasePredicate.test(laneHead)) { + return true; + } + + final long entryEstimatedBytes = estimateTabletsBytes(laneHead.getTablets()); + if (!canAppendLaneEntry( + batchState, laneHead, entryEstimatedBytes, maxEntries, maxTablets, maxBatchBytes)) { + return true; + } + + removeHeadAction.accept(frontier.laneId, laneHead); + batchState.append(laneHead, entryEstimatedBytes, trackLingerTime); + } + } + + private boolean flushBatch( + final DeliveryBatchState batchState, final long expectedSeekGeneration) { + updateBatchWriterProgress( + batchState.physicalTime, batchState.writerNodeId, batchState.writerEpoch); + if (!createAndEnqueueEvent( + new ArrayList<>(batchState.tablets), + batchState.startSearchIndex, + batchState.endSearchIndex, + batchState.lastLocalSeq, + expectedSeekGeneration)) { + return false; + } + resetBatchWriterProgress(); + batchState.reset(); + return true; + } + + // ======================== Commit (Ack/Nack) ======================== + + private boolean canAcceptCommitContext( + final SubscriptionCommitContext commitContext, final String action, final boolean silent) { + if (isClosed || closeRequested || pendingSeekRequest != null) { + return false; + } + if (Objects.isNull(commitContext) || !commitContext.hasWriterProgress()) { + if (silent) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: reject {} without writer progress, commitContext={}", + this, + action, + commitContext); + } else { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: reject {} without writer progress, commitContext={}", + this, + action, + commitContext); + } + return false; + } + if (!isActive) { + if (silent) { + LOGGER.debug( + "ConsensusPrefetchingQueue {}: reject {} for inactive queue, commitContext={}, runtimeVersion={}", + this, + action, + commitContext, + runtimeVersion); + } else { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: reject {} for inactive queue, commitContext={}, runtimeVersion={}", + this, + action, + commitContext, + runtimeVersion); + } + return false; + } + return true; + } + + public boolean ack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return canAcceptCommitContext(commitContext, "ack", false) + && ackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + private boolean ackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final WriterId commitWriterId = extractCommitWriterId(commitContext); + final WriterProgress commitWriterProgress = extractCommitWriterProgress(commitContext); + final AtomicBoolean acked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + final boolean directCommitted = + commitManager.commitWithoutOutstanding( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + acked.set(directCommitted); + if (!acked.get()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for ack", + this, + commitContext); + } + return null; + } + + if (ev.isCommitted()) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: event {} already committed", this, commitContext); + ev.cleanUp(false); + return null; + } + + final boolean committed = + commitManager.commit( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + if (!committed) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to advance commit frontier for {}", + this, + commitContext); + return ev; + } + + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + ev.cleanUp(false); + return null; + }); + + return acked.get(); + } + + public boolean nack(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + return canAcceptCommitContext(commitContext, "nack", false) + && nackInternal(consumerId, commitContext); + } finally { + releaseReadLock(); + } + } + + /** + * Silent version of ack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean ackSilent(final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (!canAcceptCommitContext(commitContext, "ack", true)) { + return false; + } + final WriterId commitWriterId = extractCommitWriterId(commitContext); + final WriterProgress commitWriterProgress = extractCommitWriterProgress(commitContext); + final AtomicBoolean acked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + final boolean directCommitted = + commitManager.commitWithoutOutstanding( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + acked.set(directCommitted); + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + final boolean committed = + commitManager.commit( + brokerId, topicName, consensusGroupId, commitWriterId, commitWriterProgress); + if (!committed) { + return ev; + } + ev.ack(); + ev.recordCommittedTimestamp(); + acked.set(true); + ev.cleanUp(false); + return null; + }); + return acked.get(); + } finally { + releaseReadLock(); + } + } + + private WriterId extractCommitWriterId(final SubscriptionCommitContext commitContext) { + final WriterId writerId = commitContext.getWriterId(); + return Objects.nonNull(writerId) ? writerId : new WriterId(consensusGroupId.toString(), -1, 0L); + } + + private WriterProgress extractCommitWriterProgress( + final SubscriptionCommitContext commitContext) { + return commitContext.getWriterProgress(); + } + + /** + * Silent version of nack: returns false without logging if the commit context is not found. Used + * in multi-region iteration where only one queue owns the event. + */ + public boolean nackSilent( + final String consumerId, final SubscriptionCommitContext commitContext) { + acquireReadLock(); + try { + if (!canAcceptCommitContext(commitContext, "nack", true)) { + return false; + } + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + ev.nack(); + nacked.set(true); + if (ev.isPoisoned()) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: poison message detected (nackCount={}), " + + "force-acking event {} to prevent infinite re-delivery", + this, + ev.getNackCount(), + ev); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } + prefetchingQueue.add(ev); + return null; + }); + return nacked.get(); + } finally { + releaseReadLock(); + } + } + + private boolean nackInternal( + final String consumerId, final SubscriptionCommitContext commitContext) { + final AtomicBoolean nacked = new AtomicBoolean(false); + inFlightEvents.compute( + new Pair<>(consumerId, commitContext), + (key, ev) -> { + if (Objects.isNull(ev)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: commit context {} does not exist for nack", + this, + commitContext); + return null; + } + + ev.nack(); + nacked.set(true); + if (ev.isPoisoned()) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: poison message detected (nackCount={}), " + + "force-acking event {} to prevent infinite re-delivery", + this, + ev.getNackCount(), + ev); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } + prefetchingQueue.add(ev); + return null; + }); + + return nacked.get(); + } + + // ======================== Recycle ======================== + + /** Recycles in-flight events that are pollable (timed out) back to the prefetching queue. */ + private void recycleInFlightEvents() { + for (final Pair key : + new ArrayList<>(inFlightEvents.keySet())) { + inFlightEvents.compute( + key, + (k, ev) -> { + if (Objects.isNull(ev)) { + return null; + } + if (ev.isCommitted()) { + ev.cleanUp(false); + return null; + } + if (ev.pollable()) { + ev.nack(); + if (ev.isPoisoned()) { + LOGGER.error( + "ConsensusPrefetchingQueue {}: poison message detected during recycle " + + "(nackCount={}), force-acking event {}", + this, + ev.getNackCount(), + ev); + ev.ack(); + ev.recordCommittedTimestamp(); + ev.cleanUp(false); + return null; + } + prefetchingQueue.add(ev); + LOGGER.debug( + "ConsensusPrefetchingQueue {}: recycled timed-out event {} back to prefetching queue", + this, + ev); + return null; + } + return ev; + }); + } + } + + // ======================== Cleanup ======================== + + public void cleanUp() { + acquireWriteLock(); + try { + prefetchingQueue.forEach(event -> event.cleanUp(true)); + prefetchingQueue.clear(); + + inFlightEvents.values().forEach(event -> event.cleanUp(true)); + inFlightEvents.clear(); + + realtimeEntriesByLane.clear(); + writerLanes.clear(); + clearRecoveryWriterProgress(); + materializedFollowerProgressByWriter.clear(); + pendingEntries.clear(); + lingerBatch.reset(); + resetBatchWriterProgress(); + pendingWalGapRetryRequested = false; + walGapWaitStartTimeMs = 0L; + lastWalGapWaitLogTimeMs = 0L; + pendingSubscriptionWalResetSearchIndex = Long.MIN_VALUE; + pendingSubscriptionWalResetGeneration = Long.MIN_VALUE; + closeSubscriptionWALIterator(); + + } finally { + releaseWriteLock(); + } + } + + // ======================== Seek ======================== + + /** + * Seeks to the earliest available WAL position. The actual position depends on WAL retention: if + * old files have been reclaimed, the earliest available position may be later than 0. + */ + public void seekToBeginning() { + seekToResolvedPosition(0L, new RegionProgress(Collections.emptyMap()), "beginning"); + } + + /** + * Seeks to the current WAL write position. After this, only newly written data will be consumed. + */ + public void seekToEnd() { + seekToResolvedPosition( + consensusReqReader.getCurrentSearchIndex(), computeTailRegionProgress(), "end"); + } + + public void seekToRegionProgress(final RegionProgress regionProgress) { + if (!(consensusReqReader instanceof WALNode)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: seekToRegionProgress not supported (no WAL directory)", + this); + seekToBeginning(); + return; + } + final WALNode walNode = (WALNode) consensusReqReader; + walNode.rollWALFile(); + + final ReplayLocateDecision replayTarget = + locateReplayStartForRegionProgress(regionProgress, false); + switch (replayTarget.getStatus()) { + case FOUND: + case AT_END: + LOGGER.info( + "ConsensusPrefetchingQueue {}: seekToRegionProgress writerCount={} -> {} searchIndex={}", + this, + regionProgress.getWriterPositions().size(), + replayTarget.getStatus(), + replayTarget.getStartSearchIndex()); + seekToResolvedPosition( + replayTarget.getStartSearchIndex(), + replayTarget.getRecoveryRegionProgress(), + "regionProgress"); + return; + case LOCATE_MISS: + default: + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot seekToRegionProgress %s: %s", + this, regionProgress, replayTarget.getDetail())); + } + } + + public void seekAfterRegionProgress(final RegionProgress regionProgress) { + if (!(consensusReqReader instanceof WALNode)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: seekAfterRegionProgress not supported (no WAL directory)", + this); + seekToEnd(); + return; + } + final WALNode walNode = (WALNode) consensusReqReader; + walNode.rollWALFile(); + + final ReplayLocateDecision replayTarget = + locateReplayStartForRegionProgress(regionProgress, true); + switch (replayTarget.getStatus()) { + case FOUND: + case AT_END: + LOGGER.info( + "ConsensusPrefetchingQueue {}: seekAfterRegionProgress writerCount={} -> {} searchIndex={}", + this, + regionProgress.getWriterPositions().size(), + replayTarget.getStatus(), + replayTarget.getStartSearchIndex()); + seekToResolvedPosition( + replayTarget.getStartSearchIndex(), + replayTarget.getRecoveryRegionProgress(), + "regionProgressAfter"); + return; + case LOCATE_MISS: + default: + throw new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s: cannot seekAfterRegionProgress %s: %s", + this, regionProgress, replayTarget.getDetail())); + } + } + + private synchronized void seekToResolvedPosition( + final long targetSearchIndex, + final RegionProgress committedRegionProgress, + final String seekReason) { + final PendingSeekRequest request; + + acquireWriteLock(); + try { + if (isClosed || closeRequested) { + return; + } + // Fence old commit contexts immediately. The grouped reset itself is applied later by the + // prefetch worker so WAL state and queue state still move under the queue's serial context. + final boolean previousPrefetchInitialized = prefetchInitialized; + final long previousSeekGeneration = seekGeneration.get(); + final long targetSeekGeneration = seekGeneration.incrementAndGet(); + request = + new PendingSeekRequest( + targetSearchIndex, + committedRegionProgress, + seekReason, + previousPrefetchInitialized, + previousSeekGeneration, + targetSeekGeneration); + pendingSeekRequest = request; + prefetchInitialized = true; + } finally { + releaseWriteLock(); + } + + final ConsensusPrefetchSubtask subtask = ensurePrefetchSubtaskBound(); + if (Objects.isNull(subtask)) { + failPendingSeekBeforeScheduling(request); + request.awaitCompletion(); + return; + } + + subtask.requestWakeupNow(); + request.awaitCompletion(); + } + + private boolean applyPendingSeekRequestIfNecessary() { + final PendingSeekRequest request = pendingSeekRequest; + if (Objects.isNull(request)) { + return false; + } + + acquireWriteLock(); + try { + if (pendingSeekRequest != request) { + return pendingSeekRequest != null; + } + pendingSeekRequest = null; + if (isClosed || closeRequested) { + request.fail( + new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s is closing while applying seek", this))); + return true; + } + applySeekResetUnderWriteLock(request); + request.complete(); + return true; + } catch (final RuntimeException e) { + request.fail(e); + throw e; + } finally { + releaseWriteLock(); + } + } + + public void abortPendingSeekForRuntimeStop() { + final PendingSeekRequest requestToFail; + + acquireWriteLock(); + try { + requestToFail = pendingSeekRequest; + if (Objects.isNull(requestToFail)) { + return; + } + pendingSeekRequest = null; + prefetchInitialized = requestToFail.previousPrefetchInitialized; + if (seekGeneration.get() == requestToFail.targetSeekGeneration) { + seekGeneration.set(requestToFail.previousSeekGeneration); + } + LOGGER.info( + "ConsensusPrefetchingQueue {}: aborted pending seek({}) during runtime stop, restored prefetchInitialized {} -> {}, seekGeneration {} -> {}", + this, + requestToFail.seekReason, + true, + requestToFail.previousPrefetchInitialized, + requestToFail.targetSeekGeneration, + requestToFail.previousSeekGeneration); + } finally { + releaseWriteLock(); + } + + requestToFail.fail( + new IllegalStateException( + String.format( + "ConsensusPrefetchingQueue %s runtime stopped before seek(%s) was applied", + this, requestToFail.seekReason))); + } + + private void failPendingSeekBeforeScheduling(final PendingSeekRequest request) { + final boolean closing; + + acquireWriteLock(); + try { + if (pendingSeekRequest != request) { + return; + } + closing = isClosed || closeRequested; + pendingSeekRequest = null; + prefetchInitialized = request.previousPrefetchInitialized; + if (seekGeneration.get() == request.targetSeekGeneration) { + seekGeneration.set(request.previousSeekGeneration); + } + LOGGER.info( + "ConsensusPrefetchingQueue {}: failed to schedule seek({}) because {}, restored prefetchInitialized {} -> {}, seekGeneration {} -> {}", + this, + request.seekReason, + closing ? "the queue is closing" : "prefetch runtime is unavailable", + true, + request.previousPrefetchInitialized, + request.targetSeekGeneration, + request.previousSeekGeneration); + } finally { + releaseWriteLock(); + } + + request.fail( + new IllegalStateException( + String.format( + closing + ? "ConsensusPrefetchingQueue %s is closing before seek(%s) can be scheduled" + : "ConsensusPrefetchingQueue %s cannot schedule seek(%s) because prefetch runtime is unavailable", + this, + request.seekReason))); + } + + private void applySeekResetUnderWriteLock(final PendingSeekRequest request) { + // 1. Clean up all queued and in-flight events + prefetchingQueue.forEach(event -> event.cleanUp(true)); + prefetchingQueue.clear(); + inFlightEvents.values().forEach(event -> event.cleanUp(true)); + inFlightEvents.clear(); + + // 2. Discard stale pending entries from in-memory queue + pendingEntries.clear(); + + // 3. Reset per-writer release state and source-level dedup frontiers. + realtimeEntriesByLane.clear(); + writerLanes.clear(); + clearRecoveryWriterProgress(); + materializedFollowerProgressByWriter.clear(); + if (Objects.nonNull(request.committedRegionProgress) + && !request.committedRegionProgress.getWriterPositions().isEmpty()) { + installRecoveryWriterProgress(request.committedRegionProgress); + } + + // 4. Reset WAL read position + nextExpectedSearchIndex.set(request.targetSearchIndex); + requestSubscriptionWalReset(request.targetSearchIndex, seekGeneration.get()); + lingerBatch.reset(); + resetBatchWriterProgress(); + observedSeekGeneration = seekGeneration.get(); + pendingWalGapRetryRequested = false; + walGapWaitStartTimeMs = 0L; + lastWalGapWaitLogTimeMs = 0L; + + // 5. Reset commit state to the writer progress immediately before the first re-delivered + // entry so seek/rebind resumes from the intended frontier. + commitManager.resetState( + brokerId, topicName, consensusGroupId, request.committedRegionProgress); + + LOGGER.info( + "ConsensusPrefetchingQueue {}: seek({}) applied to searchIndex={}, writerCount={}, seekGeneration={}", + this, + request.seekReason, + request.targetSearchIndex, + Objects.nonNull(request.committedRegionProgress) + ? request.committedRegionProgress.getWriterPositions().size() + : 0, + seekGeneration.get()); + } + + private RegionProgress computeTailRegionProgress() { + if (!(consensusReqReader instanceof WALNode)) { + return new RegionProgress(Collections.emptyMap()); + } + + final WALNode walNode = (WALNode) consensusReqReader; + final Map tailProgressByWriter = new LinkedHashMap<>(); + final File[] walFiles = WALFileUtils.listAllWALFiles(walNode.getLogDirectory()); + if (Objects.isNull(walFiles) || walFiles.length == 0) { + mergeTailProgress(tailProgressByWriter, walNode.getCurrentWALMetaDataSnapshot()); + return new RegionProgress(tailProgressByWriter); + } + + WALFileUtils.ascSortByVersionId(walFiles); + final long liveVersionId = walNode.getCurrentWALFileVersion(); + final WALMetaData liveSnapshot = walNode.getCurrentWALMetaDataSnapshot(); + for (final File walFile : walFiles) { + final long versionId = WALFileUtils.parseVersionId(walFile.getName()); + if (versionId == liveVersionId) { + mergeTailProgress(tailProgressByWriter, liveSnapshot); + continue; + } + try (final ProgressWALReader reader = new ProgressWALReader(walFile)) { + mergeTailProgress(tailProgressByWriter, reader.getMetaData()); + } catch (final IOException e) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to read WAL metadata from {} while computing seekToEnd frontier", + this, + walFile, + e); + } + } + return new RegionProgress(tailProgressByWriter); + } + + private void mergeTailProgress( + final Map tailProgressByWriter, final WALMetaData metadata) { + if (Objects.isNull(metadata)) { + return; + } + final List physicalTimes = metadata.getPhysicalTimes(); + final List nodeIds = metadata.getNodeIds(); + final List writerEpochs = metadata.getWriterEpochs(); + final List localSeqs = metadata.getLocalSeqs(); + final int size = + Math.min( + Math.min(physicalTimes.size(), nodeIds.size()), + Math.min(writerEpochs.size(), localSeqs.size())); + for (int i = 0; i < size; i++) { + final int writerNodeId = nodeIds.get(i); + final long writerEpoch = writerEpochs.get(i); + final long physicalTime = physicalTimes.get(i); + final long localSeq = localSeqs.get(i); + if (writerNodeId < 0 || physicalTime < 0L || localSeq < 0L) { + continue; + } + + final WriterId writerId = + new WriterId(consensusGroupId.toString(), writerNodeId, writerEpoch); + final WriterProgress candidateProgress = new WriterProgress(physicalTime, localSeq); + final WriterProgress currentProgress = tailProgressByWriter.get(writerId); + if (Objects.isNull(currentProgress) + || compareWriterProgress(candidateProgress, currentProgress) > 0) { + tailProgressByWriter.put(writerId, candidateProgress); + } + } + } + + /** + * Extracts the maximum timestamp from an InsertNode. For row nodes this is the single timestamp; + * for tablet nodes, {@code times} is sorted so the last element is the max. For composite nodes, + * iterates over children. + * + * @return the maximum timestamp, or {@code Long.MIN_VALUE} if extraction fails + */ + private long extractMaxTime(final InsertNode insertNode) { + try { + if (insertNode instanceof InsertRowNode) { + return ((InsertRowNode) insertNode).getTime(); + } + if (insertNode instanceof InsertTabletNode) { + final InsertTabletNode tabletNode = (InsertTabletNode) insertNode; + final int rowCount = tabletNode.getRowCount(); + return rowCount > 0 ? tabletNode.getTimes()[rowCount - 1] : Long.MIN_VALUE; + } + if (insertNode instanceof InsertMultiTabletsNode) { + long max = Long.MIN_VALUE; + for (final InsertTabletNode child : + ((InsertMultiTabletsNode) insertNode).getInsertTabletNodeList()) { + final int rowCount = child.getRowCount(); + if (rowCount > 0) { + max = Math.max(max, child.getTimes()[rowCount - 1]); + } + } + return max; + } + if (insertNode instanceof InsertRowsNode) { + long max = Long.MIN_VALUE; + for (final InsertRowNode row : ((InsertRowsNode) insertNode).getInsertRowNodeList()) { + max = Math.max(max, row.getTime()); + } + return max; + } + if (insertNode instanceof InsertRowsOfOneDeviceNode) { + long max = Long.MIN_VALUE; + for (final InsertRowNode row : + ((InsertRowsOfOneDeviceNode) insertNode).getInsertRowNodeList()) { + max = Math.max(max, row.getTime()); + } + return max; + } + // Fallback: use getMinTime() which at least gets a timestamp + return insertNode.getMinTime(); + } catch (final Exception e) { + return Long.MIN_VALUE; + } + } + + /** + * Checks whether it is time to inject a watermark event and does so if the configured interval + * has elapsed. Called from prefetch rounds after processing data and during idle scheduling. + */ + private void maybeInjectWatermark() { + if (maxObservedTimestamp == Long.MIN_VALUE) { + return; // No data observed yet, nothing to report + } + final long intervalMs = + SubscriptionConfig.getInstance().getSubscriptionConsensusWatermarkIntervalMs(); + if (intervalMs <= 0) { + return; // Watermark disabled + } + final long now = System.currentTimeMillis(); + if (now - lastWatermarkEmitTimeMs >= intervalMs) { + injectWatermark(maxObservedTimestamp); + lastWatermarkEmitTimeMs = now; + } + } + + /** + * Injects a {@link SubscriptionPollResponseType#WATERMARK} event into the prefetching queue. The + * committed mapping is deliberately NOT recorded because watermark events are metadata, not user + * data. + * + * @param watermarkTimestamp the maximum data timestamp observed so far + */ + private void injectWatermark(final long watermarkTimestamp) { + final int dataNodeId = IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + final SubscriptionCommitContext watermarkCtx = createNonCommittableSeekContext(dataNodeId); + final SubscriptionEvent watermarkEvent = + new SubscriptionEvent( + SubscriptionPollResponseType.WATERMARK.getType(), + new WatermarkPayload(watermarkTimestamp, dataNodeId), + watermarkCtx); + prefetchingQueue.add(watermarkEvent); + + LOGGER.debug( + "ConsensusPrefetchingQueue {}: injected WATERMARK, watermarkTimestamp={}", + this, + watermarkTimestamp); + } + + /** Returns the maximum observed data timestamp for metrics. */ + public long getMaxObservedTimestamp() { + return maxObservedTimestamp; + } + + private void markAcceptedFromPending() { + pendingPathAcceptedEntries.incrementAndGet(); + } + + private void markAcceptedFromWal() { + walPathAcceptedEntries.incrementAndGet(); + } + + public void close() { + final PendingSeekRequest seekRequestToFail; + final Pair prefetchBinding; + + acquireWriteLock(); + try { + if (isClosed || closeRequested) { + return; + } + closeRequested = true; + seekRequestToFail = pendingSeekRequest; + pendingSeekRequest = null; + } finally { + releaseWriteLock(); + } + + prefetchBinding = detachPrefetchSubtask(); + + if (Objects.nonNull(seekRequestToFail)) { + seekRequestToFail.fail( + new IllegalStateException( + String.format("ConsensusPrefetchingQueue %s is closing before seek applies", this))); + } + + if (Objects.nonNull(prefetchBinding.right)) { + prefetchBinding.right.cancelPendingExecution(); + prefetchBinding.right.awaitIdle(); + } + + try { + acquireWriteLock(); + try { + if (!isClosed + && pendingSeekRequest == null + && seekGeneration.get() == observedSeekGeneration) { + flushLingeringBatchOnCloseUnderWriteLock(); + } + markClosed(); + } finally { + releaseWriteLock(); + } + + // Deregister metrics after the queue is fully closed. + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance() + .deregister(getPrefetchingQueueId()); + + if (Objects.nonNull(prefetchBinding.left) && Objects.nonNull(prefetchBinding.right)) { + if (!prefetchBinding.left.isShutdown()) { + prefetchBinding.left.deregister(prefetchBinding.right.getTaskId()); + } else { + prefetchBinding.right.close(); + } + } + + try { + // Unregister from IoTConsensusServerImpl (stop receiving in-memory data). + serverImpl.unregisterSubscriptionQueue(pendingEntries); + } catch (final Exception e) { + LOGGER.warn("ConsensusPrefetchingQueue {}: error during unregister", this, e); + } finally { + try { + cleanUp(); + } finally { + // Persist progress before closing + commitManager.persistAll(); + } + } + } finally { + closeRequested = false; + } + } + + private void flushLingeringBatchOnCloseUnderWriteLock() { + if (lingerBatch.isEmpty()) { + return; + } + LOGGER.info( + "ConsensusPrefetchingQueue {}: flushing {} lingering tablets during close", + this, + lingerBatch.tablets.size()); + if (!flushBatch(lingerBatch, observedSeekGeneration)) { + LOGGER.warn( + "ConsensusPrefetchingQueue {}: failed to flush lingering batch during close, discarding it", + this); + lingerBatch.reset(); + resetBatchWriterProgress(); + } + } + + private SubscriptionEvent generateErrorResponse(final String errorMessage) { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + new ErrorPayload(errorMessage, false), + createNonCommittableContext(IoTDBDescriptor.getInstance().getConfig().getDataNodeId())); + } + + private SubscriptionEvent generateOutdatedErrorResponse() { + return new SubscriptionEvent( + SubscriptionPollResponseType.ERROR.getType(), + ErrorPayload.OUTDATED_ERROR_PAYLOAD, + createNonCommittableContext(IoTDBDescriptor.getInstance().getConfig().getDataNodeId())); + } + + /** + * Shared subscription events still use {@link SubscriptionCommitContext#INVALID_COMMIT_ID} to + * mark metadata and error payloads as non-committable. Consensus correctness never treats this + * sentinel as a replay or commit frontier. + */ + private SubscriptionCommitContext createNonCommittableContext(final int dataNodeId) { + return new SubscriptionCommitContext( + dataNodeId, + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID); + } + + private SubscriptionCommitContext createNonCommittableSeekContext(final int dataNodeId) { + return new SubscriptionCommitContext( + dataNodeId, + PipeDataNodeAgent.runtime().getRebootTimes(), + topicName, + brokerId, + INVALID_COMMIT_ID, + seekGeneration.get(), + consensusGroupId.toString(), + runtimeVersion); + } + + public boolean isCommitContextOutdated(final SubscriptionCommitContext commitContext) { + return PipeDataNodeAgent.runtime().getRebootTimes() > commitContext.getRebootTimes() + || seekGeneration.get() != commitContext.getSeekGeneration(); + } + + // ======================== Status ======================== + + public boolean isClosed() { + return isClosed; + } + + public void markClosed() { + isClosed = true; + } + + // ======================== Routing Runtime Version Control ======================== + + public long getWalGapSkippedEntries() { + return walGapSkippedEntries.get(); + } + + public long getEpochChangeCount() { + return runtimeVersionChangeCount.get(); + } + + // ======================== Leader Activation ======================== + + /** + * Activates or deactivates this queue. Only the preferred-writer (leader) node's queue should be + * active. Inactive queues skip prefetching and return null on poll. + */ + public void setActive(final boolean active) { + this.isActive = active; + LOGGER.info( + "ConsensusPrefetchingQueue {}: isActive set to {} (region={})", + this, + active, + consensusGroupId); + if (active) { + requestPrefetch(); + } + } + + public boolean isActive() { + return isActive; + } + + public void setActiveWriterNodeIds(final Set activeWriterNodeIds) { + this.runtimeActiveWriterNodeIds = + Collections.unmodifiableSet( + new LinkedHashSet<>(Objects.requireNonNull(activeWriterNodeIds))); + refreshEffectiveActiveWriterNodeIds(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: runtimeActiveWriterNodeIds={}, effectiveActiveWriterNodeIds={} " + + "(region={}, orderMode={}, preferredWriterNodeId={})", + this, + this.runtimeActiveWriterNodeIds, + this.activeWriterNodeIds, + consensusGroupId, + orderMode, + preferredWriterNodeId); + requestPrefetch(); + } + + private void refreshEffectiveActiveWriterNodeIds() { + final LinkedHashSet effectiveWriterNodeIds = new LinkedHashSet<>(); + switch (orderMode) { + case TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE: + effectiveWriterNodeIds.addAll(runtimeActiveWriterNodeIds); + if (effectiveWriterNodeIds.isEmpty() && preferredWriterNodeId >= 0) { + effectiveWriterNodeIds.add(preferredWriterNodeId); + } + break; + case TopicConstant.ORDER_MODE_PER_WRITER_VALUE: + if (preferredWriterNodeId >= 0) { + effectiveWriterNodeIds.add(preferredWriterNodeId); + } + break; + case TopicConstant.ORDER_MODE_LEADER_ONLY_VALUE: + default: + if (preferredWriterNodeId >= 0) { + effectiveWriterNodeIds.add(preferredWriterNodeId); + } + if (previousPreferredWriterNodeId >= 0 + && previousPreferredWriterNodeId != preferredWriterNodeId + && runtimeActiveWriterNodeIds.contains(previousPreferredWriterNodeId)) { + effectiveWriterNodeIds.add(previousPreferredWriterNodeId); + } + break; + } + this.activeWriterNodeIds = Collections.unmodifiableSet(effectiveWriterNodeIds); + } + + public void setPreferredWriterNodeId(final int preferredWriterNodeId) { + if (this.preferredWriterNodeId != preferredWriterNodeId) { + previousPreferredWriterNodeId = this.preferredWriterNodeId; + } else { + previousPreferredWriterNodeId = -1; + } + this.preferredWriterNodeId = preferredWriterNodeId; + refreshEffectiveActiveWriterNodeIds(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: preferredWriterNodeId set to {}, effectiveActiveWriterNodeIds={} " + + "(region={}, orderMode={})", + this, + this.preferredWriterNodeId, + this.activeWriterNodeIds, + consensusGroupId, + orderMode); + requestPrefetch(); + } + + public Set getActiveWriterNodeIds() { + return activeWriterNodeIds; + } + + public void setOrderMode(final String orderMode) { + final String normalizedOrderMode = TopicConfig.normalizeOrderMode(orderMode); + if (Objects.equals(this.orderMode, normalizedOrderMode)) { + return; + } + this.orderMode = normalizedOrderMode; + refreshEffectiveActiveWriterNodeIds(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: orderMode set to {}, effectiveActiveWriterNodeIds={} " + + "(region={}, preferredWriterNodeId={}, runtimeActiveWriterNodeIds={})", + this, + this.orderMode, + this.activeWriterNodeIds, + consensusGroupId, + preferredWriterNodeId, + runtimeActiveWriterNodeIds); + requestPrefetch(); + } + + public String getOrderMode() { + return orderMode; + } + + private boolean isLaneRuntimeActive(final WriterLaneId laneId) { + final Set writerNodeIds = activeWriterNodeIds; + return writerNodeIds.isEmpty() || writerNodeIds.contains(laneId.writerNodeId); + } + + public void applyRuntimeState(final ConsensusRegionRuntimeState runtimeState) { + Objects.requireNonNull(runtimeState, "runtimeState"); + this.runtimeVersion = runtimeState.getRuntimeVersion(); + runtimeVersionChangeCount.incrementAndGet(); + LOGGER.info( + "ConsensusPrefetchingQueue {}: applied runtimeVersion {}", + this, + runtimeState.getRuntimeVersion()); + setPreferredWriterNodeId(runtimeState.getPreferredWriterNodeId()); + setActiveWriterNodeIds(runtimeState.getActiveWriterNodeIds()); + // "active" decides whether this replica should serve subscription traffic on the current node. + // In multi-writer mode, activeWriterNodeIds may intentionally include follower replicas for + // ordering/watermark coordination, so it must not be reused as the local service-activation + // signal. + setActive(runtimeState.isActive()); + LOGGER.info( + "ConsensusPrefetchingQueue {}: applied runtimeState={}, preferredWriterNodeId={}", + this, + runtimeState, + runtimeState.getPreferredWriterNodeId()); + if (runtimeState.isActive()) { + requestPrefetch(); + } + } + + public String getPrefetchingQueueId() { + return brokerId + "_" + topicName; + } + + public long getSubscriptionUncommittedEventCount() { + return inFlightEvents.size(); + } + + /** Exposes the current seek generation for runtime tests and metrics. */ + public long getCurrentSeekGeneration() { + return seekGeneration.get(); + } + + public int getPrefetchedEventCount() { + return prefetchingQueue.size(); + } + + public long getCurrentReadSearchIndex() { + return nextExpectedSearchIndex.get(); + } + + public long getPendingPathAcceptedEntries() { + return pendingPathAcceptedEntries.get(); + } + + public long getWalPathAcceptedEntries() { + return walPathAcceptedEntries.get(); + } + + public String getBrokerId() { + return brokerId; + } + + public String getTopicName() { + return topicName; + } + + public ConsensusGroupId getConsensusGroupId() { + return consensusGroupId; + } + + /** + * Returns an approximate backlog for this queue. + * + *

The metric intentionally avoids collapsing per-writer committed progress into a single + * scalar local sequence. Instead it counts queued/in-flight work and adds one extra unit when the + * local WAL reader still has unread entries beyond its current replay cursor. + */ + public long getLag() { + long lag = + prefetchingQueue.size() + + inFlightEvents.size() + + pendingEntries.size() + + getRealtimeBufferedEntryCount(); + if (nextExpectedSearchIndex.get() < consensusReqReader.getCurrentSearchIndex()) { + lag++; + } + return lag; + } + + // ======================== Stringify ======================== + + public Map coreReportMessage() { + final Map result = new HashMap<>(); + result.put("brokerId", brokerId); + result.put("topicName", topicName); + result.put("consensusGroupId", consensusGroupId.toString()); + result.put("currentReadSearchIndex", String.valueOf(nextExpectedSearchIndex.get())); + result.put("prefetchingQueueSize", String.valueOf(prefetchingQueue.size())); + result.put("inFlightEventsSize", String.valueOf(inFlightEvents.size())); + result.put("pendingEntriesSize", String.valueOf(pendingEntries.size())); + result.put("pendingPathAcceptedEntries", String.valueOf(getPendingPathAcceptedEntries())); + result.put("walPathAcceptedEntries", String.valueOf(getWalPathAcceptedEntries())); + result.put("seekGeneration", String.valueOf(seekGeneration.get())); + result.put("walGapSkippedEntries", String.valueOf(walGapSkippedEntries.get())); + result.put("bufferedRealtimeEntryCount", String.valueOf(getRealtimeBufferedEntryCount())); + result.put("lag", String.valueOf(getLag())); + result.put("isClosed", String.valueOf(isClosed)); + result.put("isActive", String.valueOf(isActive)); + result.put("orderMode", orderMode); + result.put("preferredWriterNodeId", String.valueOf(preferredWriterNodeId)); + result.put("activeWriterCount", String.valueOf(activeWriterNodeIds.size())); + result.put("runtimeActiveWriterCount", String.valueOf(runtimeActiveWriterNodeIds.size())); + result.put("recoveryWriterCount", String.valueOf(recoveryWriterProgressByWriter.size())); + result.put("writerLaneCount", String.valueOf(writerLanes.size())); + result.put("realtimeLaneCount", String.valueOf(realtimeEntriesByLane.size())); + return result; + } + + @Override + public String toString() { + return "ConsensusPrefetchingQueue" + coreReportMessage(); + } + + // ======================== Inner Classes ======================== + + private interface LaneBufferedEntry { + List getTablets(); + + long getSearchIndex(); + + long getPhysicalTime(); + + int getWriterNodeId(); + + long getWriterEpoch(); + + long getLocalSeq(); + + OrderingKey getOrderingKey(); + } + + private static final class DeliveryBatchState { + + private final List tablets = new ArrayList<>(); + private long startSearchIndex; + private long endSearchIndex; + private long estimatedBytes; + private long firstTabletTimeMs; + private long physicalTime; + private long lastLocalSeq; + private int writerNodeId; + private long writerEpoch; + private int entryCount; + + private DeliveryBatchState() { + reset(); + } + + private boolean isEmpty() { + return tablets.isEmpty(); + } + + private void append( + final LaneBufferedEntry entry, + final long entryEstimatedBytes, + final boolean trackLingerTime) { + if (tablets.isEmpty()) { + if (trackLingerTime) { + firstTabletTimeMs = System.currentTimeMillis(); + } + writerNodeId = entry.getWriterNodeId(); + writerEpoch = entry.getWriterEpoch(); + } + if (entry.getSearchIndex() >= 0) { + if (startSearchIndex < 0) { + startSearchIndex = entry.getSearchIndex(); + } + endSearchIndex = entry.getSearchIndex(); + } + tablets.addAll(entry.getTablets()); + estimatedBytes += entryEstimatedBytes; + physicalTime = entry.getPhysicalTime(); + lastLocalSeq = entry.getLocalSeq(); + writerNodeId = entry.getWriterNodeId(); + writerEpoch = entry.getWriterEpoch(); + entryCount++; + } + + private void reset() { + tablets.clear(); + startSearchIndex = -1L; + endSearchIndex = -1L; + estimatedBytes = 0L; + firstTabletTimeMs = 0L; + physicalTime = 0L; + lastLocalSeq = -1L; + writerNodeId = -1; + writerEpoch = 0L; + entryCount = 0; + } + } + + private static final class WriterLaneId { + private final int writerNodeId; + private final long writerEpoch; + + private WriterLaneId(final int writerNodeId, final long writerEpoch) { + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof WriterLaneId)) { + return false; + } + final WriterLaneId that = (WriterLaneId) obj; + return writerNodeId == that.writerNodeId && writerEpoch == that.writerEpoch; + } + + @Override + public int hashCode() { + return Objects.hash(writerNodeId, writerEpoch); + } + } + + private static final class WriterLaneState { + private long effectiveSafePt = 0L; + private boolean closed = false; + } + + private static final class PreparedEntry implements LaneBufferedEntry { + private final List tablets; + private final long searchIndex; + private final long physicalTime; + private final int writerNodeId; + private final long writerEpoch; + private final long localSeq; + + private PreparedEntry( + final List tablets, + final long searchIndex, + final long physicalTime, + final int writerNodeId, + final long writerEpoch, + final long localSeq) { + this.tablets = tablets; + this.searchIndex = searchIndex; + this.physicalTime = physicalTime; + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + + @Override + public List getTablets() { + return tablets; + } + + @Override + public long getSearchIndex() { + return searchIndex; + } + + @Override + public long getPhysicalTime() { + return physicalTime; + } + + @Override + public int getWriterNodeId() { + return writerNodeId; + } + + @Override + public long getWriterEpoch() { + return writerEpoch; + } + + @Override + public long getLocalSeq() { + return localSeq; + } + + @Override + public OrderingKey getOrderingKey() { + return new OrderingKey(physicalTime, writerNodeId, writerEpoch, localSeq); + } + } + + private static final class LaneFrontier implements Comparable { + private final WriterLaneId laneId; + private final OrderingKey orderingKey; + private final boolean isBarrier; + + private LaneFrontier( + final WriterLaneId laneId, final OrderingKey orderingKey, final boolean isBarrier) { + this.laneId = laneId; + this.orderingKey = orderingKey; + this.isBarrier = isBarrier; + } + + private static LaneFrontier forHead(final WriterLaneId laneId, final LaneBufferedEntry entry) { + return new LaneFrontier(laneId, entry.getOrderingKey(), false); + } + + private static LaneFrontier forBarrier(final WriterLaneId laneId, final long effectiveSafePt) { + return new LaneFrontier( + laneId, + new OrderingKey(effectiveSafePt, Integer.MIN_VALUE, Long.MIN_VALUE, Long.MIN_VALUE), + true); + } + + @Override + public int compareTo(final LaneFrontier other) { + int cmp = orderingKey.compareTo(other.orderingKey); + if (cmp != 0) { + return cmp; + } + if (isBarrier != other.isBarrier) { + return isBarrier ? -1 : 1; + } + cmp = Integer.compare(laneId.writerNodeId, other.laneId.writerNodeId); + if (cmp != 0) { + return cmp; + } + return Long.compare(laneId.writerEpoch, other.laneId.writerEpoch); + } + } + + /** Composite ordering key (physicalTime, nodeId, writerEpoch, localSeq) for lane ordering. */ + static final class OrderingKey implements Comparable { + final long physicalTime; + final int nodeId; + final long writerEpoch; + final long localSeq; + + OrderingKey( + final long physicalTime, final int nodeId, final long writerEpoch, final long localSeq) { + this.physicalTime = physicalTime; + this.nodeId = nodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + + @Override + public int compareTo(final OrderingKey o) { + int cmp = Long.compare(physicalTime, o.physicalTime); + if (cmp != 0) { + return cmp; + } + cmp = Integer.compare(nodeId, o.nodeId); + if (cmp != 0) { + return cmp; + } + cmp = Long.compare(writerEpoch, o.writerEpoch); + if (cmp != 0) { + return cmp; + } + return Long.compare(localSeq, o.localSeq); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (!(o instanceof OrderingKey)) { + return false; + } + final OrderingKey that = (OrderingKey) o; + return physicalTime == that.physicalTime + && nodeId == that.nodeId + && writerEpoch == that.writerEpoch + && localSeq == that.localSeq; + } + + @Override + public int hashCode() { + return Objects.hash(physicalTime, nodeId, writerEpoch, localSeq); + } + + @Override + public String toString() { + return "(" + physicalTime + "," + nodeId + "," + writerEpoch + "," + localSeq + ")"; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusRegionRuntimeState.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusRegionRuntimeState.java new file mode 100644 index 0000000000000..92e030ce93b8f --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusRegionRuntimeState.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.Objects; +import java.util.Set; + +/** Runtime control state for consensus subscription delivery on a single region replica. */ +public class ConsensusRegionRuntimeState { + + private final long runtimeVersion; + private final int preferredWriterNodeId; + private final boolean active; + private final Set activeWriterNodeIds; + + public ConsensusRegionRuntimeState( + final long runtimeVersion, + final int preferredWriterNodeId, + final boolean active, + final Set activeWriterNodeIds) { + this.runtimeVersion = runtimeVersion; + this.preferredWriterNodeId = preferredWriterNodeId; + this.active = active; + this.activeWriterNodeIds = + Collections.unmodifiableSet( + new LinkedHashSet<>(Objects.requireNonNull(activeWriterNodeIds))); + } + + public long getRuntimeVersion() { + return runtimeVersion; + } + + public int getPreferredWriterNodeId() { + return preferredWriterNodeId; + } + + public boolean isActive() { + return active; + } + + public Set getActiveWriterNodeIds() { + return activeWriterNodeIds; + } + + public static ConsensusRegionRuntimeState leaderOnly( + final long runtimeVersion, final int preferredWriterNodeId, final boolean active) { + return new ConsensusRegionRuntimeState( + runtimeVersion, + preferredWriterNodeId, + active, + Collections.singleton(preferredWriterNodeId)); + } + + @Override + public String toString() { + return "ConsensusRegionRuntimeState{" + + "runtimeVersion=" + + runtimeVersion + + ", preferredWriterNodeId=" + + preferredWriterNodeId + + ", active=" + + active + + ", activeWriterNodeIds=" + + activeWriterNodeIds + + '}'; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java new file mode 100644 index 0000000000000..593619a93f6ed --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitManager.java @@ -0,0 +1,1297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; +import org.apache.iotdb.commons.client.ClientPoolFactory; +import org.apache.iotdb.commons.client.IClientManager; +import org.apache.iotdb.commons.client.exception.ClientManagerException; +import org.apache.iotdb.commons.client.sync.SyncDataNodeInternalServiceClient; +import org.apache.iotdb.commons.consensus.ConfigRegionId; +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressReq; +import org.apache.iotdb.confignode.rpc.thrift.TGetCommitProgressResp; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.protocol.client.ConfigNodeClient; +import org.apache.iotdb.db.protocol.client.ConfigNodeClientManager; +import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; +import org.apache.iotdb.db.queryengine.plan.analyze.ClusterPartitionFetcher; +import org.apache.iotdb.mpp.rpc.thrift.TSyncSubscriptionProgressReq; +import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.thrift.TException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Manages commit state for consensus-based subscriptions. + * + *

This manager tracks which events have been committed by consumers using their end search + * indices directly (no intermediate commitId mapping). It maintains the progress for each + * (consumerGroup, topic, region) triple and supports persistence and recovery. + * + *

Progress is tracked per-region because searchIndex is region-local — each DataRegion + * has its own independent WAL with its own searchIndex namespace. Using a single state per topic + * would cause TreeSet deduplication bugs when different regions emit the same searchIndex value. + * + *

Key responsibilities: + * + *

    + *
  • Track outstanding (dispatched but not-yet-committed) events by searchIndex + *
  • Handle commit/ack from consumers + *
  • Persist and recover progress state + *
+ */ +public class ConsensusSubscriptionCommitManager { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionCommitManager.class); + + private static final String PROGRESS_FILE_PREFIX = "consensus_subscription_progress_"; + private static final String PROGRESS_FILE_SUFFIX = ".dat"; + + private static final IClientManager CONFIG_NODE_CLIENT_MANAGER = + ConfigNodeClientManager.getInstance(); + + /** Client manager for DataNode-to-DataNode RPC (progress broadcast). */ + private static final IClientManager + SYNC_DN_CLIENT_MANAGER = + new IClientManager.Factory() + .createClientManager( + new ClientPoolFactory.SyncDataNodeInternalServiceClientPoolFactory()); + + /** Minimum interval (ms) between broadcasts for the same (consumerGroup, topic, region). */ + private static final long MIN_BROADCAST_INTERVAL_MS = 5000; + + /** Rate-limiting: last broadcast timestamp per key. */ + private final Map lastBroadcastTime = new ConcurrentHashMap<>(); + + /** Single-threaded executor for fire-and-forget broadcasts. */ + private final ExecutorService broadcastExecutor = + Executors.newSingleThreadExecutor( + r -> { + final Thread t = new Thread(r, "SubscriptionProgressBroadcast"); + t.setDaemon(true); + return t; + }); + + /** Key: "consumerGroupId##topicName##regionId" -> progress tracking state */ + private final Map commitStates = + new ConcurrentHashMap<>(); + + private final String persistDir; + + private ConsensusSubscriptionCommitManager() { + this.persistDir = + IoTDBDescriptor.getInstance().getConfig().getSystemDir() + + File.separator + + "subscription" + + File.separator + + "consensus_progress"; + final File dir = new File(persistDir); + if (!dir.exists()) { + dir.mkdirs(); + } + } + + /** + * Gets or creates the commit state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID + * @return the commit state + */ + public ConsensusSubscriptionCommitState getOrCreateState( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final String regionIdString = regionId.toString(); + return commitStates.computeIfAbsent( + key, + k -> { + // Try to recover from persisted local state + final ConsensusSubscriptionCommitState recovered = tryRecover(key, regionIdString); + if (recovered != null) { + return recovered; + } + final ConsensusSubscriptionCommitState recoveredFromConfigNode = + queryCommitProgressStateFromConfigNode(consumerGroupId, topicName, regionId); + if (Objects.nonNull(recoveredFromConfigNode)) { + return recoveredFromConfigNode; + } + return new ConsensusSubscriptionCommitState( + regionIdString, new SubscriptionConsensusProgress()); + }); + } + + public boolean hasPersistedState( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + return getProgressFile(generateKey(consumerGroupId, topicName, regionId)).exists(); + } + + public void recordMapping( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterId writerId, + final WriterProgress writerProgress) { + final ConsensusSubscriptionCommitState state = + getOrCreateState(consumerGroupId, topicName, regionId); + state.recordMapping(writerId, writerProgress); + } + + public boolean commit( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterId writerId, + final WriterProgress writerProgress) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot commit for unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}, writerId={}, writerProgress={}", + consumerGroupId, + topicName, + regionId, + writerId, + writerProgress); + return false; + } + final CommitOperationResult result = state.commitAndGetResult(writerId, writerProgress); + if (result.isHandled()) { + // Periodically persist progress + persistProgressIfNeeded(key, state); + if (result.hasAdvancedWriter()) { + maybeBroadcast( + key, + consumerGroupId, + topicName, + regionId, + result.getAdvancedWriterProgress(), + result.getAdvancedWriterId()); + } + } + return result.isHandled(); + } + + public boolean commitWithoutOutstanding( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterId writerId, + final WriterProgress writerProgress) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot direct-commit for unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}, writerId={}, writerProgress={}", + consumerGroupId, + topicName, + regionId, + writerId, + writerProgress); + return false; + } + final CommitOperationResult result = + state.commitWithoutOutstandingAndGetResult(writerId, writerProgress); + if (result.isHandled()) { + persistProgressIfNeeded(key, state); + if (result.hasAdvancedWriter()) { + maybeBroadcast( + key, + consumerGroupId, + topicName, + regionId, + result.getAdvancedWriterProgress(), + result.getAdvancedWriterId()); + } + } + return result.isHandled(); + } + + public long getCommittedPhysicalTime( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedPhysicalTime() : 0L; + } + + public long getCommittedLocalSeq( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedLocalSeq() : -1L; + } + + public int getCommittedWriterNodeId( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterNodeId() : -1; + } + + public long getCommittedWriterEpoch( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterEpoch() : 0L; + } + + public WriterId getCommittedWriterId( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterId() : null; + } + + public WriterProgress getCommittedWriterProgress( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + return state != null ? state.getCommittedWriterProgress() : null; + } + + public RegionProgress getCommittedRegionProgress( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + return new RegionProgress(Collections.emptyMap()); + } + return state.getCommittedRegionProgress(); + } + + /** + * Removes state for a specific (consumerGroup, topic, region) triple. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + * @param regionId the consensus group / data region ID + */ + public void removeState( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + final String key = generateKey(consumerGroupId, topicName, regionId); + commitStates.remove(key); + // Clean up persisted file + final File file = getProgressFile(key); + if (file.exists()) { + file.delete(); + } + } + + /** + * Removes all states for a given (consumerGroup, topic) pair across all regions. Used during + * subscription teardown when the individual regionIds may not be readily available. + * + * @param consumerGroupId the consumer group ID + * @param topicName the topic name + */ + public void removeAllStatesForTopic(final String consumerGroupId, final String topicName) { + final String prefix = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR; + final Iterator> it = + commitStates.entrySet().iterator(); + while (it.hasNext()) { + final Map.Entry entry = it.next(); + if (entry.getKey().startsWith(prefix)) { + it.remove(); + final File file = getProgressFile(entry.getKey()); + if (file.exists()) { + file.delete(); + } + } + } + } + + public void resetState( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final RegionProgress regionProgress) { + final String key = generateKey(consumerGroupId, topicName, regionId); + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state == null) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: Cannot reset unknown state, " + + "consumerGroupId={}, topicName={}, regionId={}", + consumerGroupId, + topicName, + regionId); + return; + } + state.resetForSeek(regionProgress); + persistProgress(key, state); + } + + /** Persists all states. Should be called during graceful shutdown. */ + public void persistAll() { + for (final Map.Entry entry : + commitStates.entrySet()) { + persistProgress(entry.getKey(), entry.getValue()); + } + } + + public Map collectAllRegionProgress(final int dataNodeId) { + final Map result = new ConcurrentHashMap<>(); + final String suffix = KEY_SEPARATOR + dataNodeId; + for (final Map.Entry entry : + commitStates.entrySet()) { + final RegionProgress regionProgress = entry.getValue().getCommittedRegionProgress(); + final ByteBuffer serialized = serializeRegionProgress(regionProgress); + if (Objects.nonNull(serialized)) { + result.put(entry.getKey() + suffix, serialized); + } + } + return result; + } + + // ======================== Progress Broadcast (Leader → Follower) ======================== + + /** + * Broadcasts committed progress to followers if enough time has elapsed since the last broadcast + * for this key. The broadcast is async and fire-and-forget. + */ + private void maybeBroadcast( + final String key, + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterProgress committedWriterProgress, + final WriterId committedWriterId) { + if (Objects.isNull(committedWriterId) || Objects.isNull(committedWriterProgress)) { + return; + } + final String broadcastKey = buildBroadcastKey(key, committedWriterId); + final long now = System.currentTimeMillis(); + final Long last = lastBroadcastTime.get(broadcastKey); + if (last != null && now - last < MIN_BROADCAST_INTERVAL_MS) { + return; + } + lastBroadcastTime.put(broadcastKey, now); + broadcastExecutor.submit( + () -> + doBroadcast( + consumerGroupId, topicName, regionId, committedWriterProgress, committedWriterId)); + } + + /** + * Sends committed progress to all follower replicas of the given region. Uses the partition cache + * to discover replica endpoints and skips the local DataNode. + */ + private void doBroadcast( + final String consumerGroupId, + final String topicName, + final ConsensusGroupId regionId, + final WriterProgress writerProgress, + final WriterId writerId) { + final int localDataNodeId = IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + try { + final List replicaSets = + ClusterPartitionFetcher.getInstance() + .getRegionReplicaSet( + Collections.singletonList(regionId.convertToTConsensusGroupId())); + if (replicaSets.isEmpty()) { + return; + } + final String regionIdStr = regionId.toString(); + final TSyncSubscriptionProgressReq req = + new TSyncSubscriptionProgressReq( + consumerGroupId, + topicName, + regionIdStr, + Objects.nonNull(writerProgress) ? writerProgress.getPhysicalTime() : 0L, + Objects.nonNull(writerProgress) ? writerProgress.getLocalSeq() : -1L); + if (Objects.nonNull(writerId) && writerId.getNodeId() >= 0) { + req.setWriterNodeId(writerId.getNodeId()); + } + if (Objects.nonNull(writerId) && writerId.getWriterEpoch() > 0) { + req.setWriterEpoch(writerId.getWriterEpoch()); + } + + for (final TDataNodeLocation location : replicaSets.get(0).getDataNodeLocations()) { + if (location.getDataNodeId() == localDataNodeId) { + continue; // skip self + } + final TEndPoint endpoint = location.getInternalEndPoint(); + try (final SyncDataNodeInternalServiceClient client = + SYNC_DN_CLIENT_MANAGER.borrowClient(endpoint)) { + client.syncSubscriptionProgress(req); + } catch (final ClientManagerException | TException e) { + LOGGER.debug( + "Failed to broadcast subscription progress to DataNode {} at {}: {}", + location.getDataNodeId(), + endpoint, + e.getMessage()); + } + } + } catch (final Exception e) { + LOGGER.debug( + "Failed to broadcast subscription progress for region {}: {}", regionId, e.getMessage()); + } + } + + /** + * Receives a committed progress broadcast from another DataNode (Leader). Updates local state if + * the broadcast progress is ahead of the current local progress. + */ + public void receiveProgressBroadcast( + final String consumerGroupId, + final String topicName, + final String regionIdStr, + final long physicalTime, + final long localSeq, + final int writerNodeId, + final long writerEpoch) { + receiveProgressBroadcast( + consumerGroupId, + topicName, + regionIdStr, + buildWriterId(regionIdStr, writerNodeId, writerEpoch), + new WriterProgress(physicalTime, localSeq)); + } + + public void receiveProgressBroadcast( + final String consumerGroupId, + final String topicName, + final String regionIdStr, + final WriterId writerId, + final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: ignore broadcast without writer identity, " + + "consumerGroupId={}, topicName={}, regionId={}, writerId={}, writerProgress={}", + consumerGroupId, + topicName, + regionIdStr, + writerId, + writerProgress); + return; + } + final String key = consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionIdStr; + final ConsensusSubscriptionCommitState state = commitStates.get(key); + if (state != null) { + // Update only if broadcast is ahead + state.updateFromBroadcast(writerId, writerProgress); + persistProgressIfNeeded(key, state); + } else { + // Create a new state from the broadcast progress + final ConsensusSubscriptionCommitState newState = + new ConsensusSubscriptionCommitState( + regionIdStr, + new SubscriptionConsensusProgress( + new RegionProgress(Collections.singletonMap(writerId, writerProgress)), 0L)); + newState.updateFromBroadcast(writerId, writerProgress); + commitStates.putIfAbsent(key, newState); + persistProgress(key, commitStates.get(key)); + } + LOGGER.debug( + "Received subscription progress broadcast: consumerGroupId={}, topicName={}, " + + "regionId={}, physicalTime={}, localSeq={}", + consumerGroupId, + topicName, + regionIdStr, + writerProgress != null ? writerProgress.getPhysicalTime() : 0L, + writerProgress != null ? writerProgress.getLocalSeq() : -1L); + } + + // ======================== Helper Methods ======================== + + // Use a separator that cannot appear in consumerGroupId, topicName, or regionId + // to prevent key collisions (e.g., "a_b" + "c" vs "a" + "b_c"). + private static final String KEY_SEPARATOR = "##"; + + private String generateKey( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + return consumerGroupId + KEY_SEPARATOR + topicName + KEY_SEPARATOR + regionId.toString(); + } + + private File getProgressFile(final String key) { + return new File(persistDir, PROGRESS_FILE_PREFIX + key + PROGRESS_FILE_SUFFIX); + } + + private ConsensusSubscriptionCommitState tryRecover(final String key, final String regionIdStr) { + final File file = getProgressFile(key); + if (!file.exists()) { + return null; + } + try (final FileInputStream fis = new FileInputStream(file)) { + final byte[] bytes = new byte[(int) file.length()]; + fis.read(bytes); + final ByteBuffer buffer = ByteBuffer.wrap(bytes); + return ConsensusSubscriptionCommitState.deserialize(regionIdStr, buffer); + } catch (final IOException e) { + LOGGER.warn("Failed to recover consensus subscription progress from {}", file, e); + return null; + } + } + + private static WriterId buildWriterId( + final String regionIdStr, final int writerNodeId, final long writerEpoch) { + return writerNodeId >= 0 ? new WriterId(regionIdStr, writerNodeId, writerEpoch) : null; + } + + static String buildBroadcastKey(final String key, final WriterId writerId) { + return key + + KEY_SEPARATOR + + (Objects.nonNull(writerId) ? writerId.getNodeId() : -1) + + KEY_SEPARATOR + + (Objects.nonNull(writerId) ? writerId.getWriterEpoch() : 0L); + } + + private ConsensusSubscriptionCommitState queryCommitProgressStateFromConfigNode( + final String consumerGroupId, final String topicName, final ConsensusGroupId regionId) { + try (final ConfigNodeClient configNodeClient = + CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { + final TGetCommitProgressReq req = + new TGetCommitProgressReq( + consumerGroupId, + topicName, + regionId.getId(), + IoTDBDescriptor.getInstance().getConfig().getDataNodeId()); + final TGetCommitProgressResp resp = configNodeClient.getCommitProgress(req); + if (resp.status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return null; + } + if (resp.isSetCommittedRegionProgress()) { + final RegionProgress committedRegionProgress = + deserializeRegionProgress( + ByteBuffer.wrap(resp.getCommittedRegionProgress()).asReadOnlyBuffer()); + if (Objects.nonNull(committedRegionProgress) + && !committedRegionProgress.getWriterPositions().isEmpty()) { + LOGGER.info( + "ConsensusSubscriptionCommitManager: recovered committedRegionProgress={} from " + + "ConfigNode for consumerGroupId={}, topicName={}, regionId={}", + committedRegionProgress, + consumerGroupId, + topicName, + regionId); + final ConsensusSubscriptionCommitState recoveredState = + new ConsensusSubscriptionCommitState( + regionId.toString(), new SubscriptionConsensusProgress()); + recoveredState.resetForSeek(committedRegionProgress); + return recoveredState; + } + } + } catch (final ClientManagerException | TException e) { + LOGGER.warn( + "ConsensusSubscriptionCommitManager: failed to query commit progress from ConfigNode " + + "for consumerGroupId={}, topicName={}, regionId={}, starting from 0", + consumerGroupId, + topicName, + regionId, + e); + } + return null; + } + + private static ByteBuffer serializeRegionProgress(final RegionProgress regionProgress) { + if (Objects.isNull(regionProgress)) { + return null; + } + try (final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()); + } catch (final IOException e) { + LOGGER.warn("Failed to serialize committed region progress {}", regionProgress, e); + return null; + } + } + + private static RegionProgress deserializeRegionProgress(final ByteBuffer buffer) { + if (Objects.isNull(buffer)) { + return null; + } + final ByteBuffer duplicate = buffer.asReadOnlyBuffer(); + duplicate.rewind(); + return RegionProgress.deserialize(duplicate); + } + + private void persistProgressIfNeeded( + final String key, final ConsensusSubscriptionCommitState state) { + final int interval = + SubscriptionConfig.getInstance().getSubscriptionConsensusCommitPersistInterval(); + if (interval > 0 && state.getProgress().getCommitIndex() % interval == 0) { + persistProgress(key, state); + } + } + + private void persistProgress(final String key, final ConsensusSubscriptionCommitState state) { + final File file = getProgressFile(key); + try (final FileOutputStream fos = new FileOutputStream(file); + final DataOutputStream dos = new DataOutputStream(fos)) { + state.serialize(dos); + dos.flush(); + if (SubscriptionConfig.getInstance().isSubscriptionConsensusCommitFsyncEnabled()) { + fos.getFD().sync(); + } + } catch (final IOException e) { + LOGGER.warn("Failed to persist consensus subscription progress to {}", file, e); + } + } + + // ======================== Inner State Class ======================== + + /** + * Tracks commit state for a single (consumerGroup, topic, region) triple using (physicalTime, + * localSeq) pairs for cross-leader-migration consistency. Outstanding and committed positions are + * tracked as ProgressKey objects rather than raw searchIndex values. + */ + public static class ConsensusSubscriptionCommitState { + + private final String regionId; + + private final SubscriptionConsensusProgress progress; + + /** LRU set of recently committed keys for idempotent re-commit detection. */ + private static final int RECENTLY_COMMITTED_CAPACITY = 1024; + + private final Set recentlyCommittedKeys = + Collections.newSetFromMap( + new LinkedHashMap() { + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return size() > RECENTLY_COMMITTED_CAPACITY; + } + }); + + /** Real committed checkpoint per writer. */ + private final Map committedWriterPositions = new LinkedHashMap<>(); + + /** Tracks dispatched but not-yet-committed events by writer-local slot. */ + private final Map outstandingKeys = new ConcurrentHashMap<>(); + + /** Tracks committed dispatched entries that cannot yet advance the frontier because of gaps. */ + private final Map committedPendingKeys = new LinkedHashMap<>(); + + public ConsensusSubscriptionCommitState( + final String regionId, final SubscriptionConsensusProgress progress) { + this.regionId = regionId; + this.progress = progress; + committedWriterPositions.putAll(progress.getCommittedRegionProgress().getWriterPositions()); + syncPersistedProgress(); + } + + public SubscriptionConsensusProgress getProgress() { + return progress; + } + + public long getCommittedPhysicalTime() { + return getDerivedCommittedFrontierKey().physicalTime; + } + + public long getCommittedLocalSeq() { + return getDerivedCommittedFrontierKey().localSeq; + } + + public int getCommittedWriterNodeId() { + final WriterId committedWriterId = getCommittedWriterId(); + return Objects.nonNull(committedWriterId) ? committedWriterId.getNodeId() : -1; + } + + public long getCommittedWriterEpoch() { + final WriterId committedWriterId = getCommittedWriterId(); + return Objects.nonNull(committedWriterId) ? committedWriterId.getWriterEpoch() : 0L; + } + + public WriterId getCommittedWriterId() { + return getDerivedCommittedFrontierKey().toWriterId(regionId); + } + + public WriterProgress getCommittedWriterProgress() { + return getDerivedCommittedFrontierKey().toWriterProgress(); + } + + public RegionProgress getCommittedRegionProgress() { + synchronized (this) { + return new RegionProgress(new LinkedHashMap<>(committedWriterPositions)); + } + } + + /** Threshold for warning about outstanding (uncommitted) entries accumulation. */ + private static final int OUTSTANDING_SIZE_WARN_THRESHOLD = 10000; + + public void recordMapping(final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: ignore mapping without writer identity, " + + "writerId={}, writerProgress={}", + writerId, + writerProgress); + return; + } + final ProgressKey key = new ProgressKey(writerId, writerProgress); + final ProgressSlot slot = ProgressSlot.from(key); + synchronized (this) { + final ProgressKey previous = outstandingKeys.put(slot, key); + if (Objects.nonNull(previous) && !previous.equals(key)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: duplicate outstanding mapping for slot={}, " + + "previous={}, current={}", + slot, + previous, + key); + } + final int size = outstandingKeys.size(); + if (size > OUTSTANDING_SIZE_WARN_THRESHOLD && size % OUTSTANDING_SIZE_WARN_THRESHOLD == 1) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: outstanding size ({}) exceeds threshold ({}), " + + "consumers may not be committing. committed=({},{}), writer=({}, {})", + size, + OUTSTANDING_SIZE_WARN_THRESHOLD, + getCommittedPhysicalTime(), + getCommittedLocalSeq(), + getCommittedWriterNodeId(), + getCommittedWriterEpoch()); + } + } + } + + /** + * Commits the specified event and advances the committed position contiguously. + * + * @param writerProgress the writer progress of the event to commit + * @return true if successfully committed + */ + public boolean commit(final WriterId writerId, final WriterProgress writerProgress) { + return commitAndGetResult(writerId, writerProgress).isHandled(); + } + + CommitOperationResult commitAndGetResult( + final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: missing writer identity for commit, " + + "writerId={}, writerProgress={}", + writerId, + writerProgress); + return CommitOperationResult.unhandled(); + } + final ProgressKey key = new ProgressKey(writerId, writerProgress); + + synchronized (this) { + final ProgressKey recordedKey = outstandingKeys.remove(ProgressSlot.from(key)); + if (recordedKey == null) { + if (recentlyCommittedKeys.contains(key)) { + LOGGER.debug( + "ConsensusSubscriptionCommitState: idempotent re-commit for ({},{},{},{})", + key.physicalTime, + key.localSeq, + key.writerNodeId, + key.writerEpoch); + progress.incrementCommitIndex(); + return CommitOperationResult.handledWithoutAdvance(); + } + LOGGER.warn( + "ConsensusSubscriptionCommitState: unknown key ({},{},{},{}) for commit", + key.physicalTime, + key.localSeq, + key.writerNodeId, + key.writerEpoch); + return CommitOperationResult.unhandled(); + } + final ProgressKey effectiveKey = recordedKey.resolveMissingFields(writerId, writerProgress); + final WriterId effectiveWriterId = effectiveKey.toWriterId(regionId); + final WriterProgress before = getCommittedWriterProgressForWriter(effectiveWriterId); + recentlyCommittedKeys.add(effectiveKey); + stageCommittedAndAdvance(effectiveKey); + progress.incrementCommitIndex(); + syncPersistedProgress(); + return buildCommitOperationResult( + effectiveWriterId, before, getCommittedWriterProgressForWriter(effectiveWriterId)); + } + } + + public boolean commitWithoutOutstanding( + final WriterId writerId, final WriterProgress writerProgress) { + return commitWithoutOutstandingAndGetResult(writerId, writerProgress).isHandled(); + } + + CommitOperationResult commitWithoutOutstandingAndGetResult( + final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: missing writer identity for direct commit, " + + "writerId={}, writerProgress={}", + writerId, + writerProgress); + return CommitOperationResult.unhandled(); + } + final ProgressKey incomingKey = new ProgressKey(writerId, writerProgress); + + synchronized (this) { + if (recentlyCommittedKeys.contains(incomingKey)) { + LOGGER.debug( + "ConsensusSubscriptionCommitState: idempotent direct commit for ({},{},{},{})", + incomingKey.physicalTime, + incomingKey.localSeq, + incomingKey.writerNodeId, + incomingKey.writerEpoch); + progress.incrementCommitIndex(); + return CommitOperationResult.handledWithoutAdvance(); + } + + final ProgressKey outstandingKey = outstandingKeys.remove(ProgressSlot.from(incomingKey)); + if (Objects.isNull(outstandingKey)) { + LOGGER.warn( + "ConsensusSubscriptionCommitState: reject direct commit without outstanding mapping " + + "for ({},{},{},{})", + incomingKey.physicalTime, + incomingKey.localSeq, + incomingKey.writerNodeId, + incomingKey.writerEpoch); + return CommitOperationResult.unhandled(); + } + final ProgressKey effectiveKey = + outstandingKey.resolveMissingFields(writerId, writerProgress); + final WriterId effectiveWriterId = effectiveKey.toWriterId(regionId); + final WriterProgress before = getCommittedWriterProgressForWriter(effectiveWriterId); + recentlyCommittedKeys.add(effectiveKey); + stageCommittedAndAdvance(effectiveKey); + progress.incrementCommitIndex(); + syncPersistedProgress(); + return buildCommitOperationResult( + effectiveWriterId, before, getCommittedWriterProgressForWriter(effectiveWriterId)); + } + } + + public void resetForSeek(final RegionProgress regionProgress) { + synchronized (this) { + outstandingKeys.clear(); + committedPendingKeys.clear(); + recentlyCommittedKeys.clear(); + committedWriterPositions.clear(); + if (Objects.nonNull(regionProgress)) { + for (final Map.Entry entry : + regionProgress.getWriterPositions().entrySet()) { + if (Objects.nonNull(entry.getKey()) && Objects.nonNull(entry.getValue())) { + committedWriterPositions.put(entry.getKey(), entry.getValue()); + } + } + } + syncPersistedProgress(); + } + } + + /** + * Updates committed progress from a Leader broadcast. Only advances if the broadcast position + * is ahead of the current local position. + */ + public void updateFromBroadcast(final WriterId writerId, final WriterProgress writerProgress) { + if (Objects.isNull(writerId) || Objects.isNull(writerProgress)) { + return; + } + synchronized (this) { + final ProgressKey incoming = new ProgressKey(writerId, writerProgress); + final WriterId incomingWriterId = incoming.toWriterId(regionId); + final WriterProgress currentWriterProgress = + getCommittedWriterProgressForWriter(incomingWriterId); + final ProgressKey current = new ProgressKey(incomingWriterId, currentWriterProgress); + if (incoming.compareTo(current) > 0) { + committedWriterPositions.put(incomingWriterId, incoming.toWriterProgress()); + syncPersistedProgress(); + } + } + } + + private void advanceCommitted(final ProgressKey key) { + final WriterId writerId = key.toWriterId(regionId); + if (Objects.isNull(writerId)) { + return; + } + committedWriterPositions.put(writerId, key.toWriterProgress()); + } + + private WriterProgress getCommittedWriterProgressForWriter(final WriterId writerId) { + return Objects.nonNull(writerId) + ? committedWriterPositions.getOrDefault(writerId, new WriterProgress(0L, -1L)) + : new WriterProgress(0L, -1L); + } + + private void stageCommittedAndAdvance(final ProgressKey key) { + committedPendingKeys.put(ProgressSlot.from(key), key); + final WriterId writerId = key.toWriterId(regionId); + if (Objects.isNull(writerId)) { + committedPendingKeys.remove(ProgressSlot.from(key)); + return; + } + ProgressKey current = + new ProgressKey(writerId, getCommittedWriterProgressForWriter(writerId)); + while (true) { + final ProgressKey nextCommitted = findNextCommittedKey(writerId, current); + if (Objects.isNull(nextCommitted)) { + return; + } + final ProgressKey nextOutstanding = findNextOutstandingKey(writerId, current); + if (Objects.nonNull(nextOutstanding) && nextOutstanding.compareTo(nextCommitted) < 0) { + return; + } + committedPendingKeys.remove(ProgressSlot.from(nextCommitted)); + advanceCommitted(nextCommitted); + current = nextCommitted; + } + } + + private void advanceCommittedIfAhead(final ProgressKey key) { + final WriterId writerId = key.toWriterId(regionId); + if (Objects.isNull(writerId)) { + return; + } + final WriterProgress currentWriterProgress = getCommittedWriterProgressForWriter(writerId); + final ProgressKey currentKey = new ProgressKey(writerId, currentWriterProgress); + if (key.compareTo(currentKey) > 0) { + advanceCommitted(key); + } + } + + private ProgressKey findNextCommittedKey(final WriterId writerId, final ProgressKey current) { + ProgressKey next = null; + for (final ProgressKey candidate : committedPendingKeys.values()) { + if (!sameWriter(writerId, candidate)) { + continue; + } + if (candidate.compareTo(current) <= 0) { + continue; + } + if (Objects.isNull(next) || candidate.compareTo(next) < 0) { + next = candidate; + } + } + return next; + } + + private ProgressKey findNextOutstandingKey(final WriterId writerId, final ProgressKey current) { + ProgressKey next = null; + for (final ProgressKey candidate : outstandingKeys.values()) { + if (!sameWriter(writerId, candidate)) { + continue; + } + if (candidate.compareTo(current) <= 0) { + continue; + } + if (Objects.isNull(next) || candidate.compareTo(next) < 0) { + next = candidate; + } + } + return next; + } + + private boolean sameWriter(final WriterId writerId, final ProgressKey key) { + return Objects.nonNull(writerId) + && writerId.getNodeId() == key.writerNodeId + && writerId.getWriterEpoch() == key.writerEpoch; + } + + private CommitOperationResult buildCommitOperationResult( + final WriterId writerId, final WriterProgress before, final WriterProgress after) { + if (Objects.isNull(writerId)) { + return CommitOperationResult.handledWithoutAdvance(); + } + final ProgressKey beforeKey = new ProgressKey(writerId, before); + final ProgressKey afterKey = new ProgressKey(writerId, after); + return afterKey.compareTo(beforeKey) > 0 + ? CommitOperationResult.handledWithAdvance(writerId, after) + : CommitOperationResult.handledWithoutAdvance(); + } + + private ProgressKey getDerivedCommittedFrontierKey() { + ProgressKey maxKey = null; + synchronized (this) { + for (final Map.Entry entry : + committedWriterPositions.entrySet()) { + final ProgressKey candidate = new ProgressKey(entry.getKey(), entry.getValue()); + if (Objects.isNull(maxKey) || candidate.compareTo(maxKey) > 0) { + maxKey = candidate; + } + } + } + return Objects.nonNull(maxKey) ? maxKey : new ProgressKey(0L, -1L, -1, 0L); + } + + private void syncPersistedProgress() { + progress.setCommittedRegionProgress( + new RegionProgress(new LinkedHashMap<>(committedWriterPositions))); + } + + public void serialize(final DataOutputStream stream) throws IOException { + synchronized (this) { + syncPersistedProgress(); + progress.serialize(stream); + } + } + + public static ConsensusSubscriptionCommitState deserialize( + final String regionId, final ByteBuffer buffer) { + final SubscriptionConsensusProgress progress = + SubscriptionConsensusProgress.deserialize(buffer); + return new ConsensusSubscriptionCommitState(regionId, progress); + } + } + + private static final class CommitOperationResult { + + private static final CommitOperationResult UNHANDLED = + new CommitOperationResult(false, null, null); + + private static final CommitOperationResult HANDLED_WITHOUT_ADVANCE = + new CommitOperationResult(true, null, null); + + private final boolean handled; + + private final WriterId advancedWriterId; + + private final WriterProgress advancedWriterProgress; + + private CommitOperationResult( + final boolean handled, + final WriterId advancedWriterId, + final WriterProgress advancedWriterProgress) { + this.handled = handled; + this.advancedWriterId = advancedWriterId; + this.advancedWriterProgress = advancedWriterProgress; + } + + private static CommitOperationResult unhandled() { + return UNHANDLED; + } + + private static CommitOperationResult handledWithoutAdvance() { + return HANDLED_WITHOUT_ADVANCE; + } + + private static CommitOperationResult handledWithAdvance( + final WriterId advancedWriterId, final WriterProgress advancedWriterProgress) { + return new CommitOperationResult(true, advancedWriterId, advancedWriterProgress); + } + + private boolean isHandled() { + return handled; + } + + private boolean hasAdvancedWriter() { + return Objects.nonNull(advancedWriterId) && Objects.nonNull(advancedWriterProgress); + } + + private WriterId getAdvancedWriterId() { + return advancedWriterId; + } + + private WriterProgress getAdvancedWriterProgress() { + return advancedWriterProgress; + } + } + + static final class ProgressSlot { + final int writerNodeId; + final long writerEpoch; + final long localSeq; + + private ProgressSlot(final int writerNodeId, final long writerEpoch, final long localSeq) { + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + this.localSeq = localSeq; + } + + static ProgressSlot of(final int writerNodeId, final long writerEpoch, final long localSeq) { + return new ProgressSlot(writerNodeId, writerEpoch, localSeq); + } + + static ProgressSlot from(final ProgressKey key) { + return new ProgressSlot(key.writerNodeId, key.writerEpoch, key.localSeq); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ProgressSlot)) { + return false; + } + final ProgressSlot that = (ProgressSlot) o; + return writerNodeId == that.writerNodeId + && writerEpoch == that.writerEpoch + && localSeq == that.localSeq; + } + + @Override + public int hashCode() { + return Objects.hash(writerNodeId, writerEpoch, localSeq); + } + + @Override + public String toString() { + return "(" + writerNodeId + "," + writerEpoch + "," + localSeq + ")"; + } + } + + // ======================== ProgressKey ======================== + + /** + * Comparable key for tracking commit progress: (physicalTime, localSeq). Physical time takes + * priority; within the same physical time, writer identity and local sequence determine order. + */ + static final class ProgressKey implements Comparable { + final long physicalTime; + final long localSeq; + final int writerNodeId; + final long writerEpoch; + + ProgressKey(final long physicalTime, final long localSeq) { + this(physicalTime, localSeq, -1, 0L); + } + + ProgressKey(final WriterId writerId, final WriterProgress writerProgress) { + this( + Objects.nonNull(writerProgress) ? writerProgress.getPhysicalTime() : 0L, + Objects.nonNull(writerProgress) ? writerProgress.getLocalSeq() : -1L, + Objects.nonNull(writerId) ? writerId.getNodeId() : -1, + Objects.nonNull(writerId) ? writerId.getWriterEpoch() : 0L); + } + + ProgressKey( + final long physicalTime, + final long localSeq, + final int writerNodeId, + final long writerEpoch) { + this.physicalTime = physicalTime; + this.localSeq = localSeq; + this.writerNodeId = writerNodeId; + this.writerEpoch = writerEpoch; + } + + ProgressKey resolveMissingFields(final WriterId writerId, final WriterProgress writerProgress) { + final long effectivePhysicalTime = + this.physicalTime > 0 + ? this.physicalTime + : Objects.nonNull(writerProgress) + ? writerProgress.getPhysicalTime() + : this.physicalTime; + final long effectiveLocalSeq = + this.localSeq >= 0 + ? this.localSeq + : Objects.nonNull(writerProgress) ? writerProgress.getLocalSeq() : this.localSeq; + final int effectiveWriterNodeId = + this.writerNodeId >= 0 + ? this.writerNodeId + : Objects.nonNull(writerId) ? writerId.getNodeId() : this.writerNodeId; + final long effectiveWriterEpoch = + this.writerEpoch > 0 + ? this.writerEpoch + : Objects.nonNull(writerId) ? writerId.getWriterEpoch() : this.writerEpoch; + if (effectivePhysicalTime == this.physicalTime + && effectiveLocalSeq == this.localSeq + && effectiveWriterNodeId == this.writerNodeId + && effectiveWriterEpoch == this.writerEpoch) { + return this; + } + return new ProgressKey( + effectivePhysicalTime, effectiveLocalSeq, effectiveWriterNodeId, effectiveWriterEpoch); + } + + WriterId toWriterId(final String regionId) { + return writerNodeId >= 0 ? new WriterId(regionId, writerNodeId, writerEpoch) : null; + } + + WriterProgress toWriterProgress() { + return new WriterProgress(physicalTime, localSeq); + } + + @Override + public int compareTo(final ProgressKey o) { + int cmp = Long.compare(physicalTime, o.physicalTime); + if (cmp != 0) { + return cmp; + } + cmp = Integer.compare(writerNodeId, o.writerNodeId); + if (cmp != 0) { + return cmp; + } + cmp = Long.compare(writerEpoch, o.writerEpoch); + if (cmp != 0) { + return cmp; + } + return Long.compare(localSeq, o.localSeq); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ProgressKey)) { + return false; + } + final ProgressKey that = (ProgressKey) o; + return physicalTime == that.physicalTime + && localSeq == that.localSeq + && writerNodeId == that.writerNodeId + && writerEpoch == that.writerEpoch; + } + + @Override + public int hashCode() { + return Objects.hash(physicalTime, localSeq, writerNodeId, writerEpoch); + } + + @Override + public String toString() { + return "(" + physicalTime + "," + writerNodeId + "," + writerEpoch + "," + localSeq + ")"; + } + } + + // ======================== Singleton ======================== + + private static class Holder { + private static final ConsensusSubscriptionCommitManager INSTANCE = + new ConsensusSubscriptionCommitManager(); + } + + public static ConsensusSubscriptionCommitManager getInstance() { + return Holder.INSTANCE; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java new file mode 100644 index 0000000000000..2adbb6d3e7b47 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandler.java @@ -0,0 +1,647 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.commons.pipe.datastructure.pattern.IoTDBTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.PrefixTreePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TablePattern; +import org.apache.iotdb.commons.pipe.datastructure.pattern.TreePattern; +import org.apache.iotdb.consensus.IConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensus; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.db.conf.IoTDBConfig; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.consensus.DataRegionConsensusImpl; +import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.rpc.subscription.config.TopicConfig; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Handles the setup and teardown of consensus-based subscription queues on DataNode. When a + * real-time subscription is detected, this handler finds the local IoTConsensus data regions, + * creates the appropriate converter, and binds prefetching queues to the subscription broker. + */ +public class ConsensusSubscriptionSetupHandler { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionSetupHandler.class); + + private static final IoTDBConfig IOTDB_CONFIG = IoTDBDescriptor.getInstance().getConfig(); + + /** Last-known preferred writer node ID per region, used to detect routing changes. */ + private static final ConcurrentHashMap lastKnownPreferredWriter = + new ConcurrentHashMap<>(); + + /** + * Per-region routing runtime version. Uses the routing-broadcast timestamp from ConfigNode so all + * DataNodes derive the same ordering version for the same routing change without local + * persistence. + */ + private static final ConcurrentHashMap regionRuntimeVersion = + new ConcurrentHashMap<>(); + + /** Per-region active writer node IDs for subscription runtime control. */ + private static final ConcurrentHashMap> + regionActiveWriterNodeIds = new ConcurrentHashMap<>(); + + static RegionProgress resolveFallbackCommittedRegionProgress( + final ConsensusSubscriptionCommitManager commitManager, + final String consumerGroupId, + final String topicName, + final ConsensusGroupId groupId) { + commitManager.getOrCreateState(consumerGroupId, topicName, groupId); + final RegionProgress committedRegionProgress = + commitManager.getCommittedRegionProgress(consumerGroupId, topicName, groupId); + return committedRegionProgress != null + && !committedRegionProgress.getWriterPositions().isEmpty() + ? committedRegionProgress + : null; + } + + private ConsensusSubscriptionSetupHandler() { + // utility class + } + + /** + * Ensures that the IoTConsensus new-peer and peer-removed callbacks are set, so that when a new + * DataRegion is created, all active consensus subscriptions are automatically bound to the new + * region, and when a DataRegion is removed, all subscription queues are properly cleaned up. + */ + public static void ensureNewRegionListenerRegistered() { + if (IoTConsensus.onNewPeerCreated == null) { + IoTConsensus.onNewPeerCreated = ConsensusSubscriptionSetupHandler::onNewRegionCreated; + LOGGER.info( + "Set IoTConsensus.onNewPeerCreated callback for consensus subscription auto-binding"); + } + if (IoTConsensus.onPeerRemoved == null) { + IoTConsensus.onPeerRemoved = ConsensusSubscriptionSetupHandler::onRegionRemoved; + LOGGER.info("Set IoTConsensus.onPeerRemoved callback for consensus subscription cleanup"); + } + } + + /** + * Callback invoked when a new DataRegion (IoTConsensusServerImpl) is created locally. Queries + * existing subscription metadata to find all active consensus subscriptions and binds prefetching + * queues to the new region. + */ + private static void onNewRegionCreated( + final ConsensusGroupId groupId, final IoTConsensusServerImpl serverImpl) { + if (!(groupId instanceof DataRegionId)) { + return; + } + + // Query existing metadata keepers for all active subscriptions + final Map> allSubscriptions = + SubscriptionAgent.consumer().getAllSubscriptions(); + if (allSubscriptions.isEmpty()) { + return; + } + + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + + LOGGER.info( + "New DataRegion {} created, checking {} consumer group(s) for auto-binding, " + + "currentSearchIndex={}", + groupId, + allSubscriptions.size(), + serverImpl.getSearchIndex()); + + for (final Map.Entry> groupEntry : allSubscriptions.entrySet()) { + final String consumerGroupId = groupEntry.getKey(); + for (final String topicName : groupEntry.getValue()) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + try { + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + continue; + } + + // Resolve the new DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + // For table topics, skip if this region's database doesn't match the topic filter + if (topicConfig.isTableTopic()) { + final String topicDb = + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE); + if (topicDb != null + && !topicDb.isEmpty() + && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDb) + && !topicDb.equalsIgnoreCase(dbTableModel)) { + continue; + } + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + + // Recover from persisted per-writer region progress when available. The queue will + // resolve a replay start from that progress on first poll via the region-level locator. + final RegionProgress committedRegionProgress = + resolveFallbackCommittedRegionProgress( + commitManager, consumerGroupId, topicName, groupId); + final boolean hasLocalPersistedState = + commitManager.hasPersistedState(consumerGroupId, topicName, groupId); + final long tailStartSearchIndex = serverImpl.getSearchIndex() + 1; + final long initialRuntimeVersion = + regionRuntimeVersion.getOrDefault(groupId.convertToTConsensusGroupId(), 0L); + final boolean initialActive = + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1) + == IOTDB_CONFIG.getDataNodeId(); + final Set initialActiveWriterNodeIds = + regionActiveWriterNodeIds.getOrDefault( + groupId.convertToTConsensusGroupId(), + initialActive + ? Collections.singleton(IOTDB_CONFIG.getDataNodeId()) + : Collections.emptySet()); + final ConsensusRegionRuntimeState initialRuntimeState = + new ConsensusRegionRuntimeState( + initialRuntimeVersion, + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1), + initialActive, + initialActiveWriterNodeIds); + + LOGGER.info( + "Auto-binding consensus queue for topic [{}] in group [{}] to new region {} " + + "(database={}, tailStartSearchIndex={}, hasLocalPersistedState={}, " + + "committedRegionProgress={}, initialRuntimeVersion={}, initialActive={})", + topicName, + consumerGroupId, + groupId, + dbTableModel, + tailStartSearchIndex, + hasLocalPersistedState, + committedRegionProgress, + initialRuntimeVersion, + initialActive); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + topicConfig.getOrderMode(), + groupId, + serverImpl, + converter, + commitManager, + committedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + SubscriptionAgent.broker().applyRuntimeStateForRegion(groupId, initialRuntimeState); + } catch (final Exception e) { + LOGGER.error( + "Failed to auto-bind topic [{}] in group [{}] to new region {}", + topicName, + consumerGroupId, + groupId, + e); + } + } + } + } + + /** + * Callback invoked before a DataRegion (IoTConsensusServerImpl) is deleted locally. Unbinds and + * cleans up all subscription prefetching queues associated with the removed region across all + * consumer groups. + */ + private static void onRegionRemoved(final ConsensusGroupId groupId) { + if (!(groupId instanceof DataRegionId)) { + return; + } + lastKnownPreferredWriter.remove(groupId.convertToTConsensusGroupId()); + regionRuntimeVersion.remove(groupId.convertToTConsensusGroupId()); + regionActiveWriterNodeIds.remove(groupId.convertToTConsensusGroupId()); + LOGGER.info( + "DataRegion {} being removed, unbinding all consensus subscription queues", groupId); + try { + SubscriptionAgent.broker().unbindByRegion(groupId); + } catch (final Exception e) { + LOGGER.error( + "Failed to unbind consensus subscription queues for removed region {}", groupId, e); + } + } + + public static boolean isConsensusBasedTopic(final String topicName) { + try { + final String topicMode = SubscriptionAgent.topic().getTopicMode(topicName); + final String topicFormat = SubscriptionAgent.topic().getTopicFormat(topicName); + final boolean result = + TopicConstant.MODE_LIVE_VALUE.equalsIgnoreCase(topicMode) + && !TopicConstant.FORMAT_TS_FILE_VALUE.equalsIgnoreCase(topicFormat); + LOGGER.debug( + "isConsensusBasedTopic check for topic [{}]: mode={}, format={}, result={}", + topicName, + topicMode, + topicFormat, + result); + return result; + } catch (final Exception e) { + LOGGER.warn( + "Failed to check if topic [{}] is consensus-based, defaulting to false", topicName, e); + return false; + } + } + + public static void setupConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + final IConsensus dataRegionConsensus = DataRegionConsensusImpl.getInstance(); + if (!(dataRegionConsensus instanceof IoTConsensus)) { + LOGGER.warn( + "Data region consensus is not IoTConsensus (actual: {}), " + + "cannot set up consensus-based subscription for consumer group [{}]", + dataRegionConsensus.getClass().getSimpleName(), + consumerGroupId); + return; + } + + // Ensure the new-region listener is registered (idempotent) + ensureNewRegionListenerRegistered(); + + final IoTConsensus ioTConsensus = (IoTConsensus) dataRegionConsensus; + final ConsensusSubscriptionCommitManager commitManager = + ConsensusSubscriptionCommitManager.getInstance(); + + LOGGER.info( + "Setting up consensus subscriptions for consumer group [{}], topics={}, " + + "total consensus groups={}", + consumerGroupId, + topicNames, + ioTConsensus.getAllConsensusGroupIds().size()); + + for (final String topicName : topicNames) { + if (!isConsensusBasedTopic(topicName)) { + continue; + } + + try { + setupConsensusQueueForTopic(consumerGroupId, topicName, ioTConsensus, commitManager); + } catch (final Exception e) { + LOGGER.error( + "Failed to set up consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + /** + * Set up consensus queue for a single topic. Discovers all local data region consensus groups and + * binds a ConsensusReqReader-based prefetching queue to every matching region. + * + *

For table-model topics, only regions whose database matches the topic's {@code DATABASE_KEY} + * filter are bound. For tree-model topics, all local data regions are bound. Additionally, the + * {@link #onNewRegionCreated} callback ensures that regions created after this method runs are + * also automatically bound. + */ + private static void setupConsensusQueueForTopic( + final String consumerGroupId, + final String topicName, + final IoTConsensus ioTConsensus, + final ConsensusSubscriptionCommitManager commitManager) { + final int myNodeId = IOTDB_CONFIG.getDataNodeId(); + + // Get topic config for building the converter + final Map topicConfigs = + SubscriptionAgent.topic().getTopicConfigs(java.util.Collections.singleton(topicName)); + final TopicConfig topicConfig = topicConfigs.get(topicName); + if (topicConfig == null) { + LOGGER.warn( + "Topic config not found for topic [{}], cannot set up consensus queue", topicName); + return; + } + + // Build the converter based on topic config (path pattern, time range, tree/table model) + LOGGER.info( + "Setting up consensus queue for topic [{}]: isTableTopic={}, orderMode={}, config={}", + topicName, + topicConfig.isTableTopic(), + topicConfig.getOrderMode(), + topicConfig.getAttribute()); + + // For table topics, extract the database filter from topic config + final String topicDatabaseFilter = + topicConfig.isTableTopic() + ? topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE) + : null; + + final List allGroupIds = ioTConsensus.getAllConsensusGroupIds(); + LOGGER.info( + "Discovered {} consensus group(s) for topic [{}] in consumer group [{}]: {}", + allGroupIds.size(), + topicName, + consumerGroupId, + allGroupIds); + boolean bound = false; + + for (final ConsensusGroupId groupId : allGroupIds) { + if (!(groupId instanceof DataRegionId)) { + continue; + } + + final IoTConsensusServerImpl serverImpl = ioTConsensus.getImpl(groupId); + if (serverImpl == null) { + continue; + } + + // Resolve the DataRegion's actual database name + final DataRegion dataRegion = + StorageEngine.getInstance().getDataRegion((DataRegionId) groupId); + if (dataRegion == null) { + continue; + } + final String dbRaw = dataRegion.getDatabaseName(); + final String dbTableModel = dbRaw.startsWith("root.") ? dbRaw.substring(5) : dbRaw; + + if (topicDatabaseFilter != null + && !topicDatabaseFilter.isEmpty() + && !TopicConstant.DATABASE_DEFAULT_VALUE.equals(topicDatabaseFilter) + && !topicDatabaseFilter.equalsIgnoreCase(dbTableModel)) { + LOGGER.info( + "Skipping region {} (database={}) for table topic [{}] (DATABASE_KEY={})", + groupId, + dbTableModel, + topicName, + topicDatabaseFilter); + continue; + } + + final String actualDbName = topicConfig.isTableTopic() ? dbTableModel : null; + final ConsensusLogToTabletConverter converter = buildConverter(topicConfig, actualDbName); + + // Recover from persisted per-writer region progress when available. The queue will resolve a + // replay start from that progress on first poll via the region-level locator. + final RegionProgress committedRegionProgress = + resolveFallbackCommittedRegionProgress( + commitManager, consumerGroupId, topicName, groupId); + final boolean hasLocalPersistedState = + commitManager.hasPersistedState(consumerGroupId, topicName, groupId); + final long tailStartSearchIndex = serverImpl.getSearchIndex() + 1; + final long initialRuntimeVersion = + regionRuntimeVersion.getOrDefault(groupId.convertToTConsensusGroupId(), 0L); + final boolean initialActive = + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1) + == myNodeId; + final Set initialActiveWriterNodeIds = + regionActiveWriterNodeIds.getOrDefault( + groupId.convertToTConsensusGroupId(), + initialActive + ? Collections.singleton(IOTDB_CONFIG.getDataNodeId()) + : Collections.emptySet()); + final ConsensusRegionRuntimeState initialRuntimeState = + new ConsensusRegionRuntimeState( + initialRuntimeVersion, + lastKnownPreferredWriter.getOrDefault(groupId.convertToTConsensusGroupId(), -1), + initialActive, + initialActiveWriterNodeIds); + + LOGGER.info( + "Binding consensus prefetching queue for topic [{}] in consumer group [{}] " + + "to data region consensus group [{}] (database={}, tailStartSearchIndex={}, " + + "hasLocalPersistedState={}, committedRegionProgress={}, " + + "initialRuntimeVersion={}, initialActive={})", + topicName, + consumerGroupId, + groupId, + dbTableModel, + tailStartSearchIndex, + hasLocalPersistedState, + committedRegionProgress, + initialRuntimeVersion, + initialActive); + + SubscriptionAgent.broker() + .bindConsensusPrefetchingQueue( + consumerGroupId, + topicName, + topicConfig.getOrderMode(), + groupId, + serverImpl, + converter, + commitManager, + committedRegionProgress, + tailStartSearchIndex, + initialRuntimeVersion, + initialActive); + + SubscriptionAgent.broker().applyRuntimeStateForRegion(groupId, initialRuntimeState); + + bound = true; + } + + if (!bound) { + LOGGER.warn( + "No local IoTConsensus data region found for topic [{}] in consumer group [{}]. " + + "Consensus subscription will be set up when a matching data region becomes available.", + topicName, + consumerGroupId); + } + } + + private static ConsensusLogToTabletConverter buildConverter( + final TopicConfig topicConfig, final String actualDatabaseName) { + // Determine tree or table model + final boolean isTableTopic = topicConfig.isTableTopic(); + + TreePattern treePattern = null; + TablePattern tablePattern = null; + + if (isTableTopic) { + // Table model: database + table name pattern + final String database = + topicConfig.getStringOrDefault( + TopicConstant.DATABASE_KEY, TopicConstant.DATABASE_DEFAULT_VALUE); + final String table = + topicConfig.getStringOrDefault( + TopicConstant.TABLE_KEY, TopicConstant.TABLE_DEFAULT_VALUE); + tablePattern = new TablePattern(true, database, table); + } else { + // Tree model: path or pattern + if (topicConfig.getAttribute().containsKey(TopicConstant.PATTERN_KEY)) { + final String pattern = topicConfig.getAttribute().get(TopicConstant.PATTERN_KEY); + treePattern = new PrefixTreePattern(pattern); + } else { + final String path = + topicConfig.getStringOrDefault( + TopicConstant.PATH_KEY, TopicConstant.PATH_DEFAULT_VALUE); + treePattern = new IoTDBTreePattern(path); + } + } + + return new ConsensusLogToTabletConverter(treePattern, tablePattern, actualDatabaseName); + } + + public static void teardownConsensusSubscriptions( + final String consumerGroupId, final Set topicNames) { + for (final String topicName : topicNames) { + try { + SubscriptionAgent.broker().unbindConsensusPrefetchingQueue(consumerGroupId, topicName); + + // Clean up commit state for all regions of this topic + ConsensusSubscriptionCommitManager.getInstance() + .removeAllStatesForTopic(consumerGroupId, topicName); + + LOGGER.info( + "Tore down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId); + } catch (final Exception e) { + LOGGER.warn( + "Failed to tear down consensus subscription for topic [{}] in consumer group [{}]", + topicName, + consumerGroupId, + e); + } + } + } + + public static void handleNewSubscriptions( + final String consumerGroupId, final Set newTopicNames) { + if (newTopicNames == null || newTopicNames.isEmpty()) { + return; + } + + LOGGER.info( + "Checking new subscriptions in consumer group [{}] for consensus-based topics: {}", + consumerGroupId, + newTopicNames); + + setupConsensusSubscriptions(consumerGroupId, newTopicNames); + } + + public static void applyRuntimeState( + final TConsensusGroupId groupId, final ConsensusRegionRuntimeState runtimeState) { + final int newPreferredNodeId = runtimeState.getPreferredWriterNodeId(); + final Integer oldPreferredBoxed = lastKnownPreferredWriter.put(groupId, newPreferredNodeId); + final int oldPreferredNodeId = (oldPreferredBoxed != null) ? oldPreferredBoxed : -1; + final ConsensusGroupId regionId = ConsensusGroupId.Factory.createFromTConsensusGroupId(groupId); + final long oldRuntimeVersion = regionRuntimeVersion.getOrDefault(groupId, 0L); + if (runtimeState.getRuntimeVersion() < oldRuntimeVersion) { + LOGGER.info( + "ConsensusSubscriptionSetupHandler: ignore stale runtime state for region {}, incomingRuntimeVersion={}, currentRuntimeVersion={}, runtimeState={}", + regionId, + runtimeState.getRuntimeVersion(), + oldRuntimeVersion, + runtimeState); + return; + } + regionRuntimeVersion.put(groupId, runtimeState.getRuntimeVersion()); + regionActiveWriterNodeIds.put(groupId, runtimeState.getActiveWriterNodeIds()); + LOGGER.info( + "ConsensusSubscriptionSetupHandler: applying runtime state for region {}, preferred writer {} -> {}, runtimeVersion {} -> {}, runtimeState={}", + regionId, + oldPreferredNodeId, + newPreferredNodeId, + oldRuntimeVersion, + runtimeState.getRuntimeVersion(), + runtimeState); + SubscriptionAgent.broker().applyRuntimeStateForRegion(regionId, runtimeState); + } + + public static void onRegionRouteChanged( + final Map newMap, final long routingTimestamp) { + final int myNodeId = IOTDB_CONFIG.getDataNodeId(); + + for (final Map.Entry newEntry : newMap.entrySet()) { + final TConsensusGroupId groupId = newEntry.getKey(); + final TRegionReplicaSet newReplicaSet = newEntry.getValue(); + + final int newPreferredNodeId = getPreferredNodeId(newReplicaSet); + final Integer oldPreferredBoxed = lastKnownPreferredWriter.put(groupId, newPreferredNodeId); + final int oldPreferredNodeId = (oldPreferredBoxed != null) ? oldPreferredBoxed : -1; + + if (oldPreferredNodeId == newPreferredNodeId) { + continue; + } + + final ConsensusGroupId regionId = + ConsensusGroupId.Factory.createFromTConsensusGroupId(groupId); + final long oldRuntimeVersion = regionRuntimeVersion.getOrDefault(groupId, 0L); + final long newRuntimeVersion = Math.max(routingTimestamp, oldRuntimeVersion); + regionRuntimeVersion.put(groupId, newRuntimeVersion); + + final LinkedHashSet activeWriterNodeIds = + new LinkedHashSet<>( + regionActiveWriterNodeIds.getOrDefault(groupId, Collections.emptySet())); + activeWriterNodeIds.add(newPreferredNodeId); + final Set runtimeActiveWriterNodeIds = + Collections.unmodifiableSet(activeWriterNodeIds); + regionActiveWriterNodeIds.put(groupId, runtimeActiveWriterNodeIds); + + final ConsensusRegionRuntimeState runtimeState = + new ConsensusRegionRuntimeState( + newRuntimeVersion, + newPreferredNodeId, + newPreferredNodeId == myNodeId, + runtimeActiveWriterNodeIds); + + LOGGER.info( + "ConsensusSubscriptionSetupHandler: region {} preferred writer changed {} -> {}, runtimeVersion {} -> {}, runtimeState={} (route hint)", + regionId, + oldPreferredNodeId, + newPreferredNodeId, + oldRuntimeVersion, + newRuntimeVersion, + runtimeState); + + SubscriptionAgent.broker().applyRuntimeStateForRegion(regionId, runtimeState); + } + } + + private static int getPreferredNodeId(final TRegionReplicaSet replicaSet) { + final List locations = replicaSet.getDataNodeLocations(); + if (locations == null || locations.isEmpty()) { + return -1; + } + return locations.get(0).getDataNodeId(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/PrefetchRoundResult.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/PrefetchRoundResult.java new file mode 100644 index 0000000000000..71066b4875e06 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/PrefetchRoundResult.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +public final class PrefetchRoundResult { + + public enum Type { + RESCHEDULE_NOW, + RESCHEDULE_LATER, + DORMANT + } + + private static final PrefetchRoundResult RESCHEDULE_NOW = + new PrefetchRoundResult(Type.RESCHEDULE_NOW, 0L); + + private static final PrefetchRoundResult DORMANT = new PrefetchRoundResult(Type.DORMANT, 0L); + + private final Type type; + private final long delayMs; + + private PrefetchRoundResult(final Type type, final long delayMs) { + this.type = type; + this.delayMs = delayMs; + } + + public static PrefetchRoundResult rescheduleNow() { + return RESCHEDULE_NOW; + } + + public static PrefetchRoundResult rescheduleAfter(final long delayMs) { + return new PrefetchRoundResult(Type.RESCHEDULE_LATER, Math.max(1L, delayMs)); + } + + public static PrefetchRoundResult dormant() { + return DORMANT; + } + + public Type getType() { + return type; + } + + public long getDelayMs() { + return delayMs; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIterator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIterator.java new file mode 100644 index 0000000000000..c6a83f52df15b --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIterator.java @@ -0,0 +1,522 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.consensus.common.request.IConsensusRequest; +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.common.request.IoTConsensusRequest; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.ProgressWALReader; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALFileVersion; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.Closeable; +import java.io.EOFException; +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * Writer-based WAL iterator for the new subscription progress model. + * + *

This iterator reads writer-local ordering metadata from WAL footer arrays instead of relying + * on the entry body to carry complete subscription ordering information. + */ +public class ProgressWALIterator implements Closeable { + + private static final Logger LOGGER = LoggerFactory.getLogger(ProgressWALIterator.class); + + private static final int SEARCH_INDEX_OFFSET = + WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES; + private static final long HEADER_ONLY_WAL_FILE_BYTES = + Math.max( + WALFileVersion.V2.getVersionBytes().length, WALFileVersion.V3.getVersionBytes().length); + + private final File logDirectory; + private final long startSearchIndex; + private final WALNode liveWalNode; + private File[] walFiles; + private int currentFileIndex = -1; + private ProgressWALReader currentReader; + private long currentReaderVersionId = -1L; + private boolean currentReaderUsesLiveSnapshot = false; + private int consumedEntryCountInCurrentFile = 0; + private final Set skippedBrokenWalVersionIds = new HashSet<>(); + private IOException lastError; + private boolean incompleteScan = false; + private String incompleteScanDetail; + + private long pendingSearchIndex = Long.MIN_VALUE; + private long pendingLocalSeq = Long.MIN_VALUE; + private long pendingPhysicalTime; + private int pendingNodeId; + private long pendingWriterEpoch; + private final List pendingRequests = new ArrayList<>(); + + private IndexedConsensusRequest nextReady; + + public ProgressWALIterator(final File logDirectory) { + this(logDirectory, Long.MIN_VALUE); + } + + public ProgressWALIterator(final File logDirectory, final long startSearchIndex) { + this(logDirectory, startSearchIndex, null); + } + + public ProgressWALIterator(final WALNode liveWalNode) { + this(liveWalNode, Long.MIN_VALUE); + } + + public ProgressWALIterator(final WALNode liveWalNode, final long startSearchIndex) { + this(liveWalNode.getLogDirectory(), startSearchIndex, liveWalNode); + } + + private ProgressWALIterator( + final File logDirectory, final long startSearchIndex, final WALNode liveWalNode) { + this.logDirectory = logDirectory; + this.startSearchIndex = startSearchIndex; + this.liveWalNode = liveWalNode; + refreshFileList(); + } + + private void refreshFileList() { + final File[] discoveredWalFiles = WALFileUtils.listAllWALFiles(logDirectory); + if (discoveredWalFiles == null) { + walFiles = new File[0]; + return; + } + WALFileUtils.ascSortByVersionId(discoveredWalFiles); + final List filteredWalFiles = new ArrayList<>(discoveredWalFiles.length); + for (int i = 0; i < discoveredWalFiles.length; i++) { + final File walFile = discoveredWalFiles[i]; + final boolean isLastWalFile = i == discoveredWalFiles.length - 1; + if (!isLastWalFile && shouldSkipWalFile(walFile)) { + continue; + } + filteredWalFiles.add(walFile); + } + walFiles = filteredWalFiles.toArray(new File[0]); + } + + private boolean shouldSkipWalFile(final File walFile) { + final long versionId = WALFileUtils.parseVersionId(walFile.getName()); + return skippedBrokenWalVersionIds.contains(versionId) || isHeaderOnlyWalFile(walFile); + } + + private boolean isHeaderOnlyWalFile(final File walFile) { + return walFile.length() <= HEADER_ONLY_WAL_FILE_BYTES; + } + + public void refresh() { + final long currentVersionId = + (currentFileIndex >= 0 && currentFileIndex < walFiles.length) + ? WALFileUtils.parseVersionId(walFiles[currentFileIndex].getName()) + : -1; + + refreshFileList(); + + if (currentVersionId >= 0) { + currentFileIndex = -1; + for (int i = 0; i < walFiles.length; i++) { + if (WALFileUtils.parseVersionId(walFiles[i].getName()) >= currentVersionId) { + currentFileIndex = i; + break; + } + } + if (currentFileIndex < 0) { + currentFileIndex = walFiles.length; + } + } + } + + public boolean hasNext() { + if (nextReady != null) { + return true; + } + try { + nextReady = advance(); + if (nextReady != null) { + lastError = null; + } + } catch (IOException e) { + lastError = e; + LOGGER.warn("ProgressWALIterator: error reading WAL", e); + return false; + } + return nextReady != null; + } + + public IndexedConsensusRequest next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + final IndexedConsensusRequest result = nextReady; + nextReady = null; + return result; + } + + public boolean hasReadError() { + return lastError != null; + } + + public IOException getLastError() { + return lastError; + } + + public boolean hasSkippedBrokenWalFiles() { + return !skippedBrokenWalVersionIds.isEmpty(); + } + + public boolean hasIncompleteScan() { + return incompleteScan || hasReadError() || hasSkippedBrokenWalFiles(); + } + + public String getIncompleteScanDetail() { + if (incompleteScanDetail != null) { + return incompleteScanDetail; + } + if (lastError != null) { + return lastError.getMessage(); + } + if (!skippedBrokenWalVersionIds.isEmpty()) { + return "encountered broken retained WAL files during replay scan"; + } + return "replay scan did not complete"; + } + + @Override + public void close() throws IOException { + closeCurrentReader(); + nextReady = null; + pendingRequests.clear(); + pendingSearchIndex = Long.MIN_VALUE; + pendingLocalSeq = Long.MIN_VALUE; + lastError = null; + incompleteScan = false; + incompleteScanDetail = null; + resetCurrentFileTracking(); + } + + private IndexedConsensusRequest advance() throws IOException { + while (true) { + if (currentReader != null && currentReader.hasNext()) { + try { + final ByteBuffer buffer = currentReader.next(); + consumedEntryCountInCurrentFile = currentReader.getCurrentEntryIndex() + 1; + final WALEntryType type = WALEntryType.valueOf(buffer.get()); + buffer.clear(); + if (!type.needSearch()) { + continue; + } + + final long localSeq = currentReader.getCurrentEntryLocalSeq(); + final long physicalTime = currentReader.getCurrentEntryPhysicalTime(); + final int nodeId = currentReader.getCurrentEntryNodeId(); + final long writerEpoch = currentReader.getCurrentEntryWriterEpoch(); + + buffer.position(SEARCH_INDEX_OFFSET); + final long bodySearchIndex = buffer.getLong(); + buffer.clear(); + + if (isSamePendingRequest(localSeq, nodeId, writerEpoch)) { + if (pendingSearchIndex < 0 && bodySearchIndex >= 0) { + pendingSearchIndex = bodySearchIndex; + } + pendingRequests.add(new IoTConsensusRequest(buffer)); + continue; + } + + final IndexedConsensusRequest flushed = flushPending(); + startPending(bodySearchIndex, localSeq, physicalTime, nodeId, writerEpoch, buffer); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + continue; + } catch (final EOFException eofException) { + if (!currentReaderUsesLiveSnapshot) { + throw eofException; + } + // Live snapshot metadata may get ahead of the bytes currently visible in the file. Treat + // EOF as "this snapshot is exhausted for now" instead of terminating the iterator. + final IndexedConsensusRequest flushed = flushPending(); + if (flushed != null && !shouldSkip(flushed)) { + closeCurrentReader(); + return flushed; + } + if (reopenLiveSnapshotReader()) { + continue; + } + return null; + } + } + + if (currentReaderUsesLiveSnapshot) { + final IndexedConsensusRequest flushed = flushPending(); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + if (reopenLiveSnapshotReader()) { + continue; + } + return null; + } + + if (currentReader != null) { + closeCurrentReader(); + final IndexedConsensusRequest flushed = flushPending(); + resetCurrentFileTracking(); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + continue; + } + + if (!openNextReader()) { + final IndexedConsensusRequest flushed = flushPending(); + if (flushed != null && !shouldSkip(flushed)) { + return flushed; + } + return null; + } + } + } + + private boolean openNextReader() throws IOException { + while (++currentFileIndex < walFiles.length) { + if (openReaderAtIndex(currentFileIndex, 0)) { + return true; + } + } + return false; + } + + private boolean reopenLiveSnapshotReader() throws IOException { + if (liveWalNode == null || currentReaderVersionId < 0) { + return false; + } + + closeCurrentReader(); + refresh(); + + final long currentLiveVersionId = liveWalNode.getCurrentWALFileVersion(); + if (currentLiveVersionId == currentReaderVersionId) { + final WALMetaData snapshot = liveWalNode.getCurrentWALMetaDataSnapshot(); + if (snapshot.getBuffersSize().size() <= consumedEntryCountInCurrentFile) { + return false; + } + final int fileIndex = findFileIndexByVersion(currentReaderVersionId); + if (fileIndex < 0) { + return false; + } + return openReaderAtIndex(fileIndex, consumedEntryCountInCurrentFile); + } + + final int previousFileIndex = findFileIndexByVersion(currentReaderVersionId); + if (previousFileIndex < 0) { + return openFirstReaderAfterVersion(currentReaderVersionId); + } + if (openReaderAtIndex(previousFileIndex, consumedEntryCountInCurrentFile)) { + return true; + } + return openFirstReaderAfterVersion(currentReaderVersionId); + } + + private boolean openReaderAtIndex(final int fileIndex, final int skipEntries) throws IOException { + return openReaderAtIndex(fileIndex, skipEntries, true); + } + + private boolean openReaderAtIndex( + final int fileIndex, final int skipEntries, final boolean allowNearLiveRetry) + throws IOException { + final File walFile = walFiles[fileIndex]; + final long versionId = WALFileUtils.parseVersionId(walFile.getName()); + final boolean useLiveSnapshot = + liveWalNode != null && versionId == liveWalNode.getCurrentWALFileVersion(); + + try { + final ProgressWALReader reader = + useLiveSnapshot + ? new ProgressWALReader(walFile, liveWalNode.getCurrentWALMetaDataSnapshot()) + : new ProgressWALReader(walFile); + if (!skipEntries(reader, skipEntries)) { + reader.close(); + markIncompleteScan( + String.format( + "failed to reopen WAL file %s at entry offset %s: iterator could not skip to the requested position", + walFile.getName(), skipEntries), + null); + resetCurrentFileTracking(); + return false; + } + currentReader = reader; + currentFileIndex = fileIndex; + currentReaderVersionId = versionId; + currentReaderUsesLiveSnapshot = useLiveSnapshot; + consumedEntryCountInCurrentFile = skipEntries; + return true; + } catch (final IOException e) { + if (isNearLiveWalVersion(versionId)) { + LOGGER.debug( + "ProgressWALIterator: failed to open near-live WAL file {}, retrying without blacklisting", + walFile.getName(), + e); + if (allowNearLiveRetry) { + refresh(); + final int refreshedIndex = findFileIndexByVersion(versionId); + if (refreshedIndex >= 0) { + if (openReaderAtIndex(refreshedIndex, skipEntries, false)) { + return true; + } + } + } + markIncompleteScan( + String.format( + "failed to open near-live WAL file %s while replay scan was still in progress", + walFile.getName()), + e); + return false; + } + skippedBrokenWalVersionIds.add(versionId); + LOGGER.warn( + "ProgressWALIterator: failed to open WAL file {}, skipping", walFile.getName(), e); + return false; + } + } + + private boolean skipEntries(final ProgressWALReader reader, final int skipEntries) + throws IOException { + int skipped = 0; + while (skipped < skipEntries) { + if (!reader.hasNext()) { + return false; + } + reader.next(); + skipped++; + } + return true; + } + + private int findFileIndexByVersion(final long versionId) { + for (int i = 0; i < walFiles.length; i++) { + if (WALFileUtils.parseVersionId(walFiles[i].getName()) == versionId) { + return i; + } + } + return -1; + } + + private boolean openFirstReaderAfterVersion(final long versionId) throws IOException { + for (int i = 0; i < walFiles.length; i++) { + if (WALFileUtils.parseVersionId(walFiles[i].getName()) > versionId + && openReaderAtIndex(i, 0)) { + return true; + } + } + resetCurrentFileTracking(); + return false; + } + + private boolean isNearLiveWalVersion(final long versionId) { + if (liveWalNode == null) { + return false; + } + return versionId >= Math.max(0L, liveWalNode.getCurrentWALFileVersion() - 1L); + } + + private boolean isSamePendingRequest( + final long localSeq, final int nodeId, final long writerEpoch) { + return !pendingRequests.isEmpty() + && pendingLocalSeq == localSeq + && pendingNodeId == nodeId + && pendingWriterEpoch == writerEpoch; + } + + private void startPending( + final long searchIndex, + final long localSeq, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final ByteBuffer buffer) { + pendingSearchIndex = searchIndex; + pendingLocalSeq = localSeq; + pendingPhysicalTime = physicalTime; + pendingNodeId = nodeId; + pendingWriterEpoch = writerEpoch; + pendingRequests.clear(); + pendingRequests.add(new IoTConsensusRequest(buffer)); + } + + private IndexedConsensusRequest flushPending() { + if (pendingRequests.isEmpty()) { + return null; + } + final IndexedConsensusRequest result = + new IndexedConsensusRequest( + pendingSearchIndex, pendingLocalSeq, new ArrayList<>(pendingRequests)); + result + .setPhysicalTime(pendingPhysicalTime) + .setNodeId(pendingNodeId) + .setWriterEpoch(pendingWriterEpoch); + pendingRequests.clear(); + pendingSearchIndex = Long.MIN_VALUE; + pendingLocalSeq = Long.MIN_VALUE; + return result; + } + + private boolean shouldSkip(final IndexedConsensusRequest request) { + return request.getSearchIndex() >= 0 && request.getSearchIndex() < startSearchIndex; + } + + private void closeCurrentReader() throws IOException { + if (currentReader != null) { + currentReader.close(); + currentReader = null; + } + } + + private void resetCurrentFileTracking() { + currentReaderVersionId = -1L; + currentReaderUsesLiveSnapshot = false; + consumedEntryCountInCurrentFile = 0; + } + + private void markIncompleteScan(final String detail, final IOException cause) { + incompleteScan = true; + if (incompleteScanDetail == null) { + incompleteScanDetail = detail; + } + if (lastError == null && cause != null) { + lastError = cause; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java new file mode 100644 index 0000000000000..c39f17b86b1db --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/broker/consensus/SubscriptionConsensusProgress.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Persisted commit metadata for a single (consumerGroup, topic, region) combination. + * + *

This object stores the committed per-writer region frontier plus the persistence throttling + * counter. + */ +public class SubscriptionConsensusProgress { + + private volatile RegionProgress committedRegionProgress; + + private final AtomicLong commitIndex; + + public SubscriptionConsensusProgress() { + this(new RegionProgress(Collections.emptyMap()), 0L); + } + + public SubscriptionConsensusProgress( + final RegionProgress committedRegionProgress, final long commitIndex) { + this.committedRegionProgress = normalize(committedRegionProgress); + this.commitIndex = new AtomicLong(commitIndex); + } + + public RegionProgress getCommittedRegionProgress() { + return committedRegionProgress; + } + + public void setCommittedRegionProgress(final RegionProgress committedRegionProgress) { + this.committedRegionProgress = normalize(committedRegionProgress); + } + + public WriterId getCommittedWriterId() { + return getDerivedCommittedWriterState().writerId; + } + + public WriterProgress getCommittedWriterProgress() { + return getDerivedCommittedWriterState().writerProgress; + } + + public long getCommitIndex() { + return commitIndex.get(); + } + + public void incrementCommitIndex() { + commitIndex.incrementAndGet(); + } + + public void serialize(final DataOutputStream stream) throws IOException { + committedRegionProgress.serialize(stream); + ReadWriteIOUtils.write(commitIndex.get(), stream); + } + + public static SubscriptionConsensusProgress deserialize(final ByteBuffer buffer) { + final RegionProgress committedRegionProgress = RegionProgress.deserialize(buffer); + final long commitIndex = ReadWriteIOUtils.readLong(buffer); + return new SubscriptionConsensusProgress(committedRegionProgress, commitIndex); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final SubscriptionConsensusProgress that = (SubscriptionConsensusProgress) o; + return commitIndex.get() == that.commitIndex.get() + && Objects.equals(committedRegionProgress, that.committedRegionProgress); + } + + @Override + public int hashCode() { + return Objects.hash(committedRegionProgress, commitIndex.get()); + } + + @Override + public String toString() { + return "SubscriptionConsensusProgress{" + + "committedRegionProgress=" + + committedRegionProgress + + ", commitIndex=" + + commitIndex.get() + + '}'; + } + + private static RegionProgress normalize(final RegionProgress committedRegionProgress) { + if (Objects.isNull(committedRegionProgress) + || committedRegionProgress.getWriterPositions().isEmpty()) { + return new RegionProgress(Collections.emptyMap()); + } + final Map normalized = new LinkedHashMap<>(); + for (final Map.Entry entry : + committedRegionProgress.getWriterPositions().entrySet()) { + if (Objects.nonNull(entry.getKey()) && Objects.nonNull(entry.getValue())) { + normalized.put(entry.getKey(), entry.getValue()); + } + } + return new RegionProgress(normalized); + } + + private DerivedCommittedWriterState getDerivedCommittedWriterState() { + WriterId bestWriterId = null; + WriterProgress bestWriterProgress = null; + for (final Map.Entry entry : + committedRegionProgress.getWriterPositions().entrySet()) { + if (Objects.isNull(bestWriterProgress) + || compareWriterProgress(entry.getValue(), bestWriterProgress) > 0 + || (compareWriterProgress(entry.getValue(), bestWriterProgress) == 0 + && compareWriterId(entry.getKey(), bestWriterId) > 0)) { + bestWriterId = entry.getKey(); + bestWriterProgress = entry.getValue(); + } + } + return new DerivedCommittedWriterState( + bestWriterId, + Objects.nonNull(bestWriterProgress) ? bestWriterProgress : new WriterProgress(0L, -1L)); + } + + private static int compareWriterProgress( + final WriterProgress leftProgress, final WriterProgress rightProgress) { + int cmp = Long.compare(leftProgress.getPhysicalTime(), rightProgress.getPhysicalTime()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftProgress.getLocalSeq(), rightProgress.getLocalSeq()); + } + + private static int compareWriterId(final WriterId leftWriterId, final WriterId rightWriterId) { + if (Objects.isNull(leftWriterId) && Objects.isNull(rightWriterId)) { + return 0; + } + if (Objects.isNull(leftWriterId)) { + return -1; + } + if (Objects.isNull(rightWriterId)) { + return 1; + } + int cmp = Integer.compare(leftWriterId.getNodeId(), rightWriterId.getNodeId()); + if (cmp != 0) { + return cmp; + } + return Long.compare(leftWriterId.getWriterEpoch(), rightWriterId.getWriterEpoch()); + } + + private static final class DerivedCommittedWriterState { + + private final WriterId writerId; + + private final WriterProgress writerProgress; + + private DerivedCommittedWriterState( + final WriterId writerId, final WriterProgress writerProgress) { + this.writerId = writerId; + this.writerProgress = writerProgress; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java index dfadee5908fa5..eba81238316ed 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/event/SubscriptionEvent.java @@ -47,7 +47,6 @@ import java.util.concurrent.atomic.AtomicLong; import static com.google.common.base.MoreObjects.toStringHelper; -import static org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionCommitContext.INVALID_COMMIT_ID; public class SubscriptionEvent implements Comparable { @@ -71,6 +70,9 @@ public class SubscriptionEvent implements Comparable { private volatile SubscriptionCommitContext rootCommitContext; private static final long NACK_COUNT_REPORT_THRESHOLD = 3; + + private static final long POISON_MESSAGE_NACK_THRESHOLD = 10; + private final AtomicLong nackCount = new AtomicLong(); /** @@ -159,16 +161,15 @@ public void recordCommittedTimestamp() { } public boolean isCommitted() { - if (commitContext.getCommitId() == INVALID_COMMIT_ID) { - // event with invalid commit id is committed + if (!commitContext.isCommittable()) { + // fire-and-forget events are treated as already committed return true; } return committedTimestamp.get() != INVALID_TIMESTAMP; } public boolean isCommittable() { - if (commitContext.getCommitId() == INVALID_COMMIT_ID) { - // event with invalid commit id is uncommittable + if (!commitContext.isCommittable()) { return false; } return response.isCommittable(); @@ -248,6 +249,15 @@ public void nack() { } } + /** Returns the current nack count for this event. */ + public long getNackCount() { + return nackCount.get(); + } + + public boolean isPoisoned() { + return nackCount.get() >= POISON_MESSAGE_NACK_THRESHOLD; + } + public void recordLastPolledConsumerId(final String consumerId) { lastPolledConsumerId = consumerId; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/ConsensusSubscriptionPrefetchingQueueMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/ConsensusSubscriptionPrefetchingQueueMetrics.java new file mode 100644 index 0000000000000..ecf79360237b7 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/ConsensusSubscriptionPrefetchingQueueMetrics.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.metric; + +import org.apache.iotdb.commons.service.metric.enums.Metric; +import org.apache.iotdb.commons.service.metric.enums.Tag; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.metrics.AbstractMetricService; +import org.apache.iotdb.metrics.metricsets.IMetricSet; +import org.apache.iotdb.metrics.type.Rate; +import org.apache.iotdb.metrics.utils.MetricLevel; +import org.apache.iotdb.metrics.utils.MetricType; + +import com.google.common.collect.ImmutableSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; + +public class ConsensusSubscriptionPrefetchingQueueMetrics implements IMetricSet { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionPrefetchingQueueMetrics.class); + + private volatile AbstractMetricService metricService; + + private final Map queueMap = new ConcurrentHashMap<>(); + + private final Map rateMap = new ConcurrentHashMap<>(); + + @Override + public void bindTo(final AbstractMetricService metricService) { + this.metricService = metricService; + final ImmutableSet ids = ImmutableSet.copyOf(queueMap.keySet()); + for (final String id : ids) { + createMetrics(id); + } + } + + @Override + public void unbindFrom(final AbstractMetricService metricService) { + final ImmutableSet ids = ImmutableSet.copyOf(queueMap.keySet()); + for (final String id : ids) { + deregister(id); + } + if (!queueMap.isEmpty()) { + LOGGER.warn( + "Failed to unbind from consensus subscription prefetching queue metrics, queue map not empty"); + } + } + + //////////////////////////// register & deregister //////////////////////////// + + public void register(final ConsensusPrefetchingQueue queue) { + final String id = queue.getPrefetchingQueueId(); + queueMap.putIfAbsent(id, queue); + if (Objects.nonNull(metricService)) { + createMetrics(id); + } + } + + private void createMetrics(final String id) { + createAutoGauge(id); + createRate(id); + } + + private void createAutoGauge(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + metricService.createAutoGauge( + Metric.SUBSCRIPTION_UNCOMMITTED_EVENT_COUNT.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getSubscriptionUncommittedEventCount, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + // Keep the legacy metric name for dashboard compatibility, but expose seek generation here. + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CURRENT_COMMIT_ID.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getCurrentSeekGeneration, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_LAG.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getLag, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_WAL_GAP.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getWalGapSkippedEntries, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_ROUTING_EPOCH_CHANGE.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getEpochChangeCount, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.createAutoGauge( + Metric.SUBSCRIPTION_CONSENSUS_WATERMARK.toString(), + MetricLevel.IMPORTANT, + queue, + ConsensusPrefetchingQueue::getMaxObservedTimestamp, + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + } + + private void createRate(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + rateMap.put( + id, + metricService.getOrCreateRate( + Metric.SUBSCRIPTION_EVENT_TRANSFER.toString(), + MetricLevel.IMPORTANT, + Tag.NAME.toString(), + queue.getPrefetchingQueueId())); + } + + public void deregister(final String id) { + if (!queueMap.containsKey(id)) { + LOGGER.warn( + "Failed to deregister consensus subscription prefetching queue metrics, " + + "ConsensusPrefetchingQueue({}) does not exist", + id); + return; + } + if (Objects.nonNull(metricService)) { + removeMetrics(id); + } + queueMap.remove(id); + } + + private void removeMetrics(final String id) { + removeAutoGauge(id); + removeRate(id); + } + + private void removeAutoGauge(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_UNCOMMITTED_EVENT_COUNT.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CURRENT_COMMIT_ID.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_LAG.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_WAL_GAP.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_ROUTING_EPOCH_CHANGE.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + metricService.remove( + MetricType.AUTO_GAUGE, + Metric.SUBSCRIPTION_CONSENSUS_WATERMARK.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + } + + private void removeRate(final String id) { + final ConsensusPrefetchingQueue queue = queueMap.get(id); + if (Objects.isNull(queue)) { + return; + } + metricService.remove( + MetricType.RATE, + Metric.SUBSCRIPTION_EVENT_TRANSFER.toString(), + Tag.NAME.toString(), + queue.getPrefetchingQueueId()); + } + + public void mark(final String id, final long size) { + if (Objects.isNull(metricService)) { + return; + } + final Rate rate = rateMap.get(id); + if (rate == null) { + LOGGER.warn( + "Failed to mark transfer event rate, ConsensusPrefetchingQueue({}) does not exist", id); + return; + } + rate.mark(size); + } + + //////////////////////////// singleton //////////////////////////// + + private static class Holder { + + private static final ConsensusSubscriptionPrefetchingQueueMetrics INSTANCE = + new ConsensusSubscriptionPrefetchingQueueMetrics(); + + private Holder() {} + } + + public static ConsensusSubscriptionPrefetchingQueueMetrics getInstance() { + return Holder.INSTANCE; + } + + private ConsensusSubscriptionPrefetchingQueueMetrics() {} +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java index 48a6dc50e6d43..29de59ddf3266 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/metric/SubscriptionMetrics.java @@ -29,11 +29,13 @@ public class SubscriptionMetrics implements IMetricSet { @Override public void bindTo(final AbstractMetricService metricService) { SubscriptionPrefetchingQueueMetrics.getInstance().bindTo(metricService); + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance().bindTo(metricService); } @Override public void unbindFrom(final AbstractMetricService metricService) { SubscriptionPrefetchingQueueMetrics.getInstance().unbindFrom(metricService); + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance().unbindFrom(metricService); } //////////////////////////// singleton //////////////////////////// diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java index bfcbbaf850f7a..d81129ae79a96 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/receiver/SubscriptionReceiverV1.java @@ -38,7 +38,9 @@ import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; import org.apache.iotdb.db.subscription.broker.SubscriptionPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.db.subscription.event.SubscriptionEvent; +import org.apache.iotdb.db.subscription.metric.ConsensusSubscriptionPrefetchingQueueMetrics; import org.apache.iotdb.db.subscription.metric.SubscriptionPrefetchingQueueMetrics; import org.apache.iotdb.rpc.RpcUtils; import org.apache.iotdb.rpc.TSStatusCode; @@ -54,6 +56,8 @@ import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequest; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollRequestType; import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponse; +import org.apache.iotdb.rpc.subscription.payload.poll.SubscriptionPollResponseType; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCloseReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeCommitReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeHandshakeReq; @@ -61,6 +65,7 @@ import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribePollReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeRequestType; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeRequestVersion; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSubscribeReq; import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeUnsubscribeReq; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeCloseResp; @@ -70,6 +75,7 @@ import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribePollResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeResponseType; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeResponseVersion; +import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeSeekResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeSubscribeResp; import org.apache.iotdb.rpc.subscription.payload.response.PipeSubscribeUnsubscribeResp; import org.apache.iotdb.service.rpc.thrift.TPipeSubscribeReq; @@ -85,6 +91,7 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -145,6 +152,8 @@ public final TPipeSubscribeResp handle(final TPipeSubscribeReq req) { return handlePipeSubscribeCommit(PipeSubscribeCommitReq.fromTPipeSubscribeReq(req)); case CLOSE: return handlePipeSubscribeClose(PipeSubscribeCloseReq.fromTPipeSubscribeReq(req)); + case SEEK: + return handlePipeSubscribeSeek(PipeSubscribeSeekReq.fromTPipeSubscribeReq(req)); default: break; } @@ -498,7 +507,10 @@ private TPipeSubscribeResp handlePipeSubscribePollInternal(final PipeSubscribePo case POLL: events = handlePipeSubscribePollRequest( - consumerConfig, (PollPayload) request.getPayload(), maxBytes); + consumerConfig, + (PollPayload) request.getPayload(), + maxBytes, + request.getProgressByTopic()); break; case POLL_FILE: events = @@ -562,17 +574,33 @@ private TPipeSubscribeResp handlePipeSubscribePollInternal(final PipeSubscribePo } totalSize.getAndAdd(size); - SubscriptionPrefetchingQueueMetrics.getInstance() - .mark( - SubscriptionPrefetchingQueue.generatePrefetchingQueueId( - commitContext.getConsumerGroupId(), commitContext.getTopicName()), - size); + final String queueId = + SubscriptionPrefetchingQueue.generatePrefetchingQueueId( + commitContext.getConsumerGroupId(), commitContext.getTopicName()); + if (ConsensusSubscriptionSetupHandler.isConsensusBasedTopic( + commitContext.getTopicName())) { + ConsensusSubscriptionPrefetchingQueueMetrics.getInstance() + .mark(queueId, size); + } else { + SubscriptionPrefetchingQueueMetrics.getInstance().mark(queueId, size); + } event.invalidateCurrentResponseByteBuffer(); - LOGGER.info( - "Subscription: consumer {} poll {} successfully with request: {}", - consumerConfig, - response, - req.getRequest()); + if (response.getResponseType() + == SubscriptionPollResponseType.WATERMARK.getType() + || response.getResponseType() + == SubscriptionPollResponseType.TABLETS.getType()) { + LOGGER.debug( + "Subscription: consumer {} poll {} successfully with request: {}", + consumerConfig, + response, + req.getRequest()); + } else { + LOGGER.info( + "Subscription: consumer {} poll {} successfully with request: {}", + consumerConfig, + response, + req.getRequest()); + } return byteBuffer; } catch (final Exception e) { final boolean isOutdated = @@ -610,7 +638,10 @@ private TPipeSubscribeResp handlePipeSubscribePollInternal(final PipeSubscribePo } private List handlePipeSubscribePollRequest( - final ConsumerConfig consumerConfig, final PollPayload messagePayload, final long maxBytes) { + final ConsumerConfig consumerConfig, + final PollPayload messagePayload, + final long maxBytes, + final Map progressByTopic) { final Set subscribedTopicNames = SubscriptionAgent.consumer() .getTopicNamesSubscribedByConsumer( @@ -622,7 +653,7 @@ private List handlePipeSubscribePollRequest( // filter unsubscribed topics topicNames.removeIf((topicName) -> !subscribedTopicNames.contains(topicName)); - return SubscriptionAgent.broker().poll(consumerConfig, topicNames, maxBytes); + return SubscriptionAgent.broker().poll(consumerConfig, topicNames, maxBytes, progressByTopic); } private List handlePipeSubscribePollTsFileRequest( @@ -669,22 +700,90 @@ private TPipeSubscribeResp handlePipeSubscribeCommitInternal(final PipeSubscribe if (Objects.equals(successfulCommitContexts.size(), commitContexts.size())) { LOGGER.info( - "Subscription: consumer {} commit (nack: {}) successfully, commit contexts: {}", + "Subscription: consumer {} commit (nack: {}) successfully, summary: {}", consumerConfig, nack, - commitContexts); + summarizeCommitContexts(commitContexts)); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug( + "Subscription: consumer {} commit (nack: {}) full commit contexts: {}", + consumerConfig, + nack, + commitContexts); + } } else { LOGGER.warn( - "Subscription: consumer {} commit (nack: {}) partially successful, commit contexts: {}, successful commit contexts: {}", + "Subscription: consumer {} commit (nack: {}) partially successful, requested summary: {}, successful summary: {}", consumerConfig, nack, - commitContexts, - successfulCommitContexts); + summarizeCommitContexts(commitContexts), + summarizeCommitContexts(successfulCommitContexts)); + if (LOGGER.isDebugEnabled()) { + LOGGER.debug( + "Subscription: consumer {} commit (nack: {}) full requested commit contexts: {}, full successful commit contexts: {}", + consumerConfig, + nack, + commitContexts, + successfulCommitContexts); + } } return PipeSubscribeCommitResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); } + private static String summarizeCommitContexts( + final List commitContexts) { + if (Objects.isNull(commitContexts) || commitContexts.isEmpty()) { + return "count=0"; + } + + long minLocalSeq = Long.MAX_VALUE; + long maxLocalSeq = Long.MIN_VALUE; + long minPhysicalTime = Long.MAX_VALUE; + long maxPhysicalTime = Long.MIN_VALUE; + final Set regionIds = new LinkedHashSet<>(); + final Set topicNames = new LinkedHashSet<>(); + + for (final SubscriptionCommitContext commitContext : commitContexts) { + if (Objects.isNull(commitContext)) { + continue; + } + topicNames.add(commitContext.getTopicName()); + regionIds.add(commitContext.getRegionId()); + + final long localSeq = commitContext.getLocalSeq(); + minLocalSeq = Math.min(minLocalSeq, localSeq); + maxLocalSeq = Math.max(maxLocalSeq, localSeq); + + final long physicalTime = commitContext.getPhysicalTime(); + minPhysicalTime = Math.min(minPhysicalTime, physicalTime); + maxPhysicalTime = Math.max(maxPhysicalTime, physicalTime); + } + + return String.format( + "count=%d, topics=%s, regions=%s, localSeqRange=%s, physicalTimeRange=%s", + commitContexts.size(), + summarizeStringSet(topicNames, 2), + summarizeStringSet(regionIds, 4), + minLocalSeq == Long.MAX_VALUE ? "N/A" : "[" + minLocalSeq + ", " + maxLocalSeq + "]", + minPhysicalTime == Long.MAX_VALUE + ? "N/A" + : "[" + minPhysicalTime + ", " + maxPhysicalTime + "]"); + } + + private static String summarizeStringSet(final Set values, final int maxDisplayCount) { + if (Objects.isNull(values) || values.isEmpty()) { + return "[]"; + } + + final List displayValues = + values.stream().limit(maxDisplayCount).collect(Collectors.toList()); + if (values.size() <= maxDisplayCount) { + return displayValues.toString(); + } + return displayValues + "...(" + values.size() + " total)"; + } + private TPipeSubscribeResp handlePipeSubscribeClose(final PipeSubscribeCloseReq req) { try { return handlePipeSubscribeCloseInternal(req); @@ -715,6 +814,71 @@ private TPipeSubscribeResp handlePipeSubscribeCloseInternal(final PipeSubscribeC return PipeSubscribeCloseResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); } + private TPipeSubscribeResp handlePipeSubscribeSeek(final PipeSubscribeSeekReq req) { + try { + return handlePipeSubscribeSeekInternal(req); + } catch (final Exception e) { + LOGGER.warn("Exception occurred when seeking with request {}", req, e); + final String exceptionMessage = + String.format( + "Subscription: something unexpected happened when seeking with request %s: %s", + req, e); + return PipeSubscribeSeekResp.toTPipeSubscribeResp( + RpcUtils.getStatus(TSStatusCode.SUBSCRIPTION_SEEK_ERROR, exceptionMessage)); + } + } + + private TPipeSubscribeResp handlePipeSubscribeSeekInternal(final PipeSubscribeSeekReq req) { + // check consumer config thread local + final ConsumerConfig consumerConfig = consumerConfigThreadLocal.get(); + if (Objects.isNull(consumerConfig)) { + LOGGER.warn( + "Subscription: missing consumer config when handling PipeSubscribeSeekReq: {}", req); + return SUBSCRIPTION_MISSING_CUSTOMER_RESP; + } + + final String topicName = req.getTopicName(); + final short seekType = req.getSeekType(); + + if (seekType == PipeSubscribeSeekReq.SEEK_TO_TOPIC_PROGRESS) { + SubscriptionAgent.broker() + .seekToTopicProgress(consumerConfig, topicName, req.getTopicProgress()); + LOGGER.info( + "Subscription: consumer {} seek topic {} to topicProgress(regionCount={})", + consumerConfig, + topicName, + req.getTopicProgress().getRegionProgress().size()); + } else if (seekType == PipeSubscribeSeekReq.SEEK_AFTER_TOPIC_PROGRESS) { + SubscriptionAgent.broker() + .seekAfterTopicProgress(consumerConfig, topicName, req.getTopicProgress()); + LOGGER.info( + "Subscription: consumer {} seekAfter topic {} to topicProgress(regionCount={})", + consumerConfig, + topicName, + req.getTopicProgress().getRegionProgress().size()); + } else if (seekType == PipeSubscribeSeekReq.SEEK_TO_BEGINNING + || seekType == PipeSubscribeSeekReq.SEEK_TO_END) { + SubscriptionAgent.broker().seek(consumerConfig, topicName, seekType); + LOGGER.info( + "Subscription: consumer {} seek topic {} with seekType={}", + consumerConfig, + topicName, + seekType); + } else { + final String errorMessage = + String.format( + "Subscription: unsupported seekType %s for topic %s. " + + "Consensus subscription only supports seekToBeginning, seekToEnd, " + + "seek(topicProgress), and seekAfter(topicProgress).", + seekType, topicName); + LOGGER.warn(errorMessage); + return PipeSubscribeSeekResp.toTPipeSubscribeResp( + RpcUtils.getStatus(TSStatusCode.SUBSCRIPTION_SEEK_ERROR, errorMessage)); + } + + return PipeSubscribeSeekResp.toTPipeSubscribeResp(RpcUtils.SUCCESS_STATUS); + } + private void closeConsumer(final ConsumerConfig consumerConfig) { // unsubscribe all subscribed topics final Set topicNames = diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutor.java new file mode 100644 index 0000000000000..660de3770cd7d --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutor.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.task.execution; + +import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; +import org.apache.iotdb.commons.concurrent.ThreadName; +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; +import org.apache.iotdb.db.subscription.task.subtask.ConsensusPrefetchSubtask; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +public class ConsensusSubscriptionPrefetchExecutor { + + private static final Logger LOGGER = + LoggerFactory.getLogger(ConsensusSubscriptionPrefetchExecutor.class); + + private static final AtomicInteger ID_GENERATOR = new AtomicInteger(0); + + private final String workerThreadName; + private final String schedulerThreadName; + private final int workerThreadNum; + + private final BlockingQueue readyQueue = new LinkedBlockingQueue<>(); + private final Map taskIdToSubtask = new ConcurrentHashMap<>(); + private final AtomicBoolean shutdown = new AtomicBoolean(false); + + private final ExecutorService workerPool; + private final ScheduledExecutorService delayedScheduler; + + public ConsensusSubscriptionPrefetchExecutor() { + final int executorId = ID_GENERATOR.getAndIncrement(); + this.workerThreadNum = + Math.max( + 1, + SubscriptionConfig.getInstance() + .getSubscriptionConsensusPrefetchExecutorMaxThreadNum()); + this.workerThreadName = + ThreadName.SUBSCRIPTION_CONSENSUS_PREFETCH_EXECUTOR_POOL.getName() + "-" + executorId; + this.schedulerThreadName = + ThreadName.SUBSCRIPTION_CONSENSUS_PREFETCH_SCHEDULER.getName() + "-" + executorId; + this.workerPool = IoTDBThreadPoolFactory.newFixedThreadPool(workerThreadNum, workerThreadName); + this.delayedScheduler = + IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor(schedulerThreadName); + + for (int i = 0; i < workerThreadNum; i++) { + workerPool.submit(this::workerLoop); + } + } + + public synchronized boolean register(final ConsensusPrefetchSubtask subtask) { + if (shutdown.get()) { + LOGGER.warn( + "Consensus prefetch executor is shutdown, skip registering {}", subtask.getTaskId()); + return false; + } + if (taskIdToSubtask.putIfAbsent(subtask.getTaskId(), subtask) != null) { + LOGGER.warn("Consensus prefetch subtask {} is already registered", subtask.getTaskId()); + return false; + } + subtask.bindExecutor(this); + return true; + } + + public synchronized void deregister(final String taskId) { + final ConsensusPrefetchSubtask subtask = taskIdToSubtask.remove(taskId); + if (subtask == null) { + return; + } + readyQueue.remove(subtask); + subtask.cancelPendingExecution(); + subtask.close(); + } + + public void enqueue(final ConsensusPrefetchSubtask subtask) { + if (shutdown.get() || subtask.isClosed()) { + return; + } + readyQueue.offer(subtask); + } + + public void schedule( + final ConsensusPrefetchSubtask subtask, final long delayMs, final long delayedToken) { + if (shutdown.get() || subtask.isClosed()) { + return; + } + delayedScheduler.schedule( + () -> { + if (!shutdown.get()) { + subtask.fireScheduledWakeup(delayedToken); + } + }, + delayMs, + TimeUnit.MILLISECONDS); + } + + public synchronized void shutdown() { + if (!shutdown.compareAndSet(false, true)) { + return; + } + + for (final ConsensusPrefetchSubtask subtask : taskIdToSubtask.values()) { + readyQueue.remove(subtask); + subtask.cancelPendingExecution(); + subtask.close(); + } + taskIdToSubtask.clear(); + readyQueue.clear(); + + delayedScheduler.shutdownNow(); + workerPool.shutdownNow(); + } + + public boolean isShutdown() { + return shutdown.get(); + } + + private void workerLoop() { + try { + while (!shutdown.get() && !Thread.currentThread().isInterrupted()) { + final ConsensusPrefetchSubtask subtask = readyQueue.take(); + if (subtask.isClosed()) { + continue; + } + subtask.runOneRound(); + } + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + } catch (final Throwable t) { + LOGGER.error("Consensus prefetch worker loop exits abnormally", t); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutorManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutorManager.java new file mode 100644 index 0000000000000..9362a38a58b7e --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/execution/ConsensusSubscriptionPrefetchExecutorManager.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.task.execution; + +import org.apache.iotdb.commons.subscription.config.SubscriptionConfig; + +public class ConsensusSubscriptionPrefetchExecutorManager { + + private volatile ConsensusSubscriptionPrefetchExecutor executor; + private volatile boolean started = false; + + private ConsensusSubscriptionPrefetchExecutorManager() { + // singleton + } + + public synchronized void start() { + if (!SubscriptionConfig.getInstance().getSubscriptionEnabled()) { + started = false; + return; + } + started = true; + if (executor == null || executor.isShutdown()) { + executor = new ConsensusSubscriptionPrefetchExecutor(); + } + } + + public synchronized ConsensusSubscriptionPrefetchExecutor getExecutor() { + if (!started || !SubscriptionConfig.getInstance().getSubscriptionEnabled()) { + return null; + } + if (executor == null || executor.isShutdown()) { + executor = new ConsensusSubscriptionPrefetchExecutor(); + } + return executor; + } + + public synchronized void stop() { + started = false; + if (executor != null) { + executor.shutdown(); + executor = null; + } + } + + public boolean isStarted() { + return started; + } + + private static class Holder { + private static final ConsensusSubscriptionPrefetchExecutorManager INSTANCE = + new ConsensusSubscriptionPrefetchExecutorManager(); + } + + public static ConsensusSubscriptionPrefetchExecutorManager getInstance() { + return Holder.INSTANCE; + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/ConsensusPrefetchSubtask.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/ConsensusPrefetchSubtask.java new file mode 100644 index 0000000000000..79997bb7405a1 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/ConsensusPrefetchSubtask.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.task.subtask; + +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.db.subscription.broker.consensus.PrefetchRoundResult; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutor; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ConsensusPrefetchSubtask { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsensusPrefetchSubtask.class); + + private final String taskId; + private final ConsensusPrefetchingQueue queue; + private final Object monitor = new Object(); + + private ConsensusSubscriptionPrefetchExecutor executor; + + private boolean scheduledOrRunning = false; + private boolean running = false; + private boolean wakeupPending = false; + private boolean closed = false; + private long delayedWakeToken = 0L; + + public ConsensusPrefetchSubtask(final ConsensusPrefetchingQueue queue) { + this.queue = queue; + this.taskId = queue.getPrefetchingQueueId() + "_" + queue.getConsensusGroupId(); + } + + public String getTaskId() { + return taskId; + } + + public void bindExecutor(final ConsensusSubscriptionPrefetchExecutor executor) { + this.executor = executor; + } + + public void requestWakeupNow() { + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + + boolean shouldEnqueue = false; + synchronized (monitor) { + if (closed) { + return; + } + delayedWakeToken++; + if (scheduledOrRunning) { + wakeupPending = true; + return; + } + scheduledOrRunning = true; + shouldEnqueue = true; + } + + if (shouldEnqueue) { + currentExecutor.enqueue(this); + } + } + + public void scheduleWakeupAfter(final long delayMs) { + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + + long delayedToken; + synchronized (monitor) { + if (closed || scheduledOrRunning || wakeupPending) { + return; + } + delayedToken = ++delayedWakeToken; + } + currentExecutor.schedule(this, delayMs, delayedToken); + } + + public void runOneRound() { + PrefetchRoundResult result = PrefetchRoundResult.dormant(); + + synchronized (monitor) { + if (closed) { + scheduledOrRunning = false; + monitor.notifyAll(); + return; + } + running = true; + } + + try { + result = queue.drivePrefetchOnce(); + } catch (final Throwable t) { + LOGGER.error( + "ConsensusPrefetchSubtask {}: unexpected error while driving queue {}", taskId, queue, t); + result = PrefetchRoundResult.rescheduleAfter(100L); + } + + boolean shouldEnqueue = false; + Long delayedWakeMs = null; + long delayedToken = 0L; + synchronized (monitor) { + running = false; + if (closed) { + scheduledOrRunning = false; + monitor.notifyAll(); + return; + } + + if (wakeupPending) { + wakeupPending = false; + shouldEnqueue = true; + } else { + switch (result.getType()) { + case RESCHEDULE_NOW: + shouldEnqueue = true; + break; + case RESCHEDULE_LATER: + delayedToken = ++delayedWakeToken; + delayedWakeMs = result.getDelayMs(); + scheduledOrRunning = false; + break; + case DORMANT: + default: + scheduledOrRunning = false; + break; + } + } + + if (shouldEnqueue) { + scheduledOrRunning = true; + } + monitor.notifyAll(); + } + + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + if (shouldEnqueue) { + currentExecutor.enqueue(this); + } else if (delayedWakeMs != null) { + currentExecutor.schedule(this, delayedWakeMs, delayedToken); + } + } + + public void fireScheduledWakeup(final long delayedToken) { + final ConsensusSubscriptionPrefetchExecutor currentExecutor = executor; + if (currentExecutor == null) { + return; + } + + boolean shouldEnqueue = false; + synchronized (monitor) { + if (closed || delayedWakeToken != delayedToken || scheduledOrRunning) { + return; + } + scheduledOrRunning = true; + shouldEnqueue = true; + } + + if (shouldEnqueue) { + currentExecutor.enqueue(this); + } + } + + public void cancelPendingExecution() { + synchronized (monitor) { + delayedWakeToken++; + wakeupPending = false; + if (scheduledOrRunning && !running) { + scheduledOrRunning = false; + } + monitor.notifyAll(); + } + } + + public void awaitIdle() { + synchronized (monitor) { + while (running || scheduledOrRunning) { + try { + monitor.wait(50L); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + } + + public void close() { + synchronized (monitor) { + closed = true; + delayedWakeToken++; + wakeupPending = false; + if (!running) { + scheduledOrRunning = false; + monitor.notifyAll(); + return; + } + while (scheduledOrRunning) { + try { + monitor.wait(50L); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + } + + public boolean isClosed() { + synchronized (monitor) { + return closed; + } + } + + public boolean isScheduledOrRunning() { + synchronized (monitor) { + return scheduledOrRunning; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java index 2ca332263b52b..7b67f79e62291 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtask.java @@ -22,6 +22,7 @@ import org.apache.iotdb.commons.pipe.agent.task.connection.UnboundedBlockingPendingQueue; import org.apache.iotdb.db.pipe.agent.task.subtask.sink.PipeSinkSubtask; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.pipe.api.PipeConnector; import org.apache.iotdb.pipe.api.event.Event; @@ -77,11 +78,27 @@ protected void registerCallbackHookAfterSubmit(final ListenableFuture f Futures.addCallback(future, this, subtaskCallbackListeningExecutor); } + @Override + public synchronized void onSuccess(final Boolean hasAtLeastOneEventProcessed) { + isSubmitted = false; + if (isConsensusDrivenTopic()) { + return; + } + super.onSuccess(hasAtLeastOneEventProcessed); + } + @Override public synchronized void onFailure(final Throwable throwable) { isSubmitted = false; - // just resubmit + if (isConsensusDrivenTopic()) { + LOGGER.warn( + "SubscriptionSinkSubtask for consensus topic [{}] failed unexpectedly, skip auto-resubmit", + topicName, + throwable); + return; + } + submitSelf(); } @@ -91,6 +108,14 @@ protected boolean executeOnce() { return false; } + if (isConsensusDrivenTopic()) { + return false; + } + return SubscriptionAgent.broker().executePrefetch(consumerGroupId, topicName); } + + private boolean isConsensusDrivenTopic() { + return ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName); + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java index 98163697374da..95dcba88b8f5a 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/subscription/task/subtask/SubscriptionSinkSubtaskLifeCycle.java @@ -24,6 +24,7 @@ import org.apache.iotdb.db.pipe.agent.task.subtask.sink.PipeSinkSubtask; import org.apache.iotdb.db.pipe.agent.task.subtask.sink.PipeSinkSubtaskLifeCycle; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusSubscriptionSetupHandler; import org.apache.iotdb.pipe.api.event.Event; import org.slf4j.Logger; @@ -48,8 +49,10 @@ public synchronized void register() { } if (registeredTaskCount == 0) { - // bind prefetching queue - SubscriptionAgent.broker().bindPrefetchingQueue((SubscriptionSinkSubtask) subtask); + if (!ConsensusSubscriptionSetupHandler.isConsensusBasedTopic( + ((SubscriptionSinkSubtask) subtask).getTopicName())) { + SubscriptionAgent.broker().bindPrefetchingQueue((SubscriptionSinkSubtask) subtask); + } executor.register(subtask); runningTaskCount = 0; } @@ -97,6 +100,8 @@ public synchronized void close() { // when dropping the subscription. final String consumerGroupId = ((SubscriptionSinkSubtask) subtask).getConsumerGroupId(); final String topicName = ((SubscriptionSinkSubtask) subtask).getTopicName(); - SubscriptionAgent.broker().unbindPrefetchingQueue(consumerGroupId, topicName); + if (!ConsensusSubscriptionSetupHandler.isConsensusBasedTopic(topicName)) { + SubscriptionAgent.broker().unbindPrefetchingQueue(consumerGroupId, topicName); + } } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReaderTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReaderTest.java new file mode 100644 index 0000000000000..39eeba65b9306 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/ProgressWALReaderTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.wal.io; + +import org.junit.Test; + +import java.io.File; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class ProgressWALReaderTest { + + @Test + public void testReadWriterProgressMetadataFromV3Wal() throws Exception { + Path dir = Files.createTempDirectory("progress-wal-reader"); + File walFile = dir.resolve("test.wal").toFile(); + + try { + try (WALWriter writer = new WALWriter(walFile, WALFileVersion.V3)) { + writer.write( + entryBuffer((byte) 1, (byte) 2, (byte) 3), + singleEntryMeta(3, 10L, 1L, 1000L, 10L, 10000L, 1, 2L, 10L)); + writer.write( + entryBuffer((byte) 4, (byte) 5), + singleEntryMeta(2, 11L, 1L, 1000L, 11L, 10010L, 1, 2L, 11L)); + writer.write( + entryBuffer((byte) 6, (byte) 7, (byte) 8, (byte) 9), + singleEntryMeta(4, 12L, 2L, 2000L, 1L, 20000L, 4, 1L, 1L)); + } + + try (ProgressWALReader reader = new ProgressWALReader(walFile)) { + assertTrue(reader.hasNext()); + assertArrayEquals(new byte[] {1, 2, 3}, reader.next().array()); + assertEquals(0, reader.getCurrentEntryIndex()); + assertEquals(10000L, reader.getCurrentEntryPhysicalTime()); + assertEquals(1, reader.getCurrentEntryNodeId()); + assertEquals(2L, reader.getCurrentEntryWriterEpoch()); + assertEquals(10L, reader.getCurrentEntryLocalSeq()); + + assertTrue(reader.hasNext()); + assertArrayEquals(new byte[] {4, 5}, reader.next().array()); + assertEquals(1, reader.getCurrentEntryIndex()); + assertEquals(10010L, reader.getCurrentEntryPhysicalTime()); + assertEquals(1, reader.getCurrentEntryNodeId()); + assertEquals(2L, reader.getCurrentEntryWriterEpoch()); + assertEquals(11L, reader.getCurrentEntryLocalSeq()); + + assertTrue(reader.hasNext()); + assertArrayEquals(new byte[] {6, 7, 8, 9}, reader.next().array()); + assertEquals(2, reader.getCurrentEntryIndex()); + assertEquals(20000L, reader.getCurrentEntryPhysicalTime()); + assertEquals(4, reader.getCurrentEntryNodeId()); + assertEquals(1L, reader.getCurrentEntryWriterEpoch()); + assertEquals(1L, reader.getCurrentEntryLocalSeq()); + } + } finally { + Files.deleteIfExists(walFile.toPath()); + Files.deleteIfExists(dir); + } + } + + private static ByteBuffer entryBuffer(byte... bytes) { + ByteBuffer buffer = ByteBuffer.allocate(bytes.length); + buffer.put(bytes); + return buffer; + } + + private static WALMetaData singleEntryMeta( + int size, + long searchIndex, + long memTableId, + long epoch, + long syncIndex, + long physicalTime, + int nodeId, + long writerEpoch, + long localSeq) { + return singleEntryMeta( + size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + } + + private static WALMetaData singleEntryMeta( + int size, + long searchIndex, + long memTableId, + long physicalTime, + int nodeId, + long writerEpoch, + long localSeq) { + WALMetaData metaData = new WALMetaData(); + metaData.add(size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + return metaData; + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaDataV3CompatibilityTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaDataV3CompatibilityTest.java new file mode 100644 index 0000000000000..0a4a7c1c0f74c --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/io/WALMetaDataV3CompatibilityTest.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iotdb.db.storageengine.dataregion.wal.io; + +import org.junit.Test; + +import java.nio.ByteBuffer; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** Tests for WALMetaData V3 serialization/deserialization roundtrip and V2->V3 compatibility. */ +public class WALMetaDataV3CompatibilityTest { + + @Test + public void testV3RoundTrip() { + WALMetaData original = new WALMetaData(); + + original.add(100, 10, 1, 10000L, 1, 2L, 10L); + original.add(200, 11, 1, 10010L, 1, 2L, 11L); + original.add(150, 12, 1, 10020L, 1, 2L, 12L); + original.add(300, 13, 2, 20000L, 4, 1L, 1L); + original.add(250, 14, 2, 20010L, 1, 2L, 14L); + + original.updateTimestampRange(1600000000000L); + original.updateTimestampRange(1600000001000L); + + int size = original.serializedSize(WALFileVersion.V3); + ByteBuffer buffer = ByteBuffer.allocate(size); + original.serialize(buffer, WALFileVersion.V3); + buffer.flip(); + + WALMetaData deserialized = WALMetaData.deserialize(buffer, WALFileVersion.V3); + + assertEquals(10, deserialized.getFirstSearchIndex()); + assertEquals(5, deserialized.getBuffersSize().size()); + assertEquals(Integer.valueOf(100), deserialized.getBuffersSize().get(0)); + assertEquals(Integer.valueOf(200), deserialized.getBuffersSize().get(1)); + assertEquals(Integer.valueOf(150), deserialized.getBuffersSize().get(2)); + assertEquals(Integer.valueOf(300), deserialized.getBuffersSize().get(3)); + assertEquals(Integer.valueOf(250), deserialized.getBuffersSize().get(4)); + + assertTrue(deserialized.getMemTablesId().contains(1L)); + assertTrue(deserialized.getMemTablesId().contains(2L)); + + assertEquals(1600000000000L, deserialized.getMinDataTs()); + assertEquals(1600000001000L, deserialized.getMaxDataTs()); + + assertEquals(5, deserialized.getPhysicalTimes().size()); + assertEquals(Long.valueOf(10000L), deserialized.getPhysicalTimes().get(0)); + assertEquals(Long.valueOf(10010L), deserialized.getPhysicalTimes().get(1)); + assertEquals(Long.valueOf(10020L), deserialized.getPhysicalTimes().get(2)); + assertEquals(Long.valueOf(20000L), deserialized.getPhysicalTimes().get(3)); + assertEquals(Long.valueOf(20010L), deserialized.getPhysicalTimes().get(4)); + + assertEquals(5, deserialized.getNodeIds().size()); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(0)); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(1)); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(2)); + assertEquals(Short.valueOf((short) 4), deserialized.getNodeIds().get(3)); + assertEquals(Short.valueOf((short) 1), deserialized.getNodeIds().get(4)); + + assertEquals(5, deserialized.getWriterEpochs().size()); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(0)); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(1)); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(2)); + assertEquals(Short.valueOf((short) 1), deserialized.getWriterEpochs().get(3)); + assertEquals(Short.valueOf((short) 2), deserialized.getWriterEpochs().get(4)); + + assertEquals(5, deserialized.getLocalSeqs().size()); + assertEquals(Long.valueOf(10L), deserialized.getLocalSeqs().get(0)); + assertEquals(Long.valueOf(11L), deserialized.getLocalSeqs().get(1)); + assertEquals(Long.valueOf(12L), deserialized.getLocalSeqs().get(2)); + assertEquals(Long.valueOf(1L), deserialized.getLocalSeqs().get(3)); + assertEquals(Long.valueOf(14L), deserialized.getLocalSeqs().get(4)); + } + + @Test + public void testV2DeserializationHasEmptyV3Fields() { + WALMetaData original = new WALMetaData(); + original.add(100, 10, 1, 1000L, 10); + original.add(200, 11, 1, 2000L, 11); + + int size = original.serializedSize(WALFileVersion.V2); + ByteBuffer buffer = ByteBuffer.allocate(size); + original.serialize(buffer, WALFileVersion.V2); + buffer.flip(); + + WALMetaData deserialized = WALMetaData.deserialize(buffer, WALFileVersion.V2); + + assertEquals(10, deserialized.getFirstSearchIndex()); + assertEquals(2, deserialized.getBuffersSize().size()); + assertTrue(deserialized.getPhysicalTimes().isEmpty()); + assertTrue(deserialized.getNodeIds().isEmpty()); + assertTrue(deserialized.getWriterEpochs().isEmpty()); + assertTrue(deserialized.getLocalSeqs().isEmpty()); + assertEquals(Long.MAX_VALUE, deserialized.getMinDataTs()); + assertEquals(Long.MIN_VALUE, deserialized.getMaxDataTs()); + } + + @Test + public void testV2SerializedSizeSmallerThanV3() { + WALMetaData meta = new WALMetaData(); + meta.add(100, 10, 1, 10L, 1, 2L, 10L); + meta.add(200, 11, 1, 11L, 1, 2L, 11L); + meta.add(300, 12, 1, 12L, 3, 1L, 12L); + + int v2Size = meta.serializedSize(WALFileVersion.V2); + int v3Size = meta.serializedSize(WALFileVersion.V3); + + int entryCount = 3; + int overrideCount = 1; + int expectedDiff = + entryCount * Long.BYTES * 2 + + Long.BYTES * 2 + + Short.BYTES * 2 + + Integer.BYTES + + overrideCount * (Integer.BYTES + Short.BYTES + Short.BYTES); + assertEquals(expectedDiff, v3Size - v2Size); + } + + @Test + public void testV3AddAllMerge() { + WALMetaData meta1 = new WALMetaData(); + meta1.add(100, 10, 1, 100L, 1, 2L, 10L); + meta1.add(200, 11, 1, 110L, 1, 2L, 11L); + meta1.updateTimestampRange(100L); + + WALMetaData meta2 = new WALMetaData(); + meta2.add(300, 12, 2, 200L, 4, 1L, 1L); + meta2.updateTimestampRange(200L); + + meta1.addAll(meta2); + + assertEquals(3, meta1.getBuffersSize().size()); + assertEquals(Long.valueOf(100L), meta1.getPhysicalTimes().get(0)); + assertEquals(Long.valueOf(110L), meta1.getPhysicalTimes().get(1)); + assertEquals(Long.valueOf(200L), meta1.getPhysicalTimes().get(2)); + assertEquals(Short.valueOf((short) 1), meta1.getNodeIds().get(0)); + assertEquals(Short.valueOf((short) 1), meta1.getNodeIds().get(1)); + assertEquals(Short.valueOf((short) 4), meta1.getNodeIds().get(2)); + assertEquals(Short.valueOf((short) 2), meta1.getWriterEpochs().get(0)); + assertEquals(Short.valueOf((short) 2), meta1.getWriterEpochs().get(1)); + assertEquals(Short.valueOf((short) 1), meta1.getWriterEpochs().get(2)); + assertEquals(Long.valueOf(10L), meta1.getLocalSeqs().get(0)); + assertEquals(Long.valueOf(11L), meta1.getLocalSeqs().get(1)); + assertEquals(Long.valueOf(1L), meta1.getLocalSeqs().get(2)); + assertEquals(100L, meta1.getMinDataTs()); + assertEquals(200L, meta1.getMaxDataTs()); + } + + @Test + public void testV3EmptyMetadata() { + WALMetaData empty = new WALMetaData(); + + int size = empty.serializedSize(WALFileVersion.V3); + ByteBuffer buffer = ByteBuffer.allocate(size); + empty.serialize(buffer, WALFileVersion.V3); + buffer.flip(); + + WALMetaData deserialized = WALMetaData.deserialize(buffer, WALFileVersion.V3); + + assertEquals(0, deserialized.getBuffersSize().size()); + assertTrue(deserialized.getPhysicalTimes().isEmpty()); + assertTrue(deserialized.getNodeIds().isEmpty()); + assertTrue(deserialized.getWriterEpochs().isEmpty()); + assertTrue(deserialized.getLocalSeqs().isEmpty()); + assertEquals(Long.MAX_VALUE, deserialized.getMinDataTs()); + assertEquals(Long.MIN_VALUE, deserialized.getMaxDataTs()); + } + + @Test + public void testV2CompatibleAddDefaultsWriterProgress() { + WALMetaData meta = new WALMetaData(); + meta.add(100, 10, 1); + + assertEquals(Long.valueOf(0L), meta.getPhysicalTimes().get(0)); + assertEquals(Short.valueOf((short) -1), meta.getNodeIds().get(0)); + assertEquals(Short.valueOf((short) 0), meta.getWriterEpochs().get(0)); + assertEquals(Long.valueOf(10L), meta.getLocalSeqs().get(0)); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java index 688e5df205c4e..600a003d5522d 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/node/ConsensusReqReaderTest.java @@ -250,6 +250,24 @@ public void scenario01TestGetReqIterator02() throws Exception { checkThread.shutdown(); } + @Test + public void testReqIteratorCarriesWriterMetadata() throws Exception { + final InsertRowNode insertRowNode = getInsertRowNode(devicePath); + insertRowNode.setSearchIndex(1).setPhysicalTime(123456789L).setNodeId(7).setWriterEpoch(3L); + walNode.log(0, insertRowNode); + walNode.rollWALFile(); + walNode.rollWALFile(); + + final ConsensusReqReader.ReqIterator iterator = walNode.getReqIterator(1); + Assert.assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + + Assert.assertEquals(1L, request.getSearchIndex()); + Assert.assertEquals(123456789L, request.getPhysicalTime()); + Assert.assertEquals(7, request.getNodeId()); + Assert.assertEquals(3L, request.getWriterEpoch()); + } + @Test public void scenario01TestGetReqIterator03() throws Exception { simulateFileScenario01(); diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java index a7d8fa5662f7a..5c339f0e32d95 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/wal/utils/WALFileUtilsTest.java @@ -18,10 +18,20 @@ */ package org.apache.iotdb.db.storageengine.dataregion.wal.utils; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALFileVersion; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALWriter; + import org.junit.Assert; import org.junit.Test; import java.io.File; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; public class WALFileUtilsTest { @Test @@ -238,4 +248,125 @@ public void binarySearchFileBySearchIndex13() { i = WALFileUtils.binarySearchFileBySearchIndex(files, 0); Assert.assertEquals(-1, i); } + + @Test + public void testLocateByWriterProgress() throws Exception { + final Path dir = Files.createTempDirectory("wal-writer-progress-utils"); + final File wal0 = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File wal1 = + dir.resolve(WALFileUtils.getLogFileName(1, 12, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File wal2 = + dir.resolve(WALFileUtils.getLogFileName(2, 13, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (final WALWriter writer = new WALWriter(wal0, WALFileVersion.V3)) { + writer.write(entryBuffer(10L), singleEntryMeta(19, 10L, 1L, 0L, 10L, 10000L, 1, 2L, 110L)); + writer.write(entryBuffer(11L), singleEntryMeta(19, 11L, 1L, 0L, 11L, 10010L, 1, 2L, 111L)); + } + try (final WALWriter writer = new WALWriter(wal1, WALFileVersion.V3)) { + writer.write(entryBuffer(13L), singleEntryMeta(19, 13L, 1L, 0L, 13L, 10020L, 1, 2L, 113L)); + } + // Leave wal2 as the active file placeholder; helper methods only scan sealed files. + try (final WALWriter writer = new WALWriter(wal2, WALFileVersion.V3)) { + writer.write(entryBuffer(20L), singleEntryMeta(19, 20L, 1L, 0L, 20L, 20000L, 4, 1L, 120L)); + } + + Assert.assertArrayEquals( + new long[] {11L, 1L}, + WALFileUtils.locateByWriterProgress(dir.toFile(), 1, 2L, 10010L, 111L)); + Assert.assertArrayEquals( + new long[] {10L, 0L}, + WALFileUtils.locateByWriterProgress(dir.toFile(), 1, 2L, 9999L, 109L)); + Assert.assertEquals( + 13L, WALFileUtils.findSearchIndexAfterWriterProgress(dir.toFile(), 1, 2L, 10010L, 111L)); + Assert.assertEquals( + -1L, WALFileUtils.findSearchIndexAfterWriterProgress(dir.toFile(), 4, 1L, 20000L, 120L)); + } finally { + Files.deleteIfExists(wal0.toPath()); + Files.deleteIfExists(wal1.toPath()); + Files.deleteIfExists(wal2.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testFindSearchIndexAfterCompatibleProgress() throws Exception { + final Path dir = Files.createTempDirectory("wal-compatible-progress-utils"); + final File wal0 = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File wal1 = + dir.resolve(WALFileUtils.getLogFileName(1, 12, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File wal2 = + dir.resolve(WALFileUtils.getLogFileName(2, 20, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (final WALWriter writer = new WALWriter(wal0, WALFileVersion.V3)) { + writer.write(entryBuffer(10L), singleEntryMeta(19, 10L, 1L, 10000L, 1, 2L, 110L)); + writer.write(entryBuffer(11L), singleEntryMeta(19, 11L, 1L, 10010L, 1, 2L, 111L)); + } + try (final WALWriter writer = new WALWriter(wal1, WALFileVersion.V3)) { + writer.write(entryBuffer(13L), singleEntryMeta(19, 13L, 1L, 10010L, 4, 1L, 113L)); + writer.write(entryBuffer(14L), singleEntryMeta(19, 14L, 1L, 10020L, 1, 2L, 114L)); + } + try (final WALWriter writer = new WALWriter(wal2, WALFileVersion.V3)) { + writer.write(entryBuffer(20L), singleEntryMeta(19, 20L, 1L, 20000L, 4, 1L, 120L)); + } + + Assert.assertEquals( + 14L, WALFileUtils.findSearchIndexAfterCompatibleProgress(dir.toFile(), 10010L, 111L)); + Assert.assertEquals( + 10L, WALFileUtils.findSearchIndexAfterCompatibleProgress(dir.toFile(), 9999L, 109L)); + Assert.assertEquals( + -1L, WALFileUtils.findSearchIndexAfterCompatibleProgress(dir.toFile(), 20000L, 120L)); + } finally { + Files.deleteIfExists(wal0.toPath()); + Files.deleteIfExists(wal1.toPath()); + Files.deleteIfExists(wal2.toPath()); + Files.deleteIfExists(dir); + } + } + + private static ByteBuffer entryBuffer(final long bodySearchIndex) { + final ByteBuffer buffer = + ByteBuffer.allocate(WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES + Long.BYTES); + buffer.put(WALEntryType.INSERT_ROW_NODE.getCode()); + buffer.putLong(1L); + buffer.putShort(PlanNodeType.INSERT_ROW.getNodeType()); + buffer.putLong(bodySearchIndex); + return buffer; + } + + private static WALMetaData singleEntryMeta( + final int size, + final long searchIndex, + final long memTableId, + final long epoch, + final long syncIndex, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + return singleEntryMeta( + size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + } + + private static WALMetaData singleEntryMeta( + final int size, + final long searchIndex, + final long memTableId, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + final WALMetaData metaData = new WALMetaData(); + metaData.add(size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + return metaData; + } } diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgentSeekRuntimeTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgentSeekRuntimeTest.java new file mode 100644 index 0000000000000..7f75778aa3284 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/agent/SubscriptionBrokerAgentSeekRuntimeTest.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.agent; + +import org.apache.iotdb.db.subscription.broker.ConsensusSubscriptionBroker; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; +import org.apache.iotdb.rpc.subscription.config.ConsumerConfig; +import org.apache.iotdb.rpc.subscription.config.ConsumerConstant; +import org.apache.iotdb.rpc.subscription.exception.SubscriptionException; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; + +import org.junit.Test; + +import java.lang.reflect.Field; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyShort; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class SubscriptionBrokerAgentSeekRuntimeTest { + + private static final String CONSUMER_GROUP_ID = "cg_seek_runtime_test"; + private static final String TOPIC = "topic_seek_runtime_test"; + + @Test + public void testConsensusSeekApisRejectWhenRuntimeUnavailable() throws Exception { + ConsensusSubscriptionPrefetchExecutorManager.getInstance().stop(); + + final SubscriptionBrokerAgent agent = new SubscriptionBrokerAgent(); + final ConsensusSubscriptionBroker consensusBroker = mock(ConsensusSubscriptionBroker.class); + when(consensusBroker.hasQueue(TOPIC)).thenReturn(true); + injectConsensusBroker(agent, consensusBroker); + + assertRuntimeUnavailable( + () -> agent.seek(createConsumerConfig(), TOPIC, PipeSubscribeSeekReq.SEEK_TO_BEGINNING)); + assertRuntimeUnavailable( + () -> + agent.seekToTopicProgress( + createConsumerConfig(), TOPIC, new TopicProgress(Collections.emptyMap()))); + assertRuntimeUnavailable( + () -> + agent.seekAfterTopicProgress( + createConsumerConfig(), TOPIC, new TopicProgress(Collections.emptyMap()))); + + verify(consensusBroker, never()).seek(eq(TOPIC), anyShort()); + verify(consensusBroker, never()).seek(eq(TOPIC), any(TopicProgress.class)); + verify(consensusBroker, never()).seekAfter(eq(TOPIC), any(TopicProgress.class)); + } + + private static void assertRuntimeUnavailable(final Runnable action) { + try { + action.run(); + fail("expected consensus seek to fail when runtime is unavailable"); + } catch (final SubscriptionException e) { + assertTrue(e.getMessage().contains("runtime is stopped")); + } + } + + private static ConsumerConfig createConsumerConfig() { + final Map attributes = new HashMap<>(); + attributes.put(ConsumerConstant.CONSUMER_ID_KEY, "consumer-seek-runtime"); + attributes.put(ConsumerConstant.CONSUMER_GROUP_ID_KEY, CONSUMER_GROUP_ID); + return new ConsumerConfig(attributes); + } + + @SuppressWarnings("unchecked") + private static void injectConsensusBroker( + final SubscriptionBrokerAgent agent, final ConsensusSubscriptionBroker broker) + throws Exception { + final Field field = + SubscriptionBrokerAgent.class.getDeclaredField("consumerGroupIdToConsensusBroker"); + field.setAccessible(true); + ((Map) field.get(agent)).put(CONSUMER_GROUP_ID, broker); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBrokerSeekTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBrokerSeekTest.java new file mode 100644 index 0000000000000..e2e6ad4a9ee8a --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/ConsensusSubscriptionBrokerSeekTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker; + +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.TopicProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; +import org.apache.iotdb.rpc.subscription.payload.request.PipeSubscribeSeekReq; + +import org.junit.Test; + +import java.lang.reflect.Field; +import java.util.Arrays; +import java.util.Collections; +import java.util.Map; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class ConsensusSubscriptionBrokerSeekTest { + + private static final String TOPIC = "topic_seek_test"; + + @Test + public void testSeekBeginningRoutesToAllQueues() throws Exception { + final ConsensusSubscriptionBroker broker = new ConsensusSubscriptionBroker("broker"); + final ConsensusPrefetchingQueue queue1 = mockQueue(1); + final ConsensusPrefetchingQueue queue2 = mockQueue(2); + injectQueues(broker, Arrays.asList(queue1, queue2)); + + broker.seek(TOPIC, PipeSubscribeSeekReq.SEEK_TO_BEGINNING); + + verify(queue1).seekToBeginning(); + verify(queue2).seekToBeginning(); + verify(queue1, never()).seekToEnd(); + verify(queue2, never()).seekToEnd(); + } + + @Test + public void testSeekEndRoutesToAllQueues() throws Exception { + final ConsensusSubscriptionBroker broker = new ConsensusSubscriptionBroker("broker"); + final ConsensusPrefetchingQueue queue1 = mockQueue(1); + final ConsensusPrefetchingQueue queue2 = mockQueue(2); + injectQueues(broker, Arrays.asList(queue1, queue2)); + + broker.seek(TOPIC, PipeSubscribeSeekReq.SEEK_TO_END); + + verify(queue1).seekToEnd(); + verify(queue2).seekToEnd(); + verify(queue1, never()).seekToBeginning(); + verify(queue2, never()).seekToBeginning(); + } + + @Test + public void testSeekAfterTopicProgressLeavesMissingRegionsUntouched() throws Exception { + final ConsensusSubscriptionBroker broker = new ConsensusSubscriptionBroker("broker"); + final ConsensusPrefetchingQueue queue1 = mockQueue(1); + final ConsensusPrefetchingQueue queue2 = mockQueue(2); + injectQueues(broker, Arrays.asList(queue1, queue2)); + + final String region1 = new DataRegionId(1).toString(); + final RegionProgress regionProgress = + new RegionProgress( + Collections.singletonMap(new WriterId(region1, 1, 1L), new WriterProgress(100L, 10L))); + + broker.seekAfter(TOPIC, new TopicProgress(Collections.singletonMap(region1, regionProgress))); + + verify(queue1).seekAfterRegionProgress(regionProgress); + verify(queue2, never()).seekAfterRegionProgress(any()); + verify(queue2, never()).seekToRegionProgress(any()); + verify(queue2, never()).seekToEnd(); + } + + @Test + public void testSeekTopicProgressLeavesMissingRegionsUntouched() throws Exception { + final ConsensusSubscriptionBroker broker = new ConsensusSubscriptionBroker("broker"); + final ConsensusPrefetchingQueue queue1 = mockQueue(1); + final ConsensusPrefetchingQueue queue2 = mockQueue(2); + injectQueues(broker, Arrays.asList(queue1, queue2)); + + final String region2 = new DataRegionId(2).toString(); + final RegionProgress regionProgress = + new RegionProgress( + Collections.singletonMap(new WriterId(region2, 2, 1L), new WriterProgress(200L, 20L))); + + broker.seek(TOPIC, new TopicProgress(Collections.singletonMap(region2, regionProgress))); + + verify(queue2).seekToRegionProgress(regionProgress); + verify(queue1, never()).seekToRegionProgress(any()); + verify(queue1, never()).seekAfterRegionProgress(any()); + verify(queue1, never()).seekToEnd(); + } + + @Test + public void testUnsupportedSeekTypeDoesNotTouchQueues() throws Exception { + final ConsensusSubscriptionBroker broker = new ConsensusSubscriptionBroker("broker"); + final ConsensusPrefetchingQueue queue1 = mockQueue(1); + final ConsensusPrefetchingQueue queue2 = mockQueue(2); + injectQueues(broker, Arrays.asList(queue1, queue2)); + + broker.seek(TOPIC, PipeSubscribeSeekReq.SEEK_TO_TIMESTAMP); + + verify(queue1, never()).seekToBeginning(); + verify(queue1, never()).seekToEnd(); + verify(queue2, never()).seekToBeginning(); + verify(queue2, never()).seekToEnd(); + verify(queue1, never()).seekToRegionProgress(any()); + verify(queue2, never()).seekToRegionProgress(any()); + } + + private static ConsensusPrefetchingQueue mockQueue(final int regionId) { + final ConsensusPrefetchingQueue queue = mock(ConsensusPrefetchingQueue.class); + when(queue.isClosed()).thenReturn(false); + when(queue.getConsensusGroupId()).thenReturn(new DataRegionId(regionId)); + return queue; + } + + @SuppressWarnings("unchecked") + private static void injectQueues( + final ConsensusSubscriptionBroker broker, + final java.util.List queues) + throws Exception { + final Field field = + ConsensusSubscriptionBroker.class.getDeclaredField("topicNameToConsensusPrefetchingQueues"); + field.setAccessible(true); + final Map> topicToQueues = + (Map>) field.get(broker); + topicToQueues.put(TOPIC, queues); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueueRuntimeStateTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueueRuntimeStateTest.java new file mode 100644 index 0000000000000..aa130ce579500 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusPrefetchingQueueRuntimeStateTest.java @@ -0,0 +1,957 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.consensus.iot.IoTConsensusServerImpl; +import org.apache.iotdb.consensus.iot.WriterSafeFrontierTracker; +import org.apache.iotdb.consensus.iot.log.ConsensusReqReader; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.subscription.task.execution.ConsensusSubscriptionPrefetchExecutorManager; +import org.apache.iotdb.rpc.subscription.config.TopicConstant; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.After; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Deque; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.NoSuchElementException; +import java.util.PriorityQueue; +import java.util.TreeMap; +import java.util.function.LongFunction; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ConsensusPrefetchingQueueRuntimeStateTest { + + private final int previousDataNodeId = IoTDBDescriptor.getInstance().getConfig().getDataNodeId(); + + @After + public void tearDown() { + IoTDBDescriptor.getInstance().getConfig().setDataNodeId(previousDataNodeId); + } + + @Test + public void testFollowerQueueRemainsDormantWhenWriterSetIncludesLocalNode() { + IoTDBDescriptor.getInstance().getConfig().setDataNodeId(2); + + final ConsensusPrefetchingQueue queue = createQueue(false); + try { + queue.applyRuntimeState( + new ConsensusRegionRuntimeState(1L, 1, false, new LinkedHashSet<>(Arrays.asList(2, 1)))); + + assertFalse(queue.isActive()); + assertNull(queue.poll("consumer", (RegionProgress) null)); + } finally { + queue.close(); + } + } + + @Test + public void testFormerLeaderIsDeactivatedAfterLeaderTransfer() { + IoTDBDescriptor.getInstance().getConfig().setDataNodeId(1); + + final ConsensusPrefetchingQueue queue = createQueue(true); + try { + queue.applyRuntimeState( + new ConsensusRegionRuntimeState(2L, 2, false, new LinkedHashSet<>(Arrays.asList(2, 1)))); + + assertFalse(queue.isActive()); + assertNull(queue.poll("consumer", (RegionProgress) null)); + } finally { + queue.close(); + } + } + + @Test + public void testResolveCommittedRegionProgressForInitUsesLatestCommitState() { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final RegionProgress latestCommittedRegionProgress = + new RegionProgress( + Collections.singletonMap( + new WriterId("DataRegion[11]", 2, 5L), new WriterProgress(10L, 3L))); + when(commitManager.getCommittedRegionProgress( + anyString(), anyString(), any(DataRegionId.class))) + .thenReturn(latestCommittedRegionProgress); + + final TestConsensusPrefetchingQueue queue = + createTestQueue(mock(ConsensusReqReader.class), commitManager, null); + try { + assertSame( + latestCommittedRegionProgress, queue.resolveCommittedRegionProgressForInitForTest()); + } finally { + queue.close(); + } + } + + @Test + public void testResolveCommittedRegionProgressForInitFallsBackToConstructorSnapshot() { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + when(commitManager.getCommittedRegionProgress( + anyString(), anyString(), any(DataRegionId.class))) + .thenReturn(new RegionProgress(Collections.emptyMap())); + final RegionProgress fallbackCommittedRegionProgress = + new RegionProgress( + Collections.singletonMap( + new WriterId("DataRegion[11]", 1, 1L), new WriterProgress(20L, 7L))); + + final TestConsensusPrefetchingQueue queue = + createTestQueue( + mock(ConsensusReqReader.class), commitManager, fallbackCommittedRegionProgress); + try { + assertSame( + fallbackCommittedRegionProgress, queue.resolveCommittedRegionProgressForInitForTest()); + } finally { + queue.close(); + } + } + + @Test + public void testInitPrefetchResolvesReplayStartFromCommittedRegionProgress() throws Exception { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final RegionProgress committedRegionProgress = + new RegionProgress( + Collections.singletonMap( + new WriterId("DataRegion[11]", 2, 5L), new WriterProgress(10L, 3L))); + when(commitManager.getCommittedRegionProgress( + anyString(), anyString(), any(DataRegionId.class))) + .thenReturn(committedRegionProgress); + + final WALNode walNode = mock(WALNode.class); + when(walNode.getLogDirectory()).thenReturn(new File(".")); + final TestConsensusPrefetchingQueue queue = createTestQueue(walNode, commitManager, null); + queue.setLocateDecision( + ConsensusPrefetchingQueue.ReplayLocateDecision.found( + 37L, committedRegionProgress, "test locate")); + try { + queue.initPrefetchForTest(null); + + assertEquals(37L, queue.getCurrentReadSearchIndex()); + assertSame(committedRegionProgress, queue.getLastLocatedRegionProgress()); + assertTrue(queue.wasLastSeekAfter()); + assertEquals( + committedRegionProgress.getWriterPositions(), queue.getRecoveryProgressForTest()); + } finally { + queue.close(); + } + } + + @Test + public void testInitPrefetchUsesConsumerHintWhenAheadOfCommittedProgress() throws Exception { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final WriterId writerId = new WriterId("DataRegion[11]", 2, 5L); + final RegionProgress committedRegionProgress = + new RegionProgress(Collections.singletonMap(writerId, new WriterProgress(10L, 3L))); + final RegionProgress consumerHint = + new RegionProgress(Collections.singletonMap(writerId, new WriterProgress(10L, 4L))); + when(commitManager.getCommittedRegionProgress( + anyString(), anyString(), any(DataRegionId.class))) + .thenReturn(committedRegionProgress); + + final WALNode walNode = mock(WALNode.class); + when(walNode.getLogDirectory()).thenReturn(new File(".")); + final TestConsensusPrefetchingQueue queue = createTestQueue(walNode, commitManager, null); + queue.setLocateDecision( + ConsensusPrefetchingQueue.ReplayLocateDecision.found(55L, consumerHint, "test locate")); + try { + queue.initPrefetchForTest(consumerHint); + + assertEquals(55L, queue.getCurrentReadSearchIndex()); + assertEquals( + consumerHint.getWriterPositions(), + queue.getLastLocatedRegionProgress().getWriterPositions()); + assertTrue(queue.wasLastSeekAfter()); + assertEquals(consumerHint.getWriterPositions(), queue.getRecoveryProgressForTest()); + } finally { + queue.close(); + } + } + + @Test + public void testInitPrefetchMergesCommittedProgressWithPartialConsumerHint() throws Exception { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final WriterId writerA = new WriterId("DataRegion[11]", 2, 5L); + final WriterId writerB = new WriterId("DataRegion[11]", 3, 6L); + final java.util.LinkedHashMap committedWriterProgress = + new java.util.LinkedHashMap<>(); + committedWriterProgress.put(writerA, new WriterProgress(10L, 100L)); + committedWriterProgress.put(writerB, new WriterProgress(20L, 100L)); + final RegionProgress committedRegionProgress = new RegionProgress(committedWriterProgress); + final RegionProgress consumerHint = + new RegionProgress(Collections.singletonMap(writerA, new WriterProgress(10L, 101L))); + when(commitManager.getCommittedRegionProgress( + anyString(), anyString(), any(DataRegionId.class))) + .thenReturn(committedRegionProgress); + + final WALNode walNode = mock(WALNode.class); + when(walNode.getLogDirectory()).thenReturn(new File(".")); + final TestConsensusPrefetchingQueue queue = createTestQueue(walNode, commitManager, null); + final Map expectedRecoveryProgress = new LinkedHashMap<>(); + expectedRecoveryProgress.put(writerA, new WriterProgress(10L, 101L)); + expectedRecoveryProgress.put(writerB, new WriterProgress(20L, 100L)); + queue.setLocateDecision( + ConsensusPrefetchingQueue.ReplayLocateDecision.found( + 88L, new RegionProgress(expectedRecoveryProgress), "test locate")); + try { + queue.initPrefetchForTest(consumerHint); + + assertEquals(88L, queue.getCurrentReadSearchIndex()); + final Map recoveryProgress = queue.getRecoveryProgressForTest(); + assertEquals(2, recoveryProgress.size()); + assertEquals(new WriterProgress(10L, 101L), recoveryProgress.get(writerA)); + assertEquals(new WriterProgress(20L, 100L), recoveryProgress.get(writerB)); + assertEquals(recoveryProgress, queue.getLastLocatedRegionProgress().getWriterPositions()); + } finally { + queue.close(); + } + } + + @Test + public void testInitPrefetchThrowsWhenNonEmptyProgressCannotBeLocated() throws Exception { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final RegionProgress committedRegionProgress = + new RegionProgress( + Collections.singletonMap( + new WriterId("DataRegion[11]", 2, 5L), new WriterProgress(10L, 3L))); + when(commitManager.getCommittedRegionProgress( + anyString(), anyString(), any(DataRegionId.class))) + .thenReturn(committedRegionProgress); + + final WALNode walNode = mock(WALNode.class); + when(walNode.getLogDirectory()).thenReturn(new File(".")); + final TestConsensusPrefetchingQueue queue = createTestQueue(walNode, commitManager, null); + queue.setLocateDecision( + ConsensusPrefetchingQueue.ReplayLocateDecision.locateMiss( + committedRegionProgress, "test locate miss")); + try { + try { + queue.initPrefetchForTest(null); + fail("expected initPrefetch to reject non-empty progress locate miss"); + } catch (final InvocationTargetException e) { + assertTrue(e.getCause() instanceof IllegalStateException); + } + } finally { + queue.close(); + } + } + + @Test + public void testAbortPendingSeekBeforeFirstActivationRestoresInitState() throws Exception { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final RegionProgress committedRegionProgress = + new RegionProgress( + Collections.singletonMap( + new WriterId("DataRegion[11]", 2, 5L), new WriterProgress(10L, 3L))); + when(commitManager.getCommittedRegionProgress( + anyString(), anyString(), any(DataRegionId.class))) + .thenReturn(committedRegionProgress); + + final WALNode walNode = mock(WALNode.class); + when(walNode.getLogDirectory()).thenReturn(new File(".")); + final TestConsensusPrefetchingQueue queue = createTestQueue(walNode, commitManager, null); + queue.setLocateDecision( + ConsensusPrefetchingQueue.ReplayLocateDecision.found( + 37L, committedRegionProgress, "test locate")); + try { + queue.installPendingSeekForAbortForTest( + 99L, committedRegionProgress, "runtimeStopAbort", false, 0L, 1L); + + queue.abortPendingSeekForRuntimeStop(); + queue.initPrefetchForTest(null); + + assertEquals(0L, queue.getCurrentSeekGeneration()); + assertEquals(37L, queue.getCurrentReadSearchIndex()); + assertSame(committedRegionProgress, queue.getLastLocatedRegionProgress()); + assertFalse(queue.hasPendingSeekForTest()); + } finally { + queue.close(); + } + } + + @Test + public void testSeekFailsWhenPrefetchRuntimeUnavailableInsteadOfInlineApply() throws Exception { + ConsensusSubscriptionPrefetchExecutorManager.getInstance().stop(); + + final TestConsensusPrefetchingQueue queue = + createTestQueue( + mock(ConsensusReqReader.class), mock(ConsensusSubscriptionCommitManager.class), null); + try { + try { + queue.seekToBeginning(); + fail("expected seekToBeginning to fail when prefetch runtime is unavailable"); + } catch (final IllegalStateException e) { + assertTrue(e.getMessage().contains("prefetch runtime is unavailable")); + } + + assertEquals(0L, queue.getCurrentSeekGeneration()); + assertEquals(1L, queue.getCurrentReadSearchIndex()); + assertFalse(queue.hasPendingSeekForTest()); + assertFalse(queue.isPrefetchInitializedForTest()); + } finally { + queue.close(); + } + } + + @Test + public void testScanReplayStartTreatsMissingWriterAsUncovered() throws Exception { + final TestConsensusPrefetchingQueue queue = + createTestQueue( + mock(ConsensusReqReader.class), mock(ConsensusSubscriptionCommitManager.class), null); + try { + final WriterId writerA = new WriterId("DataRegion[11]", 2, 5L); + final WriterId writerB = new WriterId("DataRegion[11]", 3, 6L); + final RegionProgress recoveryProgress = + new RegionProgress(Collections.singletonMap(writerA, new WriterProgress(10L, 3L))); + + final List requests = new ArrayList<>(); + requests.add(newIndexedConsensusRequest(30L, 10L, 2, 5L, 3L)); + requests.add(newIndexedConsensusRequest(31L, 11L, 3, 6L, 1L)); + + final ConsensusPrefetchingQueue.ReplayLocateDecision decision = + queue.scanReplayStartForRequestsForTest(requests, recoveryProgress, true); + + assertEquals(ConsensusPrefetchingQueue.ReplayLocateStatus.FOUND, decision.getStatus()); + assertEquals(31L, decision.getStartSearchIndex()); + assertEquals( + recoveryProgress.getWriterPositions(), + decision.getRecoveryRegionProgress().getWriterPositions()); + assertTrue(decision.getRecoveryRegionProgress().getWriterPositions().containsKey(writerA)); + assertFalse(decision.getRecoveryRegionProgress().getWriterPositions().containsKey(writerB)); + } finally { + queue.close(); + } + } + + @Test + public void testScanReplayStartReturnsLocateMissForBlockingNonReplayableUncoveredRequest() + throws Exception { + final TestConsensusPrefetchingQueue queue = + createTestQueue( + mock(ConsensusReqReader.class), mock(ConsensusSubscriptionCommitManager.class), null); + try { + final WriterId writerA = new WriterId("DataRegion[11]", 2, 5L); + final RegionProgress recoveryProgress = + new RegionProgress(Collections.singletonMap(writerA, new WriterProgress(10L, 3L))); + + final List requests = new ArrayList<>(); + requests.add(newIndexedConsensusRequest(-1L, 11L, 3, 6L, 1L)); + requests.add(newIndexedConsensusRequest(40L, 12L, 4, 7L, 1L)); + + final ConsensusPrefetchingQueue.ReplayLocateDecision decision = + queue.scanReplayStartForRequestsForTest(requests, recoveryProgress, true); + + assertEquals(ConsensusPrefetchingQueue.ReplayLocateStatus.LOCATE_MISS, decision.getStatus()); + } finally { + queue.close(); + } + } + + @Test + public void testScanReplayStartForSeekToDecrementsExactVisibleWriterFrontiers() throws Exception { + final TestConsensusPrefetchingQueue queue = + createTestQueue( + mock(ConsensusReqReader.class), mock(ConsensusSubscriptionCommitManager.class), null); + try { + final WriterId writerA = new WriterId("DataRegion[11]", 2, 5L); + final WriterId writerB = new WriterId("DataRegion[11]", 3, 6L); + final Map writerProgress = new LinkedHashMap<>(); + writerProgress.put(writerA, new WriterProgress(10L, 3L)); + writerProgress.put(writerB, new WriterProgress(20L, 8L)); + final RegionProgress recoveryProgress = new RegionProgress(writerProgress); + + final List requests = new ArrayList<>(); + requests.add(newIndexedConsensusRequest(30L, 10L, 2, 5L, 3L)); + requests.add(newIndexedConsensusRequest(31L, 20L, 3, 6L, 8L)); + + final ConsensusPrefetchingQueue.ReplayLocateDecision decision = + queue.scanReplayStartForRequestsForTest(requests, recoveryProgress, false); + + assertEquals(ConsensusPrefetchingQueue.ReplayLocateStatus.FOUND, decision.getStatus()); + assertEquals(30L, decision.getStartSearchIndex()); + assertEquals( + new WriterProgress(10L, 2L), + decision.getRecoveryRegionProgress().getWriterPositions().get(writerA)); + assertEquals( + new WriterProgress(20L, 7L), + decision.getRecoveryRegionProgress().getWriterPositions().get(writerB)); + } finally { + queue.close(); + } + } + + @Test + public void + testPerWriterRealtimeFrontierDoesNotInjectSyntheticBarrierForMissingPreferredWriterLane() + throws Exception { + final TestConsensusPrefetchingQueue queue = + createTestQueue( + mock(ConsensusReqReader.class), mock(ConsensusSubscriptionCommitManager.class), null); + try { + queue.setOrderMode(TopicConstant.ORDER_MODE_PER_WRITER_VALUE); + queue.setPreferredWriterNodeId(1); + queue.setActiveWriterNodeIds(new LinkedHashSet<>(Arrays.asList(1, 3))); + + addRealtimeEntry(queue, 3, 1L, 100L, 1L, 10L); + + final Object frontier = buildRealtimeLaneFrontiers(queue).peek(); + assertFalse(isBarrier(frontier)); + assertEquals(3, getFrontierWriterNodeId(frontier)); + } finally { + queue.close(); + } + } + + @Test + public void + testMultiWriterRealtimeFrontierStillInjectsSyntheticBarrierForMissingPreferredWriterLane() + throws Exception { + final TestConsensusPrefetchingQueue queue = + createTestQueue( + mock(ConsensusReqReader.class), mock(ConsensusSubscriptionCommitManager.class), null); + try { + queue.setOrderMode(TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE); + queue.setPreferredWriterNodeId(1); + queue.setActiveWriterNodeIds(new LinkedHashSet<>(Arrays.asList(1, 3))); + + addRealtimeEntry(queue, 3, 1L, 100L, 1L, 10L); + + final Object frontier = buildRealtimeLaneFrontiers(queue).peek(); + assertTrue(isBarrier(frontier)); + assertEquals(1, getFrontierWriterNodeId(frontier)); + } finally { + queue.close(); + } + } + + @Test + public void testAccumulateFromPendingWaitsForTransientWalGapWithoutSkippingBatch() + throws Exception { + final ConsensusReqReader reqReader = mock(ConsensusReqReader.class); + when(reqReader.getCurrentSearchIndex()).thenReturn(8L); + + final TestConsensusPrefetchingQueue queue = + createTestQueue(reqReader, mock(ConsensusSubscriptionCommitManager.class), null); + queue.setWalIteratorFactory( + startSearchIndex -> + new FakeProgressWALIterator( + Arrays.asList( + Collections.emptyList(), + Arrays.asList( + newIndexedConsensusRequest(5L, 5L, 1, 1L, 5L), + newIndexedConsensusRequest(6L, 6L, 1, 1L, 6L), + newIndexedConsensusRequest(7L, 7L, 1, 1L, 7L))))); + try { + queue.setNextExpectedSearchIndexForTest(5L); + + final boolean accepted = + queue.accumulateFromPendingForTest( + Collections.singletonList(newIndexedConsensusRequest(8L, 8L, 1, 1L, 8L)), + queue.newDeliveryBatchStateForTest(), + queue.getCurrentSeekGeneration(), + Integer.MAX_VALUE, + Long.MAX_VALUE); + + assertTrue(accepted); + assertEquals(9L, queue.getCurrentReadSearchIndex()); + assertEquals(0L, queue.getWalGapSkippedEntries()); + assertEquals(1, queue.getWalGapRetryCount()); + } finally { + queue.close(); + } + } + + @Test + public void testAccumulateFromPendingReturnsFalseWhenSeekChangesDuringWalGapWait() + throws Exception { + final ConsensusReqReader reqReader = mock(ConsensusReqReader.class); + when(reqReader.getCurrentSearchIndex()).thenReturn(8L); + + final TestConsensusPrefetchingQueue queue = + createTestQueue(reqReader, mock(ConsensusSubscriptionCommitManager.class), null); + queue.setWalIteratorFactory( + startSearchIndex -> + new FakeProgressWALIterator(Collections.singletonList(Collections.emptyList()))); + queue.setWalGapRetryHook(queue::incrementSeekGenerationForTest); + try { + queue.setNextExpectedSearchIndexForTest(5L); + + final boolean accepted = + queue.accumulateFromPendingForTest( + Collections.singletonList(newIndexedConsensusRequest(8L, 8L, 1, 1L, 8L)), + queue.newDeliveryBatchStateForTest(), + queue.getCurrentSeekGeneration(), + Integer.MAX_VALUE, + Long.MAX_VALUE); + + assertFalse(accepted); + assertEquals(5L, queue.getCurrentReadSearchIndex()); + assertEquals(0L, queue.getWalGapSkippedEntries()); + assertEquals(1, queue.getWalGapRetryCount()); + } finally { + queue.close(); + } + } + + private static ConsensusPrefetchingQueue createQueue(final boolean initialActive) { + final IoTConsensusServerImpl server = mock(IoTConsensusServerImpl.class); + final ConsensusReqReader reqReader = mock(ConsensusReqReader.class); + final WriterSafeFrontierTracker writerSafeFrontierTracker = + mock(WriterSafeFrontierTracker.class); + when(server.getConsensusReqReader()).thenReturn(reqReader); + when(server.getWriterSafeFrontierTracker()).thenReturn(writerSafeFrontierTracker); + when(writerSafeFrontierTracker.snapshotEffectiveSafePts()).thenReturn(Collections.emptyMap()); + when(reqReader.getCurrentSearchIndex()).thenReturn(0L); + + return new ConsensusPrefetchingQueue( + "cg", + "topic", + TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE, + new DataRegionId(11), + server, + mock(ConsensusLogToTabletConverter.class), + mock(ConsensusSubscriptionCommitManager.class), + null, + 1L, + 0L, + initialActive); + } + + private static TestConsensusPrefetchingQueue createTestQueue( + final ConsensusReqReader reqReader, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress) { + final IoTConsensusServerImpl server = mock(IoTConsensusServerImpl.class); + final WriterSafeFrontierTracker writerSafeFrontierTracker = + mock(WriterSafeFrontierTracker.class); + when(server.getConsensusReqReader()).thenReturn(reqReader); + when(server.getWriterSafeFrontierTracker()).thenReturn(writerSafeFrontierTracker); + when(writerSafeFrontierTracker.snapshotEffectiveSafePts()).thenReturn(Collections.emptyMap()); + when(reqReader.getCurrentSearchIndex()).thenReturn(0L); + + return new TestConsensusPrefetchingQueue( + server, + reqReader, + mock(ConsensusLogToTabletConverter.class), + commitManager, + fallbackCommittedRegionProgress); + } + + @SuppressWarnings("unchecked") + private static void addRealtimeEntry( + final ConsensusPrefetchingQueue queue, + final int writerNodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq, + final long searchIndex) + throws Exception { + final Object laneId = newWriterLaneId(writerNodeId, writerEpoch); + final Object preparedEntry = + newPreparedEntry(searchIndex, physicalTime, writerNodeId, writerEpoch, localSeq); + + final Field realtimeEntriesByLaneField = + ConsensusPrefetchingQueue.class.getDeclaredField("realtimeEntriesByLane"); + realtimeEntriesByLaneField.setAccessible(true); + final Map> realtimeEntriesByLane = + (Map>) realtimeEntriesByLaneField.get(queue); + + final NavigableMap laneEntries = new TreeMap<>(); + laneEntries.put(localSeq, preparedEntry); + realtimeEntriesByLane.put(laneId, laneEntries); + } + + private static Object newWriterLaneId(final int writerNodeId, final long writerEpoch) + throws Exception { + final Class writerLaneIdClass = + Class.forName( + "org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue$WriterLaneId"); + final Constructor constructor = + writerLaneIdClass.getDeclaredConstructor(int.class, long.class); + constructor.setAccessible(true); + return constructor.newInstance(writerNodeId, writerEpoch); + } + + private static Object newPreparedEntry( + final long searchIndex, + final long physicalTime, + final int writerNodeId, + final long writerEpoch, + final long localSeq) + throws Exception { + final Class preparedEntryClass = + Class.forName( + "org.apache.iotdb.db.subscription.broker.consensus.ConsensusPrefetchingQueue$PreparedEntry"); + final Constructor constructor = + preparedEntryClass.getDeclaredConstructor( + java.util.List.class, long.class, long.class, int.class, long.class, long.class); + constructor.setAccessible(true); + return constructor.newInstance( + Collections.emptyList(), searchIndex, physicalTime, writerNodeId, writerEpoch, localSeq); + } + + @SuppressWarnings("unchecked") + private static PriorityQueue buildRealtimeLaneFrontiers( + final ConsensusPrefetchingQueue queue) throws Exception { + final Method method = + ConsensusPrefetchingQueue.class.getDeclaredMethod("buildRealtimeLaneFrontiers"); + method.setAccessible(true); + return (PriorityQueue) method.invoke(queue); + } + + private static boolean isBarrier(final Object frontier) throws Exception { + final Field field = frontier.getClass().getDeclaredField("isBarrier"); + field.setAccessible(true); + return field.getBoolean(frontier); + } + + private static int getFrontierWriterNodeId(final Object frontier) throws Exception { + final Field laneIdField = frontier.getClass().getDeclaredField("laneId"); + laneIdField.setAccessible(true); + final Object laneId = laneIdField.get(frontier); + final Field writerNodeIdField = laneId.getClass().getDeclaredField("writerNodeId"); + writerNodeIdField.setAccessible(true); + return writerNodeIdField.getInt(laneId); + } + + private static IndexedConsensusRequest newIndexedConsensusRequest( + final long searchIndex, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + return new IndexedConsensusRequest(searchIndex, localSeq, Collections.emptyList()) + .setPhysicalTime(physicalTime) + .setNodeId(nodeId) + .setWriterEpoch(writerEpoch); + } + + private static final class TestConsensusPrefetchingQueue extends ConsensusPrefetchingQueue { + + private ReplayLocateDecision locateDecision = + ReplayLocateDecision.atEnd( + 0L, new RegionProgress(Collections.emptyMap()), "default test locate"); + private RegionProgress lastLocatedRegionProgress; + private boolean lastSeekAfter; + private LongFunction walIteratorFactory; + private Runnable walGapRetryHook = () -> {}; + private int walGapRetryCount = 0; + + private TestConsensusPrefetchingQueue( + final IoTConsensusServerImpl server, + final ConsensusReqReader reqReader, + final ConsensusLogToTabletConverter converter, + final ConsensusSubscriptionCommitManager commitManager, + final RegionProgress fallbackCommittedRegionProgress) { + super( + "cg", + "topic", + TopicConstant.ORDER_MODE_MULTI_WRITER_VALUE, + new DataRegionId(11), + server, + converter, + commitManager, + fallbackCommittedRegionProgress, + 1L, + 0L, + true); + if (reqReader instanceof WALNode) { + when(((WALNode) reqReader).getLogDirectory()).thenReturn(new File(".")); + } + } + + @Override + protected ReplayLocateDecision locateReplayStartForRegionProgress( + final RegionProgress regionProgress, final boolean seekAfter) { + this.lastLocatedRegionProgress = regionProgress; + this.lastSeekAfter = seekAfter; + return locateDecision; + } + + @Override + protected ProgressWALIterator createSubscriptionWALIterator(final long startSearchIndex) { + if (walIteratorFactory != null) { + return walIteratorFactory.apply(startSearchIndex); + } + return super.createSubscriptionWALIterator(startSearchIndex); + } + + @Override + protected void onWalGapRetryScheduled() { + walGapRetryCount++; + walGapRetryHook.run(); + } + + private void setLocateDecision(final ReplayLocateDecision locateDecision) { + this.locateDecision = locateDecision; + } + + private void setWalIteratorFactory(final LongFunction walIteratorFactory) { + this.walIteratorFactory = walIteratorFactory; + } + + private void setWalGapRetryHook(final Runnable walGapRetryHook) { + this.walGapRetryHook = walGapRetryHook; + } + + private int getWalGapRetryCount() { + return walGapRetryCount; + } + + private RegionProgress getLastLocatedRegionProgress() { + return lastLocatedRegionProgress; + } + + private boolean wasLastSeekAfter() { + return lastSeekAfter; + } + + private ReplayLocateDecision scanReplayStartForRequestsForTest( + final Iterable requests, + final RegionProgress regionProgress, + final boolean seekAfter) { + return scanReplayStartForRequests(requests, regionProgress, seekAfter); + } + + private void initPrefetchForTest(final RegionProgress regionProgress) throws Exception { + final Method method = + ConsensusPrefetchingQueue.class.getDeclaredMethod("initPrefetch", RegionProgress.class); + method.setAccessible(true); + method.invoke(this, regionProgress); + } + + @SuppressWarnings("unchecked") + private Map getRecoveryProgressForTest() throws Exception { + final Field field = + ConsensusPrefetchingQueue.class.getDeclaredField("recoveryWriterProgressByWriter"); + field.setAccessible(true); + return (Map) field.get(this); + } + + private RegionProgress resolveCommittedRegionProgressForInitForTest() { + return resolveCommittedRegionProgressForInit(); + } + + private Object newDeliveryBatchStateForTest() throws Exception { + final Class batchStateClass = + Class.forName( + "org.apache.iotdb.db.subscription.broker.consensus." + + "ConsensusPrefetchingQueue$DeliveryBatchState"); + final Constructor constructor = batchStateClass.getDeclaredConstructor(); + constructor.setAccessible(true); + return constructor.newInstance(); + } + + private boolean accumulateFromPendingForTest( + final List batch, + final Object lingerBatch, + final long expectedSeekGeneration, + final int maxTablets, + final long maxBatchBytes) + throws Exception { + final Class batchStateClass = + Class.forName( + "org.apache.iotdb.db.subscription.broker.consensus." + + "ConsensusPrefetchingQueue$DeliveryBatchState"); + final Method method = + ConsensusPrefetchingQueue.class.getDeclaredMethod( + "accumulateFromPending", + List.class, + batchStateClass, + long.class, + int.class, + long.class); + method.setAccessible(true); + return (boolean) + method.invoke( + this, batch, lingerBatch, expectedSeekGeneration, maxTablets, maxBatchBytes); + } + + private void setNextExpectedSearchIndexForTest(final long nextExpectedSearchIndex) + throws Exception { + final Field field = + ConsensusPrefetchingQueue.class.getDeclaredField("nextExpectedSearchIndex"); + field.setAccessible(true); + ((java.util.concurrent.atomic.AtomicLong) field.get(this)).set(nextExpectedSearchIndex); + } + + private void incrementSeekGenerationForTest() { + try { + final Field field = ConsensusPrefetchingQueue.class.getDeclaredField("seekGeneration"); + field.setAccessible(true); + ((java.util.concurrent.atomic.AtomicLong) field.get(this)).incrementAndGet(); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private void installPendingSeekForAbortForTest( + final long targetSearchIndex, + final RegionProgress committedRegionProgress, + final String seekReason, + final boolean previousPrefetchInitialized, + final long previousSeekGeneration, + final long targetSeekGeneration) + throws Exception { + final Class pendingSeekRequestClass = + Class.forName( + "org.apache.iotdb.db.subscription.broker.consensus." + + "ConsensusPrefetchingQueue$PendingSeekRequest"); + final Constructor constructor = + pendingSeekRequestClass.getDeclaredConstructor( + long.class, + RegionProgress.class, + String.class, + boolean.class, + long.class, + long.class); + constructor.setAccessible(true); + final Object pendingSeekRequest = + constructor.newInstance( + targetSearchIndex, + committedRegionProgress, + seekReason, + previousPrefetchInitialized, + previousSeekGeneration, + targetSeekGeneration); + + final Field pendingSeekRequestField = + ConsensusPrefetchingQueue.class.getDeclaredField("pendingSeekRequest"); + pendingSeekRequestField.setAccessible(true); + pendingSeekRequestField.set(this, pendingSeekRequest); + + final Field prefetchInitializedField = + ConsensusPrefetchingQueue.class.getDeclaredField("prefetchInitialized"); + prefetchInitializedField.setAccessible(true); + prefetchInitializedField.setBoolean(this, true); + + final Field seekGenerationField = + ConsensusPrefetchingQueue.class.getDeclaredField("seekGeneration"); + seekGenerationField.setAccessible(true); + ((java.util.concurrent.atomic.AtomicLong) seekGenerationField.get(this)) + .set(targetSeekGeneration); + } + + private boolean hasPendingSeekForTest() throws Exception { + final Field pendingSeekRequestField = + ConsensusPrefetchingQueue.class.getDeclaredField("pendingSeekRequest"); + pendingSeekRequestField.setAccessible(true); + return pendingSeekRequestField.get(this) != null; + } + + private boolean isPrefetchInitializedForTest() throws Exception { + final Field prefetchInitializedField = + ConsensusPrefetchingQueue.class.getDeclaredField("prefetchInitialized"); + prefetchInitializedField.setAccessible(true); + return prefetchInitializedField.getBoolean(this); + } + } + + private static final class FakeProgressWALIterator extends ProgressWALIterator { + + private final Path tempDir; + private final List> refreshSnapshots; + private final Deque ready = new ArrayDeque<>(); + private int refreshCount = 0; + + private FakeProgressWALIterator(final List> refreshSnapshots) { + this(createTempDir(), refreshSnapshots); + } + + private FakeProgressWALIterator( + final Path tempDir, final List> refreshSnapshots) { + super(tempDir.toFile(), Long.MIN_VALUE); + this.tempDir = tempDir; + this.refreshSnapshots = refreshSnapshots; + } + + @Override + public void refresh() { + ready.clear(); + if (refreshCount < refreshSnapshots.size()) { + ready.addAll(refreshSnapshots.get(refreshCount)); + } + refreshCount++; + } + + @Override + public boolean hasNext() { + return !ready.isEmpty(); + } + + @Override + public IndexedConsensusRequest next() { + if (ready.isEmpty()) { + throw new NoSuchElementException(); + } + return ready.removeFirst(); + } + + @Override + public void close() throws IOException { + ready.clear(); + Files.deleteIfExists(tempDir); + } + + private static Path createTempDir() { + try { + return Files.createTempDirectory("consensus-prefetch-gap-fill"); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitStateTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitStateTest.java new file mode 100644 index 0000000000000..751074893a6a5 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionCommitStateTest.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.nio.ByteBuffer; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertTrue; + +public class ConsensusSubscriptionCommitStateTest { + + @Test + public void testCommitAdvancesContiguousWriterProgress() { + final WriterId writerId = new WriterId("1_1", 7, 2L); + final Map initialCommitted = new LinkedHashMap<>(); + initialCommitted.put(writerId, new WriterProgress(100L, 0L)); + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "1_1", new SubscriptionConsensusProgress(new RegionProgress(initialCommitted), 0L)); + + state.recordMapping(writerId, new WriterProgress(101L, 1L)); + state.recordMapping(writerId, new WriterProgress(102L, 2L)); + state.recordMapping(writerId, new WriterProgress(103L, 3L)); + + assertTrue(state.commit(writerId, new WriterProgress(102L, 2L))); + assertEquals(100L, state.getCommittedPhysicalTime()); + assertEquals(0L, state.getCommittedLocalSeq()); + assertEquals( + new WriterProgress(100L, 0L), + state.getCommittedRegionProgress().getWriterPositions().get(writerId)); + + assertTrue(state.commit(writerId, new WriterProgress(101L, 1L))); + assertEquals(102L, state.getCommittedPhysicalTime()); + assertEquals(2L, state.getCommittedLocalSeq()); + assertEquals(7, state.getCommittedWriterNodeId()); + assertEquals(2L, state.getCommittedWriterEpoch()); + assertEquals(writerId, state.getCommittedWriterId()); + assertEquals( + new WriterProgress(102L, 2L), + state.getCommittedRegionProgress().getWriterPositions().get(writerId)); + + assertTrue(state.commit(writerId, new WriterProgress(103L, 3L))); + assertEquals(103L, state.getCommittedPhysicalTime()); + assertEquals(3L, state.getCommittedLocalSeq()); + assertEquals(7, state.getCommittedWriterNodeId()); + assertEquals(2L, state.getCommittedWriterEpoch()); + } + + @Test + public void testSerializeDeserializeWriterProgress() throws Exception { + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "2_5", new SubscriptionConsensusProgress()); + final Map seekProgress = new LinkedHashMap<>(); + final WriterId writerA = new WriterId("2_5", 4, 9L); + final WriterId writerB = new WriterId("2_5", 5, 3L); + seekProgress.put(writerA, new WriterProgress(222L, 11L)); + seekProgress.put(writerB, new WriterProgress(230L, 4L)); + state.resetForSeek(new RegionProgress(seekProgress)); + + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (DataOutputStream dos = new DataOutputStream(baos)) { + state.serialize(dos); + } + + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState restored = + ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState.deserialize( + "2_5", ByteBuffer.wrap(baos.toByteArray())); + + assertEquals(new RegionProgress(seekProgress), restored.getCommittedRegionProgress()); + assertEquals(230L, restored.getCommittedPhysicalTime()); + assertEquals(4L, restored.getCommittedLocalSeq()); + assertEquals(5, restored.getCommittedWriterNodeId()); + assertEquals(3L, restored.getCommittedWriterEpoch()); + assertEquals(writerB, restored.getCommittedWriterId()); + assertEquals(new WriterProgress(230L, 4L), restored.getCommittedWriterProgress()); + } + + @Test + public void testDirectCommitWithoutOutstandingRequiresOutstandingMapping() { + final WriterId writerId = new WriterId("3_1", 9, 4L); + final Map initialCommitted = new LinkedHashMap<>(); + initialCommitted.put(writerId, new WriterProgress(100L, 0L)); + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "3_1", new SubscriptionConsensusProgress(new RegionProgress(initialCommitted), 0L)); + + assertFalse(state.commitWithoutOutstanding(writerId, new WriterProgress(103L, 3L))); + assertEquals(100L, state.getCommittedPhysicalTime()); + assertEquals(0L, state.getCommittedLocalSeq()); + } + + @Test + public void testDirectCommitWithoutOutstandingRespectsOutstandingGap() { + final ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState state = + new ConsensusSubscriptionCommitManager.ConsensusSubscriptionCommitState( + "3_2", new SubscriptionConsensusProgress()); + + final WriterId writerId = new WriterId("3_2", 8, 1L); + state.recordMapping(writerId, new WriterProgress(101L, 1L)); + state.recordMapping(writerId, new WriterProgress(102L, 2L)); + state.recordMapping(writerId, new WriterProgress(103L, 3L)); + + assertTrue(state.commitWithoutOutstanding(writerId, new WriterProgress(103L, 3L))); + assertEquals(new WriterProgress(0L, -1L), state.getCommittedWriterProgress()); + + assertTrue(state.commitWithoutOutstanding(writerId, new WriterProgress(101L, 1L))); + assertEquals(new WriterProgress(101L, 1L), state.getCommittedWriterProgress()); + + assertTrue(state.commitWithoutOutstanding(writerId, new WriterProgress(102L, 2L))); + assertEquals(new WriterProgress(103L, 3L), state.getCommittedWriterProgress()); + } + + @Test + public void testBroadcastThrottleKeyIsPerWriter() { + final String baseKey = "cg##topic##1_1"; + final WriterId writerA = new WriterId("1_1", 7, 1L); + final WriterId writerB = new WriterId("1_1", 8, 1L); + + assertNotEquals( + ConsensusSubscriptionCommitManager.buildBroadcastKey(baseKey, writerA), + ConsensusSubscriptionCommitManager.buildBroadcastKey(baseKey, writerB)); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandlerTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandlerTest.java new file mode 100644 index 0000000000000..a15fcf7f6a380 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ConsensusSubscriptionSetupHandlerTest.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class ConsensusSubscriptionSetupHandlerTest { + + @Test + public void testResolveFallbackCommittedRegionProgressUsesRecoveredState() { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final DataRegionId regionId = new DataRegionId(11); + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(new WriterId(regionId.toString(), 3, 7L), new WriterProgress(100L, 9L)); + final RegionProgress committedRegionProgress = new RegionProgress(writerPositions); + when(commitManager.getCommittedRegionProgress("cg", "topic", regionId)) + .thenReturn(committedRegionProgress); + + final RegionProgress resolved = + ConsensusSubscriptionSetupHandler.resolveFallbackCommittedRegionProgress( + commitManager, "cg", "topic", regionId); + + assertSame(committedRegionProgress, resolved); + verify(commitManager).getOrCreateState("cg", "topic", regionId); + } + + @Test + public void testResolveFallbackCommittedRegionProgressReturnsNullForEmptyState() { + final ConsensusSubscriptionCommitManager commitManager = + mock(ConsensusSubscriptionCommitManager.class); + final DataRegionId regionId = new DataRegionId(12); + when(commitManager.getCommittedRegionProgress("cg", "topic", regionId)) + .thenReturn(new RegionProgress(Collections.emptyMap())); + + final RegionProgress resolved = + ConsensusSubscriptionSetupHandler.resolveFallbackCommittedRegionProgress( + commitManager, "cg", "topic", regionId); + + assertNull(resolved); + verify(commitManager).getOrCreateState("cg", "topic", regionId); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIteratorTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIteratorTest.java new file mode 100644 index 0000000000000..1045724c51249 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/subscription/broker/consensus/ProgressWALIteratorTest.java @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.subscription.broker.consensus; + +import org.apache.iotdb.consensus.common.request.IndexedConsensusRequest; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALEntryType; +import org.apache.iotdb.db.storageengine.dataregion.wal.buffer.WALInfoEntry; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALFileVersion; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALMetaData; +import org.apache.iotdb.db.storageengine.dataregion.wal.io.WALWriter; +import org.apache.iotdb.db.storageengine.dataregion.wal.node.WALNode; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileStatus; +import org.apache.iotdb.db.storageengine.dataregion.wal.utils.WALFileUtils; + +import org.junit.Test; + +import java.io.File; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ProgressWALIteratorTest { + + @Test + public void testIteratorGroupsByLocalSeqAndCarriesWriterMetadata() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 12, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write( + searchableEntry(5L), singleEntryMeta(19, 5L, 1L, 100L, 5L, 1000L, 7, 3L, 105L)); + writer.write( + searchableEntry(5L), singleEntryMeta(19, 5L, 1L, 100L, 5L, 1000L, 7, 3L, 105L)); + writer.write( + searchableEntry(12L), singleEntryMeta(19, 12L, 1L, 101L, 12L, 2000L, 7, 4L, 112L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a sealed successor so the first WAL becomes historical and readable. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), 6L)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + assertEquals(12L, request.getSearchIndex()); + assertEquals(112L, request.getProgressLocalSeq()); + assertEquals(2000L, request.getPhysicalTime()); + assertEquals(7, request.getNodeId()); + assertEquals(4L, request.getWriterEpoch()); + assertEquals(1, request.getRequests().size()); + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorMergesFragmentsWithSameLocalSeq() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-merge"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 9, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(9L), singleEntryMeta(19, 9L, 1L, 88L, 9L, 900L, 5, 2L, 1009L)); + writer.write(searchableEntry(9L), singleEntryMeta(19, 9L, 1L, 88L, 9L, 900L, 5, 2L, 1009L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a sealed successor so the first WAL becomes historical and readable. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + assertEquals(9L, request.getSearchIndex()); + assertEquals(1009L, request.getProgressLocalSeq()); + assertEquals(900L, request.getPhysicalTime()); + assertEquals(5, request.getNodeId()); + assertEquals(2L, request.getWriterEpoch()); + assertEquals(2, request.getRequests().size()); + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorKeepsDifferentWritersWithSameLocalSeqSeparated() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-writers"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 16, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(15L), singleEntryMeta(19, 15L, 1L, 1L, 15L, 1500L, 7, 1L, 1L)); + writer.write(searchableEntry(16L), singleEntryMeta(19, 16L, 1L, 2L, 16L, 1501L, 8, 1L, 1L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a sealed successor so the first WAL becomes historical and readable. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest first = iterator.next(); + assertEquals(15L, first.getSearchIndex()); + assertEquals(1L, first.getProgressLocalSeq()); + assertEquals(7, first.getNodeId()); + + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest second = iterator.next(); + assertEquals(16L, second.getSearchIndex()); + assertEquals(1L, second.getProgressLocalSeq()); + assertEquals(8, second.getNodeId()); + + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorDoesNotSkipNextWalFileAfterExhaustingCurrentOne() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-sequential-files"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File secondWal = + dir.resolve(WALFileUtils.getLogFileName(1, 1, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File thirdWal = + dir.resolve(WALFileUtils.getLogFileName(2, 2, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write(searchableEntry(1L), singleEntryMeta(19, 1L, 1L, 1L, 1L, 100L, 7, 1L, 1L)); + } + try (WALWriter writer = new WALWriter(secondWal, WALFileVersion.V3)) { + writer.write(searchableEntry(2L), singleEntryMeta(19, 2L, 1L, 2L, 2L, 200L, 7, 1L, 2L)); + } + try (WALWriter writer = new WALWriter(thirdWal, WALFileVersion.V3)) { + writer.write(searchableEntry(3L), singleEntryMeta(19, 3L, 1L, 3L, 3L, 300L, 7, 1L, 3L)); + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + assertEquals(1L, iterator.next().getSearchIndex()); + + assertTrue(iterator.hasNext()); + assertEquals(2L, iterator.next().getSearchIndex()); + + assertTrue(iterator.hasNext()); + assertEquals(3L, iterator.next().getSearchIndex()); + + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(secondWal.toPath()); + Files.deleteIfExists(thirdWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testFollowerEntryDoesNotSynthesizeSearchIndexFromProgressLocalSeq() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-follower"); + final File firstWal = + dir.resolve(WALFileUtils.getLogFileName(0, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + final File lastWal = + dir.resolve(WALFileUtils.getLogFileName(1, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + try (WALWriter writer = new WALWriter(firstWal, WALFileVersion.V3)) { + writer.write( + searchableEntry(-1L), singleEntryMeta(19, -1L, 1L, 77L, -1L, 900L, 5, 2L, 1009L)); + } + try (WALWriter ignored = new WALWriter(lastWal, WALFileVersion.V3)) { + // Create a readable successor for the first WAL file. + } + + try (ProgressWALIterator iterator = new ProgressWALIterator(dir.toFile(), Long.MIN_VALUE)) { + assertTrue(iterator.hasNext()); + final IndexedConsensusRequest request = iterator.next(); + assertEquals(-1L, request.getSearchIndex()); + assertEquals(1009L, request.getProgressLocalSeq()); + assertEquals(900L, request.getPhysicalTime()); + assertEquals(5, request.getNodeId()); + assertEquals(2L, request.getWriterEpoch()); + assertFalse(iterator.hasNext()); + } + } finally { + Files.deleteIfExists(firstWal.toPath()); + Files.deleteIfExists(lastWal.toPath()); + Files.deleteIfExists(dir); + } + } + + @Test + public void testIteratorMarksIncompleteScanWhenNearLiveWalCannotBeOpened() throws Exception { + final Path dir = Files.createTempDirectory("progress-wal-iterator-incomplete-scan"); + final File brokenLiveWal = + dir.resolve(WALFileUtils.getLogFileName(7, 0, WALFileStatus.CONTAINS_SEARCH_INDEX)) + .toFile(); + + try { + assertTrue(brokenLiveWal.mkdir()); + + final WALNode walNode = mock(WALNode.class); + when(walNode.getLogDirectory()).thenReturn(dir.toFile()); + when(walNode.getCurrentWALFileVersion()).thenReturn(7L); + when(walNode.getCurrentWALMetaDataSnapshot()).thenReturn(new WALMetaData()); + + try (ProgressWALIterator iterator = new ProgressWALIterator(walNode, Long.MIN_VALUE)) { + assertFalse(iterator.hasNext()); + assertTrue(iterator.hasIncompleteScan()); + assertTrue(iterator.hasReadError()); + assertTrue(iterator.getIncompleteScanDetail().contains("near-live WAL file")); + } + } finally { + Files.deleteIfExists(brokenLiveWal.toPath()); + Files.deleteIfExists(dir); + } + } + + private static ByteBuffer searchableEntry(final long bodySearchIndex) { + final ByteBuffer buffer = + ByteBuffer.allocate(WALInfoEntry.FIXED_SERIALIZED_SIZE + PlanNodeType.BYTES + Long.BYTES); + buffer.put(WALEntryType.INSERT_ROW_NODE.getCode()); + buffer.putLong(1L); + buffer.putShort(PlanNodeType.INSERT_ROW.getNodeType()); + buffer.putLong(bodySearchIndex); + return buffer; + } + + private static WALMetaData singleEntryMeta( + final int size, + final long searchIndex, + final long memTableId, + final long epoch, + final long syncIndex, + final long physicalTime, + final int nodeId, + final long writerEpoch, + final long localSeq) { + final WALMetaData metaData = new WALMetaData(); + metaData.add(size, searchIndex, memTableId, physicalTime, nodeId, writerEpoch, localSeq); + return metaData; + } +} diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties index 4015a4b2f3e92..021eaac902401 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties @@ -54,6 +54,12 @@ dn_data_region_consensus_port=10760 schema_replication_factor=1 data_replication_factor=1 +#################### +### Subscription Consensus Configuration +#################### + +# subscription_consensus_idle_safe_hlc_interval_ms=10000 + #################### ### Directory Configuration #################### @@ -70,4 +76,3 @@ cn_metric_prometheus_reporter_port=9091 # dn_metric_reporter_list= dn_metric_prometheus_reporter_port=9092 - diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java index 81f2aa7156cf7..e7fcdca7c1b2a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java @@ -156,6 +156,8 @@ public enum ThreadName { PIPE_TERMINATE_EXECUTION_POOL("Pipe-Terminate-Execution-Pool"), LOAD_DATATYPE_CONVERT_POOL("Load-Datatype-Convert-Pool"), SUBSCRIPTION_EXECUTOR_POOL("Subscription-Executor-Pool"), + SUBSCRIPTION_CONSENSUS_PREFETCH_EXECUTOR_POOL("Subscription-Consensus-Prefetch-Executor-Pool"), + SUBSCRIPTION_CONSENSUS_PREFETCH_SCHEDULER("Subscription-Consensus-Prefetch-Scheduler"), SUBSCRIPTION_RUNTIME_META_SYNCER("Subscription-Runtime-Meta-Syncer"), WINDOW_EVALUATION_SERVICE("WindowEvaluationTaskPoolManager"), STATEFUL_TRIGGER_INFORMATION_UPDATER("Stateful-Trigger-Information-Updater"), @@ -318,6 +320,8 @@ public enum ThreadName { PIPE_AIR_GAP_RECEIVER, PIPE_PARALLEL_EXECUTION_POOL, SUBSCRIPTION_EXECUTOR_POOL, + SUBSCRIPTION_CONSENSUS_PREFETCH_EXECUTOR_POOL, + SUBSCRIPTION_CONSENSUS_PREFETCH_SCHEDULER, SUBSCRIPTION_RUNTIME_META_SYNCER, WINDOW_EVALUATION_SERVICE, STATEFUL_TRIGGER_INFORMATION_UPDATER)); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java index ff4a47b6f84ab..87329dfc33271 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonConfig.java @@ -359,6 +359,7 @@ public class CommonConfig { private float subscriptionCacheMemoryUsagePercentage = 0.2F; private int subscriptionSubtaskExecutorMaxThreadNum = 2; + private int subscriptionConsensusPrefetchExecutorMaxThreadNum = 2; private int subscriptionPrefetchTabletBatchMaxDelayInMs = 20; private long subscriptionPrefetchTabletBatchMaxSizeInBytes = MB; @@ -387,6 +388,28 @@ public class CommonConfig { private long subscriptionMetaSyncerInitialSyncDelayMinutes = 3; private long subscriptionMetaSyncerSyncIntervalMinutes = 3; + private int subscriptionConsensusBatchMaxDelayInMs = 50; + private long subscriptionConsensusBatchMaxSizeInBytes = 8 * MB; + private int subscriptionConsensusBatchMaxTabletCount = 64; + private int subscriptionConsensusBatchMaxWalEntries = 128; + + private long subscriptionConsensusWalRetentionSizeInBytes = 512 * MB; + + private int subscriptionConsensusCommitPersistInterval = 100; + private boolean subscriptionConsensusCommitFsyncEnabled = false; + + private long subscriptionConsensusConsumerEvictionTimeoutMs = 60_000; + + private boolean subscriptionConsensusLagBasedPriority = true; + + private int subscriptionConsensusPrefetchingQueueCapacity = 256; + + private boolean subscriptionConsensusWatermarkEnabled = false; + + private long subscriptionConsensusWatermarkIntervalMs = 1000; + + private long subscriptionConsensusIdleSafeHlcIntervalMs = 1_000; + /** Whether to use persistent schema mode. */ private String schemaEngineMode = "Memory"; @@ -2274,6 +2297,16 @@ public void setSubscriptionSubtaskExecutorMaxThreadNum( this.subscriptionSubtaskExecutorMaxThreadNum = subscriptionSubtaskExecutorMaxThreadNum; } + public int getSubscriptionConsensusPrefetchExecutorMaxThreadNum() { + return subscriptionConsensusPrefetchExecutorMaxThreadNum; + } + + public void setSubscriptionConsensusPrefetchExecutorMaxThreadNum( + int subscriptionConsensusPrefetchExecutorMaxThreadNum) { + this.subscriptionConsensusPrefetchExecutorMaxThreadNum = + subscriptionConsensusPrefetchExecutorMaxThreadNum; + } + public int getSubscriptionPrefetchTabletBatchMaxDelayInMs() { return subscriptionPrefetchTabletBatchMaxDelayInMs; } @@ -2487,6 +2520,126 @@ public long getSubscriptionMetaSyncerSyncIntervalMinutes() { return subscriptionMetaSyncerSyncIntervalMinutes; } + public int getSubscriptionConsensusBatchMaxDelayInMs() { + return subscriptionConsensusBatchMaxDelayInMs; + } + + public void setSubscriptionConsensusBatchMaxDelayInMs( + final int subscriptionConsensusBatchMaxDelayInMs) { + this.subscriptionConsensusBatchMaxDelayInMs = subscriptionConsensusBatchMaxDelayInMs; + } + + public long getSubscriptionConsensusBatchMaxSizeInBytes() { + return subscriptionConsensusBatchMaxSizeInBytes; + } + + public void setSubscriptionConsensusBatchMaxSizeInBytes( + final long subscriptionConsensusBatchMaxSizeInBytes) { + this.subscriptionConsensusBatchMaxSizeInBytes = subscriptionConsensusBatchMaxSizeInBytes; + } + + public int getSubscriptionConsensusBatchMaxTabletCount() { + return subscriptionConsensusBatchMaxTabletCount; + } + + public int getSubscriptionConsensusCommitPersistInterval() { + return subscriptionConsensusCommitPersistInterval; + } + + public void setSubscriptionConsensusCommitPersistInterval( + final int subscriptionConsensusCommitPersistInterval) { + this.subscriptionConsensusCommitPersistInterval = subscriptionConsensusCommitPersistInterval; + } + + public boolean isSubscriptionConsensusCommitFsyncEnabled() { + return subscriptionConsensusCommitFsyncEnabled; + } + + public void setSubscriptionConsensusCommitFsyncEnabled( + final boolean subscriptionConsensusCommitFsyncEnabled) { + this.subscriptionConsensusCommitFsyncEnabled = subscriptionConsensusCommitFsyncEnabled; + } + + public long getSubscriptionConsensusConsumerEvictionTimeoutMs() { + return subscriptionConsensusConsumerEvictionTimeoutMs; + } + + public void setSubscriptionConsensusConsumerEvictionTimeoutMs( + final long subscriptionConsensusConsumerEvictionTimeoutMs) { + this.subscriptionConsensusConsumerEvictionTimeoutMs = + subscriptionConsensusConsumerEvictionTimeoutMs; + } + + public boolean isSubscriptionConsensusLagBasedPriority() { + return subscriptionConsensusLagBasedPriority; + } + + public void setSubscriptionConsensusLagBasedPriority( + final boolean subscriptionConsensusLagBasedPriority) { + this.subscriptionConsensusLagBasedPriority = subscriptionConsensusLagBasedPriority; + } + + public int getSubscriptionConsensusPrefetchingQueueCapacity() { + return subscriptionConsensusPrefetchingQueueCapacity; + } + + public void setSubscriptionConsensusPrefetchingQueueCapacity( + final int subscriptionConsensusPrefetchingQueueCapacity) { + this.subscriptionConsensusPrefetchingQueueCapacity = + subscriptionConsensusPrefetchingQueueCapacity; + } + + public boolean isSubscriptionConsensusWatermarkEnabled() { + return subscriptionConsensusWatermarkEnabled; + } + + public void setSubscriptionConsensusWatermarkEnabled( + final boolean subscriptionConsensusWatermarkEnabled) { + this.subscriptionConsensusWatermarkEnabled = subscriptionConsensusWatermarkEnabled; + } + + public long getSubscriptionConsensusWatermarkIntervalMs() { + return subscriptionConsensusWatermarkIntervalMs; + } + + public void setSubscriptionConsensusWatermarkIntervalMs( + final long subscriptionConsensusWatermarkIntervalMs) { + this.subscriptionConsensusWatermarkIntervalMs = subscriptionConsensusWatermarkIntervalMs; + } + + public long getSubscriptionConsensusIdleSafeHlcIntervalMs() { + return subscriptionConsensusIdleSafeHlcIntervalMs; + } + + public void setSubscriptionConsensusIdleSafeHlcIntervalMs( + final long subscriptionConsensusIdleSafeHlcIntervalMs) { + this.subscriptionConsensusIdleSafeHlcIntervalMs = subscriptionConsensusIdleSafeHlcIntervalMs; + } + + public void setSubscriptionConsensusBatchMaxTabletCount( + final int subscriptionConsensusBatchMaxTabletCount) { + this.subscriptionConsensusBatchMaxTabletCount = subscriptionConsensusBatchMaxTabletCount; + } + + public int getSubscriptionConsensusBatchMaxWalEntries() { + return subscriptionConsensusBatchMaxWalEntries; + } + + public void setSubscriptionConsensusBatchMaxWalEntries( + final int subscriptionConsensusBatchMaxWalEntries) { + this.subscriptionConsensusBatchMaxWalEntries = subscriptionConsensusBatchMaxWalEntries; + } + + public long getSubscriptionConsensusWalRetentionSizeInBytes() { + return subscriptionConsensusWalRetentionSizeInBytes; + } + + public void setSubscriptionConsensusWalRetentionSizeInBytes( + final long subscriptionConsensusWalRetentionSizeInBytes) { + this.subscriptionConsensusWalRetentionSizeInBytes = + subscriptionConsensusWalRetentionSizeInBytes; + } + public void setSubscriptionMetaSyncerSyncIntervalMinutes( long subscriptionMetaSyncerSyncIntervalMinutes) { this.subscriptionMetaSyncerSyncIntervalMinutes = subscriptionMetaSyncerSyncIntervalMinutes; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java index d392a60bbbd76..8d71f9b9ac7f8 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/conf/CommonDescriptor.java @@ -293,6 +293,11 @@ private void loadSubscriptionProps(TrimProperties properties) { properties.getProperty( "subscription_subtask_executor_max_thread_num", Integer.toString(config.getSubscriptionSubtaskExecutorMaxThreadNum())))); + config.setSubscriptionConsensusPrefetchExecutorMaxThreadNum( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_prefetch_executor_max_thread_num", + Integer.toString(config.getSubscriptionConsensusPrefetchExecutorMaxThreadNum())))); config.setSubscriptionPrefetchTabletBatchMaxDelayInMs( Integer.parseInt( @@ -417,6 +422,67 @@ private void loadSubscriptionProps(TrimProperties properties) { properties.getProperty( "subscription_meta_syncer_sync_interval_minutes", String.valueOf(config.getSubscriptionMetaSyncerSyncIntervalMinutes())))); + + config.setSubscriptionConsensusBatchMaxDelayInMs( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_delay_in_ms", + String.valueOf(config.getSubscriptionConsensusBatchMaxDelayInMs())))); + config.setSubscriptionConsensusBatchMaxSizeInBytes( + Long.parseLong( + properties.getProperty( + "subscription_consensus_batch_max_size_in_bytes", + String.valueOf(config.getSubscriptionConsensusBatchMaxSizeInBytes())))); + config.setSubscriptionConsensusBatchMaxTabletCount( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_tablet_count", + String.valueOf(config.getSubscriptionConsensusBatchMaxTabletCount())))); + config.setSubscriptionConsensusBatchMaxWalEntries( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_batch_max_wal_entries", + String.valueOf(config.getSubscriptionConsensusBatchMaxWalEntries())))); + config.setSubscriptionConsensusCommitPersistInterval( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_commit_persist_interval", + String.valueOf(config.getSubscriptionConsensusCommitPersistInterval())))); + config.setSubscriptionConsensusCommitFsyncEnabled( + Boolean.parseBoolean( + properties.getProperty( + "subscription_consensus_commit_fsync_enabled", + String.valueOf(config.isSubscriptionConsensusCommitFsyncEnabled())))); + config.setSubscriptionConsensusConsumerEvictionTimeoutMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_consumer_eviction_timeout_ms", + String.valueOf(config.getSubscriptionConsensusConsumerEvictionTimeoutMs())))); + config.setSubscriptionConsensusLagBasedPriority( + Boolean.parseBoolean( + properties.getProperty( + "subscription_consensus_lag_based_priority", + String.valueOf(config.isSubscriptionConsensusLagBasedPriority())))); + config.setSubscriptionConsensusPrefetchingQueueCapacity( + Integer.parseInt( + properties.getProperty( + "subscription_consensus_prefetching_queue_capacity", + String.valueOf(config.getSubscriptionConsensusPrefetchingQueueCapacity())))); + config.setSubscriptionConsensusWatermarkEnabled( + Boolean.parseBoolean( + properties.getProperty( + "subscription_consensus_watermark_enabled", + String.valueOf(config.isSubscriptionConsensusWatermarkEnabled())))); + config.setSubscriptionConsensusWatermarkIntervalMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_watermark_interval_ms", + String.valueOf(config.getSubscriptionConsensusWatermarkIntervalMs())))); + config.setSubscriptionConsensusIdleSafeHlcIntervalMs( + Long.parseLong( + properties.getProperty( + "subscription_consensus_idle_safe_hlc_interval_ms", + String.valueOf(config.getSubscriptionConsensusIdleSafeHlcIntervalMs())))); } public void loadRetryProperties(TrimProperties properties) throws IOException { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java index ec16c181e618b..cbeef1f2c7c7e 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/service/metric/enums/Metric.java @@ -206,6 +206,10 @@ public enum Metric { SUBSCRIPTION_UNCOMMITTED_EVENT_COUNT("subscription_uncommitted_event_count"), SUBSCRIPTION_CURRENT_COMMIT_ID("subscription_current_commit_id"), SUBSCRIPTION_EVENT_TRANSFER("subscription_event_transfer"), + SUBSCRIPTION_CONSENSUS_LAG("subscription_consensus_lag"), + SUBSCRIPTION_CONSENSUS_WAL_GAP("subscription_consensus_wal_gap"), + SUBSCRIPTION_CONSENSUS_ROUTING_EPOCH_CHANGE("subscription_consensus_routing_epoch_change"), + SUBSCRIPTION_CONSENSUS_WATERMARK("subscription_consensus_watermark"), // load related ACTIVE_LOADING_FILES_NUMBER("active_loading_files_number"), ACTIVE_LOADING_FILES_SIZE("active_loading_files_size"), diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java index c7e7fea8d12f8..e09fa99615dba 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/config/SubscriptionConfig.java @@ -30,7 +30,7 @@ public class SubscriptionConfig { private static final CommonConfig COMMON_CONFIG = CommonDescriptor.getInstance().getConfig(); public boolean getSubscriptionEnabled() { - return false; + return true; // TODO: make it configurable after subscription is stable } public float getSubscriptionCacheMemoryUsagePercentage() { @@ -41,6 +41,10 @@ public int getSubscriptionSubtaskExecutorMaxThreadNum() { return COMMON_CONFIG.getSubscriptionSubtaskExecutorMaxThreadNum(); } + public int getSubscriptionConsensusPrefetchExecutorMaxThreadNum() { + return COMMON_CONFIG.getSubscriptionConsensusPrefetchExecutorMaxThreadNum(); + } + public int getSubscriptionPrefetchTabletBatchMaxDelayInMs() { return COMMON_CONFIG.getSubscriptionPrefetchTabletBatchMaxDelayInMs(); } @@ -137,6 +141,54 @@ public long getSubscriptionMetaSyncerSyncIntervalMinutes() { return COMMON_CONFIG.getSubscriptionMetaSyncerSyncIntervalMinutes(); } + // Consensus subscription batching parameters + public int getSubscriptionConsensusBatchMaxDelayInMs() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxDelayInMs(); + } + + public long getSubscriptionConsensusBatchMaxSizeInBytes() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxSizeInBytes(); + } + + public int getSubscriptionConsensusBatchMaxTabletCount() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxTabletCount(); + } + + public int getSubscriptionConsensusBatchMaxWalEntries() { + return COMMON_CONFIG.getSubscriptionConsensusBatchMaxWalEntries(); + } + + public int getSubscriptionConsensusCommitPersistInterval() { + return COMMON_CONFIG.getSubscriptionConsensusCommitPersistInterval(); + } + + public boolean isSubscriptionConsensusCommitFsyncEnabled() { + return COMMON_CONFIG.isSubscriptionConsensusCommitFsyncEnabled(); + } + + public long getSubscriptionConsensusConsumerEvictionTimeoutMs() { + return COMMON_CONFIG.getSubscriptionConsensusConsumerEvictionTimeoutMs(); + } + + public boolean isSubscriptionConsensusLagBasedPriority() { + return COMMON_CONFIG.isSubscriptionConsensusLagBasedPriority(); + } + + public int getSubscriptionConsensusPrefetchingQueueCapacity() { + return COMMON_CONFIG.getSubscriptionConsensusPrefetchingQueueCapacity(); + } + + public long getSubscriptionConsensusWatermarkIntervalMs() { + if (!COMMON_CONFIG.isSubscriptionConsensusWatermarkEnabled()) { + return -1; + } + return COMMON_CONFIG.getSubscriptionConsensusWatermarkIntervalMs(); + } + + public long getSubscriptionConsensusIdleSafeHlcIntervalMs() { + return COMMON_CONFIG.getSubscriptionConsensusIdleSafeHlcIntervalMs(); + } + /////////////////////////////// Utils /////////////////////////////// private static final Logger LOGGER = LoggerFactory.getLogger(SubscriptionConfig.class); @@ -147,6 +199,9 @@ public void printAllConfigs() { LOGGER.info( "SubscriptionSubtaskExecutorMaxThreadNum: {}", getSubscriptionSubtaskExecutorMaxThreadNum()); + LOGGER.info( + "SubscriptionConsensusPrefetchExecutorMaxThreadNum: {}", + getSubscriptionConsensusPrefetchExecutorMaxThreadNum()); LOGGER.info( "SubscriptionPrefetchTabletBatchMaxDelayInMs: {}", @@ -207,6 +262,21 @@ public void printAllConfigs() { LOGGER.info( "SubscriptionMetaSyncerSyncIntervalMinutes: {}", getSubscriptionMetaSyncerSyncIntervalMinutes()); + + LOGGER.info( + "SubscriptionConsensusBatchMaxDelayInMs: {}", getSubscriptionConsensusBatchMaxDelayInMs()); + LOGGER.info( + "SubscriptionConsensusBatchMaxSizeInBytes: {}", + getSubscriptionConsensusBatchMaxSizeInBytes()); + LOGGER.info( + "SubscriptionConsensusBatchMaxTabletCount: {}", + getSubscriptionConsensusBatchMaxTabletCount()); + LOGGER.info( + "SubscriptionConsensusBatchMaxWalEntries: {}", + getSubscriptionConsensusBatchMaxWalEntries()); + LOGGER.info( + "SubscriptionConsensusIdleSafeHlcIntervalMs: {}", + getSubscriptionConsensusIdleSafeHlcIntervalMs()); } /////////////////////////////// Singleton /////////////////////////////// diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeper.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeper.java new file mode 100644 index 0000000000000..9e5f6e03779bd --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeper.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.subscription.meta.consumer; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; + +public class CommitProgressKeeper { + + private static final String KEY_SEPARATOR = "##"; + + private final Map regionProgressMap = new ConcurrentHashMap<>(); + + public CommitProgressKeeper() {} + + public static String generateKey( + final String consumerGroupId, + final String topicName, + final String regionId, + final int dataNodeId) { + return consumerGroupId + + KEY_SEPARATOR + + topicName + + KEY_SEPARATOR + + regionId + + KEY_SEPARATOR + + dataNodeId; + } + + public void updateRegionProgress(final String key, final ByteBuffer committedRegionProgress) { + if (Objects.isNull(committedRegionProgress)) { + return; + } + regionProgressMap.put(key, copyBuffer(committedRegionProgress)); + } + + public ByteBuffer getRegionProgress(final String key) { + final ByteBuffer buffer = regionProgressMap.get(key); + return Objects.nonNull(buffer) ? copyBuffer(buffer) : null; + } + + public Map getAllRegionProgress() { + final Map result = new HashMap<>(regionProgressMap.size()); + regionProgressMap.forEach((key, value) -> result.put(key, copyBuffer(value))); + return result; + } + + public void replaceAll(final Map newRegionProgressMap) { + regionProgressMap.clear(); + if (Objects.nonNull(newRegionProgressMap)) { + for (final Map.Entry entry : newRegionProgressMap.entrySet()) { + if (Objects.nonNull(entry.getValue())) { + regionProgressMap.put(entry.getKey(), copyBuffer(entry.getValue())); + } + } + } + } + + public boolean isEmpty() { + return regionProgressMap.isEmpty(); + } + + public void processTakeSnapshot(final FileOutputStream fileOutputStream) throws IOException { + final int regionSize = regionProgressMap.size(); + fileOutputStream.write(ByteBuffer.allocate(4).putInt(regionSize).array()); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final byte[] keyBytes = entry.getKey().getBytes("UTF-8"); + final ByteBuffer progressBuffer = copyBuffer(entry.getValue()); + final byte[] progressBytes = new byte[progressBuffer.remaining()]; + progressBuffer.get(progressBytes); + final ByteBuffer buffer = ByteBuffer.allocate(4 + keyBytes.length + 4 + progressBytes.length); + buffer.putInt(keyBytes.length); + buffer.put(keyBytes); + buffer.putInt(progressBytes.length); + buffer.put(progressBytes); + fileOutputStream.write(buffer.array()); + } + } + + public void processLoadSnapshot(final FileInputStream fileInputStream) throws IOException { + regionProgressMap.clear(); + final byte[] sizeBytes = new byte[4]; + if (fileInputStream.read(sizeBytes) != 4) { + return; + } + final int regionSize = ByteBuffer.wrap(sizeBytes).getInt(); + for (int i = 0; i < regionSize; i++) { + final byte[] keyLenBytes = new byte[4]; + if (fileInputStream.read(keyLenBytes) != 4) { + throw new IOException("Unexpected EOF reading region progress key length"); + } + final int keyLen = ByteBuffer.wrap(keyLenBytes).getInt(); + final byte[] keyBytes = new byte[keyLen]; + if (fileInputStream.read(keyBytes) != keyLen) { + throw new IOException("Unexpected EOF reading region progress key"); + } + final String key = new String(keyBytes, "UTF-8"); + final byte[] valueLenBytes = new byte[4]; + if (fileInputStream.read(valueLenBytes) != 4) { + throw new IOException("Unexpected EOF reading region progress value length"); + } + final int valueLen = ByteBuffer.wrap(valueLenBytes).getInt(); + final byte[] valueBytes = new byte[valueLen]; + if (fileInputStream.read(valueBytes) != valueLen) { + throw new IOException("Unexpected EOF reading region progress value"); + } + regionProgressMap.put(key, ByteBuffer.wrap(valueBytes).asReadOnlyBuffer()); + } + } + + public void serializeToStream(final java.io.DataOutputStream stream) throws IOException { + stream.writeInt(regionProgressMap.size()); + for (final Map.Entry entry : regionProgressMap.entrySet()) { + final byte[] keyBytes = entry.getKey().getBytes("UTF-8"); + final ByteBuffer progressBuffer = copyBuffer(entry.getValue()); + final byte[] progressBytes = new byte[progressBuffer.remaining()]; + progressBuffer.get(progressBytes); + stream.writeInt(keyBytes.length); + stream.write(keyBytes); + stream.writeInt(progressBytes.length); + stream.write(progressBytes); + } + } + + public static Map deserializeRegionProgressFromBuffer( + final ByteBuffer buffer) { + if (!buffer.hasRemaining()) { + return new HashMap<>(); + } + final int size = buffer.getInt(); + final Map result = new HashMap<>(size); + for (int i = 0; i < size; i++) { + final int keyLen = buffer.getInt(); + final byte[] keyBytes = new byte[keyLen]; + buffer.get(keyBytes); + final String key = new String(keyBytes, java.nio.charset.StandardCharsets.UTF_8); + final int valueLen = buffer.getInt(); + final byte[] valueBytes = new byte[valueLen]; + buffer.get(valueBytes); + result.put(key, ByteBuffer.wrap(valueBytes).asReadOnlyBuffer()); + } + return result; + } + + private static ByteBuffer copyBuffer(final ByteBuffer buffer) { + final ByteBuffer duplicate = buffer.asReadOnlyBuffer(); + duplicate.rewind(); + final byte[] bytes = new byte[duplicate.remaining()]; + duplicate.get(bytes); + return ByteBuffer.wrap(bytes).asReadOnlyBuffer(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final CommitProgressKeeper that = (CommitProgressKeeper) o; + return Objects.equals(this.regionProgressMap, that.regionProgressMap); + } + + @Override + public int hashCode() { + return Objects.hash(regionProgressMap); + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java index f89bfbc683379..6d22e7a177a1c 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/subscription/meta/consumer/ConsumerGroupMeta.java @@ -116,6 +116,26 @@ private boolean shouldRecordSubscriptionCreationTime() { return unsubscribedTopicNames; } + public static Set getTopicsNewlySubByGroup( + final ConsumerGroupMeta currentMeta, final ConsumerGroupMeta updatedMeta) { + if (!Objects.equals(currentMeta.consumerGroupId, updatedMeta.consumerGroupId) + || !Objects.equals(currentMeta.creationTime, updatedMeta.creationTime)) { + return Collections.emptySet(); + } + + final Set newlySubscribedTopicNames = new HashSet<>(); + updatedMeta + .topicNameToSubscribedConsumerIdSet + .keySet() + .forEach( + topicName -> { + if (!currentMeta.topicNameToSubscribedConsumerIdSet.containsKey(topicName)) { + newlySubscribedTopicNames.add(topicName); + } + }); + return newlySubscribedTopicNames; + } + /////////////////////////////// consumer /////////////////////////////// public void checkAuthorityBeforeJoinConsumerGroup(final ConsumerMeta consumerMeta) @@ -174,6 +194,11 @@ public ConsumerMeta getConsumerMeta(final String consumerId) { ////////////////////////// subscription ////////////////////////// + /** Get all topic names subscribed by this consumer group. */ + public Set getSubscribedTopicNames() { + return Collections.unmodifiableSet(topicNameToSubscribedConsumerIdSet.keySet()); + } + /** * Get the consumers subscribing the given topic in this group. * diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeperTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeperTest.java new file mode 100644 index 0000000000000..2cdec776683f1 --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/subscription/meta/consumer/CommitProgressKeeperTest.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.subscription.meta.consumer; + +import org.apache.iotdb.rpc.subscription.payload.poll.RegionProgress; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterId; +import org.apache.iotdb.rpc.subscription.payload.poll.WriterProgress; + +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.LinkedHashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class CommitProgressKeeperTest { + + @Test + public void testUpdateAndReplaceAllUseDefensiveCopies() throws Exception { + final CommitProgressKeeper keeper = new CommitProgressKeeper(); + final String key = CommitProgressKeeper.generateKey("cg", "topic", "1_1", 3); + final RegionProgress regionProgress = createRegionProgress("1_1", 7, 2L, 100L, 10L); + + final ByteBuffer source = serialize(regionProgress); + keeper.updateRegionProgress(key, source); + source.position(source.limit()); + + final ByteBuffer firstRead = keeper.getRegionProgress(key); + assertTrue(firstRead.isReadOnly()); + firstRead.get(); + assertEquals(regionProgress, RegionProgress.deserialize(keeper.getRegionProgress(key))); + + final Map replacement = new LinkedHashMap<>(); + final RegionProgress replacementProgress = createRegionProgress("1_1", 8, 3L, 120L, 12L); + final ByteBuffer replacementBuffer = serialize(replacementProgress); + replacement.put(key, replacementBuffer); + + keeper.replaceAll(replacement); + replacementBuffer.position(replacementBuffer.limit()); + + assertEquals(replacementProgress, RegionProgress.deserialize(keeper.getRegionProgress(key))); + } + + @Test + public void testSnapshotRoundTripPreservesRegionProgress() throws Exception { + final CommitProgressKeeper keeper = new CommitProgressKeeper(); + final String firstKey = CommitProgressKeeper.generateKey("cg", "topicA", "1_1", 3); + final String secondKey = CommitProgressKeeper.generateKey("cg", "topicB", "1_2", 5); + final RegionProgress firstProgress = + createRegionProgress( + "1_1", + new WriterId("1_1", 7, 2L), + new WriterProgress(100L, 10L), + new WriterId("1_1", 8, 2L), + new WriterProgress(110L, 11L)); + final RegionProgress secondProgress = createRegionProgress("1_2", 9, 4L, 200L, 20L); + + keeper.updateRegionProgress(firstKey, serialize(firstProgress)); + keeper.updateRegionProgress(secondKey, serialize(secondProgress)); + + final Path snapshot = Files.createTempFile("commit-progress-keeper", ".snapshot"); + try { + try (FileOutputStream fos = new FileOutputStream(snapshot.toFile())) { + keeper.processTakeSnapshot(fos); + } + + final CommitProgressKeeper restored = new CommitProgressKeeper(); + try (FileInputStream fis = new FileInputStream(snapshot.toFile())) { + restored.processLoadSnapshot(fis); + } + + assertEquals(firstProgress, RegionProgress.deserialize(restored.getRegionProgress(firstKey))); + assertEquals( + secondProgress, RegionProgress.deserialize(restored.getRegionProgress(secondKey))); + assertEquals(2, restored.getAllRegionProgress().size()); + } finally { + Files.deleteIfExists(snapshot); + } + } + + private static RegionProgress createRegionProgress( + final String regionId, + final int nodeId, + final long writerEpoch, + final long physicalTime, + final long localSeq) { + return createRegionProgress( + regionId, + new WriterId(regionId, nodeId, writerEpoch), + new WriterProgress(physicalTime, localSeq)); + } + + private static RegionProgress createRegionProgress( + final String regionId, + final WriterId firstWriterId, + final WriterProgress firstWriterProgress) { + return createRegionProgress(regionId, firstWriterId, firstWriterProgress, null, null); + } + + private static RegionProgress createRegionProgress( + final String regionId, + final WriterId firstWriterId, + final WriterProgress firstWriterProgress, + final WriterId secondWriterId, + final WriterProgress secondWriterProgress) { + final Map writerPositions = new LinkedHashMap<>(); + writerPositions.put(firstWriterId, firstWriterProgress); + if (secondWriterId != null && secondWriterProgress != null) { + writerPositions.put(secondWriterId, secondWriterProgress); + } + return new RegionProgress(writerPositions); + } + + private static ByteBuffer serialize(final RegionProgress regionProgress) throws Exception { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos)) { + regionProgress.serialize(dos); + dos.flush(); + return ByteBuffer.wrap(baos.toByteArray()).asReadOnlyBuffer(); + } + } +} diff --git a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift index 92312ee81a307..b17ccd6b1d974 100644 --- a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift +++ b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift @@ -1061,6 +1061,18 @@ struct TGetAllSubscriptionInfoResp { 2: required list allSubscriptionInfo } +struct TGetCommitProgressReq { + 1: required string consumerGroupId + 2: required string topicName + 3: required i32 regionId + 4: required i32 dataNodeId +} + +struct TGetCommitProgressResp { + 1: required common.TSStatus status + 2: optional binary committedRegionProgress +} + // ==================================================== // CQ // ==================================================== @@ -1956,6 +1968,9 @@ service IConfigNodeRPCService { /** Get all subscription information. It is used for DataNode registration and restart */ TGetAllSubscriptionInfoResp getAllSubscriptionInfo() + /** Get committed search index from ConfigNode for recovery */ + TGetCommitProgressResp getCommitProgress(TGetCommitProgressReq req) + // ====================================================== // TestTools // ====================================================== @@ -2055,4 +2070,3 @@ service IConfigNodeRPCService { common.TSStatus createTableView(TCreateTableViewReq req) } - diff --git a/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift b/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift index 829443f955282..6ab5eee193c4e 100644 --- a/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift +++ b/iotdb-protocol/thrift-consensus/src/main/thrift/iotconsensus.thrift @@ -27,6 +27,9 @@ struct TLogEntry { 2: required i64 searchIndex 3: required bool fromWAL 4: required i64 memorySize + 5: optional i64 epoch + 6: optional i64 physicalTime + 7: optional i16 writerEpoch } struct TSyncLogEntriesReq { @@ -41,6 +44,18 @@ struct TSyncLogEntriesRes { 2: optional i64 receiverMemSize } +struct TSyncSafeHlcReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i32 writerNodeId + 3: required i64 writerEpoch + 4: required i64 safePhysicalTime + 5: required i64 barrierLocalSeq +} + +struct TSyncSafeHlcRes { + 1: required common.TSStatus status +} + struct TInactivatePeerReq { 1: required common.TConsensusGroupId consensusGroupId 2: optional bool forDeletionPurpose @@ -129,6 +144,7 @@ struct TCleanupTransferredSnapshotRes { service IoTConsensusIService { TSyncLogEntriesRes syncLogEntries(TSyncLogEntriesReq req) + TSyncSafeHlcRes syncSafeHlc(TSyncSafeHlcReq req) TInactivatePeerRes inactivatePeer(TInactivatePeerReq req) TActivatePeerRes activatePeer(TActivatePeerReq req) TBuildSyncLogChannelRes buildSyncLogChannel(TBuildSyncLogChannelReq req) @@ -138,4 +154,4 @@ service IoTConsensusIService { TSendSnapshotFragmentRes sendSnapshotFragment(TSendSnapshotFragmentReq req) TTriggerSnapshotLoadRes triggerSnapshotLoad(TTriggerSnapshotLoadReq req) TCleanupTransferredSnapshotRes cleanupTransferredSnapshot(TCleanupTransferredSnapshotReq req) -} \ No newline at end of file +} diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index 8c3e12217e019..17e542bb740cd 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -597,6 +597,34 @@ struct TPushConsumerGroupMetaRespExceptionMessage { 3: required i64 timeStamp } +struct TPullCommitProgressReq { +} + +struct TPullCommitProgressResp { + 1: required common.TSStatus status + 2: optional map commitRegionProgress +} + +struct TSyncSubscriptionProgressReq { + 1: required string consumerGroupId + 2: required string topicName + 3: required string regionId + 4: required i64 epoch + 5: required i64 syncIndex + 6: optional i32 writerNodeId + 7: optional i64 writerEpoch +} +struct TSubscriptionRuntimeStateEntry { + 1: required common.TConsensusGroupId regionId + 2: required i64 runtimeVersion + 3: required i32 preferredWriterNodeId + 4: required bool active + 5: required list activeWriterNodeIds +} +struct TPushSubscriptionRuntimeReq { + 1: required list runtimeStates +} + struct TConstructViewSchemaBlackListReq { 1: required list schemaRegionIdList 2: required binary pathPatternTree @@ -1211,6 +1239,20 @@ service IDataNodeRPCService { */ TPushConsumerGroupMetaResp pushSingleConsumerGroupMeta(TPushSingleConsumerGroupMetaReq req) + /** + * Pull commit progress from DataNode for subscription consensus persistence + */ + TPullCommitProgressResp pullCommitProgress(TPullCommitProgressReq req) + + /** + * Sync subscription committed progress from Leader to Follower (fire-and-forget) + */ + common.TSStatus syncSubscriptionProgress(TSyncSubscriptionProgressReq req) + /** + * Push subscription runtime state to DataNodes. + */ + common.TSStatus pushSubscriptionRuntime(TPushSubscriptionRuntimeReq req) + /** * ConfigNode will ask DataNode for pipe meta in every few seconds **/ @@ -1350,4 +1392,5 @@ service MPPDataExchangeService { /** Empty rpc, only for connection test */ common.TSStatus testConnectionEmptyRPC() -} \ No newline at end of file +} +