Skip to content

Commit 8fa3cf7

Browse files
committed
[server] Coordinator Server Supports High-Available
1 parent fbdb7fa commit 8fa3cf7

File tree

48 files changed

+1641
-172
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1641
-172
lines changed

fluss-common/src/main/java/org/apache/fluss/config/ConfigOptions.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,12 @@ public class ConfigOptions {
341341
+ " (“50100,50101”), ranges (“50100-50200”) or a combination of both."
342342
+ "This option is deprecated. Please use bind.listeners instead, which provides a more flexible configuration for multiple ports");
343343

344+
public static final ConfigOption<Integer> COORDINATOR_ID =
345+
key("coordinator.id")
346+
.intType()
347+
.noDefaultValue()
348+
.withDescription("The id for the coordinator server.");
349+
344350
/**
345351
* @deprecated This option is deprecated. Please use {@link ConfigOptions#SERVER_IO_POOL_SIZE}
346352
* instead.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.fluss.exception;
20+
21+
/** Exception thrown when the Coordinator leader epoch is fenced. */
22+
public class CoordinatorEpochFencedException extends RuntimeException {
23+
public CoordinatorEpochFencedException(String message) {
24+
super(message);
25+
}
26+
}

fluss-common/src/main/java/org/apache/fluss/metrics/MetricNames.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ public class MetricNames {
3636
// metrics for coordinator server
3737
// --------------------------------------------------------------------------------------------
3838
public static final String ACTIVE_COORDINATOR_COUNT = "activeCoordinatorCount";
39+
public static final String ALIVE_COORDINATOR_COUNT = "aliveCoordinatorCount";
3940
public static final String ACTIVE_TABLET_SERVER_COUNT = "activeTabletServerCount";
4041
public static final String OFFLINE_BUCKET_COUNT = "offlineBucketCount";
4142
public static final String TABLE_COUNT = "tableCount";

fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ public class CoordinatorContext {
5555
private static final Logger LOG = LoggerFactory.getLogger(CoordinatorContext.class);
5656

5757
public static final int INITIAL_COORDINATOR_EPOCH = 0;
58+
public static final int INITIAL_COORDINATOR_EPOCH_ZK_VERSION = 0;
5859

5960
// for simplicity, we just use retry time, may consider make it a configurable value
6061
// and use combine retry times and retry delay
@@ -67,6 +68,7 @@ public class CoordinatorContext {
6768
// a success deletion.
6869
private final Map<TableBucketReplica, Integer> failDeleteNumbers = new HashMap<>();
6970

71+
private final Set<Integer> liveCoordinatorServers = new HashSet<>();
7072
private final Map<Integer, ServerInfo> liveTabletServers = new HashMap<>();
7173
private final Set<Integer> shuttingDownTabletServers = new HashSet<>();
7274

@@ -108,13 +110,40 @@ public class CoordinatorContext {
108110

109111
private ServerInfo coordinatorServerInfo = null;
110112
private int coordinatorEpoch = INITIAL_COORDINATOR_EPOCH;
113+
private int coordinatorEpochZkVersion = INITIAL_COORDINATOR_EPOCH_ZK_VERSION;
111114

112115
public CoordinatorContext() {}
113116

114117
public int getCoordinatorEpoch() {
115118
return coordinatorEpoch;
116119
}
117120

121+
public int getCoordinatorEpochZkVersion() {
122+
return coordinatorEpochZkVersion;
123+
}
124+
125+
public void setCoordinatorEpochAndZkVersion(int newEpoch, int newZkVersion) {
126+
this.coordinatorEpoch = newEpoch;
127+
this.coordinatorEpochZkVersion = newZkVersion;
128+
}
129+
130+
public Set<Integer> getLiveCoordinatorServers() {
131+
return liveCoordinatorServers;
132+
}
133+
134+
public void setLiveCoordinators(Set<Integer> servers) {
135+
liveCoordinatorServers.clear();
136+
liveCoordinatorServers.addAll(servers);
137+
}
138+
139+
public void addLiveCoordinator(int serverId) {
140+
this.liveCoordinatorServers.add(serverId);
141+
}
142+
143+
public void removeLiveCoordinator(int serverId) {
144+
this.liveCoordinatorServers.remove(serverId);
145+
}
146+
118147
public Map<Integer, ServerInfo> getLiveTabletServers() {
119148
return liveTabletServers;
120149
}

fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorEventProcessor.java

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,15 @@
7171
import org.apache.fluss.server.coordinator.event.CoordinatorEventManager;
7272
import org.apache.fluss.server.coordinator.event.CreatePartitionEvent;
7373
import org.apache.fluss.server.coordinator.event.CreateTableEvent;
74+
import org.apache.fluss.server.coordinator.event.DeadCoordinatorEvent;
7475
import org.apache.fluss.server.coordinator.event.DeadTabletServerEvent;
7576
import org.apache.fluss.server.coordinator.event.DeleteReplicaResponseReceivedEvent;
7677
import org.apache.fluss.server.coordinator.event.DropPartitionEvent;
7778
import org.apache.fluss.server.coordinator.event.DropTableEvent;
7879
import org.apache.fluss.server.coordinator.event.EventProcessor;
7980
import org.apache.fluss.server.coordinator.event.FencedCoordinatorEvent;
8081
import org.apache.fluss.server.coordinator.event.ListRebalanceProgressEvent;
82+
import org.apache.fluss.server.coordinator.event.NewCoordinatorEvent;
8183
import org.apache.fluss.server.coordinator.event.NewTabletServerEvent;
8284
import org.apache.fluss.server.coordinator.event.NotifyKvSnapshotOffsetEvent;
8385
import org.apache.fluss.server.coordinator.event.NotifyLakeTableOffsetEvent;
@@ -86,6 +88,7 @@
8688
import org.apache.fluss.server.coordinator.event.RemoveServerTagEvent;
8789
import org.apache.fluss.server.coordinator.event.SchemaChangeEvent;
8890
import org.apache.fluss.server.coordinator.event.TableRegistrationChangeEvent;
91+
import org.apache.fluss.server.coordinator.event.watcher.CoordinatorChangeWatcher;
8992
import org.apache.fluss.server.coordinator.event.watcher.TableChangeWatcher;
9093
import org.apache.fluss.server.coordinator.event.watcher.TabletServerChangeWatcher;
9194
import org.apache.fluss.server.coordinator.lease.KvSnapshotLeaseManager;
@@ -128,6 +131,7 @@
128131

129132
import java.time.Duration;
130133
import java.util.ArrayList;
134+
import java.util.Arrays;
131135
import java.util.Collections;
132136
import java.util.HashMap;
133137
import java.util.HashSet;
@@ -172,6 +176,7 @@ public class CoordinatorEventProcessor implements EventProcessor {
172176
private final LakeTableTieringManager lakeTableTieringManager;
173177
private final TableChangeWatcher tableChangeWatcher;
174178
private final CoordinatorChannelManager coordinatorChannelManager;
179+
private final CoordinatorChangeWatcher coordinatorChangeWatcher;
175180
private final TabletServerChangeWatcher tabletServerChangeWatcher;
176181
private final CoordinatorMetadataCache serverMetadataCache;
177182
private final CoordinatorRequestBatch coordinatorRequestBatch;
@@ -224,6 +229,8 @@ public CoordinatorEventProcessor(
224229
tableBucketStateMachine,
225230
new RemoteStorageCleaner(conf, ioExecutor),
226231
ioExecutor);
232+
this.coordinatorChangeWatcher =
233+
new CoordinatorChangeWatcher(zooKeeperClient, coordinatorEventManager);
227234
this.tableChangeWatcher = new TableChangeWatcher(zooKeeperClient, coordinatorEventManager);
228235
this.tabletServerChangeWatcher =
229236
new TabletServerChangeWatcher(zooKeeperClient, coordinatorEventManager);
@@ -263,6 +270,7 @@ public CoordinatorContext getCoordinatorContext() {
263270
public void startup() {
264271
coordinatorContext.setCoordinatorServerInfo(getCoordinatorServerInfo());
265272
// start watchers first so that we won't miss node in zk;
273+
coordinatorChangeWatcher.start();
266274
tabletServerChangeWatcher.start();
267275
tableChangeWatcher.start();
268276
LOG.info("Initializing coordinator context.");
@@ -306,14 +314,11 @@ public void shutdown() {
306314
private ServerInfo getCoordinatorServerInfo() {
307315
try {
308316
return zooKeeperClient
309-
.getCoordinatorAddress()
317+
.getCoordinatorLeaderAddress()
310318
.map(
311319
coordinatorAddress ->
312-
// TODO we set id to 0 as that CoordinatorServer don't support
313-
// HA, if we support HA, we need to set id to the config
314-
// CoordinatorServer id to avoid node drift.
315320
new ServerInfo(
316-
0,
321+
coordinatorAddress.getId(),
317322
null, // For coordinatorServer, no rack info
318323
coordinatorAddress.getEndpoints(),
319324
ServerType.COORDINATOR))
@@ -334,6 +339,12 @@ public int getCoordinatorEpoch() {
334339

335340
private void initCoordinatorContext() throws Exception {
336341
long start = System.currentTimeMillis();
342+
// get all coordinator servers
343+
int[] currentCoordinatorServers = zooKeeperClient.getCoordinatorServerList();
344+
coordinatorContext.setLiveCoordinators(
345+
Arrays.stream(currentCoordinatorServers).boxed().collect(Collectors.toSet()));
346+
LOG.info("Load coordinator servers success when initializing coordinator context.");
347+
337348
// get all tablet server's
338349
int[] currentServers = zooKeeperClient.getSortedTabletServerList();
339350
List<ServerInfo> tabletServerInfos = new ArrayList<>();
@@ -548,6 +559,7 @@ private void onShutdown() {
548559
tableManager.shutdown();
549560

550561
// then stop watchers
562+
coordinatorChangeWatcher.stop();
551563
tableChangeWatcher.stop();
552564
tabletServerChangeWatcher.stop();
553565
}
@@ -572,6 +584,10 @@ public void process(CoordinatorEvent event) {
572584
(NotifyLeaderAndIsrResponseReceivedEvent) event);
573585
} else if (event instanceof DeleteReplicaResponseReceivedEvent) {
574586
processDeleteReplicaResponseReceived((DeleteReplicaResponseReceivedEvent) event);
587+
} else if (event instanceof NewCoordinatorEvent) {
588+
processNewCoordinator((NewCoordinatorEvent) event);
589+
} else if (event instanceof DeadCoordinatorEvent) {
590+
processDeadCoordinator((DeadCoordinatorEvent) event);
575591
} else if (event instanceof NewTabletServerEvent) {
576592
processNewTabletServer((NewTabletServerEvent) event);
577593
} else if (event instanceof DeadTabletServerEvent) {
@@ -983,6 +999,28 @@ private void onReplicaBecomeOffline(Set<TableBucketReplica> offlineReplicas) {
983999
replicaStateMachine.handleStateChanges(offlineReplicas, OfflineReplica);
9841000
}
9851001

1002+
private void processNewCoordinator(NewCoordinatorEvent newCoordinatorEvent) {
1003+
int coordinatorServerId = newCoordinatorEvent.getServerId();
1004+
if (coordinatorContext.getLiveCoordinatorServers().contains(coordinatorServerId)) {
1005+
return;
1006+
}
1007+
1008+
// process new coordinator server
1009+
LOG.info("New coordinator server callback for coordinator server {}", coordinatorServerId);
1010+
1011+
coordinatorContext.addLiveCoordinator(coordinatorServerId);
1012+
}
1013+
1014+
private void processDeadCoordinator(DeadCoordinatorEvent deadCoordinatorEvent) {
1015+
int coordinatorServerId = deadCoordinatorEvent.getServerId();
1016+
if (!coordinatorContext.getLiveCoordinatorServers().contains(coordinatorServerId)) {
1017+
return;
1018+
}
1019+
// process dead coordinator server
1020+
LOG.info("Coordinator server failure callback for {}.", coordinatorServerId);
1021+
coordinatorContext.removeLiveCoordinator(coordinatorServerId);
1022+
}
1023+
9861024
private void processNewTabletServer(NewTabletServerEvent newTabletServerEvent) {
9871025
// NOTE: we won't need to detect bounced tablet servers like Kafka as we won't
9881026
// miss the event of tablet server un-register and register again since we can
@@ -1567,7 +1605,10 @@ private void updateReplicaAssignmentForBucket(
15671605
tableAssignment.forEach(
15681606
(bucket, replicas) ->
15691607
newTableAssignment.put(bucket, new BucketAssignment(replicas)));
1570-
zooKeeperClient.updateTableAssignment(tableId, new TableAssignment(newTableAssignment));
1608+
zooKeeperClient.updateTableAssignment(
1609+
tableId,
1610+
new TableAssignment(newTableAssignment),
1611+
coordinatorContext.getCoordinatorEpochZkVersion());
15711612
} else {
15721613
Map<Integer, List<Integer>> partitionAssignment =
15731614
coordinatorContext.getPartitionAssignment(
@@ -1624,7 +1665,8 @@ private List<AdjustIsrResultForBucket> tryProcessAdjustIsr(
16241665
}
16251666

16261667
try {
1627-
zooKeeperClient.batchUpdateLeaderAndIsr(newLeaderAndIsrList);
1668+
zooKeeperClient.batchUpdateLeaderAndIsr(
1669+
newLeaderAndIsrList, coordinatorContext.getCoordinatorEpochZkVersion());
16281670
newLeaderAndIsrList.forEach(
16291671
(tableBucket, newLeaderAndIsr) ->
16301672
result.add(new AdjustIsrResultForBucket(tableBucket, newLeaderAndIsr)));
@@ -1635,7 +1677,10 @@ private List<AdjustIsrResultForBucket> tryProcessAdjustIsr(
16351677
TableBucket tableBucket = entry.getKey();
16361678
LeaderAndIsr newLeaderAndIsr = entry.getValue();
16371679
try {
1638-
zooKeeperClient.updateLeaderAndIsr(tableBucket, newLeaderAndIsr);
1680+
zooKeeperClient.updateLeaderAndIsr(
1681+
tableBucket,
1682+
newLeaderAndIsr,
1683+
coordinatorContext.getCoordinatorEpochZkVersion());
16391684
} catch (Exception e) {
16401685
LOG.error("Error when register leader and isr.", e);
16411686
result.add(
@@ -2161,7 +2206,8 @@ private void updateBucketEpochAndSendRequest(TableBucket tableBucket, List<Integ
21612206
LeaderAndIsr newLeaderAndIsr = leaderAndIsr.newLeaderAndIsr(leaderAndIsr.isr());
21622207

21632208
coordinatorContext.putBucketLeaderAndIsr(tableBucket, newLeaderAndIsr);
2164-
zooKeeperClient.updateLeaderAndIsr(tableBucket, newLeaderAndIsr);
2209+
zooKeeperClient.updateLeaderAndIsr(
2210+
tableBucket, newLeaderAndIsr, coordinatorContext.getCoordinatorEpochZkVersion());
21652211

21662212
coordinatorRequestBatch.newBatch();
21672213
coordinatorRequestBatch.addNotifyLeaderRequestForTabletServers(

0 commit comments

Comments
 (0)