Skip to content

Commit f776aa5

Browse files
authored
[IOTDB-4361] Add a precheck in removing datanode (apache#7264)
1 parent 8f9e28b commit f776aa5

3 files changed

Lines changed: 58 additions & 6 deletions

File tree

confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/DataNodeRemoveHandler.java

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,17 @@
2929
import org.apache.iotdb.confignode.client.DataNodeRequestType;
3030
import org.apache.iotdb.confignode.client.async.datanode.AsyncDataNodeClientPool;
3131
import org.apache.iotdb.confignode.client.sync.datanode.SyncDataNodeClientPool;
32+
import org.apache.iotdb.confignode.conf.ConfigNodeConfig;
33+
import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor;
3234
import org.apache.iotdb.confignode.consensus.request.write.RemoveDataNodePlan;
3335
import org.apache.iotdb.confignode.consensus.request.write.UpdateRegionLocationPlan;
3436
import org.apache.iotdb.confignode.consensus.response.DataNodeToStatusResp;
3537
import org.apache.iotdb.confignode.manager.ConfigManager;
38+
import org.apache.iotdb.confignode.manager.load.heartbeat.BaseNodeCache;
3639
import org.apache.iotdb.confignode.persistence.NodeInfo;
3740
import org.apache.iotdb.confignode.procedure.exception.ProcedureException;
3841
import org.apache.iotdb.confignode.procedure.scheduler.LockQueue;
42+
import org.apache.iotdb.consensus.ConsensusFactory;
3943
import org.apache.iotdb.mpp.rpc.thrift.TCreatePeerReq;
4044
import org.apache.iotdb.mpp.rpc.thrift.TDisableDataNodeReq;
4145
import org.apache.iotdb.mpp.rpc.thrift.TMaintainPeerReq;
@@ -53,6 +57,8 @@
5357
public class DataNodeRemoveHandler {
5458
private static final Logger LOGGER = LoggerFactory.getLogger(DataNodeRemoveHandler.class);
5559

60+
private static final ConfigNodeConfig CONF = ConfigNodeDescriptor.getInstance().getConf();
61+
5662
private final ConfigManager configManager;
5763

5864
/** region migrate lock */
@@ -386,14 +392,20 @@ public TSStatus stopDataNode(TDataNodeLocation dataNode) throws ProcedureExcepti
386392
public DataNodeToStatusResp checkRemoveDataNodeRequest(RemoveDataNodePlan removeDataNodePlan) {
387393
DataNodeToStatusResp dataSet = new DataNodeToStatusResp();
388394
dataSet.setStatus(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()));
389-
TSStatus status = checkRegionReplication(removeDataNodePlan);
390-
if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
395+
396+
TSStatus status = checkClusterProtocol();
397+
if (isFailed(status)) {
398+
dataSet.setStatus(status);
399+
return dataSet;
400+
}
401+
status = checkRegionReplication(removeDataNodePlan);
402+
if (isFailed(status)) {
391403
dataSet.setStatus(status);
392404
return dataSet;
393405
}
394406

395407
status = checkDataNodeExist(removeDataNodePlan);
396-
if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
408+
if (isFailed(status)) {
397409
dataSet.setStatus(status);
398410
return dataSet;
399411
}
@@ -433,8 +445,31 @@ private TSStatus checkDataNodeExist(RemoveDataNodePlan removeDataNodePlan) {
433445
*/
434446
private TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) {
435447
TSStatus status = new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode());
436-
int removedDataNodeSize = removeDataNodePlan.getDataNodeLocations().size();
448+
List<TDataNodeLocation> removedDataNodes = removeDataNodePlan.getDataNodeLocations();
437449
int allDataNodeSize = configManager.getNodeManager().getRegisteredDataNodeCount();
450+
451+
// when the configuration is one replication, it will be failed if the data node is not in
452+
// running state.
453+
if (CONF.getSchemaReplicationFactor() == 1 || CONF.getDataReplicationFactor() == 1) {
454+
for (TDataNodeLocation dataNodeLocation : removedDataNodes) {
455+
// check whether removed data node is in running state
456+
BaseNodeCache nodeCache =
457+
configManager.getNodeManager().getNodeCacheMap().get(dataNodeLocation.getDataNodeId());
458+
if (!nodeCache.getNodeStatus().getStatus().equals("Running")) {
459+
removedDataNodes.remove(dataNodeLocation);
460+
LOGGER.error(
461+
"Failed to remove data node {} because it is not in running and the configuration of cluster is one replication",
462+
dataNodeLocation);
463+
}
464+
if (removedDataNodes.size() == 0) {
465+
status.setCode(TSStatusCode.LACK_REPLICATION.getStatusCode());
466+
status.setMessage("Failed to remove all requested data nodes");
467+
return status;
468+
}
469+
}
470+
}
471+
472+
int removedDataNodeSize = removeDataNodePlan.getDataNodeLocations().size();
438473
if (allDataNodeSize - removedDataNodeSize < NodeInfo.getMinimumDataNode()) {
439474
status.setCode(TSStatusCode.LACK_REPLICATION.getStatusCode());
440475
status.setMessage(
@@ -492,4 +527,21 @@ private Optional<TDataNodeLocation> filterDataNodeWithOtherRegionReplica(
492527
// TODO replace findAny() by select the low load node.
493528
return regionReplicaNodes.stream().filter(e -> !e.equals(filterLocation)).findAny();
494529
}
530+
531+
/**
532+
* Check the protocol of the cluster, standalone is not supported to remove data node currently
533+
*
534+
* @return SUCCEED_STATUS if the Cluster is not standalone protocol, REMOVE_DATANODE_FAILED
535+
* otherwise
536+
*/
537+
private TSStatus checkClusterProtocol() {
538+
TSStatus status = new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode());
539+
if (CONF.getDataRegionConsensusProtocolClass().equals(ConsensusFactory.StandAloneConsensus)
540+
|| CONF.getSchemaRegionConsensusProtocolClass()
541+
.equals(ConsensusFactory.StandAloneConsensus)) {
542+
status.setCode(TSStatusCode.REMOVE_DATANODE_FAILED.getStatusCode());
543+
status.setMessage("standalone protocol is not supported to remove data node");
544+
}
545+
return status;
546+
}
495547
}

integration-test/src/main/java/org/apache/iotdb/it/env/ConfigFactory.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ public static BaseConfig getConfig() {
3838
break;
3939
case "LocalStandaloneOnMpp":
4040
case "Cluster1":
41-
case "Cluster2":
4241
config = new MppConfig();
4342
break;
4443
case "Remote":

service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ public enum TSStatusCode {
152152
REGION_MIGRATE_FAILED(915),
153153
LACK_REPLICATION(916),
154154
DATANODE_STOP_ERROR(917),
155-
REGION_LEADER_CHANGE_FAILED(918);
155+
REGION_LEADER_CHANGE_FAILED(918),
156+
REMOVE_DATANODE_FAILED(919);
156157

157158
private int statusCode;
158159

0 commit comments

Comments
 (0)