Skip to content

Commit b829d26

Browse files
authored
HDDS-14425. Implement Ratis follower read exception handling (#9811)
1 parent acac563 commit b829d26

7 files changed

Lines changed: 163 additions & 22 deletions

File tree

hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/HadoopRpcOMFollowerReadFailoverProxyProvider.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getLeaderNotReadyException;
2121
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getNotLeaderException;
22+
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getReadException;
23+
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getReadIndexException;
2224

2325
import com.google.common.annotations.VisibleForTesting;
2426
import com.google.protobuf.RpcController;
@@ -42,6 +44,8 @@
4244
import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB;
4345
import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRequest;
4446
import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.ReadConsistencyHint;
47+
import org.apache.ratis.protocol.exceptions.ReadException;
48+
import org.apache.ratis.protocol.exceptions.ReadIndexException;
4549
import org.apache.ratis.util.Preconditions;
4650
import org.slf4j.Logger;
4751
import org.slf4j.LoggerFactory;
@@ -302,6 +306,18 @@ public Object invoke(Object proxy, final Method method, final Object[] args)
302306
// If we break here instead, we will retry the same leader again without waiting
303307
throw e;
304308
}
309+
310+
ReadIndexException readIndexException = getReadIndexException(e);
311+
if (readIndexException != null) {
312+
// This should trigger failover in the following shouldFailover
313+
LOG.debug("Encountered ReadIndexException from {}. ", current.proxyInfo);
314+
}
315+
316+
ReadException readException = getReadException(e);
317+
if (readException != null) {
318+
// This should trigger failover in the following shouldFailover
319+
LOG.debug("Encountered ReadException from {}. ", current.proxyInfo);
320+
}
305321
}
306322

307323
if (!failoverProxy.shouldFailover(e)) {

hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProviderBase.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
import org.apache.hadoop.security.AccessControlException;
4646
import org.apache.hadoop.security.UserGroupInformation;
4747
import org.apache.hadoop.security.token.SecretManager;
48+
import org.apache.ratis.protocol.exceptions.ReadException;
49+
import org.apache.ratis.protocol.exceptions.ReadIndexException;
4850
import org.apache.ratis.protocol.exceptions.StateMachineException;
4951
import org.slf4j.Logger;
5052
import org.slf4j.LoggerFactory;
@@ -434,6 +436,44 @@ public static OMNotLeaderException getNotLeaderException(
434436
return null;
435437
}
436438

439+
/**
440+
* Unwrap the exception and return the wrapped ReadIndexException if any.
441+
*
442+
* @param exception exception to unwrap.
443+
* @return the unwrapped ReadIndexException or null if the wrapped
444+
* exception is not ReadIndexException.
445+
*/
446+
public static ReadIndexException getReadIndexException(Exception exception) {
447+
Throwable cause = exception.getCause();
448+
if (cause instanceof RemoteException) {
449+
IOException ioException =
450+
((RemoteException) cause).unwrapRemoteException();
451+
if (ioException instanceof ReadIndexException) {
452+
return (ReadIndexException) ioException;
453+
}
454+
}
455+
return null;
456+
}
457+
458+
/**
459+
* Unwrap the exception and return the wrapped ReadException if any.
460+
*
461+
* @param exception exception to unwrap.
462+
* @return the unwrapped ReadException or null if the wrapped
463+
* exception is not ReadException.
464+
*/
465+
public static ReadException getReadException(Exception exception) {
466+
Throwable cause = exception.getCause();
467+
if (cause instanceof RemoteException) {
468+
IOException ioException =
469+
((RemoteException) cause).unwrapRemoteException();
470+
if (ioException instanceof ReadException) {
471+
return (ReadException) ioException;
472+
}
473+
}
474+
return null;
475+
}
476+
437477
protected ConfigurationSource getConf() {
438478
return conf;
439479
}

hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestHadoopRpcOMFollowerReadFailoverProxyProvider.java

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@
6868
import org.apache.hadoop.security.UserGroupInformation;
6969
import org.apache.hadoop.util.Time;
7070
import org.apache.ratis.protocol.ClientId;
71+
import org.apache.ratis.protocol.exceptions.ReadException;
72+
import org.apache.ratis.protocol.exceptions.ReadIndexException;
7173
import org.junit.jupiter.api.Test;
7274
import org.mockito.invocation.InvocationOnMock;
7375
import org.mockito.stubbing.Answer;
@@ -337,6 +339,22 @@ void testNullRequest() throws Exception {
337339
assertInstanceOf(RpcNoSuchProtocolException.class, exception.getCause());
338340
}
339341

342+
@Test
343+
void testReadIndexException() throws Exception {
344+
setupProxyProvider(3);
345+
omNodeAnswers[0].isThrowReadIndexException = true;
346+
doRead();
347+
assertHandledBy(1);
348+
}
349+
350+
@Test
351+
void testReadException() throws Exception {
352+
setupProxyProvider(3);
353+
omNodeAnswers[0].isThrowReadException = true;
354+
doRead();
355+
assertHandledBy(1);
356+
}
357+
340358
private void setupProxyProvider(int omNodeCount) throws Exception {
341359
setupProxyProvider(omNodeCount, new OzoneConfiguration());
342360
}
@@ -489,6 +507,8 @@ private static class OMAnswer {
489507
private volatile boolean isLeader = false;
490508
private volatile boolean isLeaderReady = true;
491509
private volatile boolean isFollowerReadSupported = true;
510+
private volatile boolean isThrowReadIndexException = false;
511+
private volatile boolean isThrowReadException = false;
492512

493513
private OMProtocolAnswer clientAnswer = new OMProtocolAnswer();
494514

@@ -524,13 +544,31 @@ public OMResponse answer(InvocationOnMock invocationOnMock) throws Throwable {
524544
}
525545
break;
526546
case GetKeyInfo:
527-
if (!isLeader && !isFollowerReadSupported) {
528-
throw new ServiceException(
529-
new RemoteException(
530-
OMNotLeaderException.class.getCanonicalName(),
531-
"OM follower read is not supported"
532-
)
533-
);
547+
if (!isLeader) {
548+
if (!isFollowerReadSupported) {
549+
throw new ServiceException(
550+
new RemoteException(
551+
OMNotLeaderException.class.getCanonicalName(),
552+
"OM follower read is not supported"
553+
)
554+
);
555+
}
556+
if (isThrowReadIndexException) {
557+
throw new ServiceException(
558+
new RemoteException(
559+
ReadIndexException.class.getCanonicalName(),
560+
"ReadIndex exception"
561+
)
562+
);
563+
}
564+
if (isThrowReadException) {
565+
throw new ServiceException(
566+
new RemoteException(
567+
ReadException.class.getCanonicalName(),
568+
"ReadException"
569+
)
570+
);
571+
}
534572
}
535573
if (isLeader && !isLeaderReady) {
536574
throw new ServiceException(

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerRead.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
6666
import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServerConfig;
6767
import org.apache.hadoop.ozone.security.acl.OzoneObj;
68+
import org.apache.ratis.protocol.exceptions.RaftException;
6869
import org.junit.jupiter.api.AfterAll;
6970
import org.junit.jupiter.api.BeforeAll;
7071

@@ -434,6 +435,31 @@ protected void createKeyTest(boolean checkSuccess) throws Exception {
434435
}
435436
}
436437

438+
protected void listVolumes(boolean checkSuccess)
439+
throws Exception {
440+
try {
441+
getObjectStore().getClientProxy().listVolumes(null, null, 100);
442+
} catch (IOException e) {
443+
if (!checkSuccess) {
444+
// If the last OM to be tried by the RetryProxy is down, we would get
445+
// ConnectException. Otherwise, we would get a RemoteException from the
446+
// last running OM as it would fail to get a quorum.
447+
if (e instanceof RemoteException) {
448+
// Linearizable read will fail with ReadIndexException if the follower does not recognize any leader
449+
// or leader is uncontactable. It will throw ReadException if the read submitted to Ratis encounters
450+
// timeout.
451+
assertThat(((RemoteException) e).unwrapRemoteException()).isInstanceOf(RaftException.class);
452+
} else if (e instanceof ConnectException) {
453+
assertThat(e).hasMessageContaining("Connection refused");
454+
} else {
455+
assertThat(e).hasMessageContaining("Could not determine or connect to OM Leader");
456+
}
457+
} else {
458+
throw e;
459+
}
460+
}
461+
}
462+
437463
protected void waitForLeaderToBeReady()
438464
throws InterruptedException, TimeoutException {
439465
// Wait for Leader Election timeout

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerReadWithStoppedNodes.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,12 @@ void twoOMDown() throws Exception {
111111
getCluster().stopOzoneManager(2);
112112
Thread.sleep(NODE_FAILURE_TIMEOUT * 4);
113113

114+
// Write requests will fail with OMNotLeaderException
114115
createVolumeTest(false);
115116
createKeyTest(false);
117+
118+
// Read requests will fail with either ReadIndexException or ReadException
119+
listVolumes(false);
116120
}
117121

118122
@Test

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneShellHAWithFollowerRead.java

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.apache.hadoop.ozone.om.OzoneManager;
2727
import org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServerConfig;
2828
import org.apache.ratis.server.RaftServerConfigKeys;
29-
import org.junit.jupiter.api.Assertions;
3029
import org.junit.jupiter.api.BeforeAll;
3130
import org.junit.jupiter.api.Test;
3231

@@ -58,21 +57,27 @@ public void init() throws Exception {
5857

5958
@Test
6059
public void testAllowLeaderSkipLinearizableRead() throws Exception {
61-
super.testListAllKeysInternal("skipvol1");
62-
long lastMetrics = getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
63-
Assertions.assertTrue(lastMetrics > 0);
64-
6560
OzoneConfiguration oldConf = getCluster().getConf();
66-
OzoneConfiguration newConf = new OzoneConfiguration(oldConf);
67-
newConf.setBoolean("ozone.om.allow.leader.skip.linearizable.read", false);
68-
getCluster().getOMLeader().setConfiguration(newConf);
69-
70-
super.testListAllKeysInternal("skipvol2");
71-
72-
long curMetrics = getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
73-
assertEquals(lastMetrics, curMetrics);
74-
75-
getCluster().getOMLeader().setConfiguration(oldConf);
61+
try {
62+
String[] args = new String[]{"volume", "list"};
63+
OzoneShell ozoneShell = new OzoneShell();
64+
ozoneShell.getOzoneConf().setBoolean("ozone.client.follower.read.enabled", true);
65+
for (int i = 0; i < 100; i++) {
66+
execute(ozoneShell, args);
67+
}
68+
long lastMetrics = getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
69+
assertThat(lastMetrics).isGreaterThan(0);
70+
OzoneConfiguration newConf = new OzoneConfiguration(oldConf);
71+
newConf.setBoolean("ozone.om.allow.leader.skip.linearizable.read", false);
72+
getCluster().getOMLeader().setConfiguration(newConf);
73+
for (int i = 0; i < 100; i++) {
74+
execute(ozoneShell, args);
75+
}
76+
long curMetrics = getCluster().getOMLeader().getMetrics().getNumLeaderSkipLinearizableRead();
77+
assertEquals(lastMetrics, curMetrics);
78+
} finally {
79+
getCluster().getOMLeader().setConfiguration(oldConf);
80+
}
7681
}
7782

7883
@Test

hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@
9090
import org.apache.ratis.protocol.exceptions.LeaderNotReadyException;
9191
import org.apache.ratis.protocol.exceptions.LeaderSteppingDownException;
9292
import org.apache.ratis.protocol.exceptions.NotLeaderException;
93+
import org.apache.ratis.protocol.exceptions.ReadException;
94+
import org.apache.ratis.protocol.exceptions.ReadIndexException;
9395
import org.apache.ratis.protocol.exceptions.StateMachineException;
9496
import org.apache.ratis.rpc.RpcType;
9597
import org.apache.ratis.rpc.SupportedRpcType;
@@ -608,6 +610,16 @@ private OMResponse createOmResponseImpl(OMRequest omRequest,
608610
throw new ServiceException(new OMNotLeaderException(leaderSteppingDownException.getMessage()));
609611
}
610612

613+
ReadIndexException readIndexException = reply.getReadIndexException();
614+
if (readIndexException != null) {
615+
throw new ServiceException(readIndexException);
616+
}
617+
618+
ReadException readException = reply.getReadException();
619+
if (readException != null) {
620+
throw new ServiceException(readException);
621+
}
622+
611623
StateMachineException stateMachineException =
612624
reply.getStateMachineException();
613625
if (stateMachineException != null) {

0 commit comments

Comments
 (0)