diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 17bdf7f48d1d..d0132d5dd642 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -2379,6 +2379,16 @@
request OM snapshot from OM Leader.
+
+ ozone.om.bootstrap.min.space
+ 5GB
+ OZONE, OM, HA, MANAGEMENT
+
+ Minimum free space required on the volume that holds ozone.om.ratis.snapshot.dir
+ before an OM follower downloads a ratis/bootstrap checkpoint from the leader.
+ Use storage size syntax (e.g. 10GB). Set to 0 to disable this check.
+
+
ozone.om.fs.snapshot.max.limit
diff --git a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java
index 4e731f13851d..d14b259432e2 100644
--- a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java
+++ b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/OMConfigKeys.java
@@ -288,6 +288,10 @@ public final class OMConfigKeys {
OZONE_OM_SNAPSHOT_PROVIDER_REQUEST_TIMEOUT_DEFAULT =
TimeDuration.valueOf(300000, TimeUnit.MILLISECONDS);
+ public static final String OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY =
+ "ozone.om.bootstrap.min.space";
+ public static final String OZONE_OM_BOOTSTRAP_MIN_SPACE_DEFAULT = "5GB";
+
public static final String OZONE_OM_FS_SNAPSHOT_MAX_LIMIT =
"ozone.om.fs.snapshot.max.limit";
public static final int OZONE_OM_FS_SNAPSHOT_MAX_LIMIT_DEFAULT = 10000;
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
index 0f354e21c0f5..6c6a97391287 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
@@ -56,6 +56,7 @@
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_DIR_DELETING_SERVICE_INTERVAL;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_KEY_DELETING_LIMIT_PER_TASK;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ADDRESS_KEY;
+import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_EDEKCACHELOADER_INITIAL_DELAY_MS_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_EDEKCACHELOADER_INITIAL_DELAY_MS_KEY;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_EDEKCACHELOADER_INTERVAL_MS_DEFAULT;
@@ -4051,6 +4052,13 @@ public synchronized TermIndex installSnapshotFromLeader(String leaderId) throws
omDBCheckpoint = omRatisSnapshotProvider.
downloadDBSnapshotFromLeader(leaderId);
} catch (IOException ex) {
+ if (OmRatisSnapshotProvider.isDiskFullOrQuotaIOException(ex)) {
+ LOG.error(
+ "Failed to download snapshot from leader {}: local disk appears full or over quota "
+ + "on the OM ratis snapshot volume (see previous ERROR for path/usable space). "
+ + "Free disk or raise {} before bootstrap can succeed.",
+ leaderId, OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY);
+ }
LOG.error("Failed to download snapshot from Leader {}.", leaderId, ex);
cleanupCheckpoint(omDBCheckpoint);
return null;
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis_snapshot/OmRatisSnapshotProvider.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis_snapshot/OmRatisSnapshotProvider.java
index a67c2e9e53b6..a205a2b4b60a 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis_snapshot/OmRatisSnapshotProvider.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis_snapshot/OmRatisSnapshotProvider.java
@@ -24,6 +24,8 @@
import static org.apache.hadoop.ozone.OzoneConsts.OZONE_DB_CHECKPOINT_REQUEST_TO_EXCLUDE_SST;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_DB_CHECKPOINT_USE_INODE_BASED_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_DB_CHECKPOINT_USE_INODE_BASED_KEY;
+import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_BOOTSTRAP_MIN_SPACE_DEFAULT;
+import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_HTTP_AUTH_TYPE;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_PROVIDER_CONNECTION_TIMEOUT_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_PROVIDER_CONNECTION_TIMEOUT_KEY;
@@ -37,14 +39,17 @@
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
+import java.nio.file.FileSystemException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.hdds.conf.MutableConfigurationSource;
+import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.hdds.server.http.HttpConfig;
import org.apache.hadoop.hdds.utils.HAUtils;
import org.apache.hadoop.hdds.utils.LegacyHadoopConfigurationSource;
@@ -54,6 +59,7 @@
import org.apache.hadoop.hdfs.web.URLConnectionFactory;
import org.apache.hadoop.ozone.om.helpers.OMNodeDetails;
import org.apache.hadoop.security.SecurityUtil;
+import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -81,11 +87,80 @@ public class OmRatisSnapshotProvider extends RDBSnapshotProvider {
private static final Logger LOG =
LoggerFactory.getLogger(OmRatisSnapshotProvider.class);
+ /**
+ * Whether this {@link IOException} (or its causes) typically means the
+ * local filesystem ran out of space or hit a quota while writing.
+ */
+ public static boolean isDiskFullOrQuotaIOException(IOException ioe) {
+ for (Throwable t = ioe; t != null; t = t.getCause()) {
+ if (t instanceof FileSystemException) {
+ FileSystemException fse = (FileSystemException) t;
+ String reason = fse.getReason();
+ if (reason != null) {
+ String r = reason.toLowerCase(Locale.ROOT);
+ if (r.contains("no space") || r.contains("space left")
+ || r.contains("quota") || r.contains("enospc")) {
+ return true;
+ }
+ }
+ }
+ String msg = t.getMessage();
+ if (msg != null) {
+ String m = msg.toLowerCase(Locale.ROOT);
+ if (m.contains("no space left on device")
+ || m.contains("enospc")
+ || m.contains("disk quota exceeded")
+ || m.contains("quota exceeded")) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private static String formatSnapshotVolumeUsableSpace(File pathOnVolume) {
+ try {
+ Path storePath =
+ pathOnVolume.isDirectory() ? pathOnVolume.toPath() : pathOnVolume.toPath().getParent();
+ if (storePath == null) {
+ return "unknown";
+ }
+ long usable = Files.getFileStore(storePath).getUsableSpace();
+ return String.format("%s (%d bytes)", StringUtils.byteDesc(usable), usable);
+ } catch (Exception e) {
+ return "unknown (" + e.getMessage() + ")";
+ }
+ }
+
+ /**
+ * Logs at ERROR when the failure is likely due to disk full / quota, so
+ * operators can distinguish it from network or leader-side errors.
+ */
+ private static void logDiskFullOrQuotaDuringDownload(
+ IOException ioe, File targetFile, String leaderNodeId, URL checkpointUrl) {
+ if (!isDiskFullOrQuotaIOException(ioe)) {
+ return;
+ }
+ LOG.error(
+ "OM ratis snapshot download from leader {} failed: disk full or filesystem quota while "
+ + "writing checkpoint file {} (checkpoint URL {}). Usable space on this volume: {}. "
+ + "Free disk on this OM node or raise {}. Underlying message: {}",
+ leaderNodeId,
+ targetFile.getAbsolutePath(),
+ checkpointUrl,
+ formatSnapshotVolumeUsableSpace(targetFile),
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY,
+ ioe.getMessage(),
+ ioe);
+ }
+
private final Map peerNodesMap;
private final HttpConfig.Policy httpPolicy;
private final boolean spnegoEnabled;
private final URLConnectionFactory connectionFactory;
private final boolean useV2CheckpointApi;
+ /** Minimum usable bytes on snapshot volume before download; 0 = disabled. */
+ private final long bootstrapMinSpaceBytes;
public OmRatisSnapshotProvider(File snapshotDir,
Map peerNodesMap, HttpConfig.Policy httpPolicy,
@@ -96,6 +171,7 @@ public OmRatisSnapshotProvider(File snapshotDir,
this.spnegoEnabled = spnegoEnabled;
this.connectionFactory = connectionFactory;
this.useV2CheckpointApi = OZONE_OM_DB_CHECKPOINT_USE_INODE_BASED_DEFAULT;
+ this.bootstrapMinSpaceBytes = 0L;
}
public OmRatisSnapshotProvider(MutableConfigurationSource conf,
@@ -106,6 +182,10 @@ public OmRatisSnapshotProvider(MutableConfigurationSource conf,
peerNodesMap.putAll(peerNodeDetails);
this.useV2CheckpointApi = conf.getBoolean(OZONE_OM_DB_CHECKPOINT_USE_INODE_BASED_KEY,
OZONE_OM_DB_CHECKPOINT_USE_INODE_BASED_DEFAULT);
+ this.bootstrapMinSpaceBytes = (long) conf.getStorageSize(
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY,
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_DEFAULT,
+ StorageUnit.BYTES);
this.httpPolicy = HttpConfig.getHttpPolicy(conf);
this.spnegoEnabled = conf.get(OZONE_OM_HTTP_AUTH_TYPE, "simple")
@@ -144,9 +224,57 @@ public void removeDecommissionedPeerNode(String decommNodeId) {
peerNodesMap.remove(decommNodeId);
}
+ /**
+ * Ensures the filesystem that holds {@link #getSnapshotDir()} has enough
+ * free space for OM bootstrap / install snapshot download and unpack.
+ *
+ * @throws IOException if {@link #bootstrapMinSpaceBytes} is > 0 and
+ * usable space is below the configured minimum
+ */
+ void ensureBootstrapDiskSpace() throws IOException {
+ if (bootstrapMinSpaceBytes <= 0) {
+ LOG.debug("{} is 0 or negative; skipping bootstrap disk space check.",
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY);
+ return;
+ }
+ File snapshotRoot = getSnapshotDir();
+ if (!snapshotRoot.exists()) {
+ throw new IOException(String.format(
+ "OM ratis snapshot directory %s does not exist; cannot verify "
+ + "%s (required %s)",
+ snapshotRoot.getAbsolutePath(),
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY,
+ StringUtils.byteDesc(bootstrapMinSpaceBytes)));
+ }
+ final long usable = Files.getFileStore(snapshotRoot.toPath()).getUsableSpace();
+ if (usable < bootstrapMinSpaceBytes) {
+ String message = String.format(
+ "OM bootstrap / install snapshot aborted: volume containing ratis snapshot dir "
+ + "%s has usable space %s (%d bytes) but %s requires at least %s (%d bytes). "
+ + "Free disk on this OM host and increase %s if your checkpoints are larger.",
+ snapshotRoot.getAbsolutePath(),
+ StringUtils.byteDesc(usable),
+ usable,
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY,
+ StringUtils.byteDesc(bootstrapMinSpaceBytes),
+ bootstrapMinSpaceBytes,
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY);
+ LOG.error(message);
+ throw new IOException(message);
+ }
+ LOG.info(
+ "Bootstrap disk space check passed for OM ratis snapshot dir {}: usable {} >= "
+ + "minimum {} ({})",
+ snapshotRoot.getAbsolutePath(),
+ StringUtils.byteDesc(usable),
+ StringUtils.byteDesc(bootstrapMinSpaceBytes),
+ OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY);
+ }
+
@Override
public void downloadSnapshot(String leaderNodeID, File targetFile)
throws IOException {
+ ensureBootstrapDiskSpace();
OMNodeDetails leader = peerNodesMap.get(leaderNodeID);
URL omCheckpointUrl = leader.getOMDBCheckpointEndpointUrl(
useV2CheckpointApi, httpPolicy.isHttpEnabled(), true);
@@ -156,29 +284,32 @@ public void downloadSnapshot(String leaderNodeID, File targetFile)
HttpURLConnection connection = (HttpURLConnection)
connectionFactory.openConnection(omCheckpointUrl, spnegoEnabled);
- connection.setRequestMethod("POST");
- String contentTypeValue = "multipart/form-data; boundary=" +
- MULTIPART_FORM_DATA_BOUNDARY;
- connection.setRequestProperty("Content-Type", contentTypeValue);
- connection.setDoOutput(true);
-
- List existingFiles = useV2CheckpointApi ? HAUtils.getExistingFiles(getCandidateDir())
- : HAUtils.getExistingSstFilesRelativeToDbDir(getCandidateDir());
- writeFormData(connection, existingFiles);
-
- connection.connect();
- int errorCode = connection.getResponseCode();
- if ((errorCode != HTTP_OK) && (errorCode != HTTP_CREATED)) {
- throw new IOException("Unexpected exception when trying to reach " +
- "OM to download latest checkpoint. Checkpoint URL: " +
- omCheckpointUrl + ". ErrorCode: " + errorCode);
- }
+ try {
+ connection.setRequestMethod("POST");
+ String contentTypeValue = "multipart/form-data; boundary=" +
+ MULTIPART_FORM_DATA_BOUNDARY;
+ connection.setRequestProperty("Content-Type", contentTypeValue);
+ connection.setDoOutput(true);
+
+ List existingFiles = useV2CheckpointApi ? HAUtils.getExistingFiles(getCandidateDir())
+ : HAUtils.getExistingSstFilesRelativeToDbDir(getCandidateDir());
+ writeFormData(connection, existingFiles);
- try (InputStream inputStream = connection.getInputStream()) {
- downloadFileWithProgress(inputStream, targetFile);
+ connection.connect();
+ int errorCode = connection.getResponseCode();
+ if ((errorCode != HTTP_OK) && (errorCode != HTTP_CREATED)) {
+ throw new IOException("Unexpected exception when trying to reach " +
+ "OM to download latest checkpoint. Checkpoint URL: " +
+ omCheckpointUrl + ". ErrorCode: " + errorCode);
+ }
+
+ try (InputStream inputStream = connection.getInputStream()) {
+ downloadFileWithProgress(inputStream, targetFile);
+ }
} catch (IOException ex) {
+ logDiskFullOrQuotaDuringDownload(ex, targetFile, leaderNodeID, omCheckpointUrl);
boolean deleted = FileUtils.deleteQuietly(targetFile);
- if (!deleted) {
+ if (!deleted && targetFile.exists()) {
LOG.error("OM snapshot which failed to download {} cannot be deleted",
targetFile);
}
diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis_snapshot/TestOmRatisSnapshotProvider.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis_snapshot/TestOmRatisSnapshotProvider.java
index 2fb0f56ae890..9bc2f80f59f9 100644
--- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis_snapshot/TestOmRatisSnapshotProvider.java
+++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/ratis_snapshot/TestOmRatisSnapshotProvider.java
@@ -19,7 +19,11 @@
import static java.net.HttpURLConnection.HTTP_OK;
import static org.apache.hadoop.ozone.OzoneConsts.MULTIPART_FORM_DATA_BOUNDARY;
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.any;
import static org.mockito.Mockito.anyBoolean;
import static org.mockito.Mockito.mock;
@@ -33,13 +37,16 @@
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
+import java.nio.file.FileSystemException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.server.http.HttpConfig;
import org.apache.hadoop.hdfs.web.URLConnectionFactory;
import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.ozone.om.OMConfigKeys;
import org.apache.hadoop.ozone.om.helpers.OMNodeDetails;
import org.apache.hadoop.security.authentication.client.AuthenticationException;
import org.junit.jupiter.api.BeforeEach;
@@ -80,6 +87,46 @@ public void setup(@TempDir File snapshotDir,
false, connectionFactory);
}
+ @Test
+ public void testIsDiskFullOrQuotaIOExceptionDetectsNoSpaceMessage() {
+ assertTrue(OmRatisSnapshotProvider.isDiskFullOrQuotaIOException(
+ new IOException("No space left on device")));
+ }
+
+ @Test
+ public void testIsDiskFullOrQuotaIOExceptionDetectsFileSystemExceptionReason() {
+ IOException wrapped = new IOException("write failed",
+ new FileSystemException("p", null, "No space left on device"));
+ assertTrue(OmRatisSnapshotProvider.isDiskFullOrQuotaIOException(wrapped));
+ }
+
+ @Test
+ public void testIsDiskFullOrQuotaIOExceptionReturnsFalseForOtherErrors() {
+ assertFalse(OmRatisSnapshotProvider.isDiskFullOrQuotaIOException(
+ new IOException("Connection reset")));
+ }
+
+ @Test
+ public void testBootstrapDiskSpaceCheckSkippedWhenZero(@TempDir File snapshotDir) {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OMConfigKeys.OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY, "0GB");
+ OmRatisSnapshotProvider provider =
+ new OmRatisSnapshotProvider(conf, snapshotDir, new HashMap<>());
+ assertDoesNotThrow(() -> provider.ensureBootstrapDiskSpace());
+ }
+
+ @Test
+ public void testBootstrapDiskSpaceCheckFailsWhenBelowMinimum(@TempDir File snapshotDir) {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OMConfigKeys.OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY, "1024EB");
+ OmRatisSnapshotProvider provider =
+ new OmRatisSnapshotProvider(conf, snapshotDir, new HashMap<>());
+ IOException ex =
+ assertThrows(IOException.class, provider::ensureBootstrapDiskSpace);
+ assertEquals(true,
+ ex.getMessage().contains(OMConfigKeys.OZONE_OM_BOOTSTRAP_MIN_SPACE_KEY));
+ }
+
@Test
public void testDownloadSnapshot() throws IOException,
AuthenticationException {