diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/IORateLimiter.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/IORateLimiter.java new file mode 100644 index 0000000000000..0e33f3afe910f --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/IORateLimiter.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import java.time.Duration; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * An optional interface for classes that provide rate limiters. + * For a filesystem source, the operation name SHOULD be one of + * those listed in + * {@link org.apache.hadoop.fs.statistics.StoreStatisticNames} + * if the operation is listed there. + *
+ * This interfaces is intended to be exported by FileSystems so that + * applications wishing to perform bulk operations may request access + * to a rate limiter which is shared across all threads interacting + * with the store.. + * That is: the rate limiting is global to the specific instance of the + * object implementing this interface. + *
+ * It is not expected to be shared with other instances of the same + * class, or across processes. + *
+ * This means it is primarily of benefit when limiting bulk operations + * which can overload an (object) store from a small pool of threads. + * Examples of this can include: + *
+ * The implementation may assign different costs to the different + * operations. + *
+ * If there is not enough space, the permits will be acquired, + * but the subsequent call will block until the capacity has been + * refilled. + *
+ * The path parameter is used to support stores where there may be different throttling + * under different paths. + * @param operation operation being performed. Must not be null, may be "", + * should be from {@link org.apache.hadoop.fs.statistics.StoreStatisticNames} + * where there is a matching operation. + * @param src path under which the operations will be initiated. + * @param dest destination path for rename operations + * @param requestedCapacity capacity to acquire. + * Must be greater than or equal to 0. + * @return time spent waiting for output. + */ + Duration acquireIOCapacity(String operation, Path src, Path dest, int requestedCapacity); + +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/IORateLimiterSupport.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/IORateLimiterSupport.java new file mode 100644 index 0000000000000..b9ed4c115ef4b --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/IORateLimiterSupport.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.impl; + +import org.apache.hadoop.fs.IORateLimiter; +import org.apache.hadoop.util.RateLimiting; +import org.apache.hadoop.util.RateLimitingFactory; + +import static java.util.Objects.requireNonNull; + +/** + * Implementation support for the IO rate limiter. + */ +public final class IORateLimiterSupport { + + private IORateLimiterSupport() { + } + + /** + * Get a rate limiter source which has no rate limiting. + * @return a rate limiter source which has no rate limiting. + */ + public static IORateLimiter unlimited() { + return (operation, src, dest, requestedCapacity) -> { + requireNonNull(operation, "operation"); + return RateLimitingFactory.unlimitedRate().acquire(requestedCapacity); + }; + } + + /** + * Create a rate limiter with a fixed capacity. + * @param capacityPerSecond capacity per second. + * @return a rate limiter. + */ + public static IORateLimiter create(int capacityPerSecond) { + final RateLimiting limiting = RateLimitingFactory.create(capacityPerSecond); + return (operation, src, dest, requestedCapacity) -> { + requireNonNull(operation, "operation"); + return limiting.acquire(requestedCapacity); + }; + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java index 19ee9d1414ecf..1970dbb68d0fd 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java @@ -63,6 +63,9 @@ public final class StoreStatisticNames { /** {@value}. */ public static final String OP_DELETE = "op_delete"; + /** {@value}. */ + public static final String OP_DELETE_DIR = "op_delete_dir"; + /** {@value}. */ public static final String OP_EXISTS = "op_exists"; diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimiting.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimiting.java index ae119c0e630f4..367e236dac8b7 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimiting.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimiting.java @@ -28,8 +28,10 @@ * Can be used to throttle use of object stores where excess load * will trigger cluster-wide throttling, backoff etc. and so collapse * performance. + *
* The time waited is returned as a Duration type. - * The google rate limiter implements this by allowing a caller to ask for + *
+ * The google rate limiter implements rate limiting by allowing a caller to ask for * more capacity than is available. This will be granted * but the subsequent request will be blocked if the bucket of * capacity hasn't let refilled to the point where there is @@ -44,8 +46,11 @@ public interface RateLimiting { * If there is not enough space, the permits will be acquired, * but the subsequent call will block until the capacity has been * refilled. + *
+ * If the capacity is zero, no delay will take place.
* @param requestedCapacity capacity to acquire.
- * @return time spent waiting for output.
+ * Must be greater than or equal to 0.
+ * @return time spent waiting to acquire the capacity..
*/
Duration acquire(int requestedCapacity);
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimitingFactory.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimitingFactory.java
index 621415456e125..fb5c45f0e0305 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimitingFactory.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/RateLimitingFactory.java
@@ -24,6 +24,8 @@
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.RateLimiter;
+import static org.apache.hadoop.util.Preconditions.checkArgument;
+
/**
* Factory for Rate Limiting.
* This should be only place in the code where the guava RateLimiter is imported.
@@ -50,6 +52,7 @@ private static class NoRateLimiting implements RateLimiting {
@Override
public Duration acquire(int requestedCapacity) {
+ checkArgument(requestedCapacity >= 0, "requestedCapacity must be >= 0");
return INSTANTLY;
}
}
@@ -70,6 +73,11 @@ private RestrictedRateLimiting(int capacityPerSecond) {
@Override
public Duration acquire(int requestedCapacity) {
+ checkArgument(requestedCapacity >= 0, "requestedCapacity must be >= 0");
+ if (requestedCapacity == 0) {
+ // google limiter does not do this.
+ return INSTANTLY;
+ }
final double delayMillis = limiter.acquire(requestedCapacity);
return delayMillis == 0
? INSTANTLY
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java
index c3fda19d8d73b..0dff801c92351 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/functional/FutureIO.java
@@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.UncheckedIOException;
+import java.time.Duration;
+import java.util.Collection;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
@@ -29,6 +31,9 @@
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
@@ -55,6 +60,9 @@
@InterfaceStability.Unstable
public final class FutureIO {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(FutureIO.class);
+
private FutureIO() {
}
@@ -275,4 +283,44 @@ public static
+ * If it returns without an error: there is nothing at
+ * the end of the path.
+ * @param path path
+ * @return outcome
+ * @throws IOException failure.
+ */
+ public boolean deleteFile(Path path)
+ throws IOException {
+ return delete(path, true);
+ }
+
+ /**
+ * Acquire the delete capacity then call {@code FileSystem#delete(Path, true)}
+ * or equivalent.
+ *
+ * If it returns without an error: there is nothing at
+ * the end of the path.
+ * @param path path
+ * @param capacity IO capacity to ask for.
+ * @return outcome
+ * @throws IOException failure.
+ */
+ public abstract boolean rmdir(Path path, int capacity)
+ throws IOException;
+
/**
* Forward to {@link FileSystem#mkdirs(Path)}.
* Usual "what does 'false' mean" ambiguity.
@@ -288,4 +319,22 @@ public Duration getWaitTime() {
}
}
+ /**
+ * Get the rate limiter source.
+ * Shall never be null; may be unlimited.
+ * @return the rate limiter source.
+ */
+ public IORateLimiter rateLimiterSource() {
+ return IORateLimiterSupport.unlimited();
+ }
+
+ /**
+ * Delegate to {@link #rateLimiterSource()}.
+ * {@inheritDoc}
+ */
+ @Override
+ public Duration acquireIOCapacity(final String operation, final Path src, final Path dest, final int requestedCapacity) {
+ return rateLimiterSource().acquireIOCapacity(operation, new Path("/"), null, requestedCapacity);
+ }
+
}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java
index 9a0b972bc735b..ec5023c484042 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/impl/ManifestStoreOperationsThroughFileSystem.java
@@ -25,12 +25,21 @@
import org.apache.hadoop.fs.CommonPathCapabilities;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.IORateLimiter;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.AbstractManifestData;
import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
import org.apache.hadoop.util.JsonSerialization;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.*;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.DELETE_DIR_CAPACITY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.DELETE_FILE_CAPACITY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.GET_FILE_STATUS_CAPACITY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.LIST_CAPACITY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.MKDIRS_CAPACITY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.RENAME_CAPACITY;
+
/**
* Implementation of manifest store operations through the filesystem API.
* This class is subclassed in the ABFS module, which does add the resilient
@@ -87,6 +96,7 @@ public void bindToFileSystem(FileSystem filesystem, Path path) throws IOExceptio
@Override
public FileStatus getFileStatus(Path path) throws IOException {
+ acquireIOCapacity(OP_GET_FILE_STATUS, new Path("/"), null, GET_FILE_STATUS_CAPACITY);
return fileSystem.getFileStatus(path);
}
@@ -99,30 +109,47 @@ public FileStatus getFileStatus(Path path) throws IOException {
@SuppressWarnings("deprecation")
@Override
public boolean isFile(Path path) throws IOException {
+ acquireIOCapacity(OP_IS_FILE, new Path("/"), null, GET_FILE_STATUS_CAPACITY);
return fileSystem.isFile(path);
}
+ /**
+ * Delete a path.
+ * The capacity to acquire is based on the recursive flag.
+ * {@inheritDoc}
+ */
@Override
public boolean delete(Path path, boolean recursive)
throws IOException {
+ acquireIOCapacity(OP_DELETE,
+ new Path("/"), null, recursive ? DELETE_FILE_CAPACITY : DELETE_DIR_CAPACITY);
return fileSystem.delete(path, recursive);
}
+ @Override
+ public boolean rmdir(final Path path, final int capacity) throws IOException {
+ acquireIOCapacity(OP_DELETE_DIR, new Path("/"), null, capacity);
+ return fileSystem.delete(path, true);
+ }
+
@Override
public boolean mkdirs(Path path)
throws IOException {
+ acquireIOCapacity(OP_MKDIRS, new Path("/"), null, MKDIRS_CAPACITY);
return fileSystem.mkdirs(path);
}
@Override
public boolean renameFile(Path source, Path dest)
throws IOException {
+ acquireIOCapacity(OP_RENAME, new Path("/"), null, RENAME_CAPACITY);
return fileSystem.rename(source, dest);
}
@Override
public RemoteIterator
+ * If it returns without an error: there is nothing at
+ * the end of the path.
+ * @param path path
+ * @param statistic statistic to update
+ * @return outcome.
+ * @throws IOException IO Failure.
+ */
+ protected boolean deleteFile(
+ final Path path,
+ final String statistic)
+ throws IOException {
+ return trackDuration(getIOStatistics(), statistic, () ->
+ operations.deleteFile(path));
}
/**
@@ -690,6 +710,8 @@ protected boolean storeSupportsResilientCommit() {
* Maybe delete the destination.
* This routine is optimized for the data not existing, as HEAD seems to cost less
* than a DELETE; assuming most calls don't have data, this is faster.
+ * If the destination exists, {@link #deleteDir(Path, String)} is invoked
+ * so as to require more IO capacity.
* @param deleteDest should an attempt to delete the dest be made?
* @param dest destination path
* @throws IOException IO failure, including permissions.
@@ -697,11 +719,14 @@ protected boolean storeSupportsResilientCommit() {
private void maybeDeleteDest(final boolean deleteDest, final Path dest) throws IOException {
if (deleteDest && getFileStatusOrNull(dest) != null) {
-
- boolean deleted = delete(dest, true);
- // log the outcome in case of emergency diagnostics traces
- // being needed.
- LOG.debug("{}: delete('{}') returned {}'", getName(), dest, deleted);
+ final FileStatus st = getFileStatusOrNull(dest);
+ if (st != null) {
+ if (st.isDirectory()) {
+ deleteDir(dest, OP_DELETE_DIR);
+ } else {
+ deleteFile(dest, OP_DELETE);
+ }
+ }
}
}
@@ -915,26 +940,36 @@ protected final TaskPool.Submitter getIOProcessors(int size) {
}
/**
- * Delete a directory, possibly suppressing exceptions.
+ * Delete a directory.
* @param dir directory.
- * @param suppressExceptions should exceptions be suppressed?
+ * @param statistic statistic to use
+ * @return true if the path is no longer present.
* @throws IOException exceptions raised in delete if not suppressed.
- * @return any exception caught and suppressed
*/
- protected IOException deleteDir(
+ protected boolean deleteDir(
final Path dir,
- final Boolean suppressExceptions)
+ final String statistic)
+ throws IOException {
+ return trackDuration(getIOStatistics(), statistic, () ->
+ operations.rmdir(dir, stageConfig.getDeleteDirCapacity()));
+ }
+
+ /**
+ * Delete a directory, suprressing exceptions.
+ * @param dir directory.
+ * @param statistic statistic to use
+ * @return any exception caught.
+ */
+ protected IOException deleteDirSuppressingExceptions(
+ final Path dir,
+ final String statistic)
throws IOException {
try {
- delete(dir, true);
+ deleteDir(dir, statistic);
return null;
} catch (IOException ex) {
LOG.info("Error deleting {}: {}", dir, ex.toString());
- if (!suppressExceptions) {
- throw ex;
- } else {
- return ex;
- }
+ return ex;
}
}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java
index 77b80aaf67fd6..b36027c99055d 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CleanupJobStage.java
@@ -35,6 +35,7 @@
import static java.util.Objects.requireNonNull;
import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_DELETE_DIR;
import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED;
import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_FAILURES_IGNORED_DEFAULT;
import static org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED;
@@ -49,7 +50,7 @@
* Returns: the outcome of the overall operation
* The result is detailed purely for the benefit of tests, which need
* to make assertions about error handling and fallbacks.
- *
+ *
* There's a few known issues with the azure and GCS stores which
* this stage tries to address.
* - Google GCS directory deletion is O(entries), so is slower for big jobs.
@@ -57,19 +58,21 @@
* when not the store owner triggers a scan down the tree to verify the
* caller has the permission to delete each subdir.
* If this scan takes over 90s, the operation can time out.
- *
+ * - Azure storage requires IO capacity based on the number of subdirectories.
+ *
* The main solution for both of these is that task attempts are
* deleted in parallel, in different threads.
* This will speed up GCS cleanup and reduce the risk of
* abfs related timeouts.
+ *
* Exceptions during cleanup can be suppressed,
* so that these do not cause the job to fail.
- *
+ *
* Also, some users want to be able to run multiple independent jobs
* targeting the same output directory simultaneously.
* If one job deletes the directory `__temporary` all the others
* will fail.
- *
+ *
* This can be addressed by disabling cleanup entirely.
*
*/
@@ -219,7 +222,7 @@ protected Result executeStage(
* Delete a single TA dir in a parallel task.
* Updates the audit context.
* Exceptions are swallowed so that attempts are still made
- * to delete the others, but the first exception
+ * to delete the others, but one of any exceptions raised.
* caught is saved in a field which can be retrieved
* via {@link #getLastDeleteException()}.
*
@@ -246,7 +249,7 @@ private IOException deleteOneDir(final Path dir)
throws IOException {
deleteDirCount.incrementAndGet();
- IOException ex = deleteDir(dir, true);
+ final IOException ex = deleteDirSuppressingExceptions(dir, OP_DELETE_DIR);
if (ex != null) {
deleteFailure(ex);
}
@@ -258,8 +261,9 @@ private IOException deleteOneDir(final Path dir)
* @param ex exception
*/
private synchronized void deleteFailure(IOException ex) {
- // excaption: add the count
+ // exception: add the count
deleteFailureCount.incrementAndGet();
+ // and save the exception, overwriting any predecessor.
lastDeleteException = ex;
}
@@ -343,8 +347,7 @@ public String toString() {
public static final Arguments DISABLED = new Arguments(OP_STAGE_JOB_CLEANUP,
false,
false,
- false
- );
+ false);
/**
* Build an options argument from a configuration, using the
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java
index 1618cf591a590..d3c66ee1d70a0 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/CreateOutputDirectoriesStage.java
@@ -237,7 +237,6 @@ private void deleteDirWithFile(Path dir) throws IOException {
addToDirectoryMap(dir, DirMapState.fileNowDeleted);
}
-
/**
* Create a directory is required, updating the directory map
* and, if the operation took place, the list of created dirs.
@@ -323,7 +322,7 @@ private DirMapState maybeCreateOneDirectory(DirEntry dirEntry) throws IOExceptio
// is bad: delete a file
LOG.info("{}: Deleting file where a directory should go: {}",
getName(), st);
- delete(path, false, OP_DELETE_FILE_UNDER_DESTINATION);
+ deleteFile(path, OP_DELETE_FILE_UNDER_DESTINATION);
} else {
// is good.
LOG.warn("{}: Even though mkdirs({}) failed, there is now a directory there",
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java
index fdaf0184cda20..4a17468c068e4 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/SaveTaskManifestStage.java
@@ -27,6 +27,7 @@
import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.TaskManifest;
import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterStatisticNames.OP_STAGE_TASK_SAVE_MANIFEST;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.TASK_COMMIT_RETRY_COUNT;
import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestPathForTask;
import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.manifestTempPathForTaskAttempt;
@@ -38,12 +39,32 @@
* Uses both the task ID and task attempt ID to determine the temp filename;
* Before the rename of (temp, final-path), any file at the final path
* is deleted.
+ *
* This is so that when this stage is invoked in a task commit, its output
* overwrites any of the first commit.
* When it succeeds, therefore, unless there is any subsequent commit of
* another task, the task manifest at the final path is from this
* operation.
- *
+ *
+ * If the save and rename fails, there are a limited number of retries, with no sleep
+ * interval.
+ * This is to briefly try recover from any transient rename() failure, including a
+ * race condition with any other task commit.
+ *
* Returns the path where the manifest was saved.
*/
public class SaveTaskManifestStage extends
@@ -73,8 +94,21 @@ protected Path executeStage(final TaskManifest manifest)
getRequiredTaskId());
Path manifestTempFile = manifestTempPathForTaskAttempt(manifestDir,
getRequiredTaskAttemptId());
- LOG.info("{}: Saving manifest file to {}", getName(), manifestFile);
- save(manifest, manifestTempFile, manifestFile);
+ int limit = TASK_COMMIT_RETRY_COUNT;
+ boolean success = false;
+ do {
+ try {
+ LOG.info("{}: Saving manifest file to {}", getName(), manifestFile);
+ save(manifest, manifestTempFile, manifestFile);
+ success = true;
+ } catch (IOException e) {
+ LOG.warn("Failed to save manifest to {} via temp file {} and rename()",
+ manifestFile, manifestTempFile, e);
+ if (--limit < 0) {
+ throw e;
+ }
+ }
+ } while (!success);
return manifestFile;
}
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java
index b716d2f4b7f0c..3d77854905158 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/committer/manifest/stages/StageConfig.java
@@ -32,6 +32,7 @@
import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.DEFAULT_WRITER_QUEUE_CAPACITY;
import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER;
import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.SUCCESS_MARKER_FILE_LIMIT;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.InternalConstants.DELETE_DIR_CAPACITY;
/**
* Stage Config.
@@ -172,6 +173,11 @@ public class StageConfig {
*/
private int successMarkerFileLimit = SUCCESS_MARKER_FILE_LIMIT;
+ /**
+ * Capacity for directory delete operations.
+ */
+ private int deleteDirCapacity = DELETE_DIR_CAPACITY;
+
public StageConfig() {
}
@@ -604,6 +610,24 @@ public int getSuccessMarkerFileLimit() {
return successMarkerFileLimit;
}
+ /**
+ * Get the capacity for delete operations.
+ * @return the capacity
+ */
+ public int getDeleteDirCapacity() {
+ return deleteDirCapacity;
+ }
+
+ /**
+ * Set builder value.
+ * @param value new value
+ * @return the builder
+ */
+ public StageConfig withDeleteDirCapacity(final int value) {
+ deleteDirCapacity = value;
+ return this;
+ }
+
/**
* Enter the stage; calls back to
* {@link #enterStageEventHandler} if non-null.
diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md
index da199a48d14c0..c63552f4c20a5 100644
--- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md
+++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/manifest_committer.md
@@ -17,7 +17,7 @@
This document how to use the _Manifest Committer_.
-The _Manifest_ committer is a committer for work which provides
+The _Manifest Committer_ is a committer for work which provides
performance on ABFS for "real world" queries,
and performance and correctness on GCS.
It also works with other filesystems, including HDFS.
@@ -523,14 +523,14 @@ And optional settings for debugging/performance analysis
```
spark.hadoop.mapreduce.outputcommitter.factory.scheme.abfs org.apache.hadoop.fs.azurebfs.commit.AzureManifestCommitterFactory
-spark.hadoop.fs.azure.io.rate.limit 10000
+spark.hadoop.fs.azure.io.rate.limit 1000
spark.sql.parquet.output.committer.class org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
spark.sql.sources.commitProtocolClass org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
spark.hadoop.mapreduce.manifest.committer.summary.report.directory (optional: URI of a directory for job summaries)
```
-## Experimental: ABFS Rename Rate Limiting `fs.azure.io.rate.limit`
+## ABFS Rename Rate Limiting `fs.azure.io.rate.limit`
To avoid triggering store throttling and backoff delays, as well as other
throttling-related failure conditions file renames during job commit
@@ -538,19 +538,23 @@ are throttled through a "rate limiter" which limits the number of
rename operations per second a single instance of the ABFS FileSystem client
may issue.
-| Option | Meaning |
-|--------|---------|
-| `fs.azure.io.rate.limit` | Rate limit in operations/second for IO operations. |
+| Option | Meaning |
+|----------------------------------------------------|----------------------------------------------------|
+| `fs.azure.io.rate.limit` | Rate limit in operations/second for IO operations. |
+| `mapreduce.manifest.committer.delete.dir.capacity` | Write capacity to request for directory deletion. |
+### Option `fs.azure.io.rate.limit`
+
+This is the number of IOPS to allocate for reading and writing during task and job commit.
Set the option to `0` remove all rate limiting.
-The default value of this is set to 10000, which is the default IO capacity for
-an ADLS storage account.
+The default value of this is set to 1000.
+
```xml
+ * Some operations (list and rename) are considered more expensive and so whatever capacity
+ * is asked for is multiplied.
+ * {@inheritDoc}
+ */
+ @Override
+ public Duration acquireIOCapacity(final String operation, final Path src, final Path dest, final int requestedCapacity) {
+
+ double multiplier;
+ int lowCost = 1;
+ int mediumCost = 10;
+ switch (operation) {
+ case OP_LIST_FILES:
+ case OP_LIST_STATUS:
+ case OP_LIST_LOCATED_STATUS:
+ case OP_RENAME:
+ multiplier = mediumCost;
+ break;
+ default:
+ multiplier = lowCost;
+ }
+ final int capacity = (int) (requestedCapacity * multiplier);
+ LOG.debug("Acquiring IO capacity {} for operation: {}; multiplier: {}; final capacity: {}",
+ requestedCapacity, operation, multiplier, capacity);
+ return rateLimiting.acquire(capacity);
+ }
}
diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AbfsManifestStoreOperations.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AbfsManifestStoreOperations.java
index 6bfab3a8515a9..a5a7819617af1 100644
--- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AbfsManifestStoreOperations.java
+++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AbfsManifestStoreOperations.java
@@ -92,7 +92,7 @@ public void bindToFileSystem(FileSystem filesystem, Path path) throws IOExceptio
etagsPreserved = true;
LOG.debug("Bonded to filesystem with resilient commits under path {}", path);
} catch (UnsupportedOperationException e) {
- LOG.debug("No resilient commit support under path {}", path);
+ LOG.warn("No resilient commit support under path {}", path);
}
}
diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AzureManifestCommitterFactory.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AzureManifestCommitterFactory.java
index b760fa7a4ac53..01fd9ec50e42a 100644
--- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AzureManifestCommitterFactory.java
+++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/commit/AzureManifestCommitterFactory.java
@@ -20,10 +20,14 @@
import java.io.IOException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.azurebfs.AzureBlobFileSystemStore;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitter;
import org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterFactory;
@@ -41,10 +45,12 @@
@InterfaceStability.Evolving
public class AzureManifestCommitterFactory extends ManifestCommitterFactory {
+ private static final Logger LOG = LoggerFactory.getLogger(AzureManifestCommitterFactory.class);
+
/**
* Classname, which can be declared in job configurations.
*/
- public static final String NAME = ManifestCommitterFactory.class.getName();
+ public static final String NAME = AzureManifestCommitterFactory.class.getName();
@Override
public ManifestCommitter createOutputCommitter(final Path outputPath,
diff --git a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java
index dd4d7edc6beda..485aa031d702a 100644
--- a/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java
+++ b/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/FileSystemConfigurations.java
@@ -158,7 +158,7 @@ public final class FileSystemConfigurations {
/**
* IO rate limit. Value: {@value}
*/
- public static final int RATE_LIMIT_DEFAULT = 10_000;
+ public static final int RATE_LIMIT_DEFAULT = 1_000;
private FileSystemConfigurations() {}
}
diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsManifestStoreOperations.java b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsManifestStoreOperations.java
index 922782da29c5f..3d9e2e783403e 100644
--- a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsManifestStoreOperations.java
+++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestAbfsManifestStoreOperations.java
@@ -19,6 +19,8 @@
package org.apache.hadoop.fs.azurebfs.commit;
import java.nio.charset.StandardCharsets;
+import java.time.Duration;
+import java.time.temporal.ChronoUnit;
import org.assertj.core.api.Assertions;
import org.junit.Test;
@@ -28,6 +30,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.IORateLimiter;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.azurebfs.contract.ABFSContractTestBinding;
import org.apache.hadoop.fs.azurebfs.contract.AbfsFileSystemContract;
@@ -39,6 +42,9 @@
import static org.apache.hadoop.fs.CommonPathCapabilities.ETAGS_PRESERVED_IN_RENAME;
import static org.apache.hadoop.fs.azurebfs.commit.AbfsCommitTestHelper.prepareTestConfiguration;
+import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ABFS_IO_RATE_LIMIT;
+import static org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.RATE_LIMIT_DEFAULT;
+import static org.apache.hadoop.fs.statistics.StoreStatisticNames.OP_LIST_FILES;
import static org.junit.Assume.assumeTrue;
/**
@@ -172,4 +178,35 @@ public void testEtagConsistencyAcrossRename() throws Throwable {
.isEqualTo(srcTag);
}
+ /**
+ * Verify IORateLimiter passes through to the ABFS store by
+ * making multiple requests greater than the rate limit, and
+ * verifying that the second request is delayed.
+ * The rate asked for is a fraction of the configured limit;
+ * it relies on the abfs store to use a multiplier
+ */
+ @Test
+ public void testCapacityLimiting() throws Throwable {
+ describe("Verifying Rate Limiting");
+ final Configuration conf = getConfiguration();
+ // get the capacity per second, either the default or any override.
+ final int size = conf.getInt(FS_AZURE_ABFS_IO_RATE_LIMIT, RATE_LIMIT_DEFAULT);
+ final int capacity = (int) (size/4.0);
+ final IORateLimiter limiter = createManifestStoreOperations();
+
+ // this operation is amplified; if a different name is used then
+ // the second assertion fails.
+ final String operation = OP_LIST_FILES;
+ // first one has no delay
+ Assertions.assertThat(limiter.acquireIOCapacity(operation, new Path("/"), null, capacity))
+ .describedAs("Duration of acquiring %d capacity", capacity)
+ .isEqualTo(Duration.ZERO);
+
+ // second one is delayed
+ final Duration duration = limiter.acquireIOCapacity(operation, new Path("/"), null, capacity);
+ describe("Duration of second capacity request of %d: %s", capacity, duration);
+ Assertions.assertThat(duration)
+ .describedAs("Duration of acquiring %d capacity", capacity)
+ .isGreaterThan(Duration.of(1, ChronoUnit.SECONDS));
+ }
}
diff --git a/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestRenameRecovery.java b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestRenameRecovery.java
new file mode 100644
index 0000000000000..7806afbef89f5
--- /dev/null
+++ b/hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/commit/ITestRenameRecovery.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs.commit;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.azurebfs.AbstractAbfsIntegrationTest;
+import org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.files.FileEntry;
+import org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestStoreOperations;
+import org.apache.hadoop.util.functional.CloseableTaskPoolSubmitter;
+
+import static org.apache.hadoop.fs.CommonPathCapabilities.ETAGS_AVAILABLE;
+import static org.apache.hadoop.fs.CommonPathCapabilities.ETAGS_PRESERVED_IN_RENAME;
+import static org.apache.hadoop.fs.azure.integration.AzureTestUtils.assumeScaleTestsEnabled;
+import static org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_ABFS_IO_RATE_LIMIT;
+import static org.apache.hadoop.fs.statistics.IOStatisticsLogging.ioStatisticsToPrettyString;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConfig.createCloseableTaskSubmitter;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.ManifestCommitterConstants.OPT_DELETE_DIR_CAPACITY;
+import static org.apache.hadoop.mapreduce.lib.output.committer.manifest.impl.ManifestCommitterSupport.getEtag;
+import static org.apache.hadoop.util.functional.FutureIO.awaitAllFutures;
+import static org.assertj.core.api.Assumptions.assumeThat;
+
+/**
+ * HADOOP-19093: Scale test to attempt to generate rename failures.
+ *
+ * This test is intended to overload the storage account.
+ * It has been unable to create this problem, even on a very throttled
+ * account.
+ * The test suite asserts that the number of failures is zero;
+ * if it ever fails it means that the rename operation did fail
+ * and that recovery was successful.
+ */
+public class ITestRenameRecovery extends AbstractAbfsIntegrationTest {
+
+ private static final Logger LOG = LoggerFactory.getLogger(
+ AbfsManifestStoreOperations.class);
+
+ /**
+ * Time to sleep between checks for tasks to complete.
+ */
+ public static final Duration SLEEP_INTERVAL = Duration.ofMillis(1000);
+
+ /**
+ * Number of threads to use.
+ */
+ private static final int THREAD_COUNT = 100;
+
+ /**
+ * Number of renames to attempt per thread: {@value}.
+ */
+ public static final int RENAMES = 100;
+
+ /**
+ * Thread number; used for paths and messages.
+ */
+ private final AtomicInteger threadNumber = new AtomicInteger(0);
+
+ /**
+ * Flag to indicate that the test should exit: threads must check this.
+ */
+ private final AtomicBoolean shouldExit = new AtomicBoolean(false);
+
+ /**
+ * Any failure.
+ */
+ private final AtomicReference
+ *
+ * This means that multiple task attempts may report success, but only one will have it actual
+ * manifest saved.
+ * The mapreduce and spark committers only schedule a second task commit attempt if the first
+ * task attempt's commit operation fails or fails to report success in the allocated time.
+ * The overwrite with retry loop is an attempt to ensure that the second attempt will report
+ * success, if a partitioned cluster means that the original TA commit is still in progress.
+ *