Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
299a72b
Backend: Add RetryableBackend to retry IO
riley-dixon Mar 11, 2026
c77c747
Test: Add new RetryableBackend unit tests
riley-dixon Mar 13, 2026
4c8aa69
Rework default implementation of `is_retryable`
riley-dixon Mar 13, 2026
722d723
Backend: Use a common method across all Backends for IO implementation.
riley-dixon Mar 16, 2026
8d2cf82
Backend: Rename RetryableBackend
riley-dixon Mar 16, 2026
b3a3330
Backend: Move update_stats_X to Backend
riley-dixon Mar 16, 2026
62db64a
Fastpath: Create integration test for fallback mechanism
riley-dixon Mar 18, 2026
c8a987d
Fastpath: Add integration test for tracking what exception gets thrown.
riley-dixon Mar 18, 2026
71e168f
Backend: Remove comment about io() returning a negative integer
riley-dixon Mar 19, 2026
3ddd9b0
Address copilot suggestions
riley-dixon Mar 18, 2026
89a9a9d
Update Changelog
riley-dixon Mar 20, 2026
1fe38c8
Merge branch 'develop' of github.com:ROCm/hipFile into rildixon/io-re…
riley-dixon Mar 24, 2026
10a624a
Stats: Revert Stats changes.
riley-dixon Mar 24, 2026
98be51e
Test: Fix new integration tests from merge commit.
riley-dixon Mar 25, 2026
db3549e
Backend Test: Change one unit test name
riley-dixon Mar 25, 2026
64e3ea5
Fallback: Change _io_impl() signature to have consistent names
riley-dixon Mar 25, 2026
fd1bb79
Fallback: Remove Fallback::io() to use base implementation.
riley-dixon Mar 25, 2026
bb13421
BackendWithFallback: Add registration checks for invalid fallbacks.
riley-dixon Mar 25, 2026
6f8ba67
Backend: Rework is_fallback_eligible() to only consider its instance
riley-dixon Mar 25, 2026
1ed46a5
Fastpath: Add fallback unit tests.
riley-dixon Mar 25, 2026
eb69578
Fastpath: Shift one integration test to a unit test
riley-dixon Mar 26, 2026
747b724
Fastpath: Rename fallback integration test suite & update exceptions …
riley-dixon Mar 26, 2026
a6412b5
Formatting
riley-dixon Mar 26, 2026
3cc8443
Address Copilot recommendations
riley-dixon Mar 26, 2026
bc68811
Merge branch 'develop' of github.com:ROCm/hipFile into rildixon/io-re…
riley-dixon Mar 26, 2026
b1dc5b7
Remove integration test
riley-dixon Mar 26, 2026
761e2b6
Test: Remove use of "Rethrow"
riley-dixon Mar 27, 2026
d52ac09
Merge branch 'develop' of github.com:ROCm/hipFile into rildixon/io-re…
riley-dixon Mar 27, 2026
7ab7add
formatting
riley-dixon Mar 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* The CMake namespace was changed from `roc::` to `hip::`
* `AIS_BUILD_EXAMPLES` has been renamed to `AIS_INSTALL_EXAMPLES`
* `AIS_USE_SANITIZERS` now also enables the following sanitizers: integer, float-divide-by-zero, local-bounds, vptr, nullability (in addition to address, leak, and undefined). Sanitizers should also now emit usable stack trace info.
* The AIS optimized IO path will automatically fallback to the POSIX IO path if a failure occurs and the compatibility mode has not been disabled.
* Added check in the Fastpath/AIS backend to ensure the HIP Runtime is initialized. This avoids causing a segfault in the HIP Runtime.
* The default CMake build type was changed from `Debug` to `RelWithDebInfo`

Expand Down
1 change: 1 addition & 0 deletions src/amd_detail/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
set(HIPFILE_SOURCES
"${HIPFILE_SRC_COMMON_PATH}/hipfile-common.cpp"
async.cpp
backend.cpp
backend/asyncop-fallback.cpp
backend/memcpy-kernel.hip
backend/fallback.cpp
Expand Down
65 changes: 65 additions & 0 deletions src/amd_detail/backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*/

#include "backend.h"
#include "buffer.h"
#include "file.h"
#include "io.h"

#include <cstddef>
#include <exception>
#include <memory>
#include <stdexcept>
#include <sys/types.h>
#include <system_error>
#include <unistd.h>

using namespace hipFile;

ssize_t
Backend::io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset)
{
return _io_impl(type, file, buffer, size, file_offset, buffer_offset);
}

ssize_t
BackendWithFallback::io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer,
size_t size, hoff_t file_offset, hoff_t buffer_offset)
{
ssize_t nbytes{0};
try {
nbytes = _io_impl(type, file, buffer, size, file_offset, buffer_offset);
if (nbytes < 0) {
// Typically we should not reach this point. But in case we do, throw
// an exception to use the fallback backend.
throw std::system_error(-static_cast<int>(nbytes), std::generic_category());
}
}
catch (...) {
std::exception_ptr e_ptr = std::current_exception();
if (fallback_backend && is_fallback_eligible(e_ptr, nbytes) &&
fallback_backend->score(file, buffer, size, file_offset, buffer_offset) >= 0) {
nbytes = fallback_backend->io(type, file, buffer, size, file_offset, buffer_offset);
}
else {
throw;
}
return nbytes;
}
return nbytes;
}

void
BackendWithFallback::register_fallback_backend(std::shared_ptr<Backend> backend)
{
Copy link

Copilot AI Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BackendWithFallback::register_fallback_backend() will happily accept this (or a backend that eventually retries back to this). If _io_impl() throws, that can lead to unbounded recursion (io() -> fallback io() -> ...), resulting in stack overflow. Consider defensively rejecting self-registration (and optionally detecting obvious cycles) to make the API safer to use.

Suggested change
{
{
// Prevent self-registration, which can cause unbounded recursion on fallback.
if (backend.get() == this) {
return;
}
// Optionally prevent a simple two-node cycle: this -> backend -> this.
if (backend) {
if (auto backend_with_fallback = std::dynamic_pointer_cast<BackendWithFallback>(backend)) {
if (backend_with_fallback->fallback_backend.get() == this) {
return;
}
}
}

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator

@kurtmcmillan kurtmcmillan Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I /mostly/ agree with copilot. register_fallback_backend() should throw an exception if self-registration is attempted

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a check for self-registration & nullptr. I don't think we need something more comprehensive right now.

if (!backend) {
throw std::invalid_argument("Cannot register a nullptr as a fallback backend.");
}
else if (backend.get() == this) {
throw std::invalid_argument("Cannot register a backend as it's own fallback.");
}
fallback_backend = backend;
}
49 changes: 47 additions & 2 deletions src/amd_detail/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "sys.h"

#include <cstddef>
#include <exception>
#include <memory>
#include <sys/types.h>
#include <unistd.h>
Expand Down Expand Up @@ -58,11 +59,55 @@ struct Backend {
/// @param file_offset Offset from the start of the file
/// @param buffer_offset Offset from the start of the buffer
///
/// @return Number of bytes transferred, negative on error
/// @return Number of bytes transferred
///
/// @throws Hip::RuntimeError Sys::RuntimeError
virtual ssize_t io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset) = 0;
hoff_t file_offset, hoff_t buffer_offset);

protected:
/// @brief Perform a read or write operation
///
/// @note Provides a common target across all Backends that provides the
/// implementation for running IO.
/// @param type IO type (read/write)
/// @param file File to read from or write to
/// @param buffer Buffer to write to or read from
/// @param size Number of bytes to transfer
/// @param file_offset Offset from the start of the file
/// @param buffer_offset Offset from the start of the buffer
///
/// @return Number of bytes transferred
///
/// @throws Hip::RuntimeError Sys::RuntimeError
virtual ssize_t _io_impl(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer,
size_t size, hoff_t file_offset, hoff_t buffer_offset) = 0;
};

// BackendWithFallback allows for an IO to be retried automatically with a
// different Backend in the event of an error.
struct BackendWithFallback : public Backend {
ssize_t io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset) override final;

/// @brief Register a Backend to retry a failed IO operation.
///
/// @param backend Backend to retry a failed IO operation.
///
/// @throws std::invalid_argument If a nullptr or self reference is passed.
void register_fallback_backend(std::shared_ptr<Backend> backend);

protected:
/// @brief Check if a failed IO operation can be re-issued to the fallback Backend.
///
/// @param e_ptr exception_ptr to the thrown exception from the failed IO
/// @param nbytes Return value from `_io_impl`, or 0 if an exception was thrown.
///
/// @return True if this BackendWithFallback can retry the IO, else False.
virtual bool is_fallback_eligible(std::exception_ptr e_ptr, ssize_t nbytes) const = 0;

private:
std::shared_ptr<Backend> fallback_backend;
};

}
24 changes: 16 additions & 8 deletions src/amd_detail/backend/fallback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,21 @@ Fallback::score(std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, si

ssize_t
Fallback::io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset)
hoff_t file_offset, hoff_t buffer_offset, size_t chunk_size)
{
return io(type, file, buffer, size, file_offset, buffer_offset, DefaultChunkSize);
return _io_impl(type, file, buffer, size, file_offset, buffer_offset, chunk_size);
}

ssize_t
Fallback::io(IoType io_type, shared_ptr<IFile> file, shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset, size_t chunk_size)
Fallback::_io_impl(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset)
{
return _io_impl(type, file, buffer, size, file_offset, buffer_offset, DefaultChunkSize);
}

ssize_t
Fallback::_io_impl(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset, size_t chunk_size)
{
if (!Context<Configuration>::get()->fallback()) {
throw BackendDisabled();
Expand All @@ -87,7 +94,7 @@ Fallback::io(IoType io_type, shared_ptr<IFile> file, shared_ptr<IBuffer> buffer,
reinterpret_cast<uintptr_t>(buffer->getBuffer()) + static_cast<size_t>(buffer_offset) +
static_cast<size_t>(total_io_bytes));
try {
switch (io_type) {
switch (type) {
case IoType::Read:
io_bytes =
Context<Sys>::get()->pread(file->getBufferedFd(), bounce_buffer.get(), count, offset);
Expand Down Expand Up @@ -121,16 +128,17 @@ Fallback::io(IoType io_type, shared_ptr<IFile> file, shared_ptr<IBuffer> buffer,
}
} while (static_cast<size_t>(total_io_bytes) < size);

switch (io_type) {
case IoType::Read:
switch (type) {
case (IoType::Read):
statsAddFallbackPathRead(static_cast<size_t>(total_io_bytes));
break;
case IoType::Write:
case (IoType::Write):
statsAddFallbackPathWrite(static_cast<size_t>(total_io_bytes));
break;
default:
break;
}

return total_io_bytes;
}

Expand Down
12 changes: 7 additions & 5 deletions src/amd_detail/backend/fallback.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,26 @@ enum class IoType;
namespace hipFile {

struct Fallback : public Backend {
using Backend::io;
virtual ~Fallback() override = default;

int score(std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size, hoff_t file_offset,
hoff_t buffer_offset) const override;

ssize_t io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset) override;

void async_io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t *size_p,
hoff_t *file_offset_p, hoff_t *buffer_offset_p, ssize_t *bytes_transferred_p,
std::shared_ptr<IStream> stream);

// Once we can import gtest.h and make test suites or test friends everything
// below here should be made protected.
// protected:

ssize_t io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset, size_t chunk_size);

protected:
ssize_t _io_impl(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset) override;
ssize_t _io_impl(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset, size_t chunk_size);
};

}
Expand Down
45 changes: 33 additions & 12 deletions src/amd_detail/backend/fastpath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@
#include "io.h"
#include "stats.h"

#include <cerrno>
#include <cstddef>
#include <cstdint>
#include <exception>
#include <fcntl.h>
#include <hip/hip_runtime_api.h>
#include <linux/stat.h>
#include <memory>
#include <stdexcept>
#include <system_error>

using namespace hipFile;
using namespace std;
Expand Down Expand Up @@ -158,8 +163,8 @@ Fastpath::score(shared_ptr<IFile> file, shared_ptr<IBuffer> buffer, size_t size,
}

ssize_t
Fastpath::io(IoType type, shared_ptr<IFile> file, shared_ptr<IBuffer> buffer, size_t size, hoff_t file_offset,
hoff_t buffer_offset)
Fastpath::_io_impl(IoType type, shared_ptr<IFile> file, shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset)
{
if (!Context<Configuration>::get()->fastpath()) {
throw BackendDisabled();
Expand Down Expand Up @@ -193,22 +198,38 @@ Fastpath::io(IoType type, shared_ptr<IFile> file, shared_ptr<IBuffer> buffer, si
switch (type) {
case IoType::Read:
nbytes = Context<Hip>::get()->hipAmdFileRead(handle, devptr, size, file_offset);
break;
case IoType::Write:
nbytes = Context<Hip>::get()->hipAmdFileWrite(handle, devptr, size, file_offset);
break;
default:
throw std::runtime_error("Invalid IoType");
}
switch (type) {
case IoType::Read:
statsAddFastPathRead(nbytes);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better if statsAddFastPathRead(nbytes) would be moved after Context<Hip>::get()->hipAmdFileRead(handle, devptr, size, file_offset);.

If hipAmdFileRead doesn't return a positive value it throws so stats would never be incremented.

I don't think a update_read_stats method should be added to Backend nor RetryableBackend

Same goes for the write stats.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to keep updating stats outside of _io_impl so that we don't retry the IO if updating the stats fails. Would you be okay with that effect, or alternatively adding a try-catch block wrapping the stats call?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updating stats shouldn't fail/throw. Those functions are just bumping a counter. stats should be fixed.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'll update all the stats calls to noexcept.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stats changes have been reverted with the assumption that stats won't throw.

break;
case IoType::Write:
nbytes = Context<Hip>::get()->hipAmdFileWrite(handle, devptr, size, file_offset);
statsAddFastPathWrite(nbytes);
break;
default:
break;
throw std::runtime_error("Invalid IoType");
}
return static_cast<ssize_t>(nbytes);
}

bool
Fastpath::is_fallback_eligible(std::exception_ptr e_ptr, ssize_t nbytes) const
{
(void)nbytes;
try {
std::rethrow_exception(e_ptr);
}
catch (const std::system_error &sys_err) {
switch (sys_err.code().value()) {
case ENODEV:
return true;
case EREMOTEIO:
return true;
default:
// System error not eligible for fallback.
return false;
}
}
catch (...) {
// Thrown exception not eligible for fallback.
return false;
}
}
12 changes: 9 additions & 3 deletions src/amd_detail/backend/fastpath.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@
#pragma once

#include "backend.h"
#include "buffer.h"
#include "file.h"
#include "hipfile.h"

#include <cstddef>
#include <memory>
#include <stdexcept>
#include <sys/types.h>

namespace hipFile {
Expand All @@ -23,14 +27,16 @@ enum class IoType;

namespace hipFile {

struct Fastpath : public Backend {
struct Fastpath : public BackendWithFallback {
virtual ~Fastpath() override = default;

int score(std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size, hoff_t file_offset,
hoff_t buffer_offset) const override;

ssize_t io(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset) override;
protected:
ssize_t _io_impl(IoType type, std::shared_ptr<IFile> file, std::shared_ptr<IBuffer> buffer, size_t size,
hoff_t file_offset, hoff_t buffer_offset) override;
bool is_fallback_eligible(std::exception_ptr e_ptr, ssize_t nbytes) const override;
};

}
7 changes: 5 additions & 2 deletions src/amd_detail/state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,8 +275,11 @@ std::vector<std::shared_ptr<Backend>>
DriverState::getBackends() const
{
static bool once = [&]() {
backends.emplace_back(new Fastpath{});
backends.emplace_back(new Fallback{});
std::shared_ptr<Fallback> fallback_backend = std::make_shared<Fallback>();
std::shared_ptr<Fastpath> fastpath_backend = std::make_shared<Fastpath>();
fastpath_backend->register_fallback_backend(fallback_backend);
backends.push_back(fallback_backend);
backends.push_back(fastpath_backend);
return true;
}();
(void)once;
Expand Down
1 change: 1 addition & 0 deletions test/amd_detail/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ set(SHARED_SOURCE_FILES

set(TEST_SOURCE_FILES
async.cpp
backend.cpp
batch/batch.cpp
configuration.cpp
context.cpp
Expand Down
Loading
Loading