Skip to content

Commit f3e30a4

Browse files
committed
Avoid copying output from GPU to CPU
1 parent 33ec615 commit f3e30a4

File tree

2 files changed

+76
-13
lines changed

2 files changed

+76
-13
lines changed

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,24 @@ using executorch::runtime::ArrayRef;
3535
using executorch::runtime::Backend;
3636
using executorch::runtime::BackendExecutionContext;
3737
using executorch::runtime::BackendInitContext;
38+
using executorch::runtime::BackendOption;
39+
using executorch::runtime::BackendOptionContext;
3840
using executorch::runtime::CompileSpec;
3941
using executorch::runtime::DelegateHandle;
4042
using executorch::runtime::Error;
4143
using executorch::runtime::EValue;
4244
using executorch::runtime::FreeableBuffer;
45+
using executorch::runtime::kMaxOptionValueLength;
4346
using executorch::runtime::MemoryAllocator;
4447
using executorch::runtime::NamedDataMap;
4548
using executorch::runtime::Result;
4649
using executorch::runtime::Span;
4750
using executorch::runtime::etensor::Tensor;
4851

52+
namespace {
53+
constexpr char kSkipCopyOutputToCpuForMethodOption[] = "skip_copy_output_to_cpu_for_method";
54+
}
55+
4956
class ET_EXPERIMENTAL CudaBackend final
5057
: public ::executorch::runtime::BackendInterface {
5158
private:
@@ -91,6 +98,36 @@ class ET_EXPERIMENTAL CudaBackend final
9198
return 1;
9299
}
93100

101+
Error set_option(
102+
ET_UNUSED BackendOptionContext& context,
103+
const executorch::runtime::Span<BackendOption>& backend_options)
104+
override {
105+
for (const auto& option : backend_options) {
106+
if (std::strcmp(option.key, kSkipCopyOutputToCpuForMethodOption) == 0) {
107+
if (auto* val = std::get_if<bool>(&option.value)) {
108+
copy_gpu_outputs_to_cpu_.store(*val, std::memory_order_relaxed);
109+
} else {
110+
ET_LOG(
111+
Error, "Option %s must be a bool.", kSkipCopyOutputToCpuForMethodOption);
112+
return Error::InvalidArgument;
113+
}
114+
}
115+
}
116+
return Error::Ok;
117+
}
118+
119+
Error get_option(
120+
ET_UNUSED BackendOptionContext& context,
121+
executorch::runtime::Span<BackendOption>& backend_options) override {
122+
for (auto& option : backend_options) {
123+
if (std::strcmp(option.key, kCopyGpuOutputsToCpuOption) == 0) {
124+
option.value = static_cast<bool>(
125+
copy_gpu_outputs_to_cpu_.load(std::memory_order_relaxed));
126+
}
127+
}
128+
return Error::Ok;
129+
}
130+
94131
// Once per loaded binary blob
95132
Result<DelegateHandle*> init(
96133
BackendInitContext& context,
@@ -303,18 +340,27 @@ class ET_EXPERIMENTAL CudaBackend final
303340
"AOTInductorModelContainerRun failed with error code %d",
304341
error);
305342

306-
// Copy GPU output results back to CPU output tensors
307-
for (int i = 0; i < n_outputs; i++) {
308-
auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
309-
// For DYNAMIC_BOUND tensors we try to resize
310-
ET_CHECK_OK_OR_RETURN_ERROR(
311-
resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
312-
"Error resizing tensor at output index %d",
313-
i);
314-
ET_CHECK_OK_OR_RETURN_ERROR(
315-
aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
316-
"Failed to copy GPU output %d back to CPU",
317-
i);
343+
const bool copy_outputs =
344+
copy_gpu_outputs_to_cpu_.load(std::memory_order_relaxed);
345+
346+
if (copy_outputs) {
347+
// Copy GPU output results back to CPU output tensors
348+
for (int i = 0; i < n_outputs; i++) {
349+
auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
350+
// For DYNAMIC_BOUND tensors we try to resize
351+
ET_CHECK_OK_OR_RETURN_ERROR(
352+
resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
353+
"Error resizing tensor at output index %d",
354+
i);
355+
ET_CHECK_OK_OR_RETURN_ERROR(
356+
aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
357+
"Failed to copy GPU output %d back to CPU",
358+
i);
359+
}
360+
} else {
361+
for (int i = 0; i < n_outputs; i++) {
362+
args[i + n_inputs]->toTensor() = *gpu_outputs[i];
363+
}
318364
}
319365

320366
return Error::Ok;
@@ -365,6 +411,9 @@ class ET_EXPERIMENTAL CudaBackend final
365411
delete handle;
366412
clear_all_tensors();
367413
}
414+
415+
private:
416+
std::atomic<bool> copy_gpu_outputs_to_cpu_{true};
368417
};
369418

370419
} // namespace executorch::backends::cuda

extension/asr/runner/runner.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,21 @@ Error AsrRunner::load() {
107107

108108
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName));
109109
decoder_method_loaded_ = true;
110-
110+
#ifdef CUDA_AVAILABLE
111+
executorch::runtime::BackendOptions<1> backend_options;
112+
// For decoder still copy output from GPU to CPU for sampling.
113+
// TODO: change this to use a CUDA kernel to sample and then skip copying decoder output
114+
ET_CHECK_OK_OR_RETURN_ERROR(
115+
backend_options.set_option("skip_copy_output_to_cpu_for_method", kEncoderMethodName));
116+
const auto opt_err =
117+
executorch::runtime::set_option("CudaBackend", backend_options.view());
118+
if (opt_err != ::executorch::runtime::Error::Ok) {
119+
ET_LOG(
120+
Warning,
121+
"Failed to set CUDA backend options: %d",
122+
static_cast<int>(opt_err));
123+
}
124+
#endif
111125
ET_CHECK_OK_OR_RETURN_ERROR(load_tokenizer());
112126
auto eos_ids = get_eos_ids(tokenizer_.get(), module_.get());
113127
if (!eos_ids.empty()) {

0 commit comments

Comments
 (0)