@@ -35,17 +35,24 @@ using executorch::runtime::ArrayRef;
3535using executorch::runtime::Backend;
3636using executorch::runtime::BackendExecutionContext;
3737using executorch::runtime::BackendInitContext;
38+ using executorch::runtime::BackendOption;
39+ using executorch::runtime::BackendOptionContext;
3840using executorch::runtime::CompileSpec;
3941using executorch::runtime::DelegateHandle;
4042using executorch::runtime::Error;
4143using executorch::runtime::EValue;
4244using executorch::runtime::FreeableBuffer;
45+ using executorch::runtime::kMaxOptionValueLength ;
4346using executorch::runtime::MemoryAllocator;
4447using executorch::runtime::NamedDataMap;
4548using executorch::runtime::Result;
4649using executorch::runtime::Span;
4750using executorch::runtime::etensor::Tensor;
4851
52+ namespace {
53+ constexpr char kSkipCopyOutputToCpuForMethodOption [] = " skip_copy_output_to_cpu_for_method" ;
54+ }
55+
4956class ET_EXPERIMENTAL CudaBackend final
5057 : public ::executorch::runtime::BackendInterface {
5158 private:
@@ -91,6 +98,36 @@ class ET_EXPERIMENTAL CudaBackend final
9198 return 1 ;
9299 }
93100
101+ Error set_option (
102+ ET_UNUSED BackendOptionContext& context,
103+ const executorch::runtime::Span<BackendOption>& backend_options)
104+ override {
105+ for (const auto & option : backend_options) {
106+ if (std::strcmp (option.key , kSkipCopyOutputToCpuForMethodOption ) == 0 ) {
107+ if (auto * val = std::get_if<bool >(&option.value )) {
108+ copy_gpu_outputs_to_cpu_.store (*val, std::memory_order_relaxed);
109+ } else {
110+ ET_LOG (
111+ Error, " Option %s must be a bool." , kSkipCopyOutputToCpuForMethodOption );
112+ return Error::InvalidArgument;
113+ }
114+ }
115+ }
116+ return Error::Ok;
117+ }
118+
119+ Error get_option (
120+ ET_UNUSED BackendOptionContext& context,
121+ executorch::runtime::Span<BackendOption>& backend_options) override {
122+ for (auto & option : backend_options) {
123+ if (std::strcmp (option.key , kCopyGpuOutputsToCpuOption ) == 0 ) {
124+ option.value = static_cast <bool >(
125+ copy_gpu_outputs_to_cpu_.load (std::memory_order_relaxed));
126+ }
127+ }
128+ return Error::Ok;
129+ }
130+
94131 // Once per loaded binary blob
95132 Result<DelegateHandle*> init (
96133 BackendInitContext& context,
@@ -303,18 +340,27 @@ class ET_EXPERIMENTAL CudaBackend final
303340 " AOTInductorModelContainerRun failed with error code %d" ,
304341 error);
305342
306- // Copy GPU output results back to CPU output tensors
307- for (int i = 0 ; i < n_outputs; i++) {
308- auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
309- // For DYNAMIC_BOUND tensors we try to resize
310- ET_CHECK_OK_OR_RETURN_ERROR (
311- resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
312- " Error resizing tensor at output index %d" ,
313- i);
314- ET_CHECK_OK_OR_RETURN_ERROR (
315- aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
316- " Failed to copy GPU output %d back to CPU" ,
317- i);
343+ const bool copy_outputs =
344+ copy_gpu_outputs_to_cpu_.load (std::memory_order_relaxed);
345+
346+ if (copy_outputs) {
347+ // Copy GPU output results back to CPU output tensors
348+ for (int i = 0 ; i < n_outputs; i++) {
349+ auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
350+ // For DYNAMIC_BOUND tensors we try to resize
351+ ET_CHECK_OK_OR_RETURN_ERROR (
352+ resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
353+ " Error resizing tensor at output index %d" ,
354+ i);
355+ ET_CHECK_OK_OR_RETURN_ERROR (
356+ aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
357+ " Failed to copy GPU output %d back to CPU" ,
358+ i);
359+ }
360+ } else {
361+ for (int i = 0 ; i < n_outputs; i++) {
362+ args[i + n_inputs]->toTensor () = *gpu_outputs[i];
363+ }
318364 }
319365
320366 return Error::Ok;
@@ -365,6 +411,9 @@ class ET_EXPERIMENTAL CudaBackend final
365411 delete handle;
366412 clear_all_tensors ();
367413 }
414+
415+ private:
416+ std::atomic<bool > copy_gpu_outputs_to_cpu_{true };
368417};
369418
370419} // namespace executorch::backends::cuda
0 commit comments