1313#include < executorch/runtime/core/exec_aten/util/tensor_util.h>
1414#include < cstdio>
1515
16+ #include < array>
1617#include < filesystem>
1718#include < fstream>
19+ #include < mutex>
1820#include < string>
1921#include < vector>
2022
@@ -35,20 +37,54 @@ using executorch::runtime::ArrayRef;
3537using executorch::runtime::Backend;
3638using executorch::runtime::BackendExecutionContext;
3739using executorch::runtime::BackendInitContext;
40+ using executorch::runtime::BackendOption;
41+ using executorch::runtime::BackendOptionContext;
3842using executorch::runtime::CompileSpec;
3943using executorch::runtime::DelegateHandle;
4044using executorch::runtime::Error;
4145using executorch::runtime::EValue;
4246using executorch::runtime::FreeableBuffer;
47+ using executorch::runtime::kMaxOptionValueLength ;
4348using executorch::runtime::MemoryAllocator;
4449using executorch::runtime::NamedDataMap;
4550using executorch::runtime::Result;
4651using executorch::runtime::Span;
4752using executorch::runtime::etensor::Tensor;
4853
54+ namespace {
55+ constexpr char kSkipCopyOutputToCpuForMethod [] =
56+ " skip_copy_output_to_cpu_for_method" ;
57+ }
58+
4959class ET_EXPERIMENTAL CudaBackend final
5060 : public ::executorch::runtime::BackendInterface {
5161 private:
62+ void set_skip_copy_method (
63+ const std::array<char , kMaxOptionValueLength >& raw) {
64+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
65+ skip_copy_method_ = std::string (raw.data ());
66+ }
67+
68+ std::array<char , kMaxOptionValueLength > get_skip_copy_method_as_option ()
69+ const {
70+ std::array<char , kMaxOptionValueLength > out{};
71+ std::string value;
72+ {
73+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
74+ value = skip_copy_method_;
75+ }
76+ std::snprintf (out.data (), out.size (), " %s" , value.c_str ());
77+ return out;
78+ }
79+
80+ bool should_skip_copy_for_method (const std::string& method_name) const {
81+ if (method_name.empty ()) {
82+ return false ;
83+ }
84+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
85+ return method_name == skip_copy_method_;
86+ }
87+
5288 Error load_function_pointers_into_handle (
5389 void * so_handle,
5490 AOTIDelegateHandle* handle) const {
@@ -91,6 +127,38 @@ class ET_EXPERIMENTAL CudaBackend final
91127 return 1 ;
92128 }
93129
130+ Error set_option (
131+ ET_UNUSED BackendOptionContext& context,
132+ const executorch::runtime::Span<BackendOption>& backend_options)
133+ override {
134+ for (const auto & option : backend_options) {
135+ if (std::strcmp (option.key , kSkipCopyOutputToCpuForMethod ) == 0 ) {
136+ if (auto * val = std::get_if<std::array<char , kMaxOptionValueLength >>(
137+ &option.value )) {
138+ set_skip_copy_method (*val);
139+ } else {
140+ ET_LOG (
141+ Error,
142+ " Option %s must be a method name string." ,
143+ kSkipCopyOutputToCpuForMethod );
144+ return Error::InvalidArgument;
145+ }
146+ }
147+ }
148+ return Error::Ok;
149+ }
150+
151+ Error get_option (
152+ ET_UNUSED BackendOptionContext& context,
153+ executorch::runtime::Span<BackendOption>& backend_options) override {
154+ for (auto & option : backend_options) {
155+ if (std::strcmp (option.key , kSkipCopyOutputToCpuForMethod ) == 0 ) {
156+ option.value = get_skip_copy_method_as_option ();
157+ }
158+ }
159+ return Error::Ok;
160+ }
161+
94162 // Once per loaded binary blob
95163 Result<DelegateHandle*> init (
96164 BackendInitContext& context,
@@ -159,6 +227,7 @@ class ET_EXPERIMENTAL CudaBackend final
159227 AOTIDelegateHandle* handle = new AOTIDelegateHandle ();
160228 handle->so_handle = lib_handle;
161229 handle->so_path = so_path.string ();
230+ handle->method_name = method_name;
162231
163232 // Load function pointers specific to this handle's shared library
164233 ET_CHECK_OK_OR_RETURN_ERROR (
@@ -224,7 +293,7 @@ class ET_EXPERIMENTAL CudaBackend final
224293
225294 // Process input tensors: ExecuTorch provides CPU tensors, create GPU
226295 // copies
227- for (int i = 0 ; i < n_inputs; i++) {
296+ for (size_t i = 0 ; i < n_inputs; i++) {
228297 // Get tensor dimensions and properties from ExecuTorch CPU tensor
229298 auto cpu_tensor = &(args[i]->toTensor ());
230299 auto sizes = cpu_tensor->sizes ();
@@ -260,7 +329,7 @@ class ET_EXPERIMENTAL CudaBackend final
260329 }
261330 // Process output tensors: create GPU counterparts for ExecuTorch CPU
262331 // tensors
263- for (int i = 0 ; i < n_outputs; i++) {
332+ for (size_t i = 0 ; i < n_outputs; i++) {
264333 // Get output tensor dimensions from ExecuTorch CPU tensor
265334 auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
266335 auto sizes = cpu_output_tensor->sizes ();
@@ -303,18 +372,26 @@ class ET_EXPERIMENTAL CudaBackend final
303372 " AOTInductorModelContainerRun failed with error code %d" ,
304373 error);
305374
306- // Copy GPU output results back to CPU output tensors
307- for (int i = 0 ; i < n_outputs; i++) {
308- auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
309- // For DYNAMIC_BOUND tensors we try to resize
310- ET_CHECK_OK_OR_RETURN_ERROR (
311- resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
312- " Error resizing tensor at output index %d" ,
313- i);
314- ET_CHECK_OK_OR_RETURN_ERROR (
315- aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
316- " Failed to copy GPU output %d back to CPU" ,
317- i);
375+ const bool copy_outputs = !should_skip_copy_for_method (handle->method_name );
376+
377+ if (copy_outputs) {
378+ // Copy GPU output results back to CPU output tensors
379+ for (size_t i = 0 ; i < n_outputs; i++) {
380+ auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
381+ // For DYNAMIC_BOUND tensors we try to resize
382+ ET_CHECK_OK_OR_RETURN_ERROR (
383+ resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
384+ " Error resizing tensor at output index %d" ,
385+ i);
386+ ET_CHECK_OK_OR_RETURN_ERROR (
387+ aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
388+ " Failed to copy GPU output %d back to CPU" ,
389+ i);
390+ }
391+ } else {
392+ for (size_t i = 0 ; i < n_outputs; i++) {
393+ args[i + n_inputs]->toTensor () = *gpu_outputs[i];
394+ }
318395 }
319396
320397 return Error::Ok;
@@ -365,6 +442,10 @@ class ET_EXPERIMENTAL CudaBackend final
365442 delete handle;
366443 clear_all_tensors ();
367444 }
445+
446+ private:
447+ mutable std::mutex skip_copy_method_mutex_;
448+ std::string skip_copy_method_;
368449};
369450
370451} // namespace executorch::backends::cuda
0 commit comments