1313#include < executorch/runtime/core/exec_aten/util/tensor_util.h>
1414#include < cstdio>
1515
16+ #include < array>
1617#include < filesystem>
1718#include < fstream>
19+ #include < mutex>
1820#include < string>
1921#include < vector>
2022
@@ -35,20 +37,55 @@ using executorch::runtime::ArrayRef;
3537using executorch::runtime::Backend;
3638using executorch::runtime::BackendExecutionContext;
3739using executorch::runtime::BackendInitContext;
40+ using executorch::runtime::BackendOption;
41+ using executorch::runtime::BackendOptionContext;
3842using executorch::runtime::CompileSpec;
3943using executorch::runtime::DelegateHandle;
4044using executorch::runtime::Error;
4145using executorch::runtime::EValue;
4246using executorch::runtime::FreeableBuffer;
47+ using executorch::runtime::kMaxOptionValueLength ;
4348using executorch::runtime::MemoryAllocator;
4449using executorch::runtime::NamedDataMap;
4550using executorch::runtime::Result;
4651using executorch::runtime::Span;
4752using executorch::runtime::etensor::Tensor;
4853
54+ namespace {
55+ constexpr char kSkipCopyOutputToCpuForMethod [] =
56+ " skip_copy_output_to_cpu_for_method" ;
57+ }
58+
4959class ET_EXPERIMENTAL CudaBackend final
5060 : public ::executorch::runtime::BackendInterface {
5161 private:
62+
63+ void set_skip_copy_method (
64+ const std::array<char , kMaxOptionValueLength >& raw) {
65+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
66+ skip_copy_method_ = std::string (raw.data ());
67+ }
68+
69+ std::array<char , kMaxOptionValueLength > get_skip_copy_method_as_option ()
70+ const {
71+ std::array<char , kMaxOptionValueLength > out{};
72+ std::string value;
73+ {
74+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
75+ value = skip_copy_method_;
76+ }
77+ std::snprintf (out.data (), out.size (), " %s" , value.c_str ());
78+ return out;
79+ }
80+
81+ bool should_skip_copy_for_method (const std::string& method_name) const {
82+ if (method_name.empty ()) {
83+ return false ;
84+ }
85+ std::lock_guard<std::mutex> guard (skip_copy_method_mutex_);
86+ return method_name == skip_copy_method_;
87+ }
88+
5289 Error load_function_pointers_into_handle (
5390 void * so_handle,
5491 AOTIDelegateHandle* handle) const {
@@ -91,6 +128,38 @@ class ET_EXPERIMENTAL CudaBackend final
91128 return 1 ;
92129 }
93130
131+ Error set_option (
132+ ET_UNUSED BackendOptionContext& context,
133+ const executorch::runtime::Span<BackendOption>& backend_options)
134+ override {
135+ for (const auto & option : backend_options) {
136+ if (std::strcmp (option.key , kSkipCopyOutputToCpuForMethod ) == 0 ) {
137+ if (auto * val = std::get_if<std::array<char , kMaxOptionValueLength >>(
138+ &option.value )) {
139+ set_skip_copy_method (*val);
140+ } else {
141+ ET_LOG (
142+ Error,
143+ " Option %s must be a method name string." ,
144+ kSkipCopyOutputToCpuForMethod );
145+ return Error::InvalidArgument;
146+ }
147+ }
148+ }
149+ return Error::Ok;
150+ }
151+
152+ Error get_option (
153+ ET_UNUSED BackendOptionContext& context,
154+ executorch::runtime::Span<BackendOption>& backend_options) override {
155+ for (auto & option : backend_options) {
156+ if (std::strcmp (option.key , kSkipCopyOutputToCpuForMethod ) == 0 ) {
157+ option.value = get_skip_copy_method_as_option ();
158+ }
159+ }
160+ return Error::Ok;
161+ }
162+
94163 // Once per loaded binary blob
95164 Result<DelegateHandle*> init (
96165 BackendInitContext& context,
@@ -159,6 +228,7 @@ class ET_EXPERIMENTAL CudaBackend final
159228 AOTIDelegateHandle* handle = new AOTIDelegateHandle ();
160229 handle->so_handle = lib_handle;
161230 handle->so_path = so_path.string ();
231+ handle->method_name = method_name;
162232
163233 // Load function pointers specific to this handle's shared library
164234 ET_CHECK_OK_OR_RETURN_ERROR (
@@ -222,9 +292,33 @@ class ET_EXPERIMENTAL CudaBackend final
222292 std::vector<AOTITensorHandle> gpu_outputs (
223293 n_outputs); // GPU tensors for kernel output
224294
295+ // RAII helper to ensure GPU tensors are cleaned up on all exit paths.
296+ // Prevents memory leaks when errors occur during execute().
297+ // TODO(larryliu0820): revisit this after SlimTensor migration, to see
298+ // if this is still needed.
299+ struct TensorCleanup {
300+ std::vector<AOTITensorHandle>& inputs;
301+ std::vector<AOTITensorHandle>& outputs;
302+
303+ ~TensorCleanup () {
304+ // Clean up input tensors
305+ for (auto * handle : inputs) {
306+ if (handle != nullptr ) {
307+ aoti_torch_delete_tensor_object (handle);
308+ }
309+ }
310+ // Clean up output tensors
311+ for (auto * handle : outputs) {
312+ if (handle != nullptr ) {
313+ aoti_torch_delete_tensor_object (handle);
314+ }
315+ }
316+ }
317+ };
318+ TensorCleanup cleanup{gpu_inputs, gpu_outputs};
225319 // Process input tensors: ExecuTorch provides CPU tensors, create GPU
226320 // copies
227- for (int i = 0 ; i < n_inputs; i++) {
321+ for (size_t i = 0 ; i < n_inputs; i++) {
228322 // Get tensor dimensions and properties from ExecuTorch CPU tensor
229323 auto cpu_tensor = &(args[i]->toTensor ());
230324 auto sizes = cpu_tensor->sizes ();
@@ -260,7 +354,7 @@ class ET_EXPERIMENTAL CudaBackend final
260354 }
261355 // Process output tensors: create GPU counterparts for ExecuTorch CPU
262356 // tensors
263- for (int i = 0 ; i < n_outputs; i++) {
357+ for (size_t i = 0 ; i < n_outputs; i++) {
264358 // Get output tensor dimensions from ExecuTorch CPU tensor
265359 auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
266360 auto sizes = cpu_output_tensor->sizes ();
@@ -303,18 +397,26 @@ class ET_EXPERIMENTAL CudaBackend final
303397 " AOTInductorModelContainerRun failed with error code %d" ,
304398 error);
305399
306- // Copy GPU output results back to CPU output tensors
307- for (int i = 0 ; i < n_outputs; i++) {
308- auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
309- // For DYNAMIC_BOUND tensors we try to resize
310- ET_CHECK_OK_OR_RETURN_ERROR (
311- resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
312- " Error resizing tensor at output index %d" ,
313- i);
314- ET_CHECK_OK_OR_RETURN_ERROR (
315- aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
316- " Failed to copy GPU output %d back to CPU" ,
317- i);
400+ const bool copy_outputs = !should_skip_copy_for_method (handle->method_name );
401+
402+ if (copy_outputs) {
403+ // Copy GPU output results back to CPU output tensors
404+ for (int i = 0 ; i < n_outputs; i++) {
405+ auto cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
406+ // For DYNAMIC_BOUND tensors we try to resize
407+ ET_CHECK_OK_OR_RETURN_ERROR (
408+ resize_tensor (*cpu_output_tensor, gpu_outputs[i]->sizes ()),
409+ " Error resizing tensor at output index %d" ,
410+ i);
411+ ET_CHECK_OK_OR_RETURN_ERROR (
412+ aoti_torch_copy_ (cpu_output_tensor, gpu_outputs[i], 0 ),
413+ " Failed to copy GPU output %d back to CPU" ,
414+ i);
415+ }
416+ } else {
417+ for (int i = 0 ; i < n_outputs; i++) {
418+ args[i + n_inputs]->toTensor () = *gpu_outputs[i];
419+ }
318420 }
319421
320422 return Error::Ok;
@@ -365,6 +467,10 @@ class ET_EXPERIMENTAL CudaBackend final
365467 delete handle;
366468 clear_all_tensors ();
367469 }
470+
471+ private:
472+ mutable std::mutex skip_copy_method_mutex_;
473+ std::string skip_copy_method_;
368474};
369475
370476} // namespace executorch::backends::cuda
0 commit comments