Update staging buffer allocation flags by transfer direction

GregoryComer · web-flow · commit 0ac521314c8d · 2025-12-16T08:31:31.000Z
Differential Revision: D89086669 Pull Request resolved: #16268
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
@@ -31,11 +31,13 @@ class StagingBuffer final {
   StagingBuffer(
       Context* context_p,
       const vkapi::ScalarType dtype,
-      const size_t numel)
+      const size_t numel,
+      const vkapi::CopyDirection direction)
       : context_p_(context_p),
         dtype_(dtype),
         vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
-            element_size(dtype_) * numel)),
+            element_size(dtype_) * numel,
+            direction)),
         mapped_data_(nullptr) {}
 
   StagingBuffer(const StagingBuffer&) = delete;
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -543,10 +543,11 @@ ValueRef ComputeGraph::add_tensorref(
 
 ValueRef ComputeGraph::add_staging(
     const vkapi::ScalarType dtype,
-    const size_t numel) {
+    const size_t numel,
+    const vkapi::CopyDirection direction) {
   ValueRef idx(static_cast<int>(values_.size()));
   check_no_active_value_ptrs();
-  values_.emplace_back(api::StagingBuffer(context(), dtype, numel));
+  values_.emplace_back(api::StagingBuffer(context(), dtype, numel, direction));
   return idx;
 }
 
@@ -593,7 +594,8 @@ ValueRef ComputeGraph::set_input_tensor(
   // For texture storage, the buffer size needs to account for the zero
   // padding applied by unused texel elements.
   size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
-  ValueRef staging_idx = add_staging(staging_dtype, buf_numel);
+  ValueRef staging_idx = add_staging(
+      staging_dtype, buf_numel, vkapi::CopyDirection::HOST_TO_DEVICE);
   add_staging_to_tensor_node(*this, staging_idx, idx);
   inputs_.push_back({idx, staging_idx});
   return staging_idx;
@@ -617,7 +619,8 @@ ValueRef ComputeGraph::set_output_tensor(
   // For texture storage, the buffer size needs to account for the zero
   // padding applied by unused texel elements.
   size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
-  ValueRef staging_idx = add_staging(staging_dtype, buf_numel);
+  ValueRef staging_idx = add_staging(
+      staging_dtype, buf_numel, vkapi::CopyDirection::DEVICE_TO_HOST);
   // We only run this when the tensor is non-empty.  When the underlying
   // tensor is empty (e.g. padded_numel == 0), we do not allocate a VkImage to
   // tensor, we will not be able to bind the node for execution.
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -762,7 +762,10 @@ class ComputeGraph final {
    * use memory that is visible to both the CPU and GPU, and therefore is used
    * as a intermediary when transferring data between the CPU and GPU.
    */
-  ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel);
+  ValueRef add_staging(
+      const vkapi::ScalarType dtype,
+      const size_t numel,
+      const vkapi::CopyDirection direction);
 
   ValueRef add_none();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -53,14 +53,21 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
     const std::vector<int64_t> packed_sizes = graph->sizes_of(packed_);
     size_t numel = utils::multiply_integers(packed_sizes);
     api::StagingBuffer staging(
-        graph->context(), graph->dtype_of(packed_), numel);
+        graph->context(),
+        graph->dtype_of(packed_),
+        numel,
+        vkapi::CopyDirection::HOST_TO_DEVICE);
     staging.set_staging_zeros();
     return staging;
   }
 
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
-  api::StagingBuffer staging(graph->context(), tref->dtype, numel);
+  api::StagingBuffer staging(
+      graph->context(),
+      tref->dtype,
+      numel,
+      vkapi::CopyDirection::HOST_TO_DEVICE);
   graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   staging.copy_from(tref->data, nbytes);
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
@@ -141,19 +141,25 @@ VulkanImage Allocator::create_image(
       allocate_memory);
 }
 
-VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
+VulkanBuffer Allocator::create_staging_buffer(
+    const VkDeviceSize size,
+    const CopyDirection direction) {
   const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
 
   VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
+  alloc_create_info.flags =
+      DEFAULT_ALLOCATION_STRATEGY | VMA_ALLOCATION_CREATE_MAPPED_BIT;
   alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
 
   // Staging buffers are accessed by both the CPU and GPU, so set the
   // appropriate flags to indicate that the host device will be accessing
   // the data from this buffer.
-  alloc_create_info.flags |=
-      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
-      VMA_ALLOCATION_CREATE_MAPPED_BIT;
+  if (direction == CopyDirection::HOST_TO_DEVICE) {
+    alloc_create_info.flags |=
+        VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
+  } else {
+    alloc_create_info.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT;
+  }
   alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
   alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
   alloc_create_info.preferredFlags =
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h
@@ -23,6 +23,17 @@
 namespace vkcompute {
 namespace vkapi {
 
+/**
+ * Indicates the direction of a copy to or from a staging buffer.
+ *
+ * HOST_TO_DEVICE: Data is written by the host and read by the device.
+ * DEVICE_TO_HOST: Data is written by the device and read by the host.
+ */
+enum class CopyDirection : uint8_t {
+  HOST_TO_DEVICE = 0u,
+  DEVICE_TO_HOST = 1u,
+};
+
 constexpr VmaAllocationCreateFlags DEFAULT_ALLOCATION_STRATEGY =
     VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT;
 
@@ -66,7 +77,7 @@ class Allocator final {
       const bool allow_transfer = false,
       const bool allocate_memory = true);
 
-  VulkanBuffer create_staging_buffer(const VkDeviceSize);
+  VulkanBuffer create_staging_buffer(const VkDeviceSize, const CopyDirection);
 
   VulkanBuffer create_storage_buffer(
       const VkDeviceSize,
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -399,7 +399,11 @@ void record_matmul_texture3d(
   _(int8_t, QInt8)
 
 void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
+  api::StagingBuffer staging_buffer(
+      api::context(),
+      vten.dtype(),
+      data.size(),
+      vkapi::CopyDirection::HOST_TO_DEVICE);
 
 #define CASE(ctype, name)                                     \
   case vkapi::ScalarType::name: {                             \
@@ -486,7 +490,10 @@ void fill_vtensor(
 
 void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
   api::StagingBuffer staging_buffer(
-      api::context(), vten.dtype(), vten.staging_buffer_numel());
+      api::context(),
+      vten.dtype(),
+      vten.staging_buffer_numel(),
+      vkapi::CopyDirection::DEVICE_TO_HOST);
 
   if (vten.storage_type() == utils::StorageType::BUFFER) {
     record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
@@ -42,15 +42,17 @@
   vkcompute::api::StagingBuffer staging_buffer_##tensor(    \
       vkcompute::api::context(),                            \
       vkapi::kFloat,                                        \
-      tensor.staging_buffer_numel());                       \
+      tensor.staging_buffer_numel(),                        \
+      vkapi::CopyDirection::HOST_TO_DEVICE);                \
   record_nchw_to_image_op(                                  \
       vkcompute::api::context(), staging_buffer_##tensor.buffer(), tensor);
 
 #define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \
   vkcompute::api::StagingBuffer staging_buffer_##tensor(      \
       vkcompute::api::context(),                              \
       vkapi::kFloat,                                          \
-      tensor.staging_buffer_numel());                         \
+      tensor.staging_buffer_numel(),                          \
+      vkapi::CopyDirection::DEVICE_TO_HOST);                  \
   record_image_to_nchw_op(                                    \
       vkcompute::api::context(), tensor, staging_buffer_##tensor.buffer());
 
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -530,7 +530,8 @@ TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
 
 TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
   size_t len = 16;
-  StagingBuffer buffer(context(), vkapi::kFloat, len);
+  StagingBuffer buffer(
+      context(), vkapi::kFloat, len, vkapi::CopyDirection::DEVICE_TO_HOST);
 
   float scale = 3.0f;
   float offset = 1.5f;
@@ -602,7 +603,10 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
   }
 
   StagingBuffer staging_buffer(
-      context(), vkapi::kFloat, a.staging_buffer_numel());
+      context(),
+      vkapi::kFloat,
+      a.staging_buffer_numel(),
+      vkapi::CopyDirection::DEVICE_TO_HOST);
   record_image_to_nchw_op(context(), a, staging_buffer.buffer());
 
   submit_to_gpu();
@@ -622,7 +626,8 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
 
 template <typename T, vkapi::ScalarType dtype>
 void test_storage_buffer_type(const size_t len) {
-  StagingBuffer buffer(context(), dtype, len);
+  StagingBuffer buffer(
+      context(), dtype, len, vkapi::CopyDirection::DEVICE_TO_HOST);
 
   std::string kernel_name("idx_fill_buffer");
   switch (dtype) {
@@ -2013,7 +2018,11 @@ void run_from_gpu_test(
         vten.sizes_ubo());
   }
 
-  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
+  StagingBuffer staging_buffer(
+      context(),
+      dtype,
+      vten.staging_buffer_numel(),
+      vkapi::CopyDirection::DEVICE_TO_HOST);
 
   if (dtype == vkapi::kChar &&
       !context()->adapter_ptr()->has_full_int8_buffers_support()) {
@@ -2049,7 +2058,10 @@ void round_trip_test(
 
   // Create and fill input staging buffer
   StagingBuffer staging_buffer_in(
-      context(), dtype, vten.staging_buffer_numel());
+      context(),
+      dtype,
+      vten.staging_buffer_numel(),
+      vkapi::CopyDirection::HOST_TO_DEVICE);
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
@@ -2059,7 +2071,10 @@ void round_trip_test(
 
   // Output staging buffer
   StagingBuffer staging_buffer_out(
-      context(), dtype, vten.staging_buffer_numel());
+      context(),
+      dtype,
+      vten.staging_buffer_numel(),
+      vkapi::CopyDirection::DEVICE_TO_HOST);
 
   record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);