diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_root_group.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_root_group.asciidoc index 3aca4548e52e..7bfcbf8155bf 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_root_group.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_root_group.asciidoc @@ -100,28 +100,55 @@ using device queries, as shown in the example below. [source,c++] ---- -auto bundle = sycl::get_kernel_bundle(q.get_context()); -auto kernel = bundle.get_kernel(); -auto maxWGs = kernel.ext_oneapi_get_info(q); -auto range = sycl::nd_range<1>{maxWGs * 32, 32}; -auto props = sycl::ext::oneapi::experimental::properties{sycl::ext::oneapi::experimental::use_root_sync}; -q.parallel_for(range, props, [=](sycl::nd_item<1> it) { - - // Get a handle to the root-group +#include +namespace syclex = sycl::ext::oneapi::experimental; + +struct KernelName { + KernelName(size_t *data) : data{data} {} + + void operator()(sycl::nd_item<1> it) const { + // Get a handle to the root-group. auto root = it.ext_oneapi_get_root_group(); - // Write to some global memory location - data[root.get_local_id()] = root.get_local_id(); + // Write to some global memory location. + data[root.get_local_linear_id()] = root.get_local_linear_id(); - // Synchronize all work-items executing the kernel, making all writes visible + // Synchronize all work-items executing the kernel, making all writes visible. sycl::group_barrier(root); + } -}); ----- + auto get(syclex::properties_tag) const { + // Kernels that use root-group synchronization must be decorated with the + // "use_root_sync" kernel property. + return syclex::properties{syclex::use_root_sync}; + } + + size_t *data; +}; + +int main() { + sycl::queue q; + + // When a kernel uses root-group synchronization, the total number of + // work-groups is limited. This limit can vary depending on the kernel, + // so get a "kernel" object representing the kernel when plan to launch. + auto bundle = sycl::get_kernel_bundle( + q.get_context()); + auto kernel = bundle.get_kernel(); + + // Get the maximum number of work-groups for this kernel. The limit also + // depends on the size of each work-group (which is 32 in this example), and + // on the amount of work-group local memory (which is 0 in this example). + auto maxWGs = kernel.ext_oneapi_get_info< + syclex::info::kernel_queue_specific::max_num_work_groups>(q, 32, 0); + + // Construct an nd-range which launches the maximum number of work-groups. + auto ndr = sycl::nd_range<1>{maxWGs * 32, 32}; -NOTE: SYCL 2020 requires lambdas to be named in order to locate the associated -`sycl::kernel` object used to query information descriptors. Reducing the -verbosity of the queries shown above is left to a future extension. + size_t *data = sycl::malloc_device(maxWGs * 32, q); + q.parallel_for(ndr, KernelName{data}).wait(); +} +---- == Specification @@ -151,20 +178,14 @@ supports. [source,c++] ---- -namespace sycl { -namespace ext { -namespace oneapi { -namespace experimental { +namespace sycl::ext::oneapi::experimental { struct use_root_sync_key { using value_t = property_value; }; inline constexpr use_root_sync_key::value_t use_root_sync; -} // namespace experimental -} // namespace oneapi -} // namespace ext -} // namespace sycl +} // namespace sycl::ext::oneapi::experimental ---- |=== @@ -188,10 +209,7 @@ additional functions. [source,c++] ---- -namespace sycl { -namespace ext { -namespace oneapi { -namespace experimental { +namespace sycl::ext::oneapi::experimental { enum class execution_scope { work_item, @@ -257,10 +275,7 @@ public: }; -} // namespace experimental -} // namespace oneapi -} // namespace ext -} // namespace sycl +} // namespace sycl::ext::oneapi::experimental ---- [source,c++] @@ -421,13 +436,13 @@ NOTE: Support for passing the `root_group` to other group functions and algorithms may be added in a future version of this extension. These group functions and algorithms act as synchronization points, and can -only be used in kernels launched with the `use_root_sync` property. -Attempting to call these functions in kernels that were not launched with the +only be used in kernels decorated with the `use_root_sync` property. +Attempting to call these functions in kernels that were not decorated with the `use_root_sync` property results in undefined behavior. NOTE: Implementations are encouraged to throw a synchronous error with the `errc::invalid` error code if they are able to detect that a developer has -attempted to synchronize a `root_group` from an incompatible kernel launch. +attempted to synchronize a `root_group` from an incompatible kernel. === Accessing the `root_group` instance @@ -506,12 +521,12 @@ implementation of this extension. It is not part of the specification of the extension's API. An implementation of this extension using Level Zero could launch kernels -associated with the `use_root_sync` property via +decorated with the `use_root_sync` property via `zeCommandListAppendLaunchCooperativeKernel`, and could query launch configuration requirements using `zeKernelSuggestMaxCooperativeGroupCount`. Similarly, an implementation of this extension using CUDA could launch kernels -associated with the `use_root_sync` property via +decorated with the `use_root_sync` property via `cudaLaunchCooperativeKernel`, and could query launch configuration requirements using a combination of `cudaOccupancyMaxActiveBlocksPerMultiprocessor` and @@ -544,4 +559,3 @@ template argument for `sycl::nd_item<>`. Adding a runtime query would require callstack. It's unclear if this functionality is necessary or just nice to have -- resolution of this issue depends on user and implementation experience. -- -