From 6a33df4d7f697167bada0a224a6e5b88512230c3 Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Thu, 7 Jul 2022 18:54:53 -0700 Subject: [PATCH 1/9] 174: Adding recipe for custom compute functions This recipe shows the major portions of a custom, or new, compute function: - defining a compute kernel - creating a function instance - associating the kernel with the function - registering the function in a registry - calling the function --- cpp/code/compute_fn.cc | 270 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 cpp/code/compute_fn.cc diff --git a/cpp/code/compute_fn.cc b/cpp/code/compute_fn.cc new file mode 100644 index 00000000..d8577457 --- /dev/null +++ b/cpp/code/compute_fn.cc @@ -0,0 +1,270 @@ +// ------------------------------ +// Dependencies + +// standard dependencies +#include +#include +#include + +// arrow dependencies +#include +#include +#include + +#include "common.h" + + +// >> aliases for types in standard library +using std::shared_ptr; +using std::vector; + +// arrow util types +using arrow::Result; +using arrow::Status; +using arrow::Datum; + +// arrow data types and helpers +using arrow::UInt32Builder; +using arrow::Int32Builder; + +using arrow::Array; +using arrow::ArraySpan; + + +// aliases for types used in `NamedScalarFn` +// |> kernel parameters +using arrow::compute::KernelContext; +using arrow::compute::ExecSpan; +using arrow::compute::ExecResult; + +// |> other context types +using arrow::compute::ExecContext; +using arrow::compute::LightContext; + +// |> common types for compute functions +using arrow::compute::FunctionRegistry; +using arrow::compute::FunctionDoc; +using arrow::compute::InputType; +using arrow::compute::OutputType; +using arrow::compute::Arity; + +// |> the "kind" of function we want +using arrow::compute::ScalarFunction; + +// |> structs and classes for hashing +using arrow::util::MiniBatch; +using arrow::util::TempVectorStack; + +using arrow::compute::KeyColumnArray; +using arrow::compute::Hashing32; + +// |> functions used for hashing +using arrow::compute::ColumnArrayFromArrayData; + + +// ------------------------------ +// Structs and Classes + +// >> Documentation for a compute function +/** + * Create a const instance of `FunctionDoc` that contains 3 attributes: + * 1. Short description + * 2. Long description (limited to 78 characters) + * 3. Name of input arguments + */ +const FunctionDoc named_scalar_fn_doc { + "Unary function that calculates a hash for each row of the input" + ,"This function uses an xxHash-like algorithm which produces 32-bit hashes." + ,{ "input_array" } +}; + + +// >> Kernel implementations for a compute function +/** + * Create implementations that will be associated with our compute function. When a + * compute function is invoked, the compute API framework will delegate execution to an + * associated kernel that matches: (1) input argument types/shapes and (2) output argument + * types/shapes. + * + * Kernel implementations may be functions or may be methods (functions within a class or + * struct). + */ +struct NamedScalarFn { + + /** + * A kernel implementation that expects a single array as input, and outputs an array of + * uint32 values. We write this implementation knowing what function we want to + * associate it with ("NamedScalarFn"), but that association is made later (see + * `RegisterScalarFnKernels()` below). + */ + static Status + Exec(KernelContext *ctx, const ExecSpan &input_arg, ExecResult *out) { + StartRecipe("DefineAComputeKernel"); + + if (input_arg.num_values() != 1 or not input_arg[0].is_array()) { + return Status::Invalid("Unsupported argument types or shape"); + } + + // >> Initialize stack-based memory allocator with an allocator and memory size + TempVectorStack stack_memallocator; + auto input_dtype_width = input_arg[0].type()->bit_width(); + if (input_dtype_width > 0) { + ARROW_RETURN_NOT_OK( + stack_memallocator.Init( + ctx->exec_context()->memory_pool() + ,input_dtype_width * max_batchsize + ) + ); + } + + // >> Prepare input data structure for propagation to hash function + // NOTE: "start row index" and "row count" can potentially be options in the future + ArraySpan hash_input = input_arg[0].array; + int64_t hash_startrow = 0; + int64_t hash_rowcount = hash_input.length; + ARROW_ASSIGN_OR_RAISE( + KeyColumnArray input_keycol + ,ColumnArrayFromArrayData(hash_input.ToArrayData(), hash_startrow, hash_rowcount) + ); + + // >> Call hashing function + vector hash_results; + hash_results.resize(hash_input.length); + + LightContext hash_ctx; + hash_ctx.hardware_flags = ctx->exec_context()->cpu_info()->hardware_flags(); + hash_ctx.stack = &stack_memallocator; + + Hashing32::HashMultiColumn({ input_keycol }, &hash_ctx, hash_results.data()); + + // >> Prepare results of hash function for kernel output argument + UInt32Builder builder; + builder.Reserve(hash_results.size()); + builder.AppendValues(hash_results); + + ARROW_ASSIGN_OR_RAISE(auto result_array, builder.Finish()); + out->value = result_array->data(); + + EndRecipe("DefineAComputeKernel"); + return Status::OK(); + } + + + static constexpr uint32_t max_batchsize = MiniBatch::kMiniBatchLength; +}; + + +// ------------------------------ +// Functions + + +// >> Function registration and kernel association +/** + * A convenience function that shows how we construct an instance of `ScalarFunction` that + * will be registered in a function registry. The instance is constructed with: (1) a + * unique name ("named_scalar_fn"), (2) an "arity" (`Arity::Unary()`), and (3) an instance + * of `FunctionDoc`. + * + * The function name is used to invoke it from a function registry after it has been + * registered. The "arity" is the cardinality of the function's parameters--1 parameter is + * a unary function, 2 parameters is a binary function, etc. Finally, it is helpful to + * associate the function with documentation, which uses the `FunctionDoc` struct. + */ +shared_ptr +RegisterScalarFnKernels() { + StartRecipe("AddKernelsToFunction"); + // Instantiate a function to be registered + auto fn_named_scalar = std::make_shared( + "named_scalar_fn" + ,Arity::Unary() + ,std::move(named_scalar_fn_doc) + ); + + // Associate a kernel implementation with the function using + // `ScalarFunction::AddKernel()` + DCHECK_OK( + fn_named_scalar->AddKernel( + { InputType(arrow::int32()) } + ,OutputType(arrow::uint32()) + ,NamedScalarFn::Exec + ) + ); + + EndRecipe("AddKernelsToFunction"); + return fn_named_scalar; +} + + +/** + * A convenience function that shows how we register a custom function with a + * `FunctionRegistry`. To keep this simple and general, this function takes a pointer to a + * FunctionRegistry as an input argument, then invokes `FunctionRegistry::AddFunction()`. + */ +void +RegisterNamedScalarFn(FunctionRegistry *registry) { + auto scalar_fn = RegisterScalarFnKernels(); + DCHECK_OK(registry->AddFunction(std::move(scalar_fn))); +} + + +// >> Convenience functions +/** + * An optional convenience function to easily invoke our compute function. This executes + * our compute function by invoking `CallFunction` with the name that we used to register + * the function ("named_scalar_fn" in this case). + */ +ARROW_EXPORT +Result +NamedScalarFn(const Datum &input_arg, ExecContext *ctx) { + auto func_name = "named_scalar_fn"; + return CallFunction(func_name, { input_arg }, ctx); +} + + +Result> +BuildIntArray() { + vector col_vals { 0, 1, 1, 2, 3, 5, 8, 13, 21, 34 }; + + Int32Builder builder; + ARROW_RETURN_NOT_OK(builder.Reserve(col_vals.size())); + ARROW_RETURN_NOT_OK(builder.AppendValues(col_vals)); + return builder.Finish(); +} + + +class ComputeFunctionTest : public ::testing::Test {}; + +TEST(ComputeFunctionTest, TestRegisterAndCallFunction) { + // >> Construct some test data + auto build_result = BuildIntArray(); + if (not build_result.ok()) { + std::cerr << build_result.status().message() << std::endl; + return 1; + } + + // >> Peek at the data + auto col_vals = *build_result; + std::cout << col_vals->ToString() << std::endl; + + // >> Invoke compute function + StartRecipe("RegisterAndCallComputeFunction"); + // |> First, register + auto fn_registry = arrow::compute::GetFunctionRegistry(); + RegisterNamedScalarFn(fn_registry); + + + // |> Then, invoke + Datum col_as_datum { col_vals }; + auto fn_result = NamedScalarFn(col_as_datum); + if (not fn_result.ok()) { + std::cerr << fn_result.status().message() << std::endl; + return 2; + } + + auto result_data = fn_result->make_array(); + std::cout << "Success:" << std::endl; + std::cout << "\t" << result_data->ToString() << std::endl; + + EndRecipe("RegisterAndCallComputeFunction"); + return 0; +} From a0bde0a2a00ecdbb6cfece083819c0260b21f29b Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Wed, 27 Jul 2022 16:10:43 -0700 Subject: [PATCH 2/9] [174]: resolved requests and updated comments Mostly minor changes. Only major change is replacing the use of hashing32 from key_hash.c with ScalarHelper from hashing.h --- cpp/code/compute_fn.cc | 153 ++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 95 deletions(-) diff --git a/cpp/code/compute_fn.cc b/cpp/code/compute_fn.cc index d8577457..b057f1e9 100644 --- a/cpp/code/compute_fn.cc +++ b/cpp/code/compute_fn.cc @@ -9,7 +9,7 @@ // arrow dependencies #include #include -#include +#include #include "common.h" @@ -18,48 +18,34 @@ using std::shared_ptr; using std::vector; -// arrow util types +// >> commonly used arrow types +// |> general programming support using arrow::Result; using arrow::Status; using arrow::Datum; -// arrow data types and helpers -using arrow::UInt32Builder; -using arrow::Int32Builder; - +// |> arrow data types and helpers +using arrow::Int64Builder; using arrow::Array; using arrow::ArraySpan; -// aliases for types used in `NamedScalarFn` +// >> aliases for types used to define a custom function (e.g. `NamedScalarFn`) // |> kernel parameters using arrow::compute::KernelContext; using arrow::compute::ExecSpan; using arrow::compute::ExecResult; -// |> other context types -using arrow::compute::ExecContext; -using arrow::compute::LightContext; - -// |> common types for compute functions -using arrow::compute::FunctionRegistry; +// |> for defining compute functions and their compute kernels using arrow::compute::FunctionDoc; using arrow::compute::InputType; using arrow::compute::OutputType; using arrow::compute::Arity; - -// |> the "kind" of function we want using arrow::compute::ScalarFunction; -// |> structs and classes for hashing -using arrow::util::MiniBatch; -using arrow::util::TempVectorStack; - -using arrow::compute::KeyColumnArray; -using arrow::compute::Hashing32; - -// |> functions used for hashing -using arrow::compute::ColumnArrayFromArrayData; +// |> for adding to a function registry or using `CallFunction` +using arrow::compute::FunctionRegistry; +using arrow::compute::ExecContext; // ------------------------------ @@ -69,12 +55,13 @@ using arrow::compute::ColumnArrayFromArrayData; /** * Create a const instance of `FunctionDoc` that contains 3 attributes: * 1. Short description - * 2. Long description (limited to 78 characters) + * 2. Long description (can be multiple lines, each limited to 78 characters in width) * 3. Name of input arguments */ const FunctionDoc named_scalar_fn_doc { - "Unary function that calculates a hash for each row of the input" - ,"This function uses an xxHash-like algorithm which produces 32-bit hashes." + "Unary function that calculates a hash for each element of the input" + ,("This function uses the xxHash algorithm.\n" + "The result contains a 64-bit hash value for each input element.") ,{ "input_array" } }; @@ -93,7 +80,7 @@ struct NamedScalarFn { /** * A kernel implementation that expects a single array as input, and outputs an array of - * uint32 values. We write this implementation knowing what function we want to + * int64 values. We write this implementation knowing what function we want to * associate it with ("NamedScalarFn"), but that association is made later (see * `RegisterScalarFnKernels()` below). */ @@ -101,56 +88,35 @@ struct NamedScalarFn { Exec(KernelContext *ctx, const ExecSpan &input_arg, ExecResult *out) { StartRecipe("DefineAComputeKernel"); - if (input_arg.num_values() != 1 or not input_arg[0].is_array()) { + // Validate inputs + if (input_arg.num_values() != 1 or !input_arg[0].is_array()) { return Status::Invalid("Unsupported argument types or shape"); } - // >> Initialize stack-based memory allocator with an allocator and memory size - TempVectorStack stack_memallocator; - auto input_dtype_width = input_arg[0].type()->bit_width(); - if (input_dtype_width > 0) { - ARROW_RETURN_NOT_OK( - stack_memallocator.Init( - ctx->exec_context()->memory_pool() - ,input_dtype_width * max_batchsize - ) + // The input ArraySpan manages data as 3 buffers; the data buffer has index `1` + constexpr int bufndx_data = 1; + const int64_t *hash_inputs = input_arg[0].array.GetValues(bufndx_data); + const auto input_len = input_arg[0].array.length; + + // Allocate an Arrow buffer for output + ARROW_ASSIGN_OR_RAISE(std::unique_ptr hash_buffer, + AllocateBuffer(input_len * sizeof(int64_t))); + + // Call hashing function, using both prime multipliers from xxHash + int64_t *hash_results = reinterpret_cast(hash_buffer->mutable_data()); + for (int val_ndx = 0; val_ndx < input_len; ++val_ndx) { + hash_results[val_ndx] = ( + ScalarHelper::ComputeHash(hash_inputs[val_ndx]) + + ScalarHelper::ComputeHash(hash_inputs[val_ndx]) ); } - // >> Prepare input data structure for propagation to hash function - // NOTE: "start row index" and "row count" can potentially be options in the future - ArraySpan hash_input = input_arg[0].array; - int64_t hash_startrow = 0; - int64_t hash_rowcount = hash_input.length; - ARROW_ASSIGN_OR_RAISE( - KeyColumnArray input_keycol - ,ColumnArrayFromArrayData(hash_input.ToArrayData(), hash_startrow, hash_rowcount) - ); - - // >> Call hashing function - vector hash_results; - hash_results.resize(hash_input.length); - - LightContext hash_ctx; - hash_ctx.hardware_flags = ctx->exec_context()->cpu_info()->hardware_flags(); - hash_ctx.stack = &stack_memallocator; - - Hashing32::HashMultiColumn({ input_keycol }, &hash_ctx, hash_results.data()); - - // >> Prepare results of hash function for kernel output argument - UInt32Builder builder; - builder.Reserve(hash_results.size()); - builder.AppendValues(hash_results); - - ARROW_ASSIGN_OR_RAISE(auto result_array, builder.Finish()); - out->value = result_array->data(); + // Use ArrayData (not ArraySpan) for ownership of result buffer + out->value = ArrayData{int64(), input_len, {nullptr, std::move(hash_buffer)}}; EndRecipe("DefineAComputeKernel"); return Status::OK(); } - - - static constexpr uint32_t max_batchsize = MiniBatch::kMiniBatchLength; }; @@ -172,7 +138,7 @@ struct NamedScalarFn { */ shared_ptr RegisterScalarFnKernels() { - StartRecipe("AddKernelsToFunction"); + StartRecipe("AddKernelToFunction"); // Instantiate a function to be registered auto fn_named_scalar = std::make_shared( "named_scalar_fn" @@ -180,17 +146,16 @@ RegisterScalarFnKernels() { ,std::move(named_scalar_fn_doc) ); - // Associate a kernel implementation with the function using - // `ScalarFunction::AddKernel()` + // Associate a function and kernel using `ScalarFunction::AddKernel()` DCHECK_OK( fn_named_scalar->AddKernel( - { InputType(arrow::int32()) } - ,OutputType(arrow::uint32()) + { InputType(arrow::int64()) } + ,OutputType(arrow::int64()) ,NamedScalarFn::Exec ) ); + EndRecipe("AddKernelToFunction"); - EndRecipe("AddKernelsToFunction"); return fn_named_scalar; } @@ -209,9 +174,9 @@ RegisterNamedScalarFn(FunctionRegistry *registry) { // >> Convenience functions /** - * An optional convenience function to easily invoke our compute function. This executes - * our compute function by invoking `CallFunction` with the name that we used to register - * the function ("named_scalar_fn" in this case). + * An optional, convenient invocation function to easily call our compute function. This + * executes our compute function by invoking `CallFunction` with the name that we used to + * register the function ("named_scalar_fn" in this case). */ ARROW_EXPORT Result @@ -223,9 +188,9 @@ NamedScalarFn(const Datum &input_arg, ExecContext *ctx) { Result> BuildIntArray() { - vector col_vals { 0, 1, 1, 2, 3, 5, 8, 13, 21, 34 }; + vector col_vals { 0, 1, 1, 2, 3, 5, 8, 13, 21, 34 }; - Int32Builder builder; + Int64Builder builder; ARROW_RETURN_NOT_OK(builder.Reserve(col_vals.size())); ARROW_RETURN_NOT_OK(builder.AppendValues(col_vals)); return builder.Finish(); @@ -235,27 +200,22 @@ BuildIntArray() { class ComputeFunctionTest : public ::testing::Test {}; TEST(ComputeFunctionTest, TestRegisterAndCallFunction) { - // >> Construct some test data + // >> Register the function first + StartRecipe("RegisterComputeFunction"); + auto fn_registry = arrow::compute::GetFunctionRegistry(); + RegisterNamedScalarFn(fn_registry); + EndRecipe("RegisterComputeFunction"); + + // >> Then we can call the function + StartRecipe("InvokeComputeFunction"); auto build_result = BuildIntArray(); if (not build_result.ok()) { std::cerr << build_result.status().message() << std::endl; return 1; } - // >> Peek at the data - auto col_vals = *build_result; - std::cout << col_vals->ToString() << std::endl; - - // >> Invoke compute function - StartRecipe("RegisterAndCallComputeFunction"); - // |> First, register - auto fn_registry = arrow::compute::GetFunctionRegistry(); - RegisterNamedScalarFn(fn_registry); - - - // |> Then, invoke - Datum col_as_datum { col_vals }; - auto fn_result = NamedScalarFn(col_as_datum); + Datum col_data { *build_result }; + auto fn_result = NamedScalarFn(col_data); if (not fn_result.ok()) { std::cerr << fn_result.status().message() << std::endl; return 2; @@ -264,7 +224,10 @@ TEST(ComputeFunctionTest, TestRegisterAndCallFunction) { auto result_data = fn_result->make_array(); std::cout << "Success:" << std::endl; std::cout << "\t" << result_data->ToString() << std::endl; + EndRecipe("InvokeComputeFunction"); + + // If we want to peek at the input data + std::cout << col_data.make_array()->ToString() << std::endl; - EndRecipe("RegisterAndCallComputeFunction"); return 0; } From 411d471fcecd41ee6d93c295940108c6bc76e5df Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Wed, 27 Jul 2022 17:12:31 -0700 Subject: [PATCH 3/9] [174]: adding initial compute documentation compute.rst is documentation to describe compute functions. For now it describes how to define and register a compute function. Still a work in progress. Updates to compute_fn.cc are to reflect the description provided in compute.rst --- cpp/code/compute_fn.cc | 17 ++++++--- cpp/source/compute.rst | 84 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 cpp/source/compute.rst diff --git a/cpp/code/compute_fn.cc b/cpp/code/compute_fn.cc index b057f1e9..a40cbd8e 100644 --- a/cpp/code/compute_fn.cc +++ b/cpp/code/compute_fn.cc @@ -167,8 +167,11 @@ RegisterScalarFnKernels() { */ void RegisterNamedScalarFn(FunctionRegistry *registry) { + StartRecipe("AddFunctionToRegistry"); + // scalar_fn has type: shared_ptr auto scalar_fn = RegisterScalarFnKernels(); DCHECK_OK(registry->AddFunction(std::move(scalar_fn))); + EndRecipe("AddFunctionToRegistry"); } @@ -181,8 +184,12 @@ RegisterNamedScalarFn(FunctionRegistry *registry) { ARROW_EXPORT Result NamedScalarFn(const Datum &input_arg, ExecContext *ctx) { - auto func_name = "named_scalar_fn"; - return CallFunction(func_name, { input_arg }, ctx); + StartRecipe("InvokeByCallFunction"); + auto func_name = "named_scalar_fn"; + auto result_datum = CallFunction(func_name, { input_arg }, ctx); + EndRecipe("InvokeByCallFunction"); + + return result_datum; } @@ -201,13 +208,11 @@ class ComputeFunctionTest : public ::testing::Test {}; TEST(ComputeFunctionTest, TestRegisterAndCallFunction) { // >> Register the function first - StartRecipe("RegisterComputeFunction"); auto fn_registry = arrow::compute::GetFunctionRegistry(); RegisterNamedScalarFn(fn_registry); - EndRecipe("RegisterComputeFunction"); // >> Then we can call the function - StartRecipe("InvokeComputeFunction"); + StartRecipe("InvokeByConvenienceFunction"); auto build_result = BuildIntArray(); if (not build_result.ok()) { std::cerr << build_result.status().message() << std::endl; @@ -224,7 +229,7 @@ TEST(ComputeFunctionTest, TestRegisterAndCallFunction) { auto result_data = fn_result->make_array(); std::cout << "Success:" << std::endl; std::cout << "\t" << result_data->ToString() << std::endl; - EndRecipe("InvokeComputeFunction"); + EndRecipe("InvokeByConvenienceFunction"); // If we want to peek at the input data std::cout << col_data.make_array()->ToString() << std::endl; diff --git a/cpp/source/compute.rst b/cpp/source/compute.rst new file mode 100644 index 00000000..71b038fe --- /dev/null +++ b/cpp/source/compute.rst @@ -0,0 +1,84 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +==================================== +Defining and Using Compute Functions +==================================== + +This section contains (or will contain) a number of recipes illustrating how to +define new "compute functions" or how to use existing ones. Arrow contains a "Compute +API," which primarily consists of a "registry" of functions that can be invoked. +Currently, Arrow populates a default registry with a variety of useful functions. The +recipes provided in this section show some approaches to define a compute function as well +as how to invoke a compute function by name, given a registry. + + +.. contents:: + +Invoke a Compute Function +========================= + +When invoking a compute function, the function must exist in a function registry. In this +recipe, we use `CallFunction()` to invoke the function with name "named_scalar_fn". + +.. recipe:: ../code/compute_fn.cc InvokeByCallFunction + :caption: Use CallFunction() to invoke a compute function by name + :dedent: 2 + +.. note:: + This method allows us to specify arguments as a vector and a custom ExecContext. + +If CallFunction is not provided an ExecContext (it is null), then the default builtin +FunctionRegistry will be used to call the function from. + +If we have defined a convenience function that wraps `CallFunction()`, then we can call +that function instead. Various compute functions provided by Arrow have these convenience +functions defined, such as `Add` or `Subtract`. + +.. recipe:: ../code/compute_fn.cc InvokeByConvenienceFunction + :caption: Use a convenience invocation function to call a compute function + :dedent: 2 + + +Adding a Custom Compute Function +================================ + +To make a custom compute function available, there are 3 primary steps: +1. Define kernels for the function (these implement the actual logic) +2. Associate the kernels with a function object +3. Add the function object to a function registry + + +Define Function Kernels +----------------------- + +A kernel function is a single function that implements the desired logic for the compute +function. The body of the kernel function may use other functions, but the kernel function +itself is a singular instance that will be associated with the desired compute function. + +The signature of a kernel function is relatively standardized: it returns a `Status` and +takes a context, some arguments, and a pointer to an output result. The context wraps an +`ExecContext` and other metadata about the environment in which the kernel function should +be executed. The input arguments are contained within an `ExecSpan` (newly added in place +of `ExecBatch`), which holds non-owning references to argument data. Finally, the +`ExecResult` pointed to should be set to an appropriate `ArraySpan` or `ArrayData` +instance, depending on ownership semantics of the kernel's output. + +.. recipe:: ../code/compute_fn.cc DefineAComputeKernel + :caption: Define an example compute kernel that uses ScalarHelper from hashing.h to hash + input values + :dedent: 2 From 797e621d65325118b651b45d25d6332a4f3df923 Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Thu, 28 Jul 2022 08:31:53 -0700 Subject: [PATCH 4/9] [174]: small update to documentation --- cpp/source/compute.rst | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/source/compute.rst b/cpp/source/compute.rst index 71b038fe..3c883947 100644 --- a/cpp/source/compute.rst +++ b/cpp/source/compute.rst @@ -42,7 +42,7 @@ recipe, we use `CallFunction()` to invoke the function with name "named_scalar_f .. note:: This method allows us to specify arguments as a vector and a custom ExecContext. -If CallFunction is not provided an ExecContext (it is null), then the default builtin +If an `ExecContext` is not passed to `CallFunction` (it is null), then the default FunctionRegistry will be used to call the function from. If we have defined a convenience function that wraps `CallFunction()`, then we can call @@ -66,11 +66,16 @@ To make a custom compute function available, there are 3 primary steps: Define Function Kernels ----------------------- -A kernel function is a single function that implements the desired logic for the compute -function. The body of the kernel function may use other functions, but the kernel function +A kernel is a particular function that implements desired logic for a compute function. +There are at least a couple of types of function kernels, such as initialization kernels +and execution kernels. An initialization kernel prepares the initial state of a compute +function, while an execution kernel executes the main processing logic of the compute +function. The body of a function kernel may use other functions, but the kernel function itself is a singular instance that will be associated with the desired compute function. +While compute functions can be associated with an initialization and execution kernel +pair, this recipe only shows the definition of an execution kernel. -The signature of a kernel function is relatively standardized: it returns a `Status` and +The signature of an execution kernel is relatively standardized: it returns a `Status` and takes a context, some arguments, and a pointer to an output result. The context wraps an `ExecContext` and other metadata about the environment in which the kernel function should be executed. The input arguments are contained within an `ExecSpan` (newly added in place @@ -82,3 +87,4 @@ instance, depending on ownership semantics of the kernel's output. :caption: Define an example compute kernel that uses ScalarHelper from hashing.h to hash input values :dedent: 2 + From 146376a67fc5d9bee2808b565a9cd92c2f1ab2ff Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Thu, 28 Jul 2022 09:10:36 -0700 Subject: [PATCH 5/9] [174]: completed draft of documentation --- cpp/source/compute.rst | 51 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/cpp/source/compute.rst b/cpp/source/compute.rst index 3c883947..28491fd9 100644 --- a/cpp/source/compute.rst +++ b/cpp/source/compute.rst @@ -88,3 +88,54 @@ instance, depending on ownership semantics of the kernel's output. input values :dedent: 2 +This recipe shows basic validation of `input_arg` which contains a vector of input +arguments. Then, the input `Array` is accessed from `input_arg` and a `Buffer` is +allocated to hold output results. After the main loop is completed, the allocated `Buffer` +is wrapped in an `ArrayData` instance and referenced by `out`. + + +Associate Kernels with a Function +--------------------------------- + +The process of adding kernels to a compute function is easy: (1) create an appropriate +`Function` instance--`ScalarFunction` in this case--and (2) call the `AddKernel` function. +The more difficult part of this process is repeating for the desired data types and +knowing how the signatures work. + +.. recipe:: ../code/compute_fn.cc AddKernelToFunction + :caption: Instantiate a ScalarFunction and add our execution kernel to it + :dedent: 2 + +A `ScalarFunction` represents a "scalar" or "element-wise" compute function (see +documentation on the Compute API). The signature used in this recipe passes: +1. A function name (to be used when calling it) +2. An "Arity" meaning how many input arguments it takes (like cardinality) +3. A `FunctionDoc` instance (to associate some documentation programmatically) + +Then, `AddKernel` expects: +1. A vector of data types for each input argument +2. An output data type for the result +3. The function to be used as the execution kernel +4. The function to be used as the initialization kernel (optional) + +Note that the constructor for `ScalarFunction` is more interested in how many arguments to +expect, and some information about the compute function itself; whereas, the function to +add a kernel specifies data types and the functions to call at runtime. + + +Add Function to Registry +------------------------ + +Finally, adding the function to a registry is wonderfully straightforward. + +.. recipe:: ../code/compute_fn.cc AddFunctionToRegistry + :caption: Use convenience function to get a ScalarFunction with associated kernels, then + add it to the given FunctionRegistry + :dedent: 2 + +In this recipe, we simply wrap the logic in a convenience function that: (1) creates a +`ScalarFunction`, (2) adds our execution kernel to the compute function, and (3) returns +the compute function. Then, we add the compute function to some registry. This recipe +takes the `FunctionRegistry` as an argument so that it is easy to call from the same place +that the Arrow codebase registers other provided functions. Otherwise, we can add our +compute function to the default registry, or a custom registry. From 28fada60bc694849cbdd9c58d6d61033723b0155 Mon Sep 17 00:00:00 2001 From: Aldrin M Date: Thu, 28 Jul 2022 09:13:36 -0700 Subject: [PATCH 6/9] [174]: changing list formatting --- cpp/source/compute.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/source/compute.rst b/cpp/source/compute.rst index 28491fd9..9386c387 100644 --- a/cpp/source/compute.rst +++ b/cpp/source/compute.rst @@ -58,8 +58,11 @@ Adding a Custom Compute Function ================================ To make a custom compute function available, there are 3 primary steps: + 1. Define kernels for the function (these implement the actual logic) + 2. Associate the kernels with a function object + 3. Add the function object to a function registry @@ -108,14 +111,21 @@ knowing how the signatures work. A `ScalarFunction` represents a "scalar" or "element-wise" compute function (see documentation on the Compute API). The signature used in this recipe passes: + 1. A function name (to be used when calling it) + 2. An "Arity" meaning how many input arguments it takes (like cardinality) + 3. A `FunctionDoc` instance (to associate some documentation programmatically) Then, `AddKernel` expects: + 1. A vector of data types for each input argument + 2. An output data type for the result + 3. The function to be used as the execution kernel + 4. The function to be used as the initialization kernel (optional) Note that the constructor for `ScalarFunction` is more interested in how many arguments to From 6c894dd31eea8d3ad121a09f158f4575d001c115 Mon Sep 17 00:00:00 2001 From: Aldrin Montana Date: Thu, 28 Jul 2022 14:15:06 -0700 Subject: [PATCH 7/9] 174: updated docs to be a bit more terse Also added namespace and :class: and :func: annotations. I think it doesn't matter in the cookbook, but it links nicely when in the arrow docs themselves. I'm kind of curious how it renders in the cookbook --- cpp/source/compute.rst | 95 ++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 49 deletions(-) diff --git a/cpp/source/compute.rst b/cpp/source/compute.rst index 9386c387..548912c4 100644 --- a/cpp/source/compute.rst +++ b/cpp/source/compute.rst @@ -19,12 +19,11 @@ Defining and Using Compute Functions ==================================== -This section contains (or will contain) a number of recipes illustrating how to -define new "compute functions" or how to use existing ones. Arrow contains a "Compute -API," which primarily consists of a "registry" of functions that can be invoked. -Currently, Arrow populates a default registry with a variety of useful functions. The -recipes provided in this section show some approaches to define a compute function as well -as how to invoke a compute function by name, given a registry. +Arrow contains a "Compute API," which primarily consists of a "registry" of functions that +can be invoked. Currently, Arrow populates a default registry with a variety of +functions, which we call "compute functions". This section contains (or will contain) a +number of recipes illustrating how to define compute functions or how to use existing +ones. .. contents:: @@ -32,8 +31,12 @@ as how to invoke a compute function by name, given a registry. Invoke a Compute Function ========================= -When invoking a compute function, the function must exist in a function registry. In this -recipe, we use `CallFunction()` to invoke the function with name "named_scalar_fn". +When invoking a compute function, the function must exist in a function registry. Here, we +use :func:`arrow::compute::CallFunction` to invoke the function with name +"named_scalar_fn". :func:`arrow::compute::CallFunction` uses the function registry +referenced from the :class:`ExecContext` argument. If an :class:`ExecContext` is not +specified, the default :class:`ExecContext` is used (which references a default +:class:`FunctionRegistry`). .. recipe:: ../code/compute_fn.cc InvokeByCallFunction :caption: Use CallFunction() to invoke a compute function by name @@ -42,12 +45,9 @@ recipe, we use `CallFunction()` to invoke the function with name "named_scalar_f .. note:: This method allows us to specify arguments as a vector and a custom ExecContext. -If an `ExecContext` is not passed to `CallFunction` (it is null), then the default -FunctionRegistry will be used to call the function from. - -If we have defined a convenience function that wraps `CallFunction()`, then we can call -that function instead. Various compute functions provided by Arrow have these convenience -functions defined, such as `Add` or `Subtract`. +Sometimes, a convenience function (such as :func:`arrow::compute::Add` or +:func:`arrow::compute::Subtract`) is defined. These functions are usually implemented as +wrappers around :func:`arrow::compute::CallFunction`. .. recipe:: ../code/compute_fn.cc InvokeByConvenienceFunction :caption: Use a convenience invocation function to call a compute function @@ -69,22 +69,15 @@ To make a custom compute function available, there are 3 primary steps: Define Function Kernels ----------------------- -A kernel is a particular function that implements desired logic for a compute function. -There are at least a couple of types of function kernels, such as initialization kernels -and execution kernels. An initialization kernel prepares the initial state of a compute -function, while an execution kernel executes the main processing logic of the compute -function. The body of a function kernel may use other functions, but the kernel function -itself is a singular instance that will be associated with the desired compute function. -While compute functions can be associated with an initialization and execution kernel -pair, this recipe only shows the definition of an execution kernel. - -The signature of an execution kernel is relatively standardized: it returns a `Status` and -takes a context, some arguments, and a pointer to an output result. The context wraps an -`ExecContext` and other metadata about the environment in which the kernel function should -be executed. The input arguments are contained within an `ExecSpan` (newly added in place -of `ExecBatch`), which holds non-owning references to argument data. Finally, the -`ExecResult` pointed to should be set to an appropriate `ArraySpan` or `ArrayData` -instance, depending on ownership semantics of the kernel's output. +The signature of an execution kernel is relatively standardized: it returns a +:class:`arrow::Status` and takes a context, some arguments, and a pointer to an output +result. The context wraps an :class:`arrow::compute::ExecContext` and other metadata about +the environment in which the kernel function should be executed. The input arguments are +contained within an :class:`arrow::compute::ExecSpan` (newly added in place of +:class:`arrow::compute::ExecBatch`), which holds non-owning references to argument data. +Finally, the :class:`arrow::compute::ExecResult` pointed to should be set to an +appropriate :class:`arrow::ArraySpan` or :class:`arrow::ArrayData` instance, depending on +ownership semantics of the kernel's output. .. recipe:: ../code/compute_fn.cc DefineAComputeKernel :caption: Define an example compute kernel that uses ScalarHelper from hashing.h to hash @@ -92,33 +85,35 @@ instance, depending on ownership semantics of the kernel's output. :dedent: 2 This recipe shows basic validation of `input_arg` which contains a vector of input -arguments. Then, the input `Array` is accessed from `input_arg` and a `Buffer` is -allocated to hold output results. After the main loop is completed, the allocated `Buffer` -is wrapped in an `ArrayData` instance and referenced by `out`. +arguments. Then, the input :class:`arrow::Array` is accessed from `input_arg` and a +:class:`arrow::Buffer` is allocated to hold output results. After the main loop is +completed, the allocated :class:`arrow::Buffer` is wrapped in an :class:`arrow::ArrayData` +instance and referenced by `out`. Associate Kernels with a Function --------------------------------- -The process of adding kernels to a compute function is easy: (1) create an appropriate -`Function` instance--`ScalarFunction` in this case--and (2) call the `AddKernel` function. -The more difficult part of this process is repeating for the desired data types and -knowing how the signatures work. +Kernels are added to a compute function in 2 steps: (1) create an appropriate function +object--:class:`arrow::compute::ScalarFunction` in this case--and (2) call the +:func:`arrow::compute::ScalarFunction::AddKernel` function. The AddKernel function is +repeated for each desired input data type. .. recipe:: ../code/compute_fn.cc AddKernelToFunction :caption: Instantiate a ScalarFunction and add our execution kernel to it :dedent: 2 -A `ScalarFunction` represents a "scalar" or "element-wise" compute function (see -documentation on the Compute API). The signature used in this recipe passes: +A :class:`arrow::compute::ScalarFunction` represents a "scalar" or "element-wise" compute +function (see documentation on the Compute API). The signature used in this recipe passes: 1. A function name (to be used when calling it) 2. An "Arity" meaning how many input arguments it takes (like cardinality) -3. A `FunctionDoc` instance (to associate some documentation programmatically) +3. A :class:`arrow::compute::FunctionDoc` instance (to associate some documentation + programmatically) -Then, `AddKernel` expects: +Then, :func:`arrow::compute::ScalarFunction::AddKernel` expects: 1. A vector of data types for each input argument @@ -128,9 +123,10 @@ Then, `AddKernel` expects: 4. The function to be used as the initialization kernel (optional) -Note that the constructor for `ScalarFunction` is more interested in how many arguments to -expect, and some information about the compute function itself; whereas, the function to -add a kernel specifies data types and the functions to call at runtime. +Note that the constructor for :class:`arrow::compute::ScalarFunction` is more interested +in how many arguments to expect, and some information about the compute function itself; +whereas, the function to add a kernel specifies data types and the functions to call at +runtime. Add Function to Registry @@ -144,8 +140,9 @@ Finally, adding the function to a registry is wonderfully straightforward. :dedent: 2 In this recipe, we simply wrap the logic in a convenience function that: (1) creates a -`ScalarFunction`, (2) adds our execution kernel to the compute function, and (3) returns -the compute function. Then, we add the compute function to some registry. This recipe -takes the `FunctionRegistry` as an argument so that it is easy to call from the same place -that the Arrow codebase registers other provided functions. Otherwise, we can add our -compute function to the default registry, or a custom registry. +:class:`arrow::compute::ScalarFunction`, (2) adds our execution kernel to the compute +function, and (3) returns the compute function. Then, we add the compute function to some +registry. This recipe takes the :class:`arrow::compute::FunctionRegistry` as an argument +so that it is easy to call from the same place that the Arrow codebase registers other +provided functions. Otherwise, we can add our compute function to the default registry, +or a custom registry. From 08f01882a0ce9bb3d178fcadf38e825d7912f13e Mon Sep 17 00:00:00 2001 From: Aldrin Montana Date: Thu, 28 Jul 2022 14:18:39 -0700 Subject: [PATCH 8/9] GH-174 moved some recipe labels Trying to see if moving these recipe labels captures C++ code blocks outside of function bodies --- cpp/code/compute_fn.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/code/compute_fn.cc b/cpp/code/compute_fn.cc index a40cbd8e..450635e5 100644 --- a/cpp/code/compute_fn.cc +++ b/cpp/code/compute_fn.cc @@ -160,6 +160,7 @@ RegisterScalarFnKernels() { } +// StartRecipe("AddFunctionToRegistry"); /** * A convenience function that shows how we register a custom function with a * `FunctionRegistry`. To keep this simple and general, this function takes a pointer to a @@ -167,15 +168,15 @@ RegisterScalarFnKernels() { */ void RegisterNamedScalarFn(FunctionRegistry *registry) { - StartRecipe("AddFunctionToRegistry"); // scalar_fn has type: shared_ptr auto scalar_fn = RegisterScalarFnKernels(); DCHECK_OK(registry->AddFunction(std::move(scalar_fn))); - EndRecipe("AddFunctionToRegistry"); } +// EndRecipe("AddFunctionToRegistry"); // >> Convenience functions +// StartRecipe("InvokeByCallFunction"); /** * An optional, convenient invocation function to easily call our compute function. This * executes our compute function by invoking `CallFunction` with the name that we used to @@ -184,13 +185,12 @@ RegisterNamedScalarFn(FunctionRegistry *registry) { ARROW_EXPORT Result NamedScalarFn(const Datum &input_arg, ExecContext *ctx) { - StartRecipe("InvokeByCallFunction"); auto func_name = "named_scalar_fn"; auto result_datum = CallFunction(func_name, { input_arg }, ctx); - EndRecipe("InvokeByCallFunction"); return result_datum; } +// EndRecipe("InvokeByCallFunction"); Result> From af50801dec1a6292918e9234ba4eedb444b2dafe Mon Sep 17 00:00:00 2001 From: Aldrin Montana Date: Thu, 28 Jul 2022 14:36:20 -0700 Subject: [PATCH 9/9] GH-174: changed some recipes to literalincludes I am trying the literalinclude directive first before trying to use recipe directives in comments --- cpp/code/compute_fn.cc | 9 ++++----- cpp/source/compute.rst | 39 ++++++++++++++++++++++----------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/cpp/code/compute_fn.cc b/cpp/code/compute_fn.cc index 450635e5..cd5c32a3 100644 --- a/cpp/code/compute_fn.cc +++ b/cpp/code/compute_fn.cc @@ -67,6 +67,7 @@ const FunctionDoc named_scalar_fn_doc { // >> Kernel implementations for a compute function +// StartRecipe("DefineAComputeKernel"); /** * Create implementations that will be associated with our compute function. When a * compute function is invoked, the compute API framework will delegate execution to an @@ -86,7 +87,6 @@ struct NamedScalarFn { */ static Status Exec(KernelContext *ctx, const ExecSpan &input_arg, ExecResult *out) { - StartRecipe("DefineAComputeKernel"); // Validate inputs if (input_arg.num_values() != 1 or !input_arg[0].is_array()) { @@ -113,11 +113,10 @@ struct NamedScalarFn { // Use ArrayData (not ArraySpan) for ownership of result buffer out->value = ArrayData{int64(), input_len, {nullptr, std::move(hash_buffer)}}; - - EndRecipe("DefineAComputeKernel"); return Status::OK(); } }; +// EndRecipe("DefineAComputeKernel"); // ------------------------------ @@ -125,6 +124,7 @@ struct NamedScalarFn { // >> Function registration and kernel association +// StartRecipe("AddKernelToFunction"); /** * A convenience function that shows how we construct an instance of `ScalarFunction` that * will be registered in a function registry. The instance is constructed with: (1) a @@ -138,7 +138,6 @@ struct NamedScalarFn { */ shared_ptr RegisterScalarFnKernels() { - StartRecipe("AddKernelToFunction"); // Instantiate a function to be registered auto fn_named_scalar = std::make_shared( "named_scalar_fn" @@ -154,10 +153,10 @@ RegisterScalarFnKernels() { ,NamedScalarFn::Exec ) ); - EndRecipe("AddKernelToFunction"); return fn_named_scalar; } +// EndRecipe("AddKernelToFunction"); // StartRecipe("AddFunctionToRegistry"); diff --git a/cpp/source/compute.rst b/cpp/source/compute.rst index 548912c4..6822d060 100644 --- a/cpp/source/compute.rst +++ b/cpp/source/compute.rst @@ -38,12 +38,11 @@ referenced from the :class:`ExecContext` argument. If an :class:`ExecContext` is specified, the default :class:`ExecContext` is used (which references a default :class:`FunctionRegistry`). -.. recipe:: ../code/compute_fn.cc InvokeByCallFunction - :caption: Use CallFunction() to invoke a compute function by name - :dedent: 2 - -.. note:: - This method allows us to specify arguments as a vector and a custom ExecContext. +.. literalinclude:: ../code/compute_fn.cc + :language: cpp + :lines: 179-191 + :caption: Use CallFunction() to invoke a compute function by name + :dedent: 2 Sometimes, a convenience function (such as :func:`arrow::compute::Add` or :func:`arrow::compute::Subtract`) is defined. These functions are usually implemented as @@ -79,10 +78,12 @@ Finally, the :class:`arrow::compute::ExecResult` pointed to should be set to an appropriate :class:`arrow::ArraySpan` or :class:`arrow::ArrayData` instance, depending on ownership semantics of the kernel's output. -.. recipe:: ../code/compute_fn.cc DefineAComputeKernel - :caption: Define an example compute kernel that uses ScalarHelper from hashing.h to hash - input values - :dedent: 2 +.. literalinclude:: ../code/compute_fn.cc + :language: cpp + :lines: 71-118 + :caption: Define an example compute kernel that uses ScalarHelper from hashing.h to + hash input values + :dedent: 2 This recipe shows basic validation of `input_arg` which contains a vector of input arguments. Then, the input :class:`arrow::Array` is accessed from `input_arg` and a @@ -99,9 +100,11 @@ object--:class:`arrow::compute::ScalarFunction` in this case--and (2) call the :func:`arrow::compute::ScalarFunction::AddKernel` function. The AddKernel function is repeated for each desired input data type. -.. recipe:: ../code/compute_fn.cc AddKernelToFunction - :caption: Instantiate a ScalarFunction and add our execution kernel to it - :dedent: 2 +.. literalinclude:: ../code/compute_fn.cc + :language: cpp + :lines: 128-158 + :caption: Instantiate a ScalarFunction and add our execution kernel to it + :dedent: 2 A :class:`arrow::compute::ScalarFunction` represents a "scalar" or "element-wise" compute function (see documentation on the Compute API). The signature used in this recipe passes: @@ -134,10 +137,12 @@ Add Function to Registry Finally, adding the function to a registry is wonderfully straightforward. -.. recipe:: ../code/compute_fn.cc AddFunctionToRegistry - :caption: Use convenience function to get a ScalarFunction with associated kernels, then - add it to the given FunctionRegistry - :dedent: 2 +.. literalinclude:: ../code/compute_fn.cc + :language: cpp + :lines: 163-173 + :caption: Use convenience function to get a ScalarFunction with associated kernels, + then add it to the given FunctionRegistry + :dedent: 2 In this recipe, we simply wrap the logic in a convenience function that: (1) creates a :class:`arrow::compute::ScalarFunction`, (2) adds our execution kernel to the compute