From e7f6efd59b1138ba65638016c1f5b5611d2d9c47 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 25 Jan 2023 22:00:34 -0800 Subject: [PATCH 01/78] towards using vectorize --- cunumeric/__init__.py | 1 + cunumeric/vectorize.py | 314 ++++++++++++++++++++++++++++ tests/integration/test_vectorize.py | 39 ++++ 3 files changed, 354 insertions(+) create mode 100644 cunumeric/vectorize.py create mode 100644 tests/integration/test_vectorize.py diff --git a/cunumeric/__init__.py b/cunumeric/__init__.py index 7c9e122aaa..1d0cce7c20 100644 --- a/cunumeric/__init__.py +++ b/cunumeric/__init__.py @@ -34,6 +34,7 @@ from cunumeric.logic import * from cunumeric.window import bartlett, blackman, hamming, hanning, kaiser from cunumeric.coverage import clone_module +from cunumeric.vectorize import vectorize clone_module(_np, globals()) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py new file mode 100644 index 0000000000..e68c8d55f5 --- /dev/null +++ b/cunumeric/vectorize.py @@ -0,0 +1,314 @@ +# Copyright 2023 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import inspect +import re +from typing import Any, Callable, Dict, List, Optional, Union + +# numba doesn't seem to include type hints +import numba.cuda # type: ignore +import numba.types # type: ignore +import numpy as np +import six + +from cunumeric.runtime import runtime + +from .array import convert_to_cunumeric_ndarray + +_EXTERNAL_REFERENCE_PREFIX = "__extern_ref__" +_MASK_VAR = "__mask__" +_SIZE_VAR = "__size__" +_LOOP_VAR = "__i__" +_ARGS_VAR = "__args__" + + +class vectorize: + """ + vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False, + signature=None) + Generalized function class. + Define a vectorized function which takes a nested sequence of objects or + numpy arrays as inputs and returns a single numpy array or a tuple of numpy + arrays. The vectorized function evaluates `pyfunc` over successive tuples + of the input arrays like the python map function, except it uses the + broadcasting rules of numpy. + The data type of the output of `vectorized` is determined by calling + the function with the first element of the input. This can be avoided + by specifying the `otypes` argument. + + Parameters + ---------- + pyfunc : callable + A python function or method. + otypes : str or list of dtypes, optional + The output data type. It must be specified as either a string of + typecode characters or a list of data type specifiers. There should + be one data type specifier for each output. + doc : str, optional + The docstring for the function. If None, the docstring will be the + ``pyfunc.__doc__``. + excluded : set, optional + Set of strings or integers representing the positional or keyword + arguments for which the function will not be vectorized. These will be + passed directly to `pyfunc` unmodified. + cache : bool, optional + If `True`, then cache the first function call that determines + the number of outputs if `otypes` is not provided. + signature : string, optional + Generalized universal function signature, e.g., ``(m,n),(n)->(m)`` for + vectorized matrix-vector multiplication. If provided, ``pyfunc`` will + be called with (and expected to return) arrays with shapes given by the + size of corresponding core dimensions. By default, ``pyfunc`` is + assumed to take scalars as input and output. + + Returns + ------- + vectorized : callable + Vectorized function. + + See Also + -------- + numpy.vectorize + + Availability + -------- + Multiple GPUs, Multiple CPUs + """ + + def __init__( + self, + pyfunc: Callable[[Any], Any], + otypes: Optional[Union[str, list[Any]]] = None, + doc: Optional[str] = None, + excluded: Optional[set[Any]] = None, + cache: Optional[bool] = False, + signature: Optional[str] = None, + ) -> None: + self._pyfunc = pyfunc + self._numba_func: Optional[Callable[[Any], Any]] = None + self._device_func: Optional[Callable[[Any], Any]] = None + self._otypes = None + self._result = None + self._args: List[Any] = [] + self._kwargs: List[Any] = [] + + if doc is None: + self.__doc__ = pyfunc.__doc__ + else: + self.__doc__ = doc + + if otypes is not None: + raise NotImplementedError("Otypes variables are not supported yet") + + if excluded is not None: + raise NotImplementedError( + "excluded variables are not supported yet" + ) + if cache: + raise NotImplementedError("cache variable is not supported yet") + + if signature is not None: + raise NotImplementedError( + "signature variable is not supported yet" + ) + + # FIXME check return of the user function + # return annotation (we supprt only void) + + # if inspect.signature(self._pyfunc).return_annotation() + # != inspect._empty: + # raise NotImplementedError( + # "user defined functions can't have a return" + # ) + + def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: + """Using the magic method __doc__, we KNOW the size of the docstring. + We then, just substract this from the total length of the function + """ + lines_to_skip = 0 + if func.__doc__ is not None and len(func.__doc__.split("\n")) > 0: + lines_to_skip = len(func.__doc__.split("\n")) + + lines = inspect.getsourcelines(func)[0] + + return_lines = [] + for i in range(lines_to_skip + 1, len(lines)): + return_lines.append(lines[i].rstrip()) + return return_lines + + def _build_gpu_function(self) -> Callable[[Any], Any]: + + funcid = "vectorized_{}".format(self._pyfunc.__name__) + + # Preamble + lines = ["from numba import cuda"] + + # Signature + argnames = list(k for k in inspect.signature(self._pyfunc).parameters) + args = argnames + [_SIZE_VAR] + lines.append("def {}({}):".format(funcid, ",".join(args))) + + # Initialize the index variable and return immediately + # when it exceeds the data size + lines.append(" {} = cuda.grid(1)".format(_LOOP_VAR)) + lines.append(" if {} >= {}:".format(_LOOP_VAR, _SIZE_VAR)) + lines.append(" return") + + # Kernel body + def _lift_to_array_access(m: Any) -> str: + name = m.group(0) + if name in argnames: + return "{}[{}]".format(name, _LOOP_VAR) + else: + return "{}".format(name) + + # kernel body + lines_old = self._get_func_body(self._pyfunc) + for line in lines_old: + l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) + lines.append(l_new) + + # Evaluate the string to get the Python function + body = "\n".join(lines) + glbs: Dict[str, Any] = {} + six.exec_(body, glbs) + return glbs[funcid] + + def _build_cpu_function(self) -> Callable[[Any], Any]: + + funcid = "vectorized_{}".format(self._pyfunc.__name__) + + # Preamble + lines = ["from numba import carray, types"] + + # Signature + lines.append("def {}({}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR)) + + # Unpack kernel arguments + def _emit_assignment( + var: Any, idx: int, sz: Any, ty: np.dtype[Any] + ) -> None: + lines.append( + " {} = carray({}[{}], {}, types.{})".format( + var, _ARGS_VAR, idx, sz, ty + ) + ) + + # get names of arguments + argnames = list(k for k in inspect.signature(self._pyfunc).parameters) + arg_idx = 0 + for a in self._args: + ty = a.dtype + _emit_assignment(argnames[arg_idx], arg_idx, _SIZE_VAR, ty) + arg_idx += 1 + + # Main loop + lines.append(" for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR)) + + lines_old = self._get_func_body(self._pyfunc) + + def _lift_to_array_access(m: Any) -> str: + name = m.group(0) + if name in argnames: + return "{}[{}]".format(name, _LOOP_VAR) + else: + return "{}[0]".format(name) + + # lines_new = [] + for line in lines_old: + l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) + lines.append(" " + l_new) + + # Evaluate the string to get the Python function + body = "\n".join(lines) + glbs: Dict[str, Any] = {} + six.exec_(body, glbs) + return glbs[funcid] + + def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: + types = [] + for arg in self._args: + ty = arg.dtype + ty = str(ty) if ty != bool else "int8" + ty = getattr(numba.types, ty) + ty = numba.types.CPointer(ty) + types.append(ty) + return types + + def _compile_func_gpu(self) -> Callable[[Any], Any]: + types = self._get_numba_types() + arg_types = types + [numba.types.uint64] + sig = (*arg_types,) + + cuda_arch = numba.cuda.get_current_device().compute_capability + return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch) + + def _compile_func_cpu(self) -> Any: + sig = numba.types.void( + numba.types.CPointer(numba.types.voidptr), numba.types.uint64 + ) + + return numba.cfunc(sig)(self._numba_func) + + # def _execute_gpu(self): + # task = self.context.create_auto_task(CuNumericOpCode.LOAD_PTX) + # task..add_future( + # self._runtime.create_future_from_string(self._device_func) + # ) + # kernel_fun = task.execute() + + # task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF) + # This will be ignored + # task.add_scalar_arg(0, ty.uint64) + # task.add_future_map(kernel_fun) + # task.execute() + + # def _execute_cpu(self): + + # task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF) + # task.add_scalar_arg(self._device_func.address, ty.uint64) + + def __call__(self, *args: Any, **kwargs: Any) -> None: + """ + Return arrays with the results of `pyfunc` broadcast (vectorized) over + `args` and `kwargs` not in `excluded`. + """ + self._args = list( + convert_to_cunumeric_ndarray(arg) if arg is not None else arg + for (idx, arg) in enumerate(args) + ) + for arg in self._args: + if arg is None: + raise ValueError( + "None is not supported in user function " + "passed to cunumeric.vectorize" + ) + + self._kwargs = list(kwargs) + if len(self._kwargs) > 1: + raise NotImplementedError( + "kwargs are not supported in user functions" + ) + + if runtime.num_gpus > 0: + self._numba_func = self._build_gpu_function() + self._device_func = self._compile_func_gpu() + # self._execute_gpu() + else: + self._numba_func = self._build_cpu_function() + self._device_func = self._compile_func_cpu() + # self._execute_cpu() + + return self._result diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py new file mode 100644 index 0000000000..73a09b9d1b --- /dev/null +++ b/tests/integration/test_vectorize.py @@ -0,0 +1,39 @@ +# Copyright 2021-2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# import numpy as np +import pytest + +import cunumeric as num + + +def my_func(a, b): + a = a * 2 + b + a = a * 3 + + +def test_vectorize(): + func = num.vectorize(my_func) + a = 1 + b = 2 + func(a, b) + print(a) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(sys.argv)) From 02b5ffc266b36f7932b013c7d4928b597f0f04dd Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 1 Feb 2023 21:11:17 -0800 Subject: [PATCH 02/78] making simple test work on CPUs --- cunumeric/config.py | 2 + cunumeric/vectorize.py | 93 +++++++++++++------ cunumeric_cpp.cmake | 3 + src/cunumeric/cunumeric_c.h | 1 + src/cunumeric/vectorize/eval_udf.cc | 40 ++++++++ src/cunumeric/vectorize/eval_udf.cu | 34 +++++++ src/cunumeric/vectorize/eval_udf.h | 42 +++++++++ src/cunumeric/vectorize/eval_udf_omp.cc | 35 +++++++ src/cunumeric/vectorize/eval_udf_template.inl | 64 +++++++++++++ tests/integration/test_vectorize.py | 7 +- 10 files changed, 290 insertions(+), 31 deletions(-) create mode 100644 src/cunumeric/vectorize/eval_udf.cc create mode 100644 src/cunumeric/vectorize/eval_udf.cu create mode 100644 src/cunumeric/vectorize/eval_udf.h create mode 100644 src/cunumeric/vectorize/eval_udf_omp.cc create mode 100644 src/cunumeric/vectorize/eval_udf_template.inl diff --git a/cunumeric/config.py b/cunumeric/config.py index cad52e77f4..14cb1b6434 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -146,6 +146,7 @@ class _CunumericSharedLib: CUNUMERIC_DIAG: int CUNUMERIC_DOT: int CUNUMERIC_EYE: int + CUNUMERIC_EVAL_UDF: int CUNUMERIC_FFT: int CUNUMERIC_FFT_C2C: int CUNUMERIC_FFT_C2R: int @@ -348,6 +349,7 @@ class CuNumericOpCode(IntEnum): DIAG = _cunumeric.CUNUMERIC_DIAG DOT = _cunumeric.CUNUMERIC_DOT EYE = _cunumeric.CUNUMERIC_EYE + EVAL_UDF = _cunumeric.CUNUMERIC_EVAL_UDF FFT = _cunumeric.CUNUMERIC_FFT FILL = _cunumeric.CUNUMERIC_FILL FLIP = _cunumeric.CUNUMERIC_FLIP diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index e68c8d55f5..f95531b217 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -17,15 +17,18 @@ import re from typing import Any, Callable, Dict, List, Optional, Union -# numba doesn't seem to include type hints -import numba.cuda # type: ignore -import numba.types # type: ignore +import legate.core.types as ty +import numba.cuda +import numba.types + +# import numba import numpy as np import six from cunumeric.runtime import runtime from .array import convert_to_cunumeric_ndarray +from .config import CuNumericOpCode _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__" _MASK_VAR = "__mask__" @@ -98,11 +101,15 @@ def __init__( ) -> None: self._pyfunc = pyfunc self._numba_func: Optional[Callable[[Any], Any]] = None - self._device_func: Optional[Callable[[Any], Any]] = None + self._cpu_func: numba.types.CPointer = numba.types.CPointer(int) + self._gpu_func: tuple[Any] = (0,) self._otypes = None self._result = None self._args: List[Any] = [] self._kwargs: List[Any] = [] + self._context = runtime.legate_context + + print("IRINA DEBUG initialization") if doc is None: self.__doc__ = pyfunc.__doc__ @@ -247,7 +254,7 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: types.append(ty) return types - def _compile_func_gpu(self) -> Callable[[Any], Any]: + def _compile_func_gpu(self) -> tuple[Any]: types = self._get_numba_types() arg_types = types + [numba.types.uint64] sig = (*arg_types,) @@ -255,30 +262,41 @@ def _compile_func_gpu(self) -> Callable[[Any], Any]: cuda_arch = numba.cuda.get_current_device().compute_capability return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch) - def _compile_func_cpu(self) -> Any: + def _compile_func_cpu(self) -> numba.types.CPointer: sig = numba.types.void( numba.types.CPointer(numba.types.voidptr), numba.types.uint64 ) return numba.cfunc(sig)(self._numba_func) - # def _execute_gpu(self): - # task = self.context.create_auto_task(CuNumericOpCode.LOAD_PTX) - # task..add_future( - # self._runtime.create_future_from_string(self._device_func) - # ) - # kernel_fun = task.execute() - - # task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF) - # This will be ignored - # task.add_scalar_arg(0, ty.uint64) - # task.add_future_map(kernel_fun) - # task.execute() - - # def _execute_cpu(self): - - # task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF) - # task.add_scalar_arg(self._device_func.address, ty.uint64) + def _execute_gpu(self) -> None: + print("IRINA DEBUG executing GPU function") + # task = self._context.create_auto_task(CuNumericOpCode.LOAD_PTX) + # task.add_future( + # self._runtime.create_future_from_string(self._device_func) + # ) + # task.execute() + + # task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) + # This will be ignored + # task.add_scalar_arg(0, ty.uint64) + # task.add_future_map(kernel_fun) + # task.execute() + + def _execute_cpu(self) -> None: + task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) + task.add_scalar_arg(self._cpu_func.address, ty.uint64) + idx = 0 + a0 = self._args[0]._thunk + a0 = runtime.to_deferred_array(a0) + for a in self._args: + a_tmp = runtime.to_deferred_array(a._thunk) + task.add_input(a_tmp.base) + task.add_output(a_tmp.base) + if idx != 0: + task.add_alignment(a0.base, a_tmp.base) + idx += 1 + task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: """ @@ -296,6 +314,25 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: "passed to cunumeric.vectorize" ) + # #FIXME: comment out when brodcast PR is merged + # #bring all argumants to the same shape and type: + # if len(self._args)>0: + # ty = self._args[0].dtype + # #FIXME: should we bring them all to the same type? + # for a in self._args: + # if a.dtype != ty: + # return TypeError("all arguments of " + # "user defined function " + # "should have the same type") + + # shapes = tuple(a.shape for a in self._args) + # shape = broadcast_shapes(shapes) + # new_args = tuple() + # for a in self._args: + # a_new = a.broadcast_to(shape) + # new_args +=(a_new,) + # self._args = new_args + self._kwargs = list(kwargs) if len(self._kwargs) > 1: raise NotImplementedError( @@ -304,11 +341,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: if runtime.num_gpus > 0: self._numba_func = self._build_gpu_function() - self._device_func = self._compile_func_gpu() - # self._execute_gpu() + self._gpu_func = self._compile_func_gpu() + self._execute_gpu() else: self._numba_func = self._build_cpu_function() - self._device_func = self._compile_func_cpu() - # self._execute_cpu() - - return self._result + self._cpu_func = self._compile_func_cpu() + self._execute_cpu() diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake index 7034bb600a..bdfa4c163a 100644 --- a/cunumeric_cpp.cmake +++ b/cunumeric_cpp.cmake @@ -160,6 +160,7 @@ list(APPEND cunumeric_SOURCES src/cunumeric/mapper.cc src/cunumeric/cephes/chbevl.cc src/cunumeric/cephes/i0.cc + src/cunumeric/vectorize/eval_udf.cc ) if(Legion_USE_OpenMP) @@ -206,6 +207,7 @@ if(Legion_USE_OpenMP) src/cunumeric/stat/bincount_omp.cc src/cunumeric/convolution/convolve_omp.cc src/cunumeric/transform/flip_omp.cc + src/cunumeric/vectorize/eval_udf_omp.cc ) endif() @@ -257,6 +259,7 @@ if(Legion_USE_CUDA) src/cunumeric/transform/flip.cu src/cunumeric/cudalibs.cu src/cunumeric/cunumeric.cu + src/cunumeric/vectorize/eval_udf.cu ) endif() diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index 724db00134..42c0bc955e 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -42,6 +42,7 @@ enum CuNumericOpCode { CUNUMERIC_DIAG, CUNUMERIC_DOT, CUNUMERIC_EYE, + CUNUMERIC_EVAL_UDF, CUNUMERIC_FFT, CUNUMERIC_FILL, CUNUMERIC_FLIP, diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc new file mode 100644 index 0000000000..ed0c5dea05 --- /dev/null +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -0,0 +1,40 @@ +/* Copyright 20223 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/vectorize/eval_udf.h" +#include "cunumeric/vectorize/eval_udf_template.inl" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct EvalUdfImplBody { + using VAL = legate_type_of; +}; + +/*static*/ void EvalUdfTask::cpu_variant(TaskContext& context) +{ + eval_udf_template(context); +} + +namespace // unnamed +{ +static void __attribute__((constructor)) register_tasks(void) { EvalUdfTask::register_variants(); } +} // namespace + +} // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu new file mode 100644 index 0000000000..74ac3ab31c --- /dev/null +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -0,0 +1,34 @@ +/* Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/vectorize/eval_udf.h" +#include "cunumeric/vectorize/eval_udf_template.inl" +#include "cunumeric/cuda_help.h" + +namespace cunumeric { + +using namespace Legion; + +template +struct EvalUdfImplBody { + using VAL = legate_type_of; +}; + +/*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) +{ + eval_udf_template(context); +} +} // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h new file mode 100644 index 0000000000..8981dc3b0d --- /dev/null +++ b/src/cunumeric/vectorize/eval_udf.h @@ -0,0 +1,42 @@ +/* Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "cunumeric/cunumeric.h" + +namespace cunumeric { + +struct EvalUdfArgs { + uint64_t func_ptr; + std::vector& args; +}; + +class EvalUdfTask : public CuNumericTask { + public: + static const int TASK_ID = CUNUMERIC_EVAL_UDF; + + public: + static void cpu_variant(legate::TaskContext& context); +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext& context); +#endif +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext& context); +#endif +}; + +} // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf_omp.cc b/src/cunumeric/vectorize/eval_udf_omp.cc new file mode 100644 index 0000000000..eb946d8c76 --- /dev/null +++ b/src/cunumeric/vectorize/eval_udf_omp.cc @@ -0,0 +1,35 @@ +/* Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/vectorize/eval_udf.h" +#include "cunumeric/vectorize/eval_udf_template.inl" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct EvalUdfImplBody { + using VAL = legate_type_of; +}; + +/*static*/ void EvalUdfTask::omp_variant(TaskContext& context) +{ + eval_udf_template(context); +} + +} // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl new file mode 100644 index 0000000000..a71903b02b --- /dev/null +++ b/src/cunumeric/vectorize/eval_udf_template.inl @@ -0,0 +1,64 @@ +/* Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +// Useful for IDEs +#include "cunumeric/vectorize/eval_udf.h" +#include "cunumeric/pitches.h" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +template +struct EvalUdfImplBody; + +template +struct EvalUdfImpl { + template + void operator()(EvalUdfArgs& args) const + { + using UDF = void(void**, size_t); + auto udf = reinterpret_cast(args.func_ptr); + std::vector udf_args; + using VAL = legate_type_of; + auto rect = args.args[0].shape(); + + std::cout << "IRINA DEBUG size = " << args.args.size() << " , rect = " << rect << std::endl; + + if (rect.empty()) return; + + for (size_t i = 0; i < args.args.size(); i++) { + auto out = args.args[i].write_accessor(rect); + udf_args.push_back(reinterpret_cast(out.ptr(rect))); + } + + udf(udf_args.data(), rect.volume()); + } +}; + +template +static void eval_udf_template(TaskContext& context) +{ + std::cout << "IRINA DEBUG inside eval_udf_template" << std::endl; + EvalUdfArgs args{context.scalars()[0].value(), context.outputs()}; + size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); + double_dispatch(dim, args.args[0].code(), EvalUdfImpl{}, args); +} + +} // namespace cunumeric diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 73a09b9d1b..cbc1e692aa 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -27,9 +27,12 @@ def my_func(a, b): def test_vectorize(): func = num.vectorize(my_func) - a = 1 - b = 2 + a = num.arange(5) + b = num.zeros((5,)) + # b = 2 func(a, b) + # assert(a==12) + print("IRINA DEBUG:") print(a) From 8b8e6012e3d329a0708a80ffd2440a2c7edb26b8 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 2 Feb 2023 20:11:09 -0800 Subject: [PATCH 03/78] making simple GPU function work --- cunumeric/vectorize.py | 33 ++--- src/cunumeric/vectorize/eval_udf.cc | 28 +++- src/cunumeric/vectorize/eval_udf.cu | 120 +++++++++++++++++- src/cunumeric/vectorize/eval_udf.h | 3 +- src/cunumeric/vectorize/eval_udf_omp.cc | 7 +- src/cunumeric/vectorize/eval_udf_template.inl | 12 +- 6 files changed, 165 insertions(+), 38 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index f95531b217..549dc79d80 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -18,8 +18,8 @@ from typing import Any, Callable, Dict, List, Optional, Union import legate.core.types as ty -import numba.cuda -import numba.types +import numba.cuda # type: ignore +import numba.types # type: ignore # import numba import numpy as np @@ -109,7 +109,6 @@ def __init__( self._kwargs: List[Any] = [] self._context = runtime.legate_context - print("IRINA DEBUG initialization") if doc is None: self.__doc__ = pyfunc.__doc__ @@ -155,7 +154,7 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return_lines.append(lines[i].rstrip()) return return_lines - def _build_gpu_function(self) -> Callable[[Any], Any]: + def _build_gpu_function(self) -> Any: funcid = "vectorized_{}".format(self._pyfunc.__name__) @@ -270,18 +269,20 @@ def _compile_func_cpu(self) -> numba.types.CPointer: return numba.cfunc(sig)(self._numba_func) def _execute_gpu(self) -> None: - print("IRINA DEBUG executing GPU function") - # task = self._context.create_auto_task(CuNumericOpCode.LOAD_PTX) - # task.add_future( - # self._runtime.create_future_from_string(self._device_func) - # ) - # task.execute() - - # task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) - # This will be ignored - # task.add_scalar_arg(0, ty.uint64) - # task.add_future_map(kernel_fun) - # task.execute() + print("IRINA DEBUG executing GPU function", type(self._gpu_func[0])) + task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) + task.add_scalar_arg(self._gpu_func[0], ty.string) + idx = 0 + a0 = self._args[0]._thunk + a0 = runtime.to_deferred_array(a0) + for a in self._args: + a_tmp = runtime.to_deferred_array(a._thunk) + task.add_input(a_tmp.base) + task.add_output(a_tmp.base) + if idx != 0: + task.add_alignment(a0.base, a_tmp.base) + idx += 1 + task.execute() def _execute_cpu(self) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index ed0c5dea05..1975da6aa2 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -15,21 +15,39 @@ */ #include "cunumeric/vectorize/eval_udf.h" -#include "cunumeric/vectorize/eval_udf_template.inl" namespace cunumeric { using namespace Legion; using namespace legate; -template -struct EvalUdfImplBody { - using VAL = legate_type_of; +struct EvalUdfCPU { + template + void operator()(EvalUdfArgs& args) const + { + std::cout <<"IRINA DEBUG in CPU task 2"<(args.cpu_func_ptr); + std::vector udf_args; + using VAL = legate_type_of; + auto rect = args.args[0].shape(); + + if (rect.empty()) return; + for (size_t i = 0; i < args.args.size(); i++) { + auto out = args.args[i].write_accessor(rect); + udf_args.push_back(reinterpret_cast(out.ptr(rect))); + } + + udf(udf_args.data(), rect.volume()); + } }; /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context) { - eval_udf_template(context); + std::cout <<"IRINA DEBUG in CPU task"<(), context.outputs()}; + size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); + double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args); } namespace // unnamed diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 74ac3ab31c..deb19f0643 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -15,20 +15,130 @@ */ #include "cunumeric/vectorize/eval_udf.h" -#include "cunumeric/vectorize/eval_udf_template.inl" #include "cunumeric/cuda_help.h" +#include +#include namespace cunumeric { using namespace Legion; +using namespace legate; -template -struct EvalUdfImplBody { - using VAL = legate_type_of; +struct EvalUdfGPU { + template + void operator()(EvalUdfArgs& args) const + { + using VAL = legate_type_of; + auto rect = args.args[0].shape(); + if (rect.empty()) return; + + const unsigned num_options = 4; + const size_t log_buffer_size = 16384; + std::vector log_info_buffer(log_buffer_size); + std::vector log_error_buffer(log_buffer_size); + CUjit_option jit_options[] = { + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + }; + void *option_vals[] = { + static_cast(log_info_buffer.data()), + reinterpret_cast(log_buffer_size), + static_cast(log_error_buffer.data()), + reinterpret_cast(log_buffer_size), + }; + + CUmodule module; + CUresult result = cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals); + if (result != CUDA_SUCCESS) { + if (result == CUDA_ERROR_OPERATING_SYSTEM) { + fprintf(stderr, + "ERROR: Device side asserts are not supported by the " + "CUDA driver for MAC OSX, see NVBugs 1628896.\n"); + exit(-1); + } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) { + fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n"); + exit(-1); + } else { + fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data()); +#if CUDA_VERSION >= 6050 + const char *name, *str; + assert(cuGetErrorName(result, &name) == CUDA_SUCCESS); + assert(cuGetErrorString(result, &str) == CUDA_SUCCESS); + fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); +#else + fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result); +#endif + exit(-1); + } + } + + std::cmatch line_match; + bool match = std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); +#ifdef DEBUG_PANDAS + assert(match); +#endif + const auto &matched_line = line_match.begin()->str(); + auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); + + CUfunction func; + result = cuModuleGetFunction(&func, module, fun_name.c_str()); + assert(result == CUDA_SUCCESS); + + //ececuting user function: + size_t buffer_size = (args.args.size() ) * sizeof(void *); + buffer_size += sizeof(size_t); + + std::vector arg_buffer(buffer_size); + char *raw_arg_buffer = arg_buffer.data(); + + auto p = raw_arg_buffer; + + for (auto &arg : args.args) { + auto out = arg.write_accessor(rect); + *reinterpret_cast(p) = out.ptr(rect); + p += sizeof(void *); + } + auto size = rect.volume(); + memcpy(p, &size, sizeof(size_t)); + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, + static_cast(raw_arg_buffer), + CU_LAUNCH_PARAM_BUFFER_SIZE, + &buffer_size, + CU_LAUNCH_PARAM_END, + }; + + const uint32_t gridDimX = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + const uint32_t gridDimY = 1; + const uint32_t gridDimZ = 1; + + const uint32_t blockDimX = THREADS_PER_BLOCK; + const uint32_t blockDimY = 1; + const uint32_t blockDimZ = 1; + + auto stream = get_cached_stream(); + + CUresult status = cuLaunchKernel( + func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config); + if (status != CUDA_SUCCESS) { + fprintf(stderr, "Failed to launch a CUDA kernel\n"); + exit(-1); + } + + CHECK_CUDA_STREAM(stream); + + } }; /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { - eval_udf_template(context); + //std::cout <<"IRINA DEBUG size of the scalars = "<()<()}; + size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); + double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args); + } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h index 8981dc3b0d..6a53dd9f2f 100644 --- a/src/cunumeric/vectorize/eval_udf.h +++ b/src/cunumeric/vectorize/eval_udf.h @@ -21,8 +21,9 @@ namespace cunumeric { struct EvalUdfArgs { - uint64_t func_ptr; + uint64_t cpu_func_ptr; std::vector& args; + std::string ptx = ""; }; class EvalUdfTask : public CuNumericTask { diff --git a/src/cunumeric/vectorize/eval_udf_omp.cc b/src/cunumeric/vectorize/eval_udf_omp.cc index eb946d8c76..33e3c6e2a8 100644 --- a/src/cunumeric/vectorize/eval_udf_omp.cc +++ b/src/cunumeric/vectorize/eval_udf_omp.cc @@ -15,21 +15,16 @@ */ #include "cunumeric/vectorize/eval_udf.h" -#include "cunumeric/vectorize/eval_udf_template.inl" namespace cunumeric { using namespace Legion; using namespace legate; -template -struct EvalUdfImplBody { - using VAL = legate_type_of; -}; /*static*/ void EvalUdfTask::omp_variant(TaskContext& context) { - eval_udf_template(context); + EvalUdfTask::cpu_variant(context); } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl index a71903b02b..60e2582c54 100644 --- a/src/cunumeric/vectorize/eval_udf_template.inl +++ b/src/cunumeric/vectorize/eval_udf_template.inl @@ -39,10 +39,8 @@ struct EvalUdfImpl { using VAL = legate_type_of; auto rect = args.args[0].shape(); - std::cout << "IRINA DEBUG size = " << args.args.size() << " , rect = " << rect << std::endl; - if (rect.empty()) return; - + EvalUdfImplBody(); for (size_t i = 0; i < args.args.size(); i++) { auto out = args.args[i].write_accessor(rect); udf_args.push_back(reinterpret_cast(out.ptr(rect))); @@ -55,8 +53,12 @@ struct EvalUdfImpl { template static void eval_udf_template(TaskContext& context) { - std::cout << "IRINA DEBUG inside eval_udf_template" << std::endl; - EvalUdfArgs args{context.scalars()[0].value(), context.outputs()}; + is_gpus = context.scalars()[0].value(); + if (is_gpus) + std::cout <<"IRINA DEBUG size of the scalars = "<(), context.outputs()}; + else + EvalUdfArgs args{context.scalars()[1].value(),'', context.outputs()}; size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); double_dispatch(dim, args.args[0].code(), EvalUdfImpl{}, args); } From d3300167d62cb990c7351c694b658b274b53d24b Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 3 Feb 2023 08:50:47 -0800 Subject: [PATCH 04/78] changing isort version in pre-commmit due to some errors in the older verions --- .pre-commit-config.yaml | 2 +- src/cunumeric/vectorize/eval_udf.cc | 4 +- src/cunumeric/vectorize/eval_udf.cu | 181 +++++++++--------- src/cunumeric/vectorize/eval_udf_omp.cc | 1 - src/cunumeric/vectorize/eval_udf_template.inl | 9 +- 5 files changed, 98 insertions(+), 99 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bc47df8a70..1572d79d3f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: pass_filenames: false args: ['cunumeric'] - repo: https://github.com/PyCQA/isort - rev: 5.11.4 + rev: 5.12.0 hooks: - id: isort - repo: https://github.com/psf/black diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 1975da6aa2..fb71330581 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -25,7 +25,7 @@ struct EvalUdfCPU { template void operator()(EvalUdfArgs& args) const { - std::cout <<"IRINA DEBUG in CPU task 2"<(args.cpu_func_ptr); std::vector udf_args; @@ -44,7 +44,7 @@ struct EvalUdfCPU { /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context) { - std::cout <<"IRINA DEBUG in CPU task"<(), context.outputs()}; size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args); diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index deb19f0643..d5b2c1f43a 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -32,113 +32,114 @@ struct EvalUdfGPU { auto rect = args.args[0].shape(); if (rect.empty()) return; - const unsigned num_options = 4; - const size_t log_buffer_size = 16384; - std::vector log_info_buffer(log_buffer_size); - std::vector log_error_buffer(log_buffer_size); - CUjit_option jit_options[] = { - CU_JIT_INFO_LOG_BUFFER, - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - }; - void *option_vals[] = { - static_cast(log_info_buffer.data()), - reinterpret_cast(log_buffer_size), - static_cast(log_error_buffer.data()), - reinterpret_cast(log_buffer_size), - }; - - CUmodule module; - CUresult result = cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals); - if (result != CUDA_SUCCESS) { - if (result == CUDA_ERROR_OPERATING_SYSTEM) { - fprintf(stderr, - "ERROR: Device side asserts are not supported by the " - "CUDA driver for MAC OSX, see NVBugs 1628896.\n"); - exit(-1); - } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) { - fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n"); - exit(-1); - } else { - fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data()); + const unsigned num_options = 4; + const size_t log_buffer_size = 16384; + std::vector log_info_buffer(log_buffer_size); + std::vector log_error_buffer(log_buffer_size); + CUjit_option jit_options[] = { + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + }; + void* option_vals[] = { + static_cast(log_info_buffer.data()), + reinterpret_cast(log_buffer_size), + static_cast(log_error_buffer.data()), + reinterpret_cast(log_buffer_size), + }; + + CUmodule module; + CUresult result = + cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals); + if (result != CUDA_SUCCESS) { + if (result == CUDA_ERROR_OPERATING_SYSTEM) { + fprintf(stderr, + "ERROR: Device side asserts are not supported by the " + "CUDA driver for MAC OSX, see NVBugs 1628896.\n"); + exit(-1); + } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) { + fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n"); + exit(-1); + } else { + fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data()); #if CUDA_VERSION >= 6050 - const char *name, *str; - assert(cuGetErrorName(result, &name) == CUDA_SUCCESS); - assert(cuGetErrorString(result, &str) == CUDA_SUCCESS); - fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); + const char *name, *str; + assert(cuGetErrorName(result, &name) == CUDA_SUCCESS); + assert(cuGetErrorString(result, &str) == CUDA_SUCCESS); + fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); #else - fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result); + fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result); #endif - exit(-1); + exit(-1); + } } - } - std::cmatch line_match; - bool match = std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); + std::cmatch line_match; + bool match = + std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); #ifdef DEBUG_PANDAS - assert(match); + assert(match); #endif - const auto &matched_line = line_match.begin()->str(); - auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); + const auto& matched_line = line_match.begin()->str(); + auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); - CUfunction func; - result = cuModuleGetFunction(&func, module, fun_name.c_str()); - assert(result == CUDA_SUCCESS); + CUfunction func; + result = cuModuleGetFunction(&func, module, fun_name.c_str()); + assert(result == CUDA_SUCCESS); - //ececuting user function: - size_t buffer_size = (args.args.size() ) * sizeof(void *); - buffer_size += sizeof(size_t); + // ececuting user function: + size_t buffer_size = (args.args.size()) * sizeof(void*); + buffer_size += sizeof(size_t); - std::vector arg_buffer(buffer_size); - char *raw_arg_buffer = arg_buffer.data(); + std::vector arg_buffer(buffer_size); + char* raw_arg_buffer = arg_buffer.data(); - auto p = raw_arg_buffer; + auto p = raw_arg_buffer; - for (auto &arg : args.args) { - auto out = arg.write_accessor(rect); - *reinterpret_cast(p) = out.ptr(rect); - p += sizeof(void *); - } - auto size = rect.volume(); - memcpy(p, &size, sizeof(size_t)); - - void *config[] = { - CU_LAUNCH_PARAM_BUFFER_POINTER, - static_cast(raw_arg_buffer), - CU_LAUNCH_PARAM_BUFFER_SIZE, - &buffer_size, - CU_LAUNCH_PARAM_END, - }; - - const uint32_t gridDimX = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - const uint32_t gridDimY = 1; - const uint32_t gridDimZ = 1; - - const uint32_t blockDimX = THREADS_PER_BLOCK; - const uint32_t blockDimY = 1; - const uint32_t blockDimZ = 1; - - auto stream = get_cached_stream(); - - CUresult status = cuLaunchKernel( - func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config); - if (status != CUDA_SUCCESS) { - fprintf(stderr, "Failed to launch a CUDA kernel\n"); - exit(-1); - } - - CHECK_CUDA_STREAM(stream); + for (auto& arg : args.args) { + auto out = arg.write_accessor(rect); + *reinterpret_cast(p) = out.ptr(rect); + p += sizeof(void*); + } + auto size = rect.volume(); + memcpy(p, &size, sizeof(size_t)); + + void* config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, + static_cast(raw_arg_buffer), + CU_LAUNCH_PARAM_BUFFER_SIZE, + &buffer_size, + CU_LAUNCH_PARAM_END, + }; + + const uint32_t gridDimX = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + const uint32_t gridDimY = 1; + const uint32_t gridDimZ = 1; + + const uint32_t blockDimX = THREADS_PER_BLOCK; + const uint32_t blockDimY = 1; + const uint32_t blockDimZ = 1; + + auto stream = get_cached_stream(); + + CUresult status = cuLaunchKernel( + func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config); + if (status != CUDA_SUCCESS) { + fprintf(stderr, "Failed to launch a CUDA kernel\n"); + exit(-1); + } + CHECK_CUDA_STREAM(stream); } }; /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { - //std::cout <<"IRINA DEBUG size of the scalars = "<()<()}; - size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); - double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args); - + // std::cout <<"IRINA DEBUG size of the scalars = + // "<()<()}; + size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); + double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args); } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf_omp.cc b/src/cunumeric/vectorize/eval_udf_omp.cc index 33e3c6e2a8..c6e2991733 100644 --- a/src/cunumeric/vectorize/eval_udf_omp.cc +++ b/src/cunumeric/vectorize/eval_udf_omp.cc @@ -21,7 +21,6 @@ namespace cunumeric { using namespace Legion; using namespace legate; - /*static*/ void EvalUdfTask::omp_variant(TaskContext& context) { EvalUdfTask::cpu_variant(context); diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl index 60e2582c54..775651b12c 100644 --- a/src/cunumeric/vectorize/eval_udf_template.inl +++ b/src/cunumeric/vectorize/eval_udf_template.inl @@ -40,7 +40,7 @@ struct EvalUdfImpl { auto rect = args.args[0].shape(); if (rect.empty()) return; - EvalUdfImplBody(); + EvalUdfImplBody(); for (size_t i = 0; i < args.args.size(); i++) { auto out = args.args[i].write_accessor(rect); udf_args.push_back(reinterpret_cast(out.ptr(rect))); @@ -55,10 +55,9 @@ static void eval_udf_template(TaskContext& context) { is_gpus = context.scalars()[0].value(); if (is_gpus) - std::cout <<"IRINA DEBUG size of the scalars = "<(), context.outputs()}; - else - EvalUdfArgs args{context.scalars()[1].value(),'', context.outputs()}; + std::cout << "IRINA DEBUG size of the scalars = " << context.scalars().size() << std::endl; + EvalUdfArgs args{0, context.scalars()[1].value(), context.outputs()}; + else EvalUdfArgs args{context.scalars()[1].value(),'', context.outputs()}; size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); double_dispatch(dim, args.args[0].code(), EvalUdfImpl{}, args); } From 6199a4160b86bca55578ac4418ad08a5b2fe063b Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 3 Feb 2023 21:27:05 -0800 Subject: [PATCH 05/78] adding type stubs for numba --- cunumeric/vectorize.py | 33 ++++++++++++++++---------------- pyproject.toml | 1 + typings/numba/__init__.pyi | 15 +++++++++++++++ typings/numba/cuda/__init__.pyi | 5 +++++ typings/numba/cuda/compiler.pyi | 12 ++++++++++++ typings/numba/types/CPointer.pyi | 5 +++++ typings/numba/types/__init__.pyi | 15 +++++++++++++++ 7 files changed, 69 insertions(+), 17 deletions(-) create mode 100644 typings/numba/__init__.pyi create mode 100644 typings/numba/cuda/__init__.pyi create mode 100644 typings/numba/cuda/compiler.pyi create mode 100644 typings/numba/types/CPointer.pyi create mode 100644 typings/numba/types/__init__.pyi diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 549dc79d80..394986f16d 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -18,10 +18,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import legate.core.types as ty -import numba.cuda # type: ignore -import numba.types # type: ignore - -# import numba +import numba import numpy as np import six @@ -30,6 +27,10 @@ from .array import convert_to_cunumeric_ndarray from .config import CuNumericOpCode +# import numba.cuda +# import numba.types + + _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__" _MASK_VAR = "__mask__" _SIZE_VAR = "__size__" @@ -100,16 +101,15 @@ def __init__( signature: Optional[str] = None, ) -> None: self._pyfunc = pyfunc - self._numba_func: Optional[Callable[[Any], Any]] = None - self._cpu_func: numba.types.CPointer = numba.types.CPointer(int) - self._gpu_func: tuple[Any] = (0,) + self._numba_func: Callable[[Any], Any] + self._cpu_func: numba.core.ccallback.CFunc + self._gpu_func: tuple[Any] self._otypes = None self._result = None self._args: List[Any] = [] self._kwargs: List[Any] = [] self._context = runtime.legate_context - if doc is None: self.__doc__ = pyfunc.__doc__ else: @@ -248,28 +248,27 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: for arg in self._args: ty = arg.dtype ty = str(ty) if ty != bool else "int8" - ty = getattr(numba.types, ty) - ty = numba.types.CPointer(ty) + ty = getattr(numba.core.types, ty) + ty = numba.core.types.CPointer(ty) types.append(ty) return types def _compile_func_gpu(self) -> tuple[Any]: types = self._get_numba_types() - arg_types = types + [numba.types.uint64] + arg_types = types + [numba.core.types.uint64] sig = (*arg_types,) cuda_arch = numba.cuda.get_current_device().compute_capability return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch) - def _compile_func_cpu(self) -> numba.types.CPointer: - sig = numba.types.void( - numba.types.CPointer(numba.types.voidptr), numba.types.uint64 - ) + def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: + sig = numba.core.types.void( + numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64 + ) # type: ignore return numba.cfunc(sig)(self._numba_func) def _execute_gpu(self) -> None: - print("IRINA DEBUG executing GPU function", type(self._gpu_func[0])) task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._gpu_func[0], ty.string) idx = 0 @@ -286,7 +285,7 @@ def _execute_gpu(self) -> None: def _execute_cpu(self) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) - task.add_scalar_arg(self._cpu_func.address, ty.uint64) + task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore idx = 0 a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) diff --git a/pyproject.toml b/pyproject.toml index 73ebc13c82..5c8e1f83aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ exclude = ''' [tool.mypy] python_version = "3.10" +mypy_path = "typings/" pretty = true show_error_codes = true diff --git a/typings/numba/__init__.pyi b/typings/numba/__init__.pyi new file mode 100644 index 0000000000..571faf5a05 --- /dev/null +++ b/typings/numba/__init__.pyi @@ -0,0 +1,15 @@ +from typing import Any, Callable + +# Re-export types itself +import numba.core.types as types +import numba.cuda # import compile_ptx + +# import types +from numba.core import types +from numba.core.ccallback import CFunc + +# Re-export all type names +from numba.core.types import CPointer, uint64 + +def cfunc(sig: Any) -> Any: + def wrapper(func: Callable[[Any], Any]) -> tuple[Any]: ... diff --git a/typings/numba/cuda/__init__.pyi b/typings/numba/cuda/__init__.pyi new file mode 100644 index 0000000000..d66e40c5f4 --- /dev/null +++ b/typings/numba/cuda/__init__.pyi @@ -0,0 +1,5 @@ +from typing import Any + +from numba.cuda.compiler import compile_ptx as compile_ptx + +def get_current_device() -> Any: ... diff --git a/typings/numba/cuda/compiler.pyi b/typings/numba/cuda/compiler.pyi new file mode 100644 index 0000000000..56e02dd3e2 --- /dev/null +++ b/typings/numba/cuda/compiler.pyi @@ -0,0 +1,12 @@ +from typing import Any, Callable, Optional + +def compile_ptx( + pyfunc: Callable[[Any], Any], + args: Any, + debug: bool = False, + lineinfo: bool = False, + device: bool = False, + fastmath: bool = False, + cc: Optional[Any] = None, + opt: bool = True, +) -> tuple[Any]: ... diff --git a/typings/numba/types/CPointer.pyi b/typings/numba/types/CPointer.pyi new file mode 100644 index 0000000000..249a23f191 --- /dev/null +++ b/typings/numba/types/CPointer.pyi @@ -0,0 +1,5 @@ +# import numpy as np +from numba.core.types.abstract import Type + +class CPointer(Type): + def __init__(self, dtype: Type) -> None: ... diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi new file mode 100644 index 0000000000..ffbfbd5a94 --- /dev/null +++ b/typings/numba/types/__init__.pyi @@ -0,0 +1,15 @@ + configuration locations on your computer. + +class Type(): ... + +class Number(): ... + +class Integer(Number): + def __init__(self, name: str) ->None: ... + +class CPointer (Type): + def __init__ (self, dtype : Type) -> None : ... + +uint32 = Integer('uint32') +uint64 = Integer('uint64') +void = none From 23b80c512775055dbf66f975c26d55c59e1b2af2 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 7 Feb 2023 11:04:46 -0800 Subject: [PATCH 06/78] clean-up --- cunumeric/vectorize.py | 5 +++++ src/cunumeric/vectorize/eval_udf.cc | 5 ++--- src/cunumeric/vectorize/eval_udf.cu | 13 +++++++++---- src/cunumeric/vectorize/eval_udf_template.inl | 1 - tests/integration/test_vectorize.py | 19 +++++++++++-------- 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 394986f16d..855c2f62a2 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -19,6 +19,7 @@ import legate.core.types as ty import numba +import numba.core.ccallback import numpy as np import six @@ -115,16 +116,20 @@ def __init__( else: self.__doc__ = doc + #FIXME if otypes is not None: raise NotImplementedError("Otypes variables are not supported yet") + #FIXME if excluded is not None: raise NotImplementedError( "excluded variables are not supported yet" ) + #FIXME if cache: raise NotImplementedError("cache variable is not supported yet") + #FIXME if signature is not None: raise NotImplementedError( "signature variable is not supported yet" diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index fb71330581..205c24e782 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -25,7 +25,8 @@ struct EvalUdfCPU { template void operator()(EvalUdfArgs& args) const { - std::cout << "IRINA DEBUG in CPU task 2" << std::endl; + //In the case of CPU, we pack arguments in a vector and pass them to the + //function (through the function pointer geenrated by numba) using UDF = void(void**, size_t); auto udf = reinterpret_cast(args.cpu_func_ptr); std::vector udf_args; @@ -37,14 +38,12 @@ struct EvalUdfCPU { auto out = args.args[i].write_accessor(rect); udf_args.push_back(reinterpret_cast(out.ptr(rect))); } - udf(udf_args.data(), rect.volume()); } }; /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context) { - std::cout << "IRINA DEBUG in CPU task" << std::endl; EvalUdfArgs args{context.scalars()[0].value(), context.outputs()}; size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args); diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index d5b2c1f43a..61789bcdb5 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -32,6 +32,8 @@ struct EvalUdfGPU { auto rect = args.args[0].shape(); if (rect.empty()) return; + + // 1: we need to vreate a function from the ptx generated y numba const unsigned num_options = 4; const size_t log_buffer_size = 16384; std::vector log_info_buffer(log_buffer_size); @@ -78,7 +80,7 @@ struct EvalUdfGPU { std::cmatch line_match; bool match = std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); -#ifdef DEBUG_PANDAS +#ifdef DEBUG_CUNUMERIC assert(match); #endif const auto& matched_line = line_match.begin()->str(); @@ -86,9 +88,13 @@ struct EvalUdfGPU { CUfunction func; result = cuModuleGetFunction(&func, module, fun_name.c_str()); +#ifdef DEBUG_CUNUMERIC assert(result == CUDA_SUCCESS); +#endif + + //2: after fucntion is generated, we can execute it: - // ececuting user function: + //Filling up the bugger with arguments size_t buffer_size = (args.args.size()) * sizeof(void*); buffer_size += sizeof(size_t); @@ -123,6 +129,7 @@ struct EvalUdfGPU { auto stream = get_cached_stream(); + //executing the function CUresult status = cuLaunchKernel( func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config); if (status != CUDA_SUCCESS) { @@ -136,8 +143,6 @@ struct EvalUdfGPU { /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { - // std::cout <<"IRINA DEBUG size of the scalars = - // "<()<()}; size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args); diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl index 775651b12c..6825b90ae7 100644 --- a/src/cunumeric/vectorize/eval_udf_template.inl +++ b/src/cunumeric/vectorize/eval_udf_template.inl @@ -55,7 +55,6 @@ static void eval_udf_template(TaskContext& context) { is_gpus = context.scalars()[0].value(); if (is_gpus) - std::cout << "IRINA DEBUG size of the scalars = " << context.scalars().size() << std::endl; EvalUdfArgs args{0, context.scalars()[1].value(), context.outputs()}; else EvalUdfArgs args{context.scalars()[1].value(),'', context.outputs()}; size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index cbc1e692aa..a6fd3d298d 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -1,4 +1,4 @@ -# Copyright 2021-2022 NVIDIA Corporation +# Copyright 2023 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,23 +18,26 @@ import pytest import cunumeric as num +import numpy as np def my_func(a, b): a = a * 2 + b - a = a * 3 def test_vectorize(): func = num.vectorize(my_func) a = num.arange(5) - b = num.zeros((5,)) - # b = 2 + b = num.ones((5,)) func(a, b) - # assert(a==12) - print("IRINA DEBUG:") - print(a) - + assert(np.array_equal(a, [1,3,5,7,9])) + +#FIXME uncomment once broadcast routines are merged +# a= num.arange(5) +# b=2 +# func(a,b) +# assert(np.array_equal(a, [2,4,6,8,10])) + if __name__ == "__main__": import sys From 3e07363a0209f7c0de1d1f8c31009d27a1884620 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 8 Feb 2023 09:34:20 -0800 Subject: [PATCH 07/78] forcing partition by the first dimention for vectorize --- cunumeric/vectorize.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 855c2f62a2..ca9cae749c 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -286,6 +286,9 @@ def _execute_gpu(self) -> None: if idx != 0: task.add_alignment(a0.base, a_tmp.base) idx += 1 + task.add_broadcast( + a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) + ) task.execute() def _execute_cpu(self) -> None: @@ -301,6 +304,9 @@ def _execute_cpu(self) -> None: if idx != 0: task.add_alignment(a0.base, a_tmp.base) idx += 1 + task.add_broadcast( + a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) + ) task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: From 67b5675ee1d3078e315a8379461285c6befcefb8 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 9 Feb 2023 13:34:45 -0800 Subject: [PATCH 08/78] adding caching to vectorize --- cunumeric/vectorize.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index ca9cae749c..9ec52b7813 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -28,6 +28,7 @@ from .array import convert_to_cunumeric_ndarray from .config import CuNumericOpCode +from legate.timing import time # import numba.cuda # import numba.types @@ -110,6 +111,8 @@ def __init__( self._args: List[Any] = [] self._kwargs: List[Any] = [] self._context = runtime.legate_context + self._created: bool = False + self._cache: bool = cache if doc is None: self.__doc__ = pyfunc.__doc__ @@ -125,9 +128,6 @@ def __init__( raise NotImplementedError( "excluded variables are not supported yet" ) - #FIXME - if cache: - raise NotImplementedError("cache variable is not supported yet") #FIXME if signature is not None: @@ -351,10 +351,16 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: ) if runtime.num_gpus > 0: - self._numba_func = self._build_gpu_function() - self._gpu_func = self._compile_func_gpu() + if not self._created: + self._numba_func = self._build_gpu_function() + self._gpu_func = self._compile_func_gpu() + if self._cache: + self._created = True self._execute_gpu() else: - self._numba_func = self._build_cpu_function() - self._cpu_func = self._compile_func_cpu() + if not self._created: + self._numba_func = self._build_cpu_function() + self._cpu_func = self._compile_func_cpu() + if self._cache: + self._created = True self._execute_cpu() From 5b634fb74991240c8a6b05af5a74883e8a00616d Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 10 Feb 2023 18:41:03 -0800 Subject: [PATCH 09/78] adding support for otypes --- cunumeric/vectorize.py | 42 +++++++++++++------ src/cunumeric/vectorize/eval_udf.cc | 28 +++++++++---- src/cunumeric/vectorize/eval_udf.cu | 32 ++++++++------ src/cunumeric/vectorize/eval_udf.h | 4 +- src/cunumeric/vectorize/eval_udf_template.inl | 5 ++- 5 files changed, 74 insertions(+), 37 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 9ec52b7813..4ecf496656 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -19,7 +19,7 @@ import legate.core.types as ty import numba -import numba.core.ccallback +import numba.core.ccallback import numpy as np import six @@ -28,7 +28,9 @@ from .array import convert_to_cunumeric_ndarray from .config import CuNumericOpCode -from legate.timing import time +# from legate.timing import time + + # import numba.cuda # import numba.types @@ -99,37 +101,47 @@ def __init__( otypes: Optional[Union[str, list[Any]]] = None, doc: Optional[str] = None, excluded: Optional[set[Any]] = None, - cache: Optional[bool] = False, + cache: bool = False, signature: Optional[str] = None, ) -> None: self._pyfunc = pyfunc self._numba_func: Callable[[Any], Any] self._cpu_func: numba.core.ccallback.CFunc self._gpu_func: tuple[Any] - self._otypes = None + self._otypes: Optional[tuple[Any]] = None self._result = None self._args: List[Any] = [] self._kwargs: List[Any] = [] self._context = runtime.legate_context self._created: bool = False self._cache: bool = cache + self._num_outputs = 1 # there is at least 1 output if doc is None: self.__doc__ = pyfunc.__doc__ else: self.__doc__ = doc - #FIXME if otypes is not None: - raise NotImplementedError("Otypes variables are not supported yet") - - #FIXME + self._num_outputs = len(otypes) + if len(otypes) == 0: + raise ValueError( + "There should be at least 1 type specified in otypes" + ) + ty = otypes[0] + for t in otypes: + if t != ty: + raise NotImplementedError( + "cuNumeric doesn't support variable types in otypes" + ) + + # FIXME if excluded is not None: raise NotImplementedError( "excluded variables are not supported yet" ) - #FIXME + # FIXME if signature is not None: raise NotImplementedError( "signature variable is not supported yet" @@ -276,13 +288,15 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: def _execute_gpu(self) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._gpu_func[0], ty.string) + task.add_scalar_arg(self._num_outputs, ty.uint32) idx = 0 a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) - for a in self._args: + for count, a in enumerate(self._args): a_tmp = runtime.to_deferred_array(a._thunk) task.add_input(a_tmp.base) - task.add_output(a_tmp.base) + if count < self._num_outputs: + task.add_output(a_tmp.base) if idx != 0: task.add_alignment(a0.base, a_tmp.base) idx += 1 @@ -294,13 +308,15 @@ def _execute_gpu(self) -> None: def _execute_cpu(self) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore + task.add_scalar_arg(self._num_outputs, ty.uint32) idx = 0 a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) - for a in self._args: + for count, a in enumerate(self._args): a_tmp = runtime.to_deferred_array(a._thunk) task.add_input(a_tmp.base) - task.add_output(a_tmp.base) + if count < self._num_outputs: + task.add_output(a_tmp.base) if idx != 0: task.add_alignment(a0.base, a_tmp.base) idx += 1 diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 205c24e782..78f0a045b7 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -25,18 +25,23 @@ struct EvalUdfCPU { template void operator()(EvalUdfArgs& args) const { - //In the case of CPU, we pack arguments in a vector and pass them to the - //function (through the function pointer geenrated by numba) + // In the case of CPU, we pack arguments in a vector and pass them to the + // function (through the function pointer geenrated by numba) using UDF = void(void**, size_t); auto udf = reinterpret_cast(args.cpu_func_ptr); std::vector udf_args; using VAL = legate_type_of; - auto rect = args.args[0].shape(); + auto rect = args.inputs[0].shape(); if (rect.empty()) return; - for (size_t i = 0; i < args.args.size(); i++) { - auto out = args.args[i].write_accessor(rect); - udf_args.push_back(reinterpret_cast(out.ptr(rect))); + for (size_t i = 0; i < args.inputs.size(); i++) { + if (i < args.num_outputs) { + auto out = args.outputs[i].write_accessor(rect); + udf_args.push_back(reinterpret_cast(out.ptr(rect))); + } else { + auto out = args.inputs[i].read_accessor(rect); + udf_args.push_back(reinterpret_cast(const_cast(out.ptr(rect)))); + } } udf(udf_args.data(), rect.volume()); } @@ -44,9 +49,14 @@ struct EvalUdfCPU { /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context) { - EvalUdfArgs args{context.scalars()[0].value(), context.outputs()}; - size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); - double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args); + std::string tmp("tmp"); + EvalUdfArgs args{context.scalars()[0].value(), + context.inputs(), + context.outputs(), + tmp, + context.scalars()[1].value()}; + size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); + double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); } namespace // unnamed diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 61789bcdb5..d0886387a7 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -29,10 +29,9 @@ struct EvalUdfGPU { void operator()(EvalUdfArgs& args) const { using VAL = legate_type_of; - auto rect = args.args[0].shape(); + auto rect = args.inputs[0].shape(); if (rect.empty()) return; - // 1: we need to vreate a function from the ptx generated y numba const unsigned num_options = 4; const size_t log_buffer_size = 16384; @@ -92,10 +91,10 @@ struct EvalUdfGPU { assert(result == CUDA_SUCCESS); #endif - //2: after fucntion is generated, we can execute it: + // 2: after fucntion is generated, we can execute it: - //Filling up the bugger with arguments - size_t buffer_size = (args.args.size()) * sizeof(void*); + // Filling up the bugger with arguments + size_t buffer_size = (args.inputs.size()) * sizeof(void*); buffer_size += sizeof(size_t); std::vector arg_buffer(buffer_size); @@ -103,9 +102,14 @@ struct EvalUdfGPU { auto p = raw_arg_buffer; - for (auto& arg : args.args) { - auto out = arg.write_accessor(rect); - *reinterpret_cast(p) = out.ptr(rect); + for (size_t i = 0; i < args.inputs.size(); i++) { + if (i < args.num_outputs) { + auto out = args.outputs[i].write_accessor(rect); + *reinterpret_cast(p) = out.ptr(rect); + } else { + auto in = args.inputs[i].read_accessor(rect); + *reinterpret_cast(p) = in.ptr(rect); + } p += sizeof(void*); } auto size = rect.volume(); @@ -129,7 +133,7 @@ struct EvalUdfGPU { auto stream = get_cached_stream(); - //executing the function + // executing the function CUresult status = cuLaunchKernel( func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config); if (status != CUDA_SUCCESS) { @@ -143,8 +147,12 @@ struct EvalUdfGPU { /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { - EvalUdfArgs args{0, context.outputs(), context.scalars()[0].value()}; - size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); - double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args); + EvalUdfArgs args{0, + context.inputs(), + context.outputs(), + context.scalars()[0].value(), + context.scalars()[1].value()}; + size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); + double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h index 6a53dd9f2f..0e5b479686 100644 --- a/src/cunumeric/vectorize/eval_udf.h +++ b/src/cunumeric/vectorize/eval_udf.h @@ -22,8 +22,10 @@ namespace cunumeric { struct EvalUdfArgs { uint64_t cpu_func_ptr; - std::vector& args; + std::vector& inputs; + std::vector& outputs; std::string ptx = ""; + uint32_t num_outputs; }; class EvalUdfTask : public CuNumericTask { diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl index 6825b90ae7..969999490d 100644 --- a/src/cunumeric/vectorize/eval_udf_template.inl +++ b/src/cunumeric/vectorize/eval_udf_template.inl @@ -55,8 +55,9 @@ static void eval_udf_template(TaskContext& context) { is_gpus = context.scalars()[0].value(); if (is_gpus) - EvalUdfArgs args{0, context.scalars()[1].value(), context.outputs()}; - else EvalUdfArgs args{context.scalars()[1].value(),'', context.outputs()}; + EvalUdfArgs args{0, context.scalars()[1].value(), context.outputs()}; + else + EvalUdfArgs args{context.scalars()[1].value(),'', context.outputs()}; size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); double_dispatch(dim, args.args[0].code(), EvalUdfImpl{}, args); } From 844a29f04d37f012a3f756c5c303b9c108ee0e7a Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 10 Feb 2023 19:22:44 -0800 Subject: [PATCH 10/78] requiring exact instance creation for EVAL_UDF task --- cunumeric/vectorize.py | 12 ++++++------ src/cunumeric/mapper.cc | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 4ecf496656..cca4191b8e 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -300,9 +300,9 @@ def _execute_gpu(self) -> None: if idx != 0: task.add_alignment(a0.base, a_tmp.base) idx += 1 - task.add_broadcast( - a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) - ) + # task.add_broadcast( + # a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) + # ) task.execute() def _execute_cpu(self) -> None: @@ -320,9 +320,9 @@ def _execute_cpu(self) -> None: if idx != 0: task.add_alignment(a0.base, a_tmp.base) idx += 1 - task.add_broadcast( - a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) - ) + # task.add_broadcast( + # a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) + # ) task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc index 51797acfe7..c879c4f476 100644 --- a/src/cunumeric/mapper.cc +++ b/src/cunumeric/mapper.cc @@ -222,6 +222,22 @@ std::vector CuNumericMapper::store_mappings( } return std::move(mappings); } + case CUNUMERIC_EVAL_UDF: { + std::vector mappings; + auto& inputs = task.inputs(); + auto& outputs = task.outputs(); + for (auto& input : inputs) { + mappings.push_back(StoreMapping::default_mapping(input, options.front())); + // mappings.back().policy.ordering.c_order(); + mappings.back().policy.exact = true; + } + for (auto& output : outputs) { + mappings.push_back(StoreMapping::default_mapping(output, options.front())); + // mappings.back().policy.ordering.c_order(); + mappings.back().policy.exact = true; + } + return std::move(mappings); + } default: { return {}; } From 728549825515bd693392426749e9d9b48801b8d4 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 10 Feb 2023 20:07:26 -0800 Subject: [PATCH 11/78] fixing logic for generating CPU functions --- cunumeric/vectorize.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index cca4191b8e..2494ea64d9 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -246,6 +246,20 @@ def _lift_to_array_access(m: Any) -> str: name = m.group(0) if name in argnames: return "{}[{}]".format(name, _LOOP_VAR) + elif name == "if": + return "if " + elif name == "return": + return "return " + elif name == "or": + return "or " + elif name == "and": + return "and " + elif name == "not": + return "not " + elif name == "min": + return "min" + elif name == "max": + return "max" else: return "{}[0]".format(name) @@ -254,6 +268,9 @@ def _lift_to_array_access(m: Any) -> str: l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) lines.append(" " + l_new) + print("IRINA DEBUG CPU function") + print(lines) + # Evaluate the string to get the Python function body = "\n".join(lines) glbs: Dict[str, Any] = {} From f1192e6ec5288d0e8a110c7e6c43d1c1616fe65c Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 14 Feb 2023 09:18:20 -0800 Subject: [PATCH 12/78] some clean-up --- cunumeric/vectorize.py | 122 ++++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 62 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 2494ea64d9..9d914e1b28 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -171,6 +171,27 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return_lines.append(lines[i].rstrip()) return return_lines + def _replace_name(self, name: str,argnames:list[str], _LOOP_VAR:str) -> str: + if name in argnames: + return "{}[{}]".format(name, _LOOP_VAR) + elif name == "if": + return "if " + elif name == "return": + return "return " + elif name == "or": + return "or " + elif name == "and": + return "and " + elif name == "not": + return "not " + elif name == "min": + return "min" + elif name == "max": + return "max" + else: + return "{}".format(name) + + def _build_gpu_function(self) -> Any: funcid = "vectorized_{}".format(self._pyfunc.__name__) @@ -191,11 +212,7 @@ def _build_gpu_function(self) -> Any: # Kernel body def _lift_to_array_access(m: Any) -> str: - name = m.group(0) - if name in argnames: - return "{}[{}]".format(name, _LOOP_VAR) - else: - return "{}".format(name) + return self._replace_name(m.group(0), argnames, _LOOP_VAR) # kernel body lines_old = self._get_func_body(self._pyfunc) @@ -242,34 +259,17 @@ def _emit_assignment( lines_old = self._get_func_body(self._pyfunc) + # Kernel body def _lift_to_array_access(m: Any) -> str: - name = m.group(0) - if name in argnames: - return "{}[{}]".format(name, _LOOP_VAR) - elif name == "if": - return "if " - elif name == "return": - return "return " - elif name == "or": - return "or " - elif name == "and": - return "and " - elif name == "not": - return "not " - elif name == "min": - return "min" - elif name == "max": - return "max" - else: - return "{}[0]".format(name) + return self._replace_name(m.group(0), argnames, _LOOP_VAR) # lines_new = [] for line in lines_old: l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) lines.append(" " + l_new) - print("IRINA DEBUG CPU function") - print(lines) + #print("IRINA DEBUG CPU function") + #print(lines) # Evaluate the string to get the Python function body = "\n".join(lines) @@ -306,40 +306,32 @@ def _execute_gpu(self) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._gpu_func[0], ty.string) task.add_scalar_arg(self._num_outputs, ty.uint32) - idx = 0 a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): a_tmp = runtime.to_deferred_array(a._thunk) - task.add_input(a_tmp.base) + a_tmp=a_tmp.base + task.add_input(a_tmp) if count < self._num_outputs: - task.add_output(a_tmp.base) - if idx != 0: - task.add_alignment(a0.base, a_tmp.base) - idx += 1 - # task.add_broadcast( - # a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) - # ) + task.add_output(a_tmp) + if count != 0: + task.add_alignment(a0.base, a_tmp) task.execute() def _execute_cpu(self) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore task.add_scalar_arg(self._num_outputs, ty.uint32) - idx = 0 a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): a_tmp = runtime.to_deferred_array(a._thunk) - task.add_input(a_tmp.base) + a_tmp=a_tmp.base + task.add_input(a_tmp) if count < self._num_outputs: - task.add_output(a_tmp.base) - if idx != 0: - task.add_alignment(a0.base, a_tmp.base) - idx += 1 - # task.add_broadcast( - # a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape))) - # ) + task.add_output(a_tmp) + if count != 0: + task.add_alignment(a0.base, a_tmp) task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: @@ -358,24 +350,30 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: "passed to cunumeric.vectorize" ) - # #FIXME: comment out when brodcast PR is merged - # #bring all argumants to the same shape and type: - # if len(self._args)>0: - # ty = self._args[0].dtype - # #FIXME: should we bring them all to the same type? - # for a in self._args: - # if a.dtype != ty: - # return TypeError("all arguments of " - # "user defined function " - # "should have the same type") - - # shapes = tuple(a.shape for a in self._args) - # shape = broadcast_shapes(shapes) - # new_args = tuple() - # for a in self._args: - # a_new = a.broadcast_to(shape) - # new_args +=(a_new,) - # self._args = new_args + #all output arrays should have the same type + if len(self._args)>0: + ty = self._args[0].dtype + shape = self._args[0].shape + for i in range (1, self._num_outputs): + if ty!=self._args[i].dtype: + raise TypeError("cuNumeric doesnt support " + "different types for output data in " + "user function passed to vectorize") + if shape != self._args[i].shape: + raise TypeError("cuNumeric doesnt support " + "different shapes for output data in " + "user function passed to vectorize") + for i in range (self._num_outputs, len(self._args)): + if ty!=self._args[i].dtype: + runtime.warn( + "converting input array to output types in user func ", + category=RuntimeWarning, + ) + self._args[i] = self._args[i].astype(ty) + if shape !=self._args[i].shape: + raise TypeError("cuNumeric doesnt support " + "different shapes for arrays in " + "user function passed to vectorize") self._kwargs = list(kwargs) if len(self._kwargs) > 1: From e8b544f2e2a71e409049ffc6101c3dcd296c7cdb Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 14 Feb 2023 17:56:26 -0800 Subject: [PATCH 13/78] adding logic for scalar arguments --- cunumeric/utils.py | 25 +++++++ cunumeric/vectorize.py | 102 +++++++++++++++++----------- src/cunumeric/vectorize/eval_udf.cc | 45 ++++++++---- src/cunumeric/vectorize/eval_udf.cu | 52 ++++++++++---- src/cunumeric/vectorize/eval_udf.h | 2 + tests/integration/test_vectorize.py | 9 ++- 6 files changed, 163 insertions(+), 72 deletions(-) diff --git a/cunumeric/utils.py b/cunumeric/utils.py index 25f0f19f15..cef1fbb539 100644 --- a/cunumeric/utils.py +++ b/cunumeric/utils.py @@ -45,6 +45,25 @@ np.complex128: ty.complex128, } +CUNUMERIC_TYPE_MAP = { + "bool": ty.bool_, + "int8": ty.int8, + "int16": ty.int16, + "int32": ty.int32, + "int": ty.int64, # np.int is int + "int64": ty.int64, + "uint8": ty.uint8, + "uint16": ty.uint16, + "uint32": ty.uint32, + "uint64": ty.uint64, # np.uint is np.uint64 + "float16": ty.float16, + "float32": ty.float32, + "float": ty.float64, + "float64": ty.float64, + "complex64": ty.complex64, + "complex128": ty.complex128, +} + def is_advanced_indexing(key: Any) -> bool: if key is Ellipsis or key is None: # np.newdim case @@ -98,6 +117,12 @@ def is_supported_dtype(dtype: Any) -> bool: raise TypeError("expected a NumPy dtype") return dtype.type in SUPPORTED_DTYPES +def convert_to_cunumeric_dtype(dtype: str) ->Any: + if dtype in CUNUMERIC_TYPE_MAP: + return CUNUMERIC_TYPE_MAP[dtype] + else: + raise TypeError("dtype is not supported") + def calculate_volume(shape: NdShape) -> int: if len(shape) == 0: diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 9d914e1b28..5d74aac1e7 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -27,6 +27,7 @@ from .array import convert_to_cunumeric_ndarray from .config import CuNumericOpCode +from .utils import convert_to_cunumeric_dtype # from legate.timing import time @@ -111,6 +112,10 @@ def __init__( self._otypes: Optional[tuple[Any]] = None self._result = None self._args: List[Any] = [] + self._scalar_args: List[Any]=[] + self._scalar_idxs:List[int]=[] + self._scalar_names:List[str]=[] + self._argnames:List[str]=[] self._kwargs: List[Any] = [] self._context = runtime.legate_context self._created: bool = False @@ -171,8 +176,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return_lines.append(lines[i].rstrip()) return return_lines - def _replace_name(self, name: str,argnames:list[str], _LOOP_VAR:str) -> str: - if name in argnames: + def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str: + if name in self._argnames and not(name in self._scalar_names) : return "{}[{}]".format(name, _LOOP_VAR) elif name == "if": return "if " @@ -188,8 +193,10 @@ def _replace_name(self, name: str,argnames:list[str], _LOOP_VAR:str) -> str: return "min" elif name == "max": return "max" - else: + elif is_gpu: return "{}".format(name) + else: + return "{}[0]".format(name) def _build_gpu_function(self) -> Any: @@ -200,8 +207,8 @@ def _build_gpu_function(self) -> Any: lines = ["from numba import cuda"] # Signature - argnames = list(k for k in inspect.signature(self._pyfunc).parameters) - args = argnames + [_SIZE_VAR] + args = self._argnames + [_SIZE_VAR] + lines.append("def {}({}):".format(funcid, ",".join(args))) # Initialize the index variable and return immediately @@ -212,7 +219,7 @@ def _build_gpu_function(self) -> Any: # Kernel body def _lift_to_array_access(m: Any) -> str: - return self._replace_name(m.group(0), argnames, _LOOP_VAR) + return self._replace_name(m.group(0), _LOOP_VAR, True) # kernel body lines_old = self._get_func_body(self._pyfunc) @@ -247,12 +254,16 @@ def _emit_assignment( ) # get names of arguments - argnames = list(k for k in inspect.signature(self._pyfunc).parameters) arg_idx = 0 for a in self._args: ty = a.dtype - _emit_assignment(argnames[arg_idx], arg_idx, _SIZE_VAR, ty) + _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, ty) arg_idx += 1 + for a in self._scalar_args: + scalar_type = np.dtype(type(a).__name__) + _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type) + arg_idx += 1 + # Main loop lines.append(" for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR)) @@ -261,16 +272,13 @@ def _emit_assignment( # Kernel body def _lift_to_array_access(m: Any) -> str: - return self._replace_name(m.group(0), argnames, _LOOP_VAR) + return self._replace_name(m.group(0), _LOOP_VAR) # lines_new = [] for line in lines_old: l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) lines.append(" " + l_new) - #print("IRINA DEBUG CPU function") - #print(lines) - # Evaluate the string to get the Python function body = "\n".join(lines) glbs: Dict[str, Any] = {} @@ -285,6 +293,11 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: ty = getattr(numba.core.types, ty) ty = numba.core.types.CPointer(ty) types.append(ty) + for arg in self._scalar_args: + ty = np.dtype(type(arg).__name__) + ty = str(ty) if ty != bool else "int8" + ty = getattr(numba.core.types, ty) + types.append(ty) return types def _compile_func_gpu(self) -> tuple[Any]: @@ -302,26 +315,18 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: return numba.cfunc(sig)(self._numba_func) - def _execute_gpu(self) -> None: + def _execute(self, is_gpu:bool) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) - task.add_scalar_arg(self._gpu_func[0], ty.string) - task.add_scalar_arg(self._num_outputs, ty.uint32) - a0 = self._args[0]._thunk - a0 = runtime.to_deferred_array(a0) - for count, a in enumerate(self._args): - a_tmp = runtime.to_deferred_array(a._thunk) - a_tmp=a_tmp.base - task.add_input(a_tmp) - if count < self._num_outputs: - task.add_output(a_tmp) - if count != 0: - task.add_alignment(a0.base, a_tmp) - task.execute() + if is_gpu: + task.add_scalar_arg(self._gpu_func[0], ty.string) + task.add_scalar_arg(self._num_outputs, ty.uint32) + else: + task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore + task.add_scalar_arg(self._num_outputs, ty.uint32) + for a in self._scalar_args: + dtype = convert_to_cunumeric_dtype(type(a).__name__) + task.add_scalar_arg(a,dtype) - def _execute_cpu(self) -> None: - task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) - task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore - task.add_scalar_arg(self._num_outputs, ty.uint32) a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): @@ -339,16 +344,31 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: Return arrays with the results of `pyfunc` broadcast (vectorized) over `args` and `kwargs` not in `excluded`. """ - self._args = list( - convert_to_cunumeric_ndarray(arg) if arg is not None else arg - for (idx, arg) in enumerate(args) - ) - for arg in self._args: + #self._args = list( + # convert_to_cunumeric_ndarray(arg) if (arg is not None and np.ndim(Arg)>0) + # for (idx, arg) in enumerate(args) + #) + for i,arg in enumerate(args): if arg is None: raise ValueError( "None is not supported in user function " "passed to cunumeric.vectorize" ) + elif np.ndim(arg)==0: + self._scalar_args.append(arg) + self._scalar_idxs.append(i) + else: + self._args.append(convert_to_cunumeric_ndarray(arg)) + + #first fill arrays to argnames, then scalars: + for i,k in enumerate(inspect.signature(self._pyfunc).parameters): + if not(i in self._scalar_idxs): + self._argnames.append(k) + + for i,k in enumerate(inspect.signature(self._pyfunc).parameters): + if i in self._scalar_idxs: + self._scalar_names.append(k) + self._argnames.append(k) #all output arrays should have the same type if len(self._args)>0: @@ -370,7 +390,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: category=RuntimeWarning, ) self._args[i] = self._args[i].astype(ty) - if shape !=self._args[i].shape: + if shape !=self._args[i].shape and np.ndim(self._args[i])>0: raise TypeError("cuNumeric doesnt support " "different shapes for arrays in " "user function passed to vectorize") @@ -387,11 +407,17 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._gpu_func = self._compile_func_gpu() if self._cache: self._created = True - self._execute_gpu() + self._execute(True) else: if not self._created: self._numba_func = self._build_cpu_function() self._cpu_func = self._compile_func_cpu() if self._cache: self._created = True - self._execute_cpu() + self._execute(False) + + self._args.clear() + self._scalar_args.clear() + self._scalar_idxs.clear() + self._argnames.clear() + self._scalar_names.clear() diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 78f0a045b7..47f84f0d97 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -30,33 +30,50 @@ struct EvalUdfCPU { using UDF = void(void**, size_t); auto udf = reinterpret_cast(args.cpu_func_ptr); std::vector udf_args; - using VAL = legate_type_of; - auto rect = args.inputs[0].shape(); + size_t volume = 1; + if (args.inputs.size()>0){ + using VAL = legate_type_of; + auto rect = args.inputs[0].shape(); - if (rect.empty()) return; - for (size_t i = 0; i < args.inputs.size(); i++) { - if (i < args.num_outputs) { - auto out = args.outputs[i].write_accessor(rect); - udf_args.push_back(reinterpret_cast(out.ptr(rect))); - } else { - auto out = args.inputs[i].read_accessor(rect); - udf_args.push_back(reinterpret_cast(const_cast(out.ptr(rect)))); + if (rect.empty()) return; + for (size_t i = 0; i < args.inputs.size(); i++) { + if (i < args.num_outputs) { + auto out = args.outputs[i].write_accessor(rect); + udf_args.push_back(reinterpret_cast(out.ptr(rect))); + } else { + auto out = args.inputs[i].read_accessor(rect); + udf_args.push_back(reinterpret_cast(const_cast(out.ptr(rect)))); + } } - } - udf(udf_args.data(), rect.volume()); + volume = rect.volume(); + }//if + for (auto s: args.scalars) + udf_args.push_back(const_cast(s.ptr())); + udf(udf_args.data(), volume); } }; /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context) { std::string tmp("tmp"); + std::vectorscalars; + for (size_t i=2; i(), context.inputs(), context.outputs(), + scalars, tmp, context.scalars()[1].value()}; - size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); - double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); + size_t dim=1; + if (args.inputs.size()>0){ + dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); + double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); + } + else{ + //FIXME + double_dispatch(dim, args.inputs[0].code() , EvalUdfCPU{}, args); + } } namespace // unnamed diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index d0886387a7..28d899b0e1 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -29,8 +29,8 @@ struct EvalUdfGPU { void operator()(EvalUdfArgs& args) const { using VAL = legate_type_of; - auto rect = args.inputs[0].shape(); - if (rect.empty()) return; + //auto rect = args.inputs[0].shape(); + //if (rect.empty()) return; // 1: we need to vreate a function from the ptx generated y numba const unsigned num_options = 4; @@ -93,8 +93,8 @@ struct EvalUdfGPU { // 2: after fucntion is generated, we can execute it: - // Filling up the bugger with arguments - size_t buffer_size = (args.inputs.size()) * sizeof(void*); + // Filling up the buffer with arguments + size_t buffer_size = (args.inputs.size()+args.scalars.size()) * sizeof(void*); buffer_size += sizeof(size_t); std::vector arg_buffer(buffer_size); @@ -102,17 +102,27 @@ struct EvalUdfGPU { auto p = raw_arg_buffer; - for (size_t i = 0; i < args.inputs.size(); i++) { - if (i < args.num_outputs) { - auto out = args.outputs[i].write_accessor(rect); - *reinterpret_cast(p) = out.ptr(rect); - } else { - auto in = args.inputs[i].read_accessor(rect); - *reinterpret_cast(p) = in.ptr(rect); + size_t size =1; + if (args.inputs.size()>0){ + auto rect = args.inputs[0].shape(); + size = rect.volume(); + for (size_t i = 0; i < args.inputs.size(); i++) { + if (i < args.num_outputs) { + auto out = args.outputs[i].write_accessor(rect); + *reinterpret_cast(p) = out.ptr(rect); + } else { + auto in = args.inputs[i].read_accessor(rect); + *reinterpret_cast(p) = in.ptr(rect); + } + p += sizeof(void*); } - p += sizeof(void*); } - auto size = rect.volume(); + for (auto scalar: args.scalars){ + memcpy(p, scalar.ptr(), scalar.size()); + p += scalar.size(); + // *reinterpret_cast(p) =s; + //p += sizeof(void*); + } memcpy(p, &size, sizeof(size_t)); void* config[] = { @@ -147,12 +157,24 @@ struct EvalUdfGPU { /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { + std::vectorscalars; + for (size_t i=2; i(), context.scalars()[1].value()}; - size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); - double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); + size_t dim=1; + if (args.inputs.size()>0){ + dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); + double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); + } + else{ + double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); + //double_dispatch(dim, 0 , EvalUdfGPU{}, args); + } } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h index 0e5b479686..243c439230 100644 --- a/src/cunumeric/vectorize/eval_udf.h +++ b/src/cunumeric/vectorize/eval_udf.h @@ -17,6 +17,7 @@ #pragma once #include "cunumeric/cunumeric.h" +#include "core/data/scalar.h" namespace cunumeric { @@ -24,6 +25,7 @@ struct EvalUdfArgs { uint64_t cpu_func_ptr; std::vector& inputs; std::vector& outputs; + std::vectorscalars; std::string ptx = ""; uint32_t num_outputs; }; diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index a6fd3d298d..a453198f23 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -32,11 +32,10 @@ def test_vectorize(): func(a, b) assert(np.array_equal(a, [1,3,5,7,9])) -#FIXME uncomment once broadcast routines are merged -# a= num.arange(5) -# b=2 -# func(a,b) -# assert(np.array_equal(a, [2,4,6,8,10])) + a= num.arange(5) + b=2 + func(a,b) + assert(np.array_equal(a, [2,4,6,8,10])) if __name__ == "__main__": From 37a7281b2fbe4cf2409951e5cdf52462de35a230 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 16 Feb 2023 12:51:19 -0800 Subject: [PATCH 14/78] refactoring cectorize kernel to compute point --- cunumeric/vectorize.py | 18 ++++++++++--- src/cunumeric/mapper.cc | 2 ++ src/cunumeric/pitches.h | 26 ++++++++++++++++++- src/cunumeric/vectorize/eval_udf.cu | 22 +++++++++++++--- src/cunumeric/vectorize/eval_udf_template.inl | 6 ++++- tests/integration/test_vectorize.py | 6 ++++- 6 files changed, 70 insertions(+), 10 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 5d74aac1e7..8fae97608b 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -41,6 +41,10 @@ _SIZE_VAR = "__size__" _LOOP_VAR = "__i__" _ARGS_VAR = "__args__" +_DIM_VAR = "__dim__" +_POINT_VAR = "__point__" +_LO_POINT_VAR = "__lo_point__" +_PITCHES_VAR = "__pitches__" class vectorize: @@ -207,7 +211,7 @@ def _build_gpu_function(self) -> Any: lines = ["from numba import cuda"] # Signature - args = self._argnames + [_SIZE_VAR] + args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR] lines.append("def {}({}):".format(funcid, ",".join(args))) @@ -216,16 +220,24 @@ def _build_gpu_function(self) -> Any: lines.append(" {} = cuda.grid(1)".format(_LOOP_VAR)) lines.append(" if {} >= {}:".format(_LOOP_VAR, _SIZE_VAR)) lines.append(" return") + lines.append(" {}={}".format(_POINT_VAR, _LO_POINT_VAR)) + lines.append(" for p in range({}-1):".format(_DIM_VAR)) + + lines.append(" {}[p]+={}/int({}[p])".format(_POINT_VAR,_LOOP_VAR, _PITCHES_VAR)) + lines.append(" {}={}%int({})".format(_LOOP_VAR,_LOOP_VAR,_PITCHES_VAR)) + lines.append(" {}[{}-1]+={}".format(_POINT_VAR, _DIM_VAR, _LOOP_VAR)) # Kernel body def _lift_to_array_access(m: Any) -> str: - return self._replace_name(m.group(0), _LOOP_VAR, True) + return self._replace_name(m.group(0), _POINT_VAR, True) # kernel body lines_old = self._get_func_body(self._pyfunc) for line in lines_old: l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) lines.append(l_new) + + print("IRINA DEBUG GPU function",lines) # Evaluate the string to get the Python function body = "\n".join(lines) @@ -302,7 +314,7 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: def _compile_func_gpu(self) -> tuple[Any]: types = self._get_numba_types() - arg_types = types + [numba.core.types.uint64] + arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)] sig = (*arg_types,) cuda_arch = numba.cuda.get_current_device().compute_capability diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc index c879c4f476..5959179ee5 100644 --- a/src/cunumeric/mapper.cc +++ b/src/cunumeric/mapper.cc @@ -222,6 +222,7 @@ std::vector CuNumericMapper::store_mappings( } return std::move(mappings); } +#if 0 case CUNUMERIC_EVAL_UDF: { std::vector mappings; auto& inputs = task.inputs(); @@ -238,6 +239,7 @@ std::vector CuNumericMapper::store_mappings( } return std::move(mappings); } +#endif default: { return {}; } diff --git a/src/cunumeric/pitches.h b/src/cunumeric/pitches.h index af12c09e43..ae88ee9138 100644 --- a/src/cunumeric/pitches.h +++ b/src/cunumeric/pitches.h @@ -53,6 +53,12 @@ class Pitches { point[DIM] += index; return point; } + + __CUDA_HD__ + inline const size_t* data(void) + { + return &pitches[0]; + } private: size_t pitches[DIM]; @@ -90,6 +96,13 @@ class Pitches { return point; } + __CUDA_HD__ + inline const size_t* data(void) + { + return &pitches[0]; + } + + private: size_t pitches[DIM]; }; @@ -103,8 +116,10 @@ class Pitches<0, C_ORDER> { { if (rect.lo[0] > rect.hi[0]) return 0; - else + else{ + pitches[0]=rect.hi[0] - rect.lo[0] + 1; return (rect.hi[0] - rect.lo[0] + 1); + } } __CUDA_HD__ inline Legion::Point<1> unflatten(size_t index, const Legion::Point<1>& lo) const @@ -113,6 +128,15 @@ class Pitches<0, C_ORDER> { point[0] += index; return point; } + __CUDA_HD__ + inline const size_t* data(void) + { + return &pitches[0]; + } + + private: + size_t pitches[1]; + }; } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 28d899b0e1..3fc233384d 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -16,6 +16,7 @@ #include "cunumeric/vectorize/eval_udf.h" #include "cunumeric/cuda_help.h" +#include "cunumeric/pitches.h" #include #include @@ -29,8 +30,7 @@ struct EvalUdfGPU { void operator()(EvalUdfArgs& args) const { using VAL = legate_type_of; - //auto rect = args.inputs[0].shape(); - //if (rect.empty()) return; + Rect rect; // 1: we need to vreate a function from the ptx generated y numba const unsigned num_options = 4; @@ -95,7 +95,10 @@ struct EvalUdfGPU { // Filling up the buffer with arguments size_t buffer_size = (args.inputs.size()+args.scalars.size()) * sizeof(void*); - buffer_size += sizeof(size_t); + buffer_size +=sizeof(size_t);//size + buffer_size += sizeof(size_t);//dim + buffer_size += sizeof(void*);//pitches + buffer_size += sizeof(void*);//lo_point std::vector arg_buffer(buffer_size); char* raw_arg_buffer = arg_buffer.data(); @@ -104,7 +107,7 @@ struct EvalUdfGPU { size_t size =1; if (args.inputs.size()>0){ - auto rect = args.inputs[0].shape(); + rect = args.inputs[0].shape(); size = rect.volume(); for (size_t i = 0; i < args.inputs.size(); i++) { if (i < args.num_outputs) { @@ -124,6 +127,17 @@ struct EvalUdfGPU { //p += sizeof(void*); } memcpy(p, &size, sizeof(size_t)); + size_t dim=DIM; + p += sizeof(size_t); + memcpy(p, &dim, sizeof(size_t)); + p += sizeof(size_t); + Pitches pitches; + size_t volume = pitches.flatten(rect); + *reinterpret_cast(p) =pitches.data(); + p += sizeof(void*); + *reinterpret_cast(p) =&rect.lo[0]; +// p += sizeof(void*); + void* config[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl index 969999490d..c597e44a14 100644 --- a/src/cunumeric/vectorize/eval_udf_template.inl +++ b/src/cunumeric/vectorize/eval_udf_template.inl @@ -38,12 +38,16 @@ struct EvalUdfImpl { std::vector udf_args; using VAL = legate_type_of; auto rect = args.args[0].shape(); + + size_t strides[DIM]; if (rect.empty()) return; EvalUdfImplBody(); for (size_t i = 0; i < args.args.size(); i++) { auto out = args.args[i].write_accessor(rect); - udf_args.push_back(reinterpret_cast(out.ptr(rect))); + udf_args.push_back(reinterpret_cast(out.ptr(rect, strides))); + for (size_t i=0; i Date: Thu, 16 Feb 2023 23:00:55 -0800 Subject: [PATCH 15/78] making GPU kernel work with sparse arrays --- cunumeric/vectorize.py | 29 ++++++++++++++++------------ src/cunumeric/vectorize/eval_udf.cu | 30 +++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 8fae97608b..81ecc4776a 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -42,7 +42,7 @@ _LOOP_VAR = "__i__" _ARGS_VAR = "__args__" _DIM_VAR = "__dim__" -_POINT_VAR = "__point__" +_STRIDES_VAR = "__strides__" _LO_POINT_VAR = "__lo_point__" _PITCHES_VAR = "__pitches__" @@ -211,25 +211,30 @@ def _build_gpu_function(self) -> Any: lines = ["from numba import cuda"] # Signature - args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR] + args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR] +[_STRIDES_VAR] lines.append("def {}({}):".format(funcid, ",".join(args))) # Initialize the index variable and return immediately # when it exceeds the data size - lines.append(" {} = cuda.grid(1)".format(_LOOP_VAR)) - lines.append(" if {} >= {}:".format(_LOOP_VAR, _SIZE_VAR)) + lines.append(" local_i = cuda.grid(1)") + lines.append(" if local_i >= {}:".format(_SIZE_VAR)) lines.append(" return") - lines.append(" {}={}".format(_POINT_VAR, _LO_POINT_VAR)) + lines.append(" {}:int = 0".format(_LOOP_VAR)) lines.append(" for p in range({}-1):".format(_DIM_VAR)) - - lines.append(" {}[p]+={}/int({}[p])".format(_POINT_VAR,_LOOP_VAR, _PITCHES_VAR)) - lines.append(" {}={}%int({})".format(_LOOP_VAR,_LOOP_VAR,_PITCHES_VAR)) - lines.append(" {}[{}-1]+={}".format(_POINT_VAR, _DIM_VAR, _LOOP_VAR)) + lines.append(" x={}[p]+int(local_i/{}[p])".format(_LO_POINT_VAR,_PITCHES_VAR)) + lines.append(" local_i = local_i-{}[p]*int(local_i/{}[p])".format(_PITCHES_VAR,_PITCHES_VAR)) + lines.append(" {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)) + #lines.append(" print(x, {}, {}[p])".format(_LOOP_VAR, _STRIDES_VAR)) + + #lines.append(" print(local_i, local_i,local_i,local_i,local_i,local_i, {}[0])".format( _STRIDES_VAR)) + lines.append(" {}+=int(local_i*{}[{}-1])".format(_LOOP_VAR, _STRIDES_VAR, _DIM_VAR)) + #lines.append(" print( local_i, local_i, local_i, {})".format(_LOOP_VAR)) + #lines.append(" {} =local_i".format(_LOOP_VAR)) # Kernel body def _lift_to_array_access(m: Any) -> str: - return self._replace_name(m.group(0), _POINT_VAR, True) + return self._replace_name(m.group(0), _LOOP_VAR, True) # kernel body lines_old = self._get_func_body(self._pyfunc) @@ -237,7 +242,7 @@ def _lift_to_array_access(m: Any) -> str: l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) lines.append(l_new) - print("IRINA DEBUG GPU function",lines) + #print("IRINA DEBUG GPU function",lines) # Evaluate the string to get the Python function body = "\n".join(lines) @@ -314,7 +319,7 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: def _compile_func_gpu(self) -> tuple[Any]: types = self._get_numba_types() - arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)] + arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)]+[numba.core.types.CPointer(numba.core.types.uint64)] sig = (*arg_types,) cuda_arch = numba.cuda.get_current_device().compute_capability diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 3fc233384d..ea6d878254 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -99,12 +99,13 @@ struct EvalUdfGPU { buffer_size += sizeof(size_t);//dim buffer_size += sizeof(void*);//pitches buffer_size += sizeof(void*);//lo_point + buffer_size += sizeof(void*);//strides std::vector arg_buffer(buffer_size); char* raw_arg_buffer = arg_buffer.data(); auto p = raw_arg_buffer; - + size_t strides[DIM]; size_t size =1; if (args.inputs.size()>0){ rect = args.inputs[0].shape(); @@ -112,10 +113,10 @@ struct EvalUdfGPU { for (size_t i = 0; i < args.inputs.size(); i++) { if (i < args.num_outputs) { auto out = args.outputs[i].write_accessor(rect); - *reinterpret_cast(p) = out.ptr(rect); + *reinterpret_cast(p) = out.ptr(rect, strides); } else { auto in = args.inputs[i].read_accessor(rect); - *reinterpret_cast(p) = in.ptr(rect); + *reinterpret_cast(p) = in.ptr(rect, strides); } p += sizeof(void*); } @@ -133,10 +134,27 @@ struct EvalUdfGPU { p += sizeof(size_t); Pitches pitches; size_t volume = pitches.flatten(rect); - *reinterpret_cast(p) =pitches.data(); + //create buffers for pitches, lower point and strides since + //we need to pass pointer to device memory + auto device_pitches = create_buffer(Point<1>(DIM-1), Memory::Kind::Z_COPY_MEM); + auto device_lo = create_buffer(Point<1>(DIM), Memory::Kind::Z_COPY_MEM); + auto device_strides = create_buffer(Point<1>(DIM), Memory::Kind::Z_COPY_MEM); + //std::cout<<"IRINA DEBUG"<(i)]=pitches.data()[i]; + //std::cout<<" pitches ="<(i)]=rect.lo[i]; + device_strides[Point<1>(i)] = strides[i]; + //std::cout<<" device_lo = " < str: @@ -281,7 +276,6 @@ def _emit_assignment( _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type) arg_idx += 1 - # Main loop lines.append(" for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR)) @@ -361,31 +355,40 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: Return arrays with the results of `pyfunc` broadcast (vectorized) over `args` and `kwargs` not in `excluded`. """ - #self._args = list( - # convert_to_cunumeric_ndarray(arg) if (arg is not None and np.ndim(Arg)>0) - # for (idx, arg) in enumerate(args) - #) - for i,arg in enumerate(args): - if arg is None: - raise ValueError( - "None is not supported in user function " - "passed to cunumeric.vectorize" + if not self._created: + self._scalar_args.clear() + self._scalar_idxs.clear() + self._args.clear() + self._argnames.clear() + self._scalar_names.clear() + + for i,arg in enumerate(args): + if arg is None: + raise ValueError( + "None is not supported in user function " + "passed to cunumeric.vectorize" + ) + elif np.ndim(arg)==0: + self._scalar_args.append(arg) + self._scalar_idxs.append(i) + else: + self._args.append(convert_to_cunumeric_ndarray(arg)) + + #first fill arrays to argnames, then scalars: + for i,k in enumerate(inspect.signature(self._pyfunc).parameters): + if not(i in self._scalar_idxs): + self._argnames.append(k) + + for i,k in enumerate(inspect.signature(self._pyfunc).parameters): + if i in self._scalar_idxs: + self._scalar_names.append(k) + self._argnames.append(k) + + self._kwargs = list(kwargs) + if len(self._kwargs) > 1: + raise NotImplementedError( + "kwargs are not supported in user functions" ) - elif np.ndim(arg)==0: - self._scalar_args.append(arg) - self._scalar_idxs.append(i) - else: - self._args.append(convert_to_cunumeric_ndarray(arg)) - - #first fill arrays to argnames, then scalars: - for i,k in enumerate(inspect.signature(self._pyfunc).parameters): - if not(i in self._scalar_idxs): - self._argnames.append(k) - - for i,k in enumerate(inspect.signature(self._pyfunc).parameters): - if i in self._scalar_idxs: - self._scalar_names.append(k) - self._argnames.append(k) #all output arrays should have the same type if len(self._args)>0: @@ -412,12 +415,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: "different shapes for arrays in " "user function passed to vectorize") - self._kwargs = list(kwargs) - if len(self._kwargs) > 1: - raise NotImplementedError( - "kwargs are not supported in user functions" - ) - if runtime.num_gpus > 0: if not self._created: self._numba_func = self._build_gpu_function() @@ -433,8 +430,3 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._created = True self._execute(False) - self._args.clear() - self._scalar_args.clear() - self._scalar_idxs.clear() - self._argnames.clear() - self._scalar_names.clear() diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index fb19a85e69..903a8a8081 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -24,6 +24,10 @@ def my_func(a, b): a = a * 2 + b +def my_func_np(a, b): + a = a * 2 + b + return a + def test_vectorize(): func = num.vectorize(my_func) @@ -40,7 +44,38 @@ def test_vectorize(): a=num.array([[1,2,3],[4,5,6],[7,8,9]]) b=num.array([[10,11,12],[13,14,15],[16,17,18]]) func(a[:2],b[:2]) - print(a) + + a=np.arange(100).reshape((25,4)) + a_num= num.array(a) + + b=a*10 + b_num=a_num*10 + func_np = np.vectorize(my_func_np) + func_num=num.vectorize(my_func) + + a=func_np(a,b) + func_num(a_num, b_num) + assert np.array_equal(a, a_num) + + a[:,2]=func_np(a[:, 2], b[:,2]) + func_num(a_num[:,2],b_num[:,2]) + assert np.array_equal(a, a_num) + + a[5:10,2]=func_np(a[5:10, 2], b[1:6,2]) + func_num(a_num[5:10,2],b_num[1:6,2]) + assert np.array_equal(a, a_num) + + a[15:20]=func_np(a[15:20], b[15:20]) + func_num(a_num[15:20],b_num[15:20]) + assert np.array_equal(a, a_num) + + a=np.arange(1000).reshape((25,10,4)) + a_num= num.array(a) + + a[:, 2, :] = func_np(a[:, 2, :],2) + func_num(a_num[:, 2, :],2) + assert np.array_equal(a, a_num) + if __name__ == "__main__": import sys From c76b2fe8288d3a3f1618cbb02e42b419046ec486 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Feb 2023 09:07:39 -0800 Subject: [PATCH 17/78] hashing CUDA kernel for user function --- cunumeric/vectorize.py | 1 + src/cunumeric/vectorize/eval_udf.cu | 54 +++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 59cc83dc91..7c3b26044d 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -417,6 +417,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: if runtime.num_gpus > 0: if not self._created: + #print("IRINA DEBUG ptx is not created yet") self._numba_func = self._build_gpu_function() self._gpu_func = self._compile_func_gpu() if self._cache: diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index ea6d878254..c416852dcc 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -25,6 +25,45 @@ namespace cunumeric { using namespace Legion; using namespace legate; +class JITKernelStorage +{ + +private: + JITKernelStorage(){} + std::map jit_functions_; + +public: + JITKernelStorage( JITKernelStorage const&) = delete; + + void operator=(JITKernelStorage const&) = delete; + + static JITKernelStorage& get_instance(void){ + static JITKernelStorage instance; + return instance; + } + + bool registered_jit_funtion(size_t hash){ + return jit_functions_.find(hash)!=jit_functions_.end(); + }; + + CUfunction return_saved_jit_function(size_t hash){ + if ( + jit_functions_.find(hash)!=jit_functions_.end()) + return jit_functions_[hash]; + else + assert(false);//should never come here + } + + void add_jit_function(size_t hash, CUfunction func){ + if ( + jit_functions_.find(hash)!=jit_functions_.end()) + assert(false);// should never come here + else + jit_functions_.insert({hash, func}); + } +};//class JITKernelStorage + + struct EvalUdfGPU { template void operator()(EvalUdfArgs& args) const @@ -32,6 +71,17 @@ struct EvalUdfGPU { using VAL = legate_type_of; Rect rect; + JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); + + std::hash hasher; + CUfunction func; + size_t ptx_hash = hasher(args.ptx); + //std::cout <<"IRINA DEBUG hash = "<str(); auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); - CUfunction func; result = cuModuleGetFunction(&func, module, fun_name.c_str()); #ifdef DEBUG_CUNUMERIC assert(result == CUDA_SUCCESS); #endif - + jit_storage.add_jit_function(ptx_hash, func); + } // 2: after fucntion is generated, we can execute it: // Filling up the buffer with arguments From c8e88472966fac2bcba72eae5c7a1e56e998f830 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Feb 2023 11:58:19 -0800 Subject: [PATCH 18/78] removing PTX from task arguments in the case the function was hashed --- cunumeric/vectorize.py | 27 +++++++++++++++++++++++++-- src/cunumeric/vectorize/eval_udf.cu | 17 +++++++++-------- src/cunumeric/vectorize/eval_udf.h | 1 + 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 7c3b26044d..e8bfa86400 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -13,6 +13,8 @@ # limitations under the License. # +import cProfile, pstats + import inspect import re from typing import Any, Callable, Dict, List, Optional, Union @@ -329,8 +331,15 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: def _execute(self, is_gpu:bool) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) if is_gpu: - task.add_scalar_arg(self._gpu_func[0], ty.string) + ptx_hash = hash(self._gpu_func[0]) + if self._created: + #use hashed ptx and CUfunction on the C++ side + str_tmp ="" + task.add_scalar_arg(str_tmp, ty.string) + else: + task.add_scalar_arg(self._gpu_func[0], ty.string) task.add_scalar_arg(self._num_outputs, ty.uint32) + task.add_scalar_arg(ptx_hash, ty.int64) else: task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore task.add_scalar_arg(self._num_outputs, ty.uint32) @@ -355,6 +364,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: Return arrays with the results of `pyfunc` broadcast (vectorized) over `args` and `kwargs` not in `excluded`. """ + #profiler = cProfile.Profile() + #profiler.enable() if not self._created: self._scalar_args.clear() self._scalar_idxs.clear() @@ -420,9 +431,15 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: #print("IRINA DEBUG ptx is not created yet") self._numba_func = self._build_gpu_function() self._gpu_func = self._compile_func_gpu() + #profiler = cProfile.Profile() + #profiler.enable() + self._execute(True) + if not self._created: if self._cache: self._created = True - self._execute(True) + #profiler.disable() + #stats = pstats.Stats(profiler).sort_stats('cumtime') + #stats.print_stats() else: if not self._created: self._numba_func = self._build_cpu_function() @@ -431,3 +448,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._created = True self._execute(False) + + #profiler.disable() + #stats = pstats.Stats(profiler).sort_stats('cumtime') + #stats.print_stats() + + diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index c416852dcc..63e6322bd9 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -73,15 +73,15 @@ struct EvalUdfGPU { JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); - std::hash hasher; + //std::hash hasher; CUfunction func; - size_t ptx_hash = hasher(args.ptx); + //size_t ptx_hash = hasher(args.ptx); //std::cout <<"IRINA DEBUG hash = "<1);// in this case PTX string shouldn't be empty // 1: we need to vreate a function from the ptx generated y numba const unsigned num_options = 4; const size_t log_buffer_size = 16384; @@ -139,7 +139,7 @@ struct EvalUdfGPU { #ifdef DEBUG_CUNUMERIC assert(result == CUDA_SUCCESS); #endif - jit_storage.add_jit_function(ptx_hash, func); + jit_storage.add_jit_function(args.hash, func); } // 2: after fucntion is generated, we can execute it: @@ -240,7 +240,7 @@ struct EvalUdfGPU { /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { std::vectorscalars; - for (size_t i=2; i(), - context.scalars()[1].value()}; + context.scalars()[1].value(), + context.scalars()[2].value()}; size_t dim=1; if (args.inputs.size()>0){ dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h index 243c439230..55a27bc200 100644 --- a/src/cunumeric/vectorize/eval_udf.h +++ b/src/cunumeric/vectorize/eval_udf.h @@ -28,6 +28,7 @@ struct EvalUdfArgs { std::vectorscalars; std::string ptx = ""; uint32_t num_outputs; + int64_t hash=0; }; class EvalUdfTask : public CuNumericTask { From 392b4ee8038d5363221f98180f302dd4d67c5539 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Feb 2023 13:09:54 -0800 Subject: [PATCH 19/78] fixing Torchswe test --- src/cunumeric/vectorize/eval_udf.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 63e6322bd9..11c7ae8c45 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -30,7 +30,7 @@ class JITKernelStorage private: JITKernelStorage(){} - std::map jit_functions_; + std::map jit_functions_; public: JITKernelStorage( JITKernelStorage const&) = delete; @@ -42,11 +42,11 @@ public: return instance; } - bool registered_jit_funtion(size_t hash){ + bool registered_jit_funtion(int64_t hash){ return jit_functions_.find(hash)!=jit_functions_.end(); }; - CUfunction return_saved_jit_function(size_t hash){ + CUfunction return_saved_jit_function(int64_t hash){ if ( jit_functions_.find(hash)!=jit_functions_.end()) return jit_functions_[hash]; @@ -54,10 +54,10 @@ public: assert(false);//should never come here } - void add_jit_function(size_t hash, CUfunction func){ + void add_jit_function(int64_t hash, CUfunction func){ if ( jit_functions_.find(hash)!=jit_functions_.end()) - assert(false);// should never come here + //assert(false);// should never come here else jit_functions_.insert({hash, func}); } From 4278be6387da0268164f60c7a2b63417a17f4c68 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Feb 2023 13:35:58 -0800 Subject: [PATCH 20/78] fixing Torchswe test --- src/cunumeric/vectorize/eval_udf.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 11c7ae8c45..c70ee27ffc 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -56,9 +56,7 @@ public: void add_jit_function(int64_t hash, CUfunction func){ if ( - jit_functions_.find(hash)!=jit_functions_.end()) - //assert(false);// should never come here - else + jit_functions_.find(hash)==jit_functions_.end()) jit_functions_.insert({hash, func}); } };//class JITKernelStorage From ff0782b62174de5259bd91c40ff07a5611310182 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Feb 2023 18:26:00 -0800 Subject: [PATCH 21/78] fixing Torchswe test --- src/cunumeric/vectorize/eval_udf.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index c70ee27ffc..4ebae74c40 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -55,8 +55,6 @@ public: } void add_jit_function(int64_t hash, CUfunction func){ - if ( - jit_functions_.find(hash)==jit_functions_.end()) jit_functions_.insert({hash, func}); } };//class JITKernelStorage From c772522242a7fcd55b88e1a1a4f59ec45fa2786d Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Feb 2023 19:40:21 -0800 Subject: [PATCH 22/78] adding debug output --- cunumeric/vectorize.py | 6 +++--- src/cunumeric/vectorize/eval_udf.cu | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index e8bfa86400..3192c540b1 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -332,6 +332,7 @@ def _execute(self, is_gpu:bool) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) if is_gpu: ptx_hash = hash(self._gpu_func[0]) + print("IRINA DEBUG hash =", ptx_hash) if self._created: #use hashed ptx and CUfunction on the C++ side str_tmp ="" @@ -434,9 +435,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: #profiler = cProfile.Profile() #profiler.enable() self._execute(True) - if not self._created: - if self._cache: - self._created = True + if not self._created and self._cache: + self._created = True #profiler.disable() #stats = pstats.Stats(profiler).sort_stats('cumtime') #stats.print_stats() diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 4ebae74c40..e006cac2c2 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -72,11 +72,12 @@ struct EvalUdfGPU { //std::hash hasher; CUfunction func; //size_t ptx_hash = hasher(args.ptx); - //std::cout <<"IRINA DEBUG hash = "< Date: Wed, 22 Feb 2023 22:34:25 -0800 Subject: [PATCH 24/78] removing debug output --- cunumeric/vectorize.py | 2 +- src/cunumeric/vectorize/eval_udf.cu | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 3192c540b1..b082a7a1cb 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -332,7 +332,7 @@ def _execute(self, is_gpu:bool) -> None: task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) if is_gpu: ptx_hash = hash(self._gpu_func[0]) - print("IRINA DEBUG hash =", ptx_hash) + #print("IRINA DEBUG hash =", ptx_hash) if self._created: #use hashed ptx and CUfunction on the C++ side str_tmp ="" diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index de8ff0cbf0..a2859981d2 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -73,12 +73,12 @@ struct EvalUdfGPU { CUfunction func; std::pair key(args.hash, args.point); //size_t ptx_hash = hasher(args.ptx); - std::cout <<"IRINA DEBUG within cuda task hash = "<(); + uint32_t num_scalars = context.scalars()[1].value(); std::vectorscalars; - for (size_t i=2; i(), + + EvalUdfArgs args{context.scalars()[2+num_scalars].value(), context.inputs(), context.outputs(), scalars, - tmp, - context.scalars()[1].value(), + num_outputs, context.get_task_index()}; size_t dim=1; if (args.inputs.size()>0){ diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index a2859981d2..9c6c310d07 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -237,24 +237,33 @@ struct EvalUdfGPU { /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { + + uint32_t num_outputs = context.scalars()[0].value(); + uint32_t num_scalars = context.scalars()[1].value(); std::vectorscalars; - for (size_t i=3; i(); + bool is_created = context.scalars()[3+num_scalars].value(); + EvalUdfArgs args{0, context.inputs(), context.outputs(), scalars, - context.scalars()[0].value(), - context.scalars()[1].value(), + num_outputs, context.get_task_index(), - context.scalars()[2].value()}; + ptx_hash}; + if (!is_created) + args.ptx = context.scalars()[4+num_scalars].value(); size_t dim=1; if (args.inputs.size()>0){ dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); } else{ + //FIXME double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); //double_dispatch(dim, 0 , EvalUdfGPU{}, args); } diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h index e8141ce754..aac5aade92 100644 --- a/src/cunumeric/vectorize/eval_udf.h +++ b/src/cunumeric/vectorize/eval_udf.h @@ -26,10 +26,11 @@ struct EvalUdfArgs { std::vector& inputs; std::vector& outputs; std::vectorscalars; - std::string ptx = ""; uint32_t num_outputs; Legion::DomainPoint point; int64_t hash=0; + std::string ptx = ""; + }; class EvalUdfTask : public CuNumericTask { From 798289ff795e9d8f59e952b3c514a356959c6e19 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 24 Feb 2023 11:27:04 -0800 Subject: [PATCH 26/78] split Cuda kerenel generation with kernel execution --- cunumeric/config.py | 2 + cunumeric/vectorize.py | 19 +++- cunumeric_cpp.cmake | 2 + src/cunumeric/cuda_help.h | 37 +++++++ src/cunumeric/cunumeric_c.h | 1 + src/cunumeric/vectorize/create_cu_kernel.cc | 38 ++++++++ src/cunumeric/vectorize/create_cu_kernel.cu | 97 ++++++++++++++++++ src/cunumeric/vectorize/create_cu_kernel.h | 38 ++++++++ src/cunumeric/vectorize/eval_udf.cu | 103 +------------------- 9 files changed, 236 insertions(+), 101 deletions(-) create mode 100644 src/cunumeric/vectorize/create_cu_kernel.cc create mode 100644 src/cunumeric/vectorize/create_cu_kernel.cu create mode 100644 src/cunumeric/vectorize/create_cu_kernel.h diff --git a/cunumeric/config.py b/cunumeric/config.py index 14cb1b6434..158dd68016 100644 --- a/cunumeric/config.py +++ b/cunumeric/config.py @@ -143,6 +143,7 @@ class _CunumericSharedLib: CUNUMERIC_CONVERT_NAN_PROD: int CUNUMERIC_CONVERT_NAN_SUM: int CUNUMERIC_CONVOLVE: int + CUNUMERIC_CREATE_CU_KERNEL: int CUNUMERIC_DIAG: int CUNUMERIC_DOT: int CUNUMERIC_EYE: int @@ -346,6 +347,7 @@ class CuNumericOpCode(IntEnum): CONTRACT = _cunumeric.CUNUMERIC_CONTRACT CONVERT = _cunumeric.CUNUMERIC_CONVERT CONVOLVE = _cunumeric.CUNUMERIC_CONVOLVE + CREATE_CU_KERNEL = _cunumeric.CUNUMERIC_CREATE_CU_KERNEL DIAG = _cunumeric.CUNUMERIC_DIAG DOT = _cunumeric.CUNUMERIC_DOT EYE = _cunumeric.CUNUMERIC_EYE diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 9129ab784d..a1d57ec454 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -329,9 +329,26 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: return numba.cfunc(sig)(self._numba_func) def _execute(self, is_gpu:bool) -> None: + if is_gpu and not self._created: + #create CUDA kernel + kernel_task = self._context.create_auto_task(CuNumericOpCode.CREATE_CU_KERNEL) + ptx_hash = hash(self._gpu_func[0]) + kernel_task.add_scalar_arg(ptx_hash, ty.int64) + kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) + #adding unused array for creating correct launch domain + #and set up dependency between kernel_task and task + if len(self._args)>0: + a0 = self._args[0]._thunk + a0 = runtime.to_deferred_array(a0) + kernel_task.add_input(a0.base) + kernel_task.add_output(a0.base) + kernel_task.execute() + + task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._num_outputs, ty.uint32) task.add_scalar_arg(len(self._scalar_args), ty.uint32) + for a in self._scalar_args: dtype = convert_to_cunumeric_dtype(type(a).__name__) task.add_scalar_arg(a,dtype) @@ -340,8 +357,6 @@ def _execute(self, is_gpu:bool) -> None: ptx_hash = hash(self._gpu_func[0]) task.add_scalar_arg(ptx_hash, ty.int64) task.add_scalar_arg(self._created, bool) - if not self._created: - task.add_scalar_arg(self._gpu_func[0], ty.string) else: task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore a0 = self._args[0]._thunk diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake index bdfa4c163a..39115eeb72 100644 --- a/cunumeric_cpp.cmake +++ b/cunumeric_cpp.cmake @@ -161,6 +161,7 @@ list(APPEND cunumeric_SOURCES src/cunumeric/cephes/chbevl.cc src/cunumeric/cephes/i0.cc src/cunumeric/vectorize/eval_udf.cc + src/cunumeric/vectorize/create_cu_kernel.cc ) if(Legion_USE_OpenMP) @@ -260,6 +261,7 @@ if(Legion_USE_CUDA) src/cunumeric/cudalibs.cu src/cunumeric/cunumeric.cu src/cunumeric/vectorize/eval_udf.cu + src/cunumeric/vectorize/create_cu_kernel.cu ) endif() diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h index 63bd6d4e17..2e67366a20 100644 --- a/src/cunumeric/cuda_help.h +++ b/src/cunumeric/cuda_help.h @@ -390,4 +390,41 @@ __device__ __forceinline__ void store_streaming(double* ptr, double valu asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory"); } +#include + +class JITKernelStorage +{ + +private: + JITKernelStorage(){} + std::map, CUfunction> jit_functions_; + +public: + JITKernelStorage( JITKernelStorage const&) = delete; + + void operator=(JITKernelStorage const&) = delete; + + static JITKernelStorage& get_instance(void){ + static JITKernelStorage instance; + return instance; + } + + bool registered_jit_funtion(std::pair &key){ + return jit_functions_.find(key)!=jit_functions_.end(); + }; + + CUfunction return_saved_jit_function(std::pair &key){ + if ( + jit_functions_.find(key)!=jit_functions_.end()) + return jit_functions_[key]; + else + assert(false);//should never come here + } + + void add_jit_function(std::pair &key, CUfunction func){ + jit_functions_.insert({key, func}); + } +};//class JITKernelStorage + + } // namespace cunumeric diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h index 42c0bc955e..7b055f82f6 100644 --- a/src/cunumeric/cunumeric_c.h +++ b/src/cunumeric/cunumeric_c.h @@ -37,6 +37,7 @@ enum CuNumericOpCode { CUNUMERIC_CONTRACT, CUNUMERIC_CONVERT, CUNUMERIC_CONVOLVE, + CUNUMERIC_CREATE_CU_KERNEL, CUNUMERIC_SCAN_GLOBAL, CUNUMERIC_SCAN_LOCAL, CUNUMERIC_DIAG, diff --git a/src/cunumeric/vectorize/create_cu_kernel.cc b/src/cunumeric/vectorize/create_cu_kernel.cc new file mode 100644 index 0000000000..1d6b5d3950 --- /dev/null +++ b/src/cunumeric/vectorize/create_cu_kernel.cc @@ -0,0 +1,38 @@ +/* Copyright 20223 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/vectorize/create_cu_kernel.h" + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + + +/*static*/ void CreateCUKernelTask::cpu_variant(TaskContext& context) +{ +} +/*static*/ void CreateCUKernelTask::omp_variant(TaskContext& context) +{ +} + + +namespace // unnamed +{ +static void __attribute__((constructor)) register_tasks(void) { CreateCUKernelTask::register_variants(); } +} // namespace + +} // namespace cunumeric diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu new file mode 100644 index 0000000000..b3ba432cc2 --- /dev/null +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -0,0 +1,97 @@ +/* Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/vectorize/create_cu_kernel.h" +#include "cunumeric/cuda_help.h" +#include +#include + +namespace cunumeric { + +using namespace Legion; +using namespace legate; + +/*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context) +{ + + int64_t ptx_hash = context.scalars()[0].value(); + std::string ptx = context.scalars()[1].value(); + DomainPoint point = context.get_task_index(); + JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); + + CUfunction func; + std::pair key(ptx_hash, point); + if (!jit_storage.registered_jit_funtion(key)){ + const unsigned num_options = 4; + const size_t log_buffer_size = 16384; + std::vector log_info_buffer(log_buffer_size); + std::vector log_error_buffer(log_buffer_size); + CUjit_option jit_options[] = { + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + }; + void* option_vals[] = { + static_cast(log_info_buffer.data()), + reinterpret_cast(log_buffer_size), + static_cast(log_error_buffer.data()), + reinterpret_cast(log_buffer_size), + }; + + CUmodule module; + CUresult result = + cuModuleLoadDataEx(&module, ptx.data(), num_options, jit_options, option_vals); + if (result != CUDA_SUCCESS) { + if (result == CUDA_ERROR_OPERATING_SYSTEM) { + fprintf(stderr, + "ERROR: Device side asserts are not supported by the " + "CUDA driver for MAC OSX, see NVBugs 1628896.\n"); + exit(-1); + } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) { + fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n"); + exit(-1); + } else { + fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data()); +#if CUDA_VERSION >= 6050 + const char *name, *str; + assert(cuGetErrorName(result, &name) == CUDA_SUCCESS); + assert(cuGetErrorString(result, &str) == CUDA_SUCCESS); + fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); +#else + fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result); +#endif + exit(-1); + } + } + std::cmatch line_match; + bool match = + std::regex_search(ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); +#ifdef DEBUG_CUNUMERIC + assert(match); +#endif + const auto& matched_line = line_match.begin()->str(); + auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); + + result = cuModuleGetFunction(&func, module, fun_name.c_str()); +#ifdef DEBUG_CUNUMERIC + assert(result == CUDA_SUCCESS); +#endif + jit_storage.add_jit_function(key, func); + } +} + +} // namespace cunumeric diff --git a/src/cunumeric/vectorize/create_cu_kernel.h b/src/cunumeric/vectorize/create_cu_kernel.h new file mode 100644 index 0000000000..7b1e176756 --- /dev/null +++ b/src/cunumeric/vectorize/create_cu_kernel.h @@ -0,0 +1,38 @@ +/* Copyright 2023 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "cunumeric/cunumeric.h" +#include "core/data/scalar.h" + +namespace cunumeric { + +class CreateCUKernelTask : public CuNumericTask { + public: + static const int TASK_ID = CUNUMERIC_CREATE_CU_KERNEL; + + public: + static void cpu_variant(legate::TaskContext& context); +#ifdef LEGATE_USE_OPENMP + static void omp_variant(legate::TaskContext& context); +#endif +#ifdef LEGATE_USE_CUDA + static void gpu_variant(legate::TaskContext& context); +#endif +}; + +} // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 9c6c310d07..12777df1d2 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -25,41 +25,6 @@ namespace cunumeric { using namespace Legion; using namespace legate; -class JITKernelStorage -{ - -private: - JITKernelStorage(){} - std::map, CUfunction> jit_functions_; - -public: - JITKernelStorage( JITKernelStorage const&) = delete; - - void operator=(JITKernelStorage const&) = delete; - - static JITKernelStorage& get_instance(void){ - static JITKernelStorage instance; - return instance; - } - - bool registered_jit_funtion(std::pair &key){ - return jit_functions_.find(key)!=jit_functions_.end(); - }; - - CUfunction return_saved_jit_function(std::pair &key){ - if ( - jit_functions_.find(key)!=jit_functions_.end()) - return jit_functions_[key]; - else - assert(false);//should never come here - } - - void add_jit_function(std::pair &key, CUfunction func){ - jit_functions_.insert({key, func}); - } -};//class JITKernelStorage - - struct EvalUdfGPU { template void operator()(EvalUdfArgs& args) const @@ -78,68 +43,8 @@ struct EvalUdfGPU { func = jit_storage.return_saved_jit_function(key); } else{ - //std::cout <<"IRINA DEBUG PTX code size within cuda task = "<1);// in this case PTX string shouldn't be empty - // 1: we need to vreate a function from the ptx generated y numba - const unsigned num_options = 4; - const size_t log_buffer_size = 16384; - std::vector log_info_buffer(log_buffer_size); - std::vector log_error_buffer(log_buffer_size); - CUjit_option jit_options[] = { - CU_JIT_INFO_LOG_BUFFER, - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - }; - void* option_vals[] = { - static_cast(log_info_buffer.data()), - reinterpret_cast(log_buffer_size), - static_cast(log_error_buffer.data()), - reinterpret_cast(log_buffer_size), - }; - - CUmodule module; - CUresult result = - cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals); - if (result != CUDA_SUCCESS) { - if (result == CUDA_ERROR_OPERATING_SYSTEM) { - fprintf(stderr, - "ERROR: Device side asserts are not supported by the " - "CUDA driver for MAC OSX, see NVBugs 1628896.\n"); - exit(-1); - } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) { - fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n"); - exit(-1); - } else { - fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data()); -#if CUDA_VERSION >= 6050 - const char *name, *str; - assert(cuGetErrorName(result, &name) == CUDA_SUCCESS); - assert(cuGetErrorString(result, &str) == CUDA_SUCCESS); - fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); -#else - fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result); -#endif - exit(-1); - } - } - - std::cmatch line_match; - bool match = - std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); -#ifdef DEBUG_CUNUMERIC - assert(match); -#endif - const auto& matched_line = line_match.begin()->str(); - auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); - - result = cuModuleGetFunction(&func, module, fun_name.c_str()); -#ifdef DEBUG_CUNUMERIC - assert(result == CUDA_SUCCESS); -#endif - jit_storage.add_jit_function(key, func); + assert(false); //should never come here } - // 2: after fucntion is generated, we can execute it: // Filling up the buffer with arguments size_t buffer_size = (args.inputs.size()+args.scalars.size()) * sizeof(void*); @@ -245,7 +150,7 @@ struct EvalUdfGPU { scalars.push_back(context.scalars()[i]); int64_t ptx_hash = context.scalars()[2+num_scalars].value(); - bool is_created = context.scalars()[3+num_scalars].value(); + // bool is_created = context.scalars()[3+num_scalars].value(); EvalUdfArgs args{0, @@ -255,8 +160,8 @@ struct EvalUdfGPU { num_outputs, context.get_task_index(), ptx_hash}; - if (!is_created) - args.ptx = context.scalars()[4+num_scalars].value(); + //if (!is_created) + // args.ptx = context.scalars()[4+num_scalars].value(); size_t dim=1; if (args.inputs.size()>0){ dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); From 591fe8eaa7e19774b2fb0eed5463de5531452b31 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 27 Feb 2023 13:23:18 -0800 Subject: [PATCH 27/78] fixing errors after merge --- src/cunumeric/vectorize/eval_udf.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 12777df1d2..e307982be3 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -22,7 +22,7 @@ namespace cunumeric { -using namespace Legion; +//using namespace Legion; using namespace legate; struct EvalUdfGPU { From 7b56a5cfa0d096abef56c20602fa011eda7e80c3 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 28 Feb 2023 08:34:10 -0800 Subject: [PATCH 28/78] fixing logic for generating numba function --- cunumeric/vectorize.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index a1d57ec454..e071c4c12d 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -185,24 +185,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str: if name in self._argnames and not(name in self._scalar_names) : return "{}[{}]".format(name, _LOOP_VAR) - elif name == "if": - return "if " - elif name == "return": - return "return " - elif name == "or": - return "or " - elif name == "and": - return "and " - elif name == "not": - return "not " - elif name == "min": - return "min" - elif name == "max": - return "max" - elif is_gpu: - return "{}".format(name) else: - return "{}[0]".format(name) + return "{}".format(name) def _build_gpu_function(self) -> Any: From 57bc2b3d67032bc3a16917f1bf39daafad1490ed Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 28 Feb 2023 13:04:17 -0800 Subject: [PATCH 29/78] adding support for upper-case Letters in argument names in UDF --- cunumeric/vectorize.py | 6 ++++-- tests/integration/test_vectorize.py | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index e071c4c12d..bfa504e234 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -183,6 +183,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return return_lines def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str: + print("IRINA DEBUG ARGNAMES =", self._argnames) + print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names) if name in self._argnames and not(name in self._scalar_names) : return "{}[{}]".format(name, _LOOP_VAR) else: @@ -220,11 +222,11 @@ def _lift_to_array_access(m: Any) -> str: # kernel body lines_old = self._get_func_body(self._pyfunc) for line in lines_old: - l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) + l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(l_new) #print("IRINA DEBUG GPU function",lines) - + # Evaluate the string to get the Python function body = "\n".join(lines) glbs: Dict[str, Any] = {} diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 903a8a8081..0dc8e7b6a7 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -29,6 +29,14 @@ def my_func_np(a, b): return a +def my_func2(A0, B0): + A0 = A0 * 2 + B0 + +def my_func_np2(A0, B0): + A0 = A0 * 2 + B0 + return A0 + + def test_vectorize(): func = num.vectorize(my_func) a = num.arange(5) @@ -75,6 +83,20 @@ def test_vectorize(): a[:, 2, :] = func_np(a[:, 2, :],2) func_num(a_num[:, 2, :],2) assert np.array_equal(a, a_num) + + a=np.arange(100).reshape((25,4)) + a_num= num.array(a) + + b=a*10 + b_num=a_num*10 + + func_np = np.vectorize(my_func_np2) + func_num=num.vectorize(my_func2) + + a=func_np(a,b) + func_num(a_num, b_num) + assert np.array_equal(a, a_num) + if __name__ == "__main__": From cd6d0256752f5db00ebb10b43491e3d214581a36 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 28 Feb 2023 15:25:42 -0800 Subject: [PATCH 30/78] fixing CUfunction caching logic --- cunumeric/vectorize.py | 30 +++++++++++---------- src/cunumeric/cuda_help.h | 8 +++--- src/cunumeric/vectorize/create_cu_kernel.cu | 4 +-- src/cunumeric/vectorize/eval_udf.cc | 2 +- src/cunumeric/vectorize/eval_udf.cu | 21 ++++++++++----- src/cunumeric/vectorize/eval_udf.h | 5 ++-- 6 files changed, 39 insertions(+), 31 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index bfa504e234..d970095c25 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -20,6 +20,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import legate.core.types as ty +from legate.core import Rect import numba import numba.core.ccallback import numpy as np @@ -183,8 +184,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return return_lines def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str: - print("IRINA DEBUG ARGNAMES =", self._argnames) - print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names) + #print("IRINA DEBUG ARGNAMES =", self._argnames) + #print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names) if name in self._argnames and not(name in self._scalar_names) : return "{}[{}]".format(name, _LOOP_VAR) else: @@ -314,20 +315,19 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: return numba.cfunc(sig)(self._numba_func) - def _execute(self, is_gpu:bool) -> None: + def _execute(self, is_gpu:bool, num_gpus:int=0) -> None: if is_gpu and not self._created: - #create CUDA kernel - kernel_task = self._context.create_auto_task(CuNumericOpCode.CREATE_CU_KERNEL) + # create future for dependency between CREATE_CU_KERNEL and + # EVAL_UDF tasks + future = convert_to_cunumeric_ndarray(num_gpus) + future_deferred = runtime.to_deferred_array(future._thunk) + # create CUDA kernel + launch_domain=Rect(lo=(0,), hi=(num_gpus,)) + kernel_task = self._context.create_task(CuNumericOpCode.CREATE_CU_KERNEL,manual=True, launch_domain=launch_domain) ptx_hash = hash(self._gpu_func[0]) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - #adding unused array for creating correct launch domain - #and set up dependency between kernel_task and task - if len(self._args)>0: - a0 = self._args[0]._thunk - a0 = runtime.to_deferred_array(a0) - kernel_task.add_input(a0.base) - kernel_task.add_output(a0.base) + kernel_task.add_output(future_deferred.base) kernel_task.execute() @@ -342,7 +342,7 @@ def _execute(self, is_gpu:bool) -> None: if is_gpu: ptx_hash = hash(self._gpu_func[0]) task.add_scalar_arg(ptx_hash, ty.int64) - task.add_scalar_arg(self._created, bool) + task.add_scalar_arg((is_gpu and not self._created), bool) else: task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore a0 = self._args[0]._thunk @@ -355,6 +355,8 @@ def _execute(self, is_gpu:bool) -> None: task.add_output(a_tmp) if count != 0: task.add_alignment(a0.base, a_tmp) + if is_gpu and not self._created: + task.add_input(future_deferred.base) task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: @@ -431,7 +433,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._gpu_func = self._compile_func_gpu() #profiler = cProfile.Profile() #profiler.enable() - self._execute(True) + self._execute(True, runtime.num_gpus) if not self._created and self._cache: self._created = True #profiler.disable() diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h index 497815bad3..ce4504d4c0 100644 --- a/src/cunumeric/cuda_help.h +++ b/src/cunumeric/cuda_help.h @@ -397,7 +397,7 @@ class JITKernelStorage private: JITKernelStorage(){} - std::map, CUfunction> jit_functions_; + std::map, CUfunction> jit_functions_; public: JITKernelStorage( JITKernelStorage const&) = delete; @@ -409,11 +409,11 @@ class JITKernelStorage return instance; } - bool registered_jit_funtion(std::pair &key){ + bool registered_jit_funtion(std::pair &key){ return jit_functions_.find(key)!=jit_functions_.end(); }; - CUfunction return_saved_jit_function(std::pair &key){ + CUfunction return_saved_jit_function(std::pair &key){ if ( jit_functions_.find(key)!=jit_functions_.end()) return jit_functions_[key]; @@ -421,7 +421,7 @@ class JITKernelStorage assert(false);//should never come here } - void add_jit_function(std::pair &key, CUfunction func){ + void add_jit_function(std::pair &key, CUfunction func){ jit_functions_.insert({key, func}); } };//class JITKernelStorage diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index b3ba432cc2..61bc0124cd 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -29,11 +29,11 @@ using namespace legate; int64_t ptx_hash = context.scalars()[0].value(); std::string ptx = context.scalars()[1].value(); - DomainPoint point = context.get_task_index(); + Processor point = context.get_current_processor(); JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); CUfunction func; - std::pair key(ptx_hash, point); + std::pair key(ptx_hash, point); if (!jit_storage.registered_jit_funtion(key)){ const unsigned num_options = 4; const size_t log_buffer_size = 16384; diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index bef822f464..92f8cd739f 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -66,7 +66,7 @@ struct EvalUdfCPU { context.outputs(), scalars, num_outputs, - context.get_task_index()}; + context.get_current_processor()}; size_t dim=1; if (args.inputs.size()>0){ dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index e307982be3..428d8c5ed3 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -36,7 +36,7 @@ struct EvalUdfGPU { //std::hash hasher; CUfunction func; - std::pair key(args.hash, args.point); + std::pair key(args.hash, args.point); //size_t ptx_hash = hasher(args.ptx); //std::cout <<"IRINA DEBUG within cuda task hash = "<0){ rect = args.inputs[0].shape(); size = rect.volume(); - for (size_t i = 0; i < args.inputs.size(); i++) { + for (size_t i = 0; i < input_size; i++) { if (i < args.num_outputs) { auto out = args.outputs[i].write_accessor(rect); *reinterpret_cast(p) = out.ptr(rect, strides); @@ -128,12 +133,13 @@ struct EvalUdfGPU { auto stream = get_cached_stream(); + //std::cout <<"function = "<(); - // bool is_created = context.scalars()[3+num_scalars].value(); + bool is_created = context.scalars()[3+num_scalars].value(); EvalUdfArgs args{0, @@ -158,8 +164,9 @@ struct EvalUdfGPU { context.outputs(), scalars, num_outputs, - context.get_task_index(), - ptx_hash}; + context.get_current_processor(), + ptx_hash, + is_created}; //if (!is_created) // args.ptx = context.scalars()[4+num_scalars].value(); size_t dim=1; diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h index aac5aade92..4d867a003f 100644 --- a/src/cunumeric/vectorize/eval_udf.h +++ b/src/cunumeric/vectorize/eval_udf.h @@ -27,10 +27,9 @@ struct EvalUdfArgs { std::vector& outputs; std::vectorscalars; uint32_t num_outputs; - Legion::DomainPoint point; + Legion::Processor point; int64_t hash=0; - std::string ptx = ""; - + bool is_created = false; }; class EvalUdfTask : public CuNumericTask { From c7b9a080c1445e7b5eab702d595079cd089fc506 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 28 Feb 2023 16:40:38 -0800 Subject: [PATCH 31/78] adding debug output --- src/cunumeric/vectorize/create_cu_kernel.cu | 3 +++ src/cunumeric/vectorize/eval_udf.cu | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index 61bc0124cd..eaddf6d3ec 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -90,6 +90,9 @@ using namespace legate; #ifdef DEBUG_CUNUMERIC assert(result == CUDA_SUCCESS); #endif + + std::cout <<"IRINA DEBUG create_func proc = "< Any: - funcid = "vectorized_{}".format(self._pyfunc.__name__) # Preamble lines = ["from numba import cuda"] # Signature - args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR] +[_STRIDES_VAR] + args = ( + self._argnames + + [_SIZE_VAR] + + [_DIM_VAR] + + [_PITCHES_VAR] + + [_LO_POINT_VAR] + + [_STRIDES_VAR] + ) lines.append("def {}({}):".format(funcid, ",".join(args))) @@ -211,10 +218,24 @@ def _build_gpu_function(self) -> Any: lines.append(" return") lines.append(" {}:int = 0".format(_LOOP_VAR)) lines.append(" for p in range({}-1):".format(_DIM_VAR)) - lines.append(" x={}[p]+int(local_i/{}[p])".format(_LO_POINT_VAR,_PITCHES_VAR)) - lines.append(" local_i = local_i-{}[p]*int(local_i/{}[p])".format(_PITCHES_VAR,_PITCHES_VAR)) - lines.append(" {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)) - lines.append(" {}+=int(local_i*{}[{}-1])".format(_LOOP_VAR, _STRIDES_VAR, _DIM_VAR)) + lines.append( + " x={}[p]+int(local_i/{}[p])".format( + _LO_POINT_VAR, _PITCHES_VAR + ) + ) + lines.append( + " local_i = local_i-{}[p]*int(local_i/{}[p])".format( + _PITCHES_VAR, _PITCHES_VAR + ) + ) + lines.append( + " {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) + ) + lines.append( + " {}+=int(local_i*{}[{}-1])".format( + _LOOP_VAR, _STRIDES_VAR, _DIM_VAR + ) + ) # Kernel body def _lift_to_array_access(m: Any) -> str: @@ -225,9 +246,9 @@ def _lift_to_array_access(m: Any) -> str: for line in lines_old: l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(l_new) - - #print("IRINA DEBUG GPU function",lines) - + + # print("IRINA DEBUG GPU function",lines) + # Evaluate the string to get the Python function body = "\n".join(lines) glbs: Dict[str, Any] = {} @@ -235,7 +256,6 @@ def _lift_to_array_access(m: Any) -> str: return glbs[funcid] def _build_cpu_function(self) -> Callable[[Any], Any]: - funcid = "vectorized_{}".format(self._pyfunc.__name__) # Preamble @@ -262,7 +282,9 @@ def _emit_assignment( arg_idx += 1 for a in self._scalar_args: scalar_type = np.dtype(type(a).__name__) - _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type) + _emit_assignment( + self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type + ) arg_idx += 1 # Main loop @@ -302,7 +324,14 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: def _compile_func_gpu(self) -> tuple[Any]: types = self._get_numba_types() - arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)]+[numba.core.types.CPointer(numba.core.types.uint64)] + arg_types = ( + types + + [numba.core.types.uint64] + + [numba.core.types.uint64] + + [numba.core.types.CPointer(numba.core.types.uint64)] + + [numba.core.types.CPointer(numba.core.types.uint64)] + + [numba.core.types.CPointer(numba.core.types.uint64)] + ) sig = (*arg_types,) cuda_arch = numba.cuda.get_current_device().compute_capability @@ -315,41 +344,47 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: return numba.cfunc(sig)(self._numba_func) - def _execute(self, is_gpu:bool, num_gpus:int=0) -> None: + def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if is_gpu and not self._created: - # create future for dependency between CREATE_CU_KERNEL and + # create future for dependency between CREATE_CU_KERNEL and # EVAL_UDF tasks - future = convert_to_cunumeric_ndarray(num_gpus) + future = convert_to_cunumeric_ndarray(num_gpus) future_deferred = runtime.to_deferred_array(future._thunk) # create CUDA kernel - launch_domain=Rect(lo=(0,), hi=(num_gpus,)) - kernel_task = self._context.create_task(CuNumericOpCode.CREATE_CU_KERNEL,manual=True, launch_domain=launch_domain) + launch_domain = Rect(lo=(0,), hi=(num_gpus,)) + kernel_task = self._context.create_task( + CuNumericOpCode.CREATE_CU_KERNEL, + manual=True, + launch_domain=launch_domain, + ) ptx_hash = hash(self._gpu_func[0]) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - kernel_task.add_output(future_deferred.base) + kernel_task.add_input(future_deferred.base) kernel_task.execute() - + get_legate_runtime().issue_execution_fence(block=True) task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._num_outputs, ty.uint32) task.add_scalar_arg(len(self._scalar_args), ty.uint32) - + for a in self._scalar_args: dtype = convert_to_cunumeric_dtype(type(a).__name__) - task.add_scalar_arg(a,dtype) + task.add_scalar_arg(a, dtype) if is_gpu: ptx_hash = hash(self._gpu_func[0]) task.add_scalar_arg(ptx_hash, ty.int64) task.add_scalar_arg((is_gpu and not self._created), bool) else: - task.add_scalar_arg(self._cpu_func.address, ty.uint64) # type : ignore + task.add_scalar_arg( + self._cpu_func.address, ty.uint64 + ) # type : ignore a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): a_tmp = runtime.to_deferred_array(a._thunk) - a_tmp=a_tmp.base + a_tmp = a_tmp.base task.add_input(a_tmp) if count < self._num_outputs: task.add_output(a_tmp) @@ -364,8 +399,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: Return arrays with the results of `pyfunc` broadcast (vectorized) over `args` and `kwargs` not in `excluded`. """ - #profiler = cProfile.Profile() - #profiler.enable() + # profiler = cProfile.Profile() + # profiler.enable() if not self._created: self._scalar_args.clear() self._scalar_idxs.clear() @@ -373,24 +408,24 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._argnames.clear() self._scalar_names.clear() - for i,arg in enumerate(args): + for i, arg in enumerate(args): if arg is None: raise ValueError( "None is not supported in user function " "passed to cunumeric.vectorize" ) - elif np.ndim(arg)==0: + elif np.ndim(arg) == 0: self._scalar_args.append(arg) self._scalar_idxs.append(i) else: self._args.append(convert_to_cunumeric_ndarray(arg)) - #first fill arrays to argnames, then scalars: - for i,k in enumerate(inspect.signature(self._pyfunc).parameters): - if not(i in self._scalar_idxs): + # first fill arrays to argnames, then scalars: + for i, k in enumerate(inspect.signature(self._pyfunc).parameters): + if not (i in self._scalar_idxs): self._argnames.append(k) - for i,k in enumerate(inspect.signature(self._pyfunc).parameters): + for i, k in enumerate(inspect.signature(self._pyfunc).parameters): if i in self._scalar_idxs: self._scalar_names.append(k) self._argnames.append(k) @@ -401,44 +436,50 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: "kwargs are not supported in user functions" ) - #all output arrays should have the same type - if len(self._args)>0: + # all output arrays should have the same type + if len(self._args) > 0: ty = self._args[0].dtype shape = self._args[0].shape - for i in range (1, self._num_outputs): - if ty!=self._args[i].dtype: - raise TypeError("cuNumeric doesnt support " + for i in range(1, self._num_outputs): + if ty != self._args[i].dtype: + raise TypeError( + "cuNumeric doesnt support " "different types for output data in " - "user function passed to vectorize") + "user function passed to vectorize" + ) if shape != self._args[i].shape: - raise TypeError("cuNumeric doesnt support " + raise TypeError( + "cuNumeric doesnt support " "different shapes for output data in " - "user function passed to vectorize") - for i in range (self._num_outputs, len(self._args)): - if ty!=self._args[i].dtype: + "user function passed to vectorize" + ) + for i in range(self._num_outputs, len(self._args)): + if ty != self._args[i].dtype: runtime.warn( "converting input array to output types in user func ", category=RuntimeWarning, ) self._args[i] = self._args[i].astype(ty) - if shape !=self._args[i].shape and np.ndim(self._args[i])>0: - raise TypeError("cuNumeric doesnt support " + if shape != self._args[i].shape and np.ndim(self._args[i]) > 0: + raise TypeError( + "cuNumeric doesnt support " "different shapes for arrays in " - "user function passed to vectorize") + "user function passed to vectorize" + ) if runtime.num_gpus > 0: if not self._created: - #print("IRINA DEBUG ptx is not created yet") + # print("IRINA DEBUG ptx is not created yet") self._numba_func = self._build_gpu_function() self._gpu_func = self._compile_func_gpu() - #profiler = cProfile.Profile() - #profiler.enable() + # profiler = cProfile.Profile() + # profiler.enable() self._execute(True, runtime.num_gpus) if not self._created and self._cache: self._created = True - #profiler.disable() - #stats = pstats.Stats(profiler).sort_stats('cumtime') - #stats.print_stats() + # profiler.disable() + # stats = pstats.Stats(profiler).sort_stats('cumtime') + # stats.print_stats() else: if not self._created: self._numba_func = self._build_cpu_function() @@ -447,9 +488,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._created = True self._execute(False) - - #profiler.disable() - #stats = pstats.Stats(profiler).sort_stats('cumtime') - #stats.print_stats() - - + # profiler.disable() + # stats = pstats.Stats(profiler).sort_stats('cumtime') + # stats.print_stats() diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake index c8d89694c6..e0305c4834 100644 --- a/cunumeric_cpp.cmake +++ b/cunumeric_cpp.cmake @@ -208,6 +208,7 @@ if(Legion_USE_OpenMP) src/cunumeric/convolution/convolve_omp.cc src/cunumeric/transform/flip_omp.cc src/cunumeric/vectorize/eval_udf_omp.cc + src/cunumeric/vectorize/create_cu_kernel_omp.cc ) endif() diff --git a/src/cunumeric/vectorize/create_cu_kernel.cc b/src/cunumeric/vectorize/create_cu_kernel.cc index 1d6b5d3950..effcb32c95 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cc +++ b/src/cunumeric/vectorize/create_cu_kernel.cc @@ -18,21 +18,16 @@ namespace cunumeric { -using namespace Legion; using namespace legate; +/*static*/ void CreateCUKernelTask::cpu_variant(TaskContext& context){}; -/*static*/ void CreateCUKernelTask::cpu_variant(TaskContext& context) +namespace // unnamed { -} -/*static*/ void CreateCUKernelTask::omp_variant(TaskContext& context) +static void __attribute__((constructor)) register_tasks(void) { + CreateCUKernelTask::register_variants(); } - - -namespace // unnamed -{ -static void __attribute__((constructor)) register_tasks(void) { CreateCUKernelTask::register_variants(); } } // namespace } // namespace cunumeric diff --git a/src/cunumeric/vectorize/create_cu_kernel_omp.cc b/src/cunumeric/vectorize/create_cu_kernel_omp.cc new file mode 100644 index 0000000000..40cc28f6c7 --- /dev/null +++ b/src/cunumeric/vectorize/create_cu_kernel_omp.cc @@ -0,0 +1,25 @@ +/* Copyright 20223 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/vectorize/create_cu_kernel.h" + +namespace cunumeric { + +using namespace legate; + +/*static*/ void CreateCUKernelTask::omp_variant(TaskContext& context) {} + +} // namespace cunumeric From 3f8a3073ace4237d38735b1e509e2cf7554d5552 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 1 Mar 2023 11:06:31 -0800 Subject: [PATCH 33/78] changing the way we store CUfunctions --- cunumeric/vectorize.py | 26 +++++++++++++-------- src/cunumeric/vectorize/create_cu_kernel.cu | 17 +++++++------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index dec4afde86..7b1a9ecea2 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -24,7 +24,7 @@ import numba.core.ccallback import numpy as np import six -from legate.core import Rect, get_legate_runtime +from legate.core import Rect, get_legate_runtime, ReductionOp from cunumeric.runtime import runtime @@ -128,6 +128,10 @@ def __init__( self._created: bool = False self._cache: bool = cache self._num_outputs = 1 # there is at least 1 output + self._proc_ids = runtime.create_empty_thunk( + (runtime.num_gpus,), ty.int64, inputs=[]) + self._cu_func_pointers = runtime.create_empty_thunk( + (runtime.num_gpus,), ty.int64, inputs=[]) if doc is None: self.__doc__ = pyfunc.__doc__ @@ -345,11 +349,8 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: return numba.cfunc(sig)(self._numba_func) def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: + print("IRINA DEBUG in execute") if is_gpu and not self._created: - # create future for dependency between CREATE_CU_KERNEL and - # EVAL_UDF tasks - future = convert_to_cunumeric_ndarray(num_gpus) - future_deferred = runtime.to_deferred_array(future._thunk) # create CUDA kernel launch_domain = Rect(lo=(0,), hi=(num_gpus,)) kernel_task = self._context.create_task( @@ -358,11 +359,13 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: launch_domain=launch_domain, ) ptx_hash = hash(self._gpu_func[0]) + print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - kernel_task.add_input(future_deferred.base) + kernel_task.add_output(self._proc_ids.base) + kernel_task.add_output(self._cu_func_pointers.base) kernel_task.execute() - get_legate_runtime().issue_execution_fence(block=True) + #get_legate_runtime().issue_execution_fence(block=True) task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._num_outputs, ty.uint32) @@ -374,8 +377,13 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if is_gpu: ptx_hash = hash(self._gpu_func[0]) + print("IRINA DEBUG executing UDF for hash = ", ptx_hash) task.add_scalar_arg(ptx_hash, ty.int64) - task.add_scalar_arg((is_gpu and not self._created), bool) + kernel_task.add_intput(self._proc_ids.base) + kernel_task.add_intput(self._cu_func_pointers.base) + task.add_broadcast(self._proc_ids.base) + task.add_broadcast(self._cu_func_pointers.base) + else: task.add_scalar_arg( self._cpu_func.address, ty.uint64 @@ -390,8 +398,6 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: task.add_output(a_tmp) if count != 0: task.add_alignment(a0.base, a_tmp) - if is_gpu and not self._created: - task.add_input(future_deferred.base) task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index eaddf6d3ec..7cb77e5352 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -30,11 +30,14 @@ using namespace legate; int64_t ptx_hash = context.scalars()[0].value(); std::string ptx = context.scalars()[1].value(); Processor point = context.get_current_processor(); - JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); + //JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); + auto rect = context.outputs()[0].shape<1>; + auto procs = context.outputs()[0].write_accessor().ptr(rect); + auto funcs = context.outputs()[1].write_accessor().ptr(rect); + procs[0]=point; + CUfunction func; - std::pair key(ptx_hash, point); - if (!jit_storage.registered_jit_funtion(key)){ const unsigned num_options = 4; const size_t log_buffer_size = 16384; std::vector log_info_buffer(log_buffer_size); @@ -90,11 +93,9 @@ using namespace legate; #ifdef DEBUG_CUNUMERIC assert(result == CUDA_SUCCESS); #endif - - std::cout <<"IRINA DEBUG create_func proc = "< str: def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: types = [] for arg in self._args: - ty = arg.dtype - ty = str(ty) if ty != bool else "int8" - ty = getattr(numba.core.types, ty) - ty = numba.core.types.CPointer(ty) - types.append(ty) + type_a = arg.dtype + type_a = str(type_a) if type_a != bool else "int8" + type_a = getattr(numba.core.types, type_a) + type_a = numba.core.types.CPointer(type_a) + types.append(type_a) for arg in self._scalar_args: - ty = np.dtype(type(arg).__name__) - ty = str(ty) if ty != bool else "int8" - ty = getattr(numba.core.types, ty) - types.append(ty) + type_a = np.dtype(type(arg).__name__) + type_a = str(type_a) if type_a != bool else "int8" + type_a = getattr(numba.core.types, type_a) + types.append(type_a) return types def _compile_func_gpu(self) -> tuple[Any]: @@ -355,16 +358,19 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: launch_domain = Rect(lo=(0,), hi=(num_gpus,)) kernel_task = self._context.create_task( CuNumericOpCode.CREATE_CU_KERNEL, - manual=True, + #manual=True, launch_domain=launch_domain, ) ptx_hash = hash(self._gpu_func[0]) print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - kernel_task.add_output(self._proc_ids.base) - kernel_task.add_output(self._cu_func_pointers.base) + kernel_task.add_input(self._proc_ids_deferred.base) + kernel_task.add_input(self._cu_func_pointers_deferred.base) + kernel_task.add_output(self._proc_ids_deferred.base) + kernel_task.add_output(self._cu_func_pointers_deferred.base) kernel_task.execute() + print("IRINA DEBUG proc_ids =", self._proc_ids); #get_legate_runtime().issue_execution_fence(block=True) task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) @@ -375,19 +381,6 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: dtype = convert_to_cunumeric_dtype(type(a).__name__) task.add_scalar_arg(a, dtype) - if is_gpu: - ptx_hash = hash(self._gpu_func[0]) - print("IRINA DEBUG executing UDF for hash = ", ptx_hash) - task.add_scalar_arg(ptx_hash, ty.int64) - kernel_task.add_intput(self._proc_ids.base) - kernel_task.add_intput(self._cu_func_pointers.base) - task.add_broadcast(self._proc_ids.base) - task.add_broadcast(self._cu_func_pointers.base) - - else: - task.add_scalar_arg( - self._cpu_func.address, ty.uint64 - ) # type : ignore a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): @@ -398,6 +391,20 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: task.add_output(a_tmp) if count != 0: task.add_alignment(a0.base, a_tmp) + + if is_gpu: + ptx_hash = hash(self._gpu_func[0]) + print("IRINA DEBUG executing UDF for hash = ", ptx_hash) + task.add_scalar_arg(ptx_hash, ty.int64) + task.add_input(self._proc_ids_deferred.base) + task.add_input(self._cu_func_pointers_deferred.base) + task.add_broadcast(self._proc_ids_deferred.base) + task.add_broadcast(self._cu_func_pointers_deferred.base) + + else: + task.add_scalar_arg( + self._cpu_func.address, ty.uint64 + ) # type : ignore task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: @@ -444,10 +451,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: # all output arrays should have the same type if len(self._args) > 0: - ty = self._args[0].dtype + type_a = self._args[0].dtype shape = self._args[0].shape for i in range(1, self._num_outputs): - if ty != self._args[i].dtype: + if type_a != self._args[i].dtype: raise TypeError( "cuNumeric doesnt support " "different types for output data in " @@ -460,12 +467,12 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: "user function passed to vectorize" ) for i in range(self._num_outputs, len(self._args)): - if ty != self._args[i].dtype: + if type_a != self._args[i].dtype: runtime.warn( "converting input array to output types in user func ", category=RuntimeWarning, ) - self._args[i] = self._args[i].astype(ty) + self._args[i] = self._args[i].astype(type_a) if shape != self._args[i].shape and np.ndim(self._args[i]) > 0: raise TypeError( "cuNumeric doesnt support " diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index 7cb77e5352..e0c56b9075 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -31,11 +31,15 @@ using namespace legate; std::string ptx = context.scalars()[1].value(); Processor point = context.get_current_processor(); //JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); - auto rect = context.outputs()[0].shape<1>; - auto procs = context.outputs()[0].write_accessor().ptr(rect); - auto funcs = context.outputs()[1].write_accessor().ptr(rect); - procs[0]=point; - + std::vector &outputs =context.outputs(); + auto rect = outputs[0].shape<1>(); + auto procs = outputs[0].write_accessor(rect); + auto funcs = outputs[1].write_accessor(rect); + //FIXME check if dense) + auto procs_ptr = procs.ptr(rect); + auto funcs_ptr = funcs.ptr(rect); + procs_ptr[0]=point.id; + std::cout <<"INSIDE OF THE CREATE FUNCTION "<(func); // std::cout <<"IRINA DEBUG create_func proc = "<(); + + std::cout<<"IRINA DEBUG proc rect = "< class JITKernelStorage @@ -426,5 +429,5 @@ class JITKernelStorage } };//class JITKernelStorage - +#endif } // namespace cunumeric diff --git a/src/cunumeric/cudalibs.cu b/src/cunumeric/cudalibs.cu index 7d3ab8a098..45d33d752f 100644 --- a/src/cunumeric/cudalibs.cu +++ b/src/cunumeric/cudalibs.cu @@ -233,6 +233,19 @@ cufftContext CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& si return cufftContext(cache->get_cufft_plan(size)); } +void CUDALibraries::store_udf_func(size_t hash, CUfunction func){ + udf_caches_[hash]=func; +} + +CUfunction CUDALibraries::get_udf_func(size_t hash){ + auto finder = udf_caches_.find(hash); + if (udf_caches_.end() == finder) { + fprintf(stderr, "UDF function wasn't generated yet"); + LEGATE_ABORT; + } + return udf_caches_[hash]; +} + static CUDALibraries& get_cuda_libraries(legate::Processor proc) { if (proc.kind() != legate::Processor::TOC_PROC) { @@ -278,6 +291,18 @@ cufftContext get_cufft_plan(cufftType type, const DomainPoint& size) return lib.get_cufft_plan(type, size); } +void store_udf(size_t hash, CUfunction func){ + const auto proc = legate::Processor::get_executing_processor(); + auto& lib = get_cuda_libraries(proc); + lib.store_udf_func(hash, func); +} + +CUfunction get_udf(size_t hash){ + const auto proc = legate::Processor::get_executing_processor(); + auto& lib = get_cuda_libraries(proc); + return lib.get_udf_func(hash); +} + class LoadCUDALibsTask : public CuNumericTask { public: static const int TASK_ID = CUNUMERIC_LOAD_CUDALIBS; diff --git a/src/cunumeric/cudalibs.h b/src/cunumeric/cudalibs.h index f2f01fffe1..8f91f3aad3 100644 --- a/src/cunumeric/cudalibs.h +++ b/src/cunumeric/cudalibs.h @@ -38,6 +38,8 @@ struct CUDALibraries { cusolverDnHandle_t get_cusolver(); cutensorHandle_t* get_cutensor(); cufftContext get_cufft_plan(cufftType type, const legate::DomainPoint& size); + void store_udf_func(size_t hash, CUfunction func); + CUfunction get_udf_func(size_t hash); private: void finalize_cublas(); @@ -50,6 +52,7 @@ struct CUDALibraries { cusolverDnContext* cusolver_; cutensorHandle_t* cutensor_; std::map plan_caches_; + std::map udf_caches_; }; } // namespace cunumeric diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index e0c56b9075..6481727a00 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -30,16 +30,8 @@ using namespace legate; int64_t ptx_hash = context.scalars()[0].value(); std::string ptx = context.scalars()[1].value(); Processor point = context.get_current_processor(); - //JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); - std::vector &outputs =context.outputs(); - auto rect = outputs[0].shape<1>(); - auto procs = outputs[0].write_accessor(rect); - auto funcs = outputs[1].write_accessor(rect); + auto procs = context.outputs()[0].write_accessor(); //FIXME check if dense) - auto procs_ptr = procs.ptr(rect); - auto funcs_ptr = funcs.ptr(rect); - procs_ptr[0]=point.id; - std::cout <<"INSIDE OF THE CREATE FUNCTION "<(func); + store_udf(ptx_hash, func); // std::cout <<"IRINA DEBUG create_func proc = "<(funcs[proc_point]); + CUfunction func = get_udf(args.hash); // Filling up the buffer with arguments size_t buffer_size = (input_size+args.scalars.size()) * sizeof(void*); From 42b82776954cb8e139e7496e5c3c26f7faffe40c Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 2 Mar 2023 21:35:15 -0800 Subject: [PATCH 36/78] fixing dependency between creation and use of UDF function --- cunumeric/vectorize.py | 28 ++++++++------------- src/cunumeric/vectorize/create_cu_kernel.cu | 15 ++++++++--- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 55ad5a5e52..faea888d1c 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -129,12 +129,8 @@ def __init__( self._created: bool = False self._cache: bool = cache self._num_outputs = 1 # there is at least 1 output - self._proc_ids = zeros((runtime.num_gpus,), dtype=np.dtype(np.uint64)) - self._cu_func_pointers = zeros((runtime.num_gpus,), dtype=np.dtype(np.uint64)) - self._proc_ids_deferred = runtime.to_deferred_array(self._proc_ids._thunk) - self._cu_func_pointers_deferred = runtime.to_deferred_array(self._cu_func_pointers._thunk) - #runtime.create_empty_thunk( - #(runtime.num_gpus,), dtype = np.dtype(np.uint64), inputs=[]) + self._created_array = create_empty_thunk( + (1,), dtype = np.dtype(np.bool), inputs=[]) if doc is None: self.__doc__ = pyfunc.__doc__ @@ -352,7 +348,7 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: return numba.cfunc(sig)(self._numba_func) def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: - print("IRINA DEBUG in execute") + #print("IRINA DEBUG in execute") if is_gpu and not self._created: # create CUDA kernel launch_domain = Rect(lo=(0,), hi=(num_gpus,)) @@ -365,12 +361,10 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - kernel_task.add_input(self._proc_ids_deferred.base) - kernel_task.add_input(self._cu_func_pointers_deferred.base) - kernel_task.add_output(self._proc_ids_deferred.base) - kernel_task.add_output(self._cu_func_pointers_deferred.base) + kernel_task.add_reduction(self._created_array.base,ReductionOp.MUL) kernel_task.execute() - print("IRINA DEBUG proc_ids =", self._proc_ids); + print("IRINA DEBUG created array= ",self._created_array); + self._created = bool(self._created_array[0]) #get_legate_runtime().issue_execution_fence(block=True) task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) @@ -396,10 +390,10 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: ptx_hash = hash(self._gpu_func[0]) print("IRINA DEBUG executing UDF for hash = ", ptx_hash) task.add_scalar_arg(ptx_hash, ty.int64) - task.add_input(self._proc_ids_deferred.base) - task.add_input(self._cu_func_pointers_deferred.base) - task.add_broadcast(self._proc_ids_deferred.base) - task.add_broadcast(self._cu_func_pointers_deferred.base) + task.add_input(self._created_array.base) + #task.add_input(self._cu_func_pointers_deferred.base) + #task.add_broadcast(self._proc_ids_deferred.base) + #task.add_broadcast(self._cu_func_pointers_deferred.base) else: task.add_scalar_arg( @@ -488,8 +482,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: # profiler = cProfile.Profile() # profiler.enable() self._execute(True, runtime.num_gpus) - if not self._created and self._cache: - self._created = True # profiler.disable() # stats = pstats.Stats(profiler).sort_stats('cumtime') # stats.print_stats() diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index 6481727a00..2b5477e17c 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -24,14 +24,19 @@ namespace cunumeric { using namespace Legion; using namespace legate; +__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) +fill_kernel(const AccessorRD out) +{ + reduce_output(out,true); +} + /*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context) { - + int64_t ptx_hash = context.scalars()[0].value(); std::string ptx = context.scalars()[1].value(); Processor point = context.get_current_processor(); auto procs = context.outputs()[0].write_accessor(); - //FIXME check if dense) CUfunction func; const unsigned num_options = 4; @@ -90,8 +95,10 @@ using namespace legate; assert(result == CUDA_SUCCESS); #endif store_udf(ptx_hash, func); - // std::cout <<"IRINA DEBUG create_func proc = "< numba.core.ccallback.CFunc: return numba.cfunc(sig)(self._numba_func) def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: - #print("IRINA DEBUG in execute") if is_gpu and not self._created: # create CUDA kernel launch_domain = Rect(lo=(0,), hi=(num_gpus,)) kernel_task = self._context.create_task( CuNumericOpCode.CREATE_CU_KERNEL, - #manual=True, launch_domain=launch_domain, ) ptx_hash = hash(self._gpu_func[0]) - #print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - #kernel_task.add_reduction(self._created_array_deferred.base,ReductionOp.MUL) kernel_task.add_output(self._created_array_deferred.base) kernel_task.execute() - #print("IRINA DEBUG created array= ",self._created_array); + # inline map first element of the array to make sure the CREATE_CU_KERNEL + # task has finished by the time we set self._created to True if self._cache: self._created = bool(self._created_array[0]) - #get_legate_runtime().issue_execution_fence(block=True) task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) - task.add_scalar_arg(self._num_outputs, ty.uint32) - task.add_scalar_arg(len(self._scalar_args), ty.uint32) - + task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs + task.add_scalar_arg(len(self._scalar_args), ty.uint32) # N of scalar_args + # add all scalars for a in self._scalar_args: dtype = convert_to_cunumeric_dtype(type(a).__name__) task.add_scalar_arg(a, dtype) + # add array arguments a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): @@ -395,13 +396,11 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if is_gpu: ptx_hash = hash(self._gpu_func[0]) - #print("IRINA DEBUG executing UDF for hash = ", ptx_hash) task.add_scalar_arg(ptx_hash, ty.int64) + # passing the _created * array to introduce dependency between + # CREATE_CU_KERNEL task and EVAL_UDF task task.add_input(self._created_array_deferred.base) task.add_broadcast(self._created_array_deferred.base) - #task.add_input(self._cu_func_pointers_deferred.base) - #task.add_broadcast(self._proc_ids_deferred.base) - #task.add_broadcast(self._cu_func_pointers_deferred.base) else: task.add_scalar_arg( @@ -487,12 +486,12 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: # print("IRINA DEBUG ptx is not created yet") self._numba_func = self._build_gpu_function() self._gpu_func = self._compile_func_gpu() - # profiler = cProfile.Profile() - # profiler.enable() + #profiler = cProfile.Profile() + #profiler.enable() self._execute(True, runtime.num_gpus) - # profiler.disable() - # stats = pstats.Stats(profiler).sort_stats('cumtime') - # stats.print_stats() + #profiler.disable() + #stats = pstats.Stats(profiler).sort_stats('cumtime') + #stats.print_stats() else: if not self._created: self._numba_func = self._build_cpu_function() From 6e8975f7491652c6e36ccb4625c4146fbf46d47e Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 8 Mar 2023 14:48:13 -0800 Subject: [PATCH 40/78] fixed dependency bug --- cunumeric/vectorize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 15c8b276e6..6cd8c7e367 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -367,6 +367,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: ptx_hash = hash(self._gpu_func[0]) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) + #added to introduce dependency between this and EVAL_UDF task + kernel_task.add_input(self._created_array_deferred.base) kernel_task.add_output(self._created_array_deferred.base) kernel_task.execute() # inline map first element of the array to make sure the CREATE_CU_KERNEL From 1c3ce9e26561e27f4804548ef37092fb58651e9d Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 8 Mar 2023 14:50:17 -0800 Subject: [PATCH 41/78] more clean-up --- cunumeric/vectorize.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 6cd8c7e367..5c31f8a031 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -485,15 +485,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: if runtime.num_gpus > 0: if not self._created: - # print("IRINA DEBUG ptx is not created yet") self._numba_func = self._build_gpu_function() self._gpu_func = self._compile_func_gpu() - #profiler = cProfile.Profile() - #profiler.enable() self._execute(True, runtime.num_gpus) - #profiler.disable() - #stats = pstats.Stats(profiler).sort_stats('cumtime') - #stats.print_stats() else: if not self._created: self._numba_func = self._build_cpu_function() @@ -502,6 +496,3 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._created = True self._execute(False) - # profiler.disable() - # stats = pstats.Stats(profiler).sort_stats('cumtime') - # stats.print_stats() From 5a5b0617eb5a7b664e603e2a4d30516c8927269c Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 8 Mar 2023 19:21:57 -0800 Subject: [PATCH 42/78] removing created_array --- cunumeric/vectorize.py | 17 ++++++++++------- src/cunumeric/vectorize/eval_udf.cu | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 5c31f8a031..e29d26c209 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -132,8 +132,8 @@ def __init__( size_tmp=runtime.num_gpus if size_tmp==1: size_tmp=10 - self._created_array = full((size_tmp,), True, dtype=bool) - self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk) + #self._created_array = full((size_tmp,), True, dtype=bool) + #self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk) #runtime.create_empty_thunk( # (1,), dtype = np.dtype(np.bool), inputs=[]) @@ -368,13 +368,16 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) #added to introduce dependency between this and EVAL_UDF task - kernel_task.add_input(self._created_array_deferred.base) - kernel_task.add_output(self._created_array_deferred.base) + #kernel_task.add_input(self._created_array_deferred.base) + #kernel_task.add_output(self._created_array_deferred.base) kernel_task.execute() + get_legate_runtime().issue_execution_fence(block=True) # inline map first element of the array to make sure the CREATE_CU_KERNEL + # task has finished by the time we set self._created to True if self._cache: - self._created = bool(self._created_array[0]) + #self._created = bool(self._created_array[0]) + self._created = True task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs @@ -401,8 +404,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: task.add_scalar_arg(ptx_hash, ty.int64) # passing the _created * array to introduce dependency between # CREATE_CU_KERNEL task and EVAL_UDF task - task.add_input(self._created_array_deferred.base) - task.add_broadcast(self._created_array_deferred.base) + #task.add_input(self._created_array_deferred.base) + #task.add_broadcast(self._created_array_deferred.base) else: task.add_scalar_arg( diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 485227abfb..6a47e85f05 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -32,7 +32,8 @@ struct EvalUdfGPU { using VAL = legate_type_of; Rect rect; - size_t input_size=args.inputs.size()-1; + // size_t input_size=args.inputs.size()-1; + size_t input_size=args.inputs.size(); // auto procs_rect = args.inputs[input_size].shape<1>(); //auto procs=args.inputs[input_size].read_accessor(); From a80c1281aa43850ac95cad7f931ac9d958041328 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 9 Mar 2023 09:20:16 -0800 Subject: [PATCH 43/78] fixing barrier --- cunumeric/vectorize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index e29d26c209..085c384b10 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -362,6 +362,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: launch_domain = Rect(lo=(0,), hi=(num_gpus,)) kernel_task = self._context.create_task( CuNumericOpCode.CREATE_CU_KERNEL, + manual=True, launch_domain=launch_domain, ) ptx_hash = hash(self._gpu_func[0]) @@ -371,7 +372,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: #kernel_task.add_input(self._created_array_deferred.base) #kernel_task.add_output(self._created_array_deferred.base) kernel_task.execute() - get_legate_runtime().issue_execution_fence(block=True) + self._context.issue_execution_fence(block=True) # inline map first element of the array to make sure the CREATE_CU_KERNEL # task has finished by the time we set self._created to True From e45d0373f1a6dadb7fde13c31b17ebc6fecb7c23 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 9 Mar 2023 09:22:22 -0800 Subject: [PATCH 44/78] removing scipy --- cunumeric/vectorize.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 085c384b10..ed97209ccf 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -208,8 +208,6 @@ def _build_gpu_function(self) -> Any: lines = ["from numba import cuda"] lines.append("import math") lines.append("import numpy") - lines.append("import scipy.special") - lines.append("import numba_scipy") # Signature args = ( From dafbf1d9ff98b531c156588e84b36c936c4e9411 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 10 Mar 2023 12:14:05 -0800 Subject: [PATCH 45/78] adding provenance --- cunumeric/vectorize.py | 5 +++-- src/cunumeric/mapper.cc | 18 ------------------ 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index ed97209ccf..718097effd 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -24,7 +24,7 @@ import numba.core.ccallback import numpy as np import six -from legate.core import Rect, get_legate_runtime, ReductionOp +from legate.core import Rect, get_legate_runtime, ReductionOp, track_provenance from cunumeric.runtime import runtime @@ -353,7 +353,8 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: ) # type: ignore return numba.cfunc(sig)(self._numba_func) - + + @track_provenance(runtime.legate_context) def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if is_gpu and not self._created: # create CUDA kernel diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc index 16b362bdb5..432f6713c0 100644 --- a/src/cunumeric/mapper.cc +++ b/src/cunumeric/mapper.cc @@ -223,24 +223,6 @@ std::vector CuNumericMapper::store_mappings( } return std::move(mappings); } -#if 0 - case CUNUMERIC_EVAL_UDF: { - std::vector mappings; - auto& inputs = task.inputs(); - auto& outputs = task.outputs(); - for (auto& input : inputs) { - mappings.push_back(StoreMapping::default_mapping(input, options.front())); - // mappings.back().policy.ordering.c_order(); - mappings.back().policy.exact = true; - } - for (auto& output : outputs) { - mappings.push_back(StoreMapping::default_mapping(output, options.front())); - // mappings.back().policy.ordering.c_order(); - mappings.back().policy.exact = true; - } - return std::move(mappings); - } -#endif default: { return {}; } From 9e09b8b88360ac2899ced840507a6bd8ab02425f Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 15 Mar 2023 12:13:47 -0700 Subject: [PATCH 46/78] adding black_scholes example that uses vectorize --- examples/black_scholes_greeks.py | 230 +++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 examples/black_scholes_greeks.py diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py new file mode 100644 index 0000000000..ff8354bd7a --- /dev/null +++ b/examples/black_scholes_greeks.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python + +# Copyright 2021-2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse + +from benchmark import parse_args, run_benchmark, CuNumericTimer + +import math +import cunumeric as np + + +#big size +#n_vol_steps = 40 +vol_start = 0.1 +vol_step = 0.01 +#n_t_steps = 365*10 +t_start = 0.5 +t_step = 1.0/(365*10) +#n_money_steps = 60 +money_start = -0.4 +money_step = 0.01 + + +#small size +#n_vol_steps = 10 +#vol_start = 0.1 +#vol_step = 0.01 +#n_t_steps = 6 +#t_start = 0.5 +#t_step = 0.5 +#n_money_steps = 1 +#money_start = 0 +#money_step = 0.1 + +RISKFREE = 0.02 +S0 = 100.0 +N_GREEKS=7 +EPS = 0.00000001 + + +def initialize(n_vol_steps, n_t_steps, n_money_steps, D): + CALL = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D) + PUT = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D) + S=np.full((n_t_steps, n_vol_steps, n_money_steps,),S0, dtype = D) + K=np.full((n_t_steps, n_vol_steps, n_money_steps,), (1 + money_start), dtype = D) + temp_arr = np.arange((n_vol_steps*n_t_steps*n_money_steps), dtype=int) + k_temp=(temp_arr%n_money_steps)*money_step + k_temp = k_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,)) + K+=k_temp + K=K*S0 + + T=np.full((n_t_steps, n_vol_steps, n_money_steps,),t_start, dtype = D) + t_temp = (temp_arr%(n_vol_steps*n_money_steps))*vol_step + t_temp = t_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,)) + T+=t_temp + R= 0.02 + V=np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype = D) + for i in range(n_vol_steps): + V[:,i,:]+=i*vol_step + + return CALL, PUT, S, K, T, R, V + + + +def normPDF(d): + RSQRT2PI = 0.39894228040143267793994605993438; + return RSQRT2PI * np.exp(- 0.5 * d * d); + +def black_scholes_vec_kernel(d1, d2, nd1, nd2, S, K, T, V, stdev, R,CP, EPS): + if (math.fabs(V)>EPS) and (math.fabs(T)>EPS) and (math.fabs(K)>EPS) and (math.fabs(S)>EPS): + d1 = (math.log(S/K)+(R+0.5*V*V)*T)/stdev + d2=d1-stdev + cpd1 = CP*d1 + cpd2 = CP*d2 + #manual inlining ndtr + NPY_SQRT1_2 = 0.707106781186547524400844362104849039 + x = cpd1 * NPY_SQRT1_2 + z = math.fabs(x) + + if z < NPY_SQRT1_2: + y = 0.5 + 0.5 * math.erf(x) + else: + y = 0.5 * math.erfc(z) + + if x > 0: + y = 1.0 - y + nd1=y + + #manual inlining ndtr + x = cpd2 * NPY_SQRT1_2 + z = math.fabs(x) + + if z < NPY_SQRT1_2: + y = 0.5 + 0.5 * math.erf(x) + else: + y = 0.5 * math.erfc(z) + + if x > 0: + y = 1.0 - y + nd2=y + else: + if (math.fabs(V)<=EPS) or (math.fabs(T)<=EPS) or (math.fabs(K)<=EPS): + d1 = math.inf + d2 = math.inf + nd1 = 1. + nd2 = 1. + else: + d1 = -math.inf + d2 = -math.inf + nd1 = 1. + nd2 = 1. + + +bs_vec = np.vectorize(black_scholes_vec_kernel,otypes=(float,float,float,float), cache=True) + +def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, df,ind_v, ind_t, CP, greek): + + if greek == "PREM": + out[...] = CP*(S*nd1 - K*df*nd2); + elif greek == "DELTA": + out[...] = CP*nd1 + elif greek =="VEGA": + out[...] = S*np.sqrt(T)*normPDF(d1) + elif greek == "GAMMA": + out[...] = normPDF(d1)/(S*V*np.sqrt(T)) + out[ind_v] =0. + elif greek == "VANNA": + out[...] = -d2*normPDF(d1)/V + out[ind_v] =0. + elif greek == "VOLGA": + out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V; + out[ind_v] =0. + elif greek == "THETA": + out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2) + else: + RuntimeError("Wrong greek name is passed") + + if (greek != "PREM"): + out[ind_t] = 0. + + +greeks = ["PREM", "DELTA", "VEGA", "GAMMA", "VANNA", "VOLGA", "THETA",] +#greeks = ["PREM",] + +def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps): + timer = CuNumericTimer() + print("Start black_scholes") + CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32) + #pre-compute some data for black_scholes + stdev = V * np.sqrt(T) + df = np.exp(-R*T) + ind_v = np.nonzero(np.absolute(V) Date: Wed, 15 Mar 2023 14:01:00 -0700 Subject: [PATCH 47/78] removing special cases from black_scholes_greek --- examples/black_scholes_greeks.py | 100 ++++++++----------------------- 1 file changed, 26 insertions(+), 74 deletions(-) diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py index ff8354bd7a..8b5465ad5c 100644 --- a/examples/black_scholes_greeks.py +++ b/examples/black_scholes_greeks.py @@ -49,7 +49,6 @@ RISKFREE = 0.02 S0 = 100.0 N_GREEKS=7 -EPS = 0.00000001 def initialize(n_vol_steps, n_t_steps, n_money_steps, D): @@ -74,60 +73,32 @@ def initialize(n_vol_steps, n_t_steps, n_money_steps, D): return CALL, PUT, S, K, T, R, V +def normCDF(d): + A1 = 0.31938153 + A2 = -0.356563782 + A3 = 1.781477937 + A4 = -1.821255978 + A5 = 1.330274429 + RSQRT2PI = 0.39894228040143267793994605993438 + K = 1.0 / (1.0 + 0.2316419 * np.absolute(d)) + + cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + + return np.where(d > 0, 1.0 - cnd, cnd) def normPDF(d): RSQRT2PI = 0.39894228040143267793994605993438; return RSQRT2PI * np.exp(- 0.5 * d * d); -def black_scholes_vec_kernel(d1, d2, nd1, nd2, S, K, T, V, stdev, R,CP, EPS): - if (math.fabs(V)>EPS) and (math.fabs(T)>EPS) and (math.fabs(K)>EPS) and (math.fabs(S)>EPS): - d1 = (math.log(S/K)+(R+0.5*V*V)*T)/stdev - d2=d1-stdev - cpd1 = CP*d1 - cpd2 = CP*d2 - #manual inlining ndtr - NPY_SQRT1_2 = 0.707106781186547524400844362104849039 - x = cpd1 * NPY_SQRT1_2 - z = math.fabs(x) - - if z < NPY_SQRT1_2: - y = 0.5 + 0.5 * math.erf(x) - else: - y = 0.5 * math.erfc(z) - - if x > 0: - y = 1.0 - y - nd1=y - - #manual inlining ndtr - x = cpd2 * NPY_SQRT1_2 - z = math.fabs(x) - - if z < NPY_SQRT1_2: - y = 0.5 + 0.5 * math.erf(x) - else: - y = 0.5 * math.erfc(z) - - if x > 0: - y = 1.0 - y - nd2=y - else: - if (math.fabs(V)<=EPS) or (math.fabs(T)<=EPS) or (math.fabs(K)<=EPS): - d1 = math.inf - d2 = math.inf - nd1 = 1. - nd2 = 1. - else: - d1 = -math.inf - d2 = -math.inf - nd1 = 1. - nd2 = 1. - - -bs_vec = np.vectorize(black_scholes_vec_kernel,otypes=(float,float,float,float), cache=True) - -def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, df,ind_v, ind_t, CP, greek): +def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, CP, greek): + EPS = 0.00000001 + stdev = V * np.sqrt(T) + df = np.exp(-R*T) + d1 = (np.log(S/K)+(R+0.5*V*V)*T)/stdev + d2= d1-stdev + nd1 = normCDF(CP*d1) + nd2 = normCDF(CP*d2) if greek == "PREM": out[...] = CP*(S*nd1 - K*df*nd2); @@ -137,53 +108,34 @@ def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, df,ind_v, ind_t, CP, out[...] = S*np.sqrt(T)*normPDF(d1) elif greek == "GAMMA": out[...] = normPDF(d1)/(S*V*np.sqrt(T)) - out[ind_v] =0. elif greek == "VANNA": out[...] = -d2*normPDF(d1)/V - out[ind_v] =0. elif greek == "VOLGA": out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V; - out[ind_v] =0. elif greek == "THETA": out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2) else: RuntimeError("Wrong greek name is passed") - if (greek != "PREM"): - out[ind_t] = 0. greeks = ["PREM", "DELTA", "VEGA", "GAMMA", "VANNA", "VOLGA", "THETA",] -#greeks = ["PREM",] def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps): timer = CuNumericTimer() print("Start black_scholes") CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32) - #pre-compute some data for black_scholes - stdev = V * np.sqrt(T) - df = np.exp(-R*T) - ind_v = np.nonzero(np.absolute(V) Date: Thu, 16 Mar 2023 10:06:41 -0700 Subject: [PATCH 48/78] cleaning up greeks test --- examples/black_scholes_greeks.py | 88 ++++++++++++++------------------ 1 file changed, 38 insertions(+), 50 deletions(-) diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py index 8b5465ad5c..b4538533bb 100644 --- a/examples/black_scholes_greeks.py +++ b/examples/black_scholes_greeks.py @@ -18,54 +18,48 @@ import argparse from benchmark import parse_args, run_benchmark, CuNumericTimer - +from enum import IntEnum import math import cunumeric as np -#big size -#n_vol_steps = 40 +NUM_ITERS=10 +WARMUP_ITER=2 + vol_start = 0.1 vol_step = 0.01 -#n_t_steps = 365*10 t_start = 0.5 t_step = 1.0/(365*10) -#n_money_steps = 60 money_start = -0.4 money_step = 0.01 -#small size -#n_vol_steps = 10 -#vol_start = 0.1 -#vol_step = 0.01 -#n_t_steps = 6 -#t_start = 0.5 -#t_step = 0.5 -#n_money_steps = 1 -#money_start = 0 -#money_step = 0.1 - RISKFREE = 0.02 S0 = 100.0 N_GREEKS=7 +class Greeks(IntEnum): + PREM=0, + DELTA=1, + VEGA=2, + GAMMA=3, + VANNA=4, + VOLGA=5, + THETA=6 + def initialize(n_vol_steps, n_t_steps, n_money_steps, D): CALL = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D) PUT = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D) S=np.full((n_t_steps, n_vol_steps, n_money_steps,),S0, dtype = D) - K=np.full((n_t_steps, n_vol_steps, n_money_steps,), (1 + money_start), dtype = D) temp_arr = np.arange((n_vol_steps*n_t_steps*n_money_steps), dtype=int) k_temp=(temp_arr%n_money_steps)*money_step k_temp = k_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,)) - K+=k_temp - K=K*S0 + K=(k_temp+(1 + money_start))*S0 - T=np.full((n_t_steps, n_vol_steps, n_money_steps,),t_start, dtype = D) t_temp = (temp_arr%(n_vol_steps*n_money_steps))*vol_step t_temp = t_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,)) - T+=t_temp + T=t_temp+t_start R= 0.02 V=np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype = D) for i in range(n_vol_steps): @@ -83,15 +77,15 @@ def normCDF(d): K = 1.0 / (1.0 + 0.2316419 * np.absolute(d)) - cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) return np.where(d > 0, 1.0 - cnd, cnd) def normPDF(d): - RSQRT2PI = 0.39894228040143267793994605993438; - return RSQRT2PI * np.exp(- 0.5 * d * d); + RSQRT2PI = 0.39894228040143267793994605993438 + return RSQRT2PI * np.exp(- 0.5 * d * d) -def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, CP, greek): +def black_scholes ( out , S, K, R, T, V, CP, greek): EPS = 0.00000001 stdev = V * np.sqrt(T) df = np.exp(-R*T) @@ -100,45 +94,39 @@ def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, CP, greek): nd1 = normCDF(CP*d1) nd2 = normCDF(CP*d2) - if greek == "PREM": - out[...] = CP*(S*nd1 - K*df*nd2); - elif greek == "DELTA": + if greek == Greeks.PREM: + out[...] = CP*(S*nd1 - K*df*nd2) + elif greek == Greeks.DELTA: out[...] = CP*nd1 - elif greek =="VEGA": + elif greek ==Greeks.VEGA: out[...] = S*np.sqrt(T)*normPDF(d1) - elif greek == "GAMMA": + elif greek == Greeks.GAMMA: out[...] = normPDF(d1)/(S*V*np.sqrt(T)) - elif greek == "VANNA": + elif greek == Greeks.VANNA: out[...] = -d2*normPDF(d1)/V - elif greek == "VOLGA": - out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V; - elif greek == "THETA": + elif greek == Greeks.VOLGA: + out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V + elif greek == Greeks.THETA: out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2) else: - RuntimeError("Wrong greek name is passed") - + raise RuntimeError("Wrong greek name is passed") - -greeks = ["PREM", "DELTA", "VEGA", "GAMMA", "VANNA", "VOLGA", "THETA",] def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps): timer = CuNumericTimer() print("Start black_scholes") CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32) - d1 = np.zeros_like(S) - d2= np.zeros_like(S) - nd1 = np.zeros_like(S) - nd2= np.zeros_like(S) - print("After the initialization") - timer.start() - for count,g in enumerate(greeks): - black_scholes(CALL[count],S, K, R, T, V, d1, d2, nd1, nd2,1, g) - black_scholes(PUT[count],S, K, R, T, V, d1, d2, nd1, nd2, -1, g) - - total = timer.stop() - print("Elapsed Time: " + str(total) + " ms") + for i in range (NUM_ITERS): + if i==WARMUP_ITER: + timer.start() + for g in Greeks: + black_scholes(CALL[g.value],S, K, R, T, V, 1, g) + black_scholes(PUT[g.value],S, K, R, T, V, -1, g) + + total = (timer.stop())/(NUM_ITERS-WARMUP_ITER) + print("Elapsed Time: {} ms".format(total)) return total if __name__ == "__main__": From f6b8e67043e62598edaa9b12ead8643c8762f8b0 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 17 Mar 2023 11:53:32 -0700 Subject: [PATCH 49/78] making CPU UDF work with sparse arrays --- cunumeric/vectorize.py | 75 +++++++++++++++++++---------- src/cunumeric/vectorize/eval_udf.cc | 24 ++++++--- src/cunumeric/vectorize/eval_udf.cu | 2 +- 3 files changed, 67 insertions(+), 34 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 718097effd..722e8ae52e 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -194,12 +194,13 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: def _replace_name( self, name: str, _LOOP_VAR: str, is_gpu: bool = False ) -> str: - # print("IRINA DEBUG ARGNAMES =", self._argnames) - # print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names) if name in self._argnames and not (name in self._scalar_names): - return "{}[{}]".format(name, _LOOP_VAR) + return "{}[int({})]".format(name, _LOOP_VAR) else: - return "{}".format(name) + if is_gpu: + return "{}".format(name) + else: + return "{}[0]".format(name) def _build_gpu_function(self) -> Any: funcid = "vectorized_{}".format(self._pyfunc.__name__) @@ -228,16 +229,11 @@ def _build_gpu_function(self) -> Any: lines.append(" return") lines.append(" {}:int = 0".format(_LOOP_VAR)) lines.append(" for p in range({}-1):".format(_DIM_VAR)) + #fixme make sure we compute index correct for all data types lines.append( - " x={}[p]+int(local_i/{}[p])".format( - _LO_POINT_VAR, _PITCHES_VAR - ) - ) + " x=int(local_i/{}[p])".format(_PITCHES_VAR)) lines.append( - " local_i = local_i-{}[p]*int(local_i/{}[p])".format( - _PITCHES_VAR, _PITCHES_VAR - ) - ) + " local_i = int(local_i%{}[p])".format(_PITCHES_VAR)) lines.append( " {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) ) @@ -257,10 +253,10 @@ def _lift_to_array_access(m: Any) -> str: l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(l_new) - # print("IRINA DEBUG GPU function",lines) # Evaluate the string to get the Python function body = "\n".join(lines) + print("IRINA DEBUG GPU body", body) glbs: Dict[str, Any] = {} six.exec_(body, glbs) return glbs[funcid] @@ -272,17 +268,24 @@ def _build_cpu_function(self) -> Callable[[Any], Any]: lines = ["from numba import carray, types"] # Signature - lines.append("def {}({}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR)) + lines.append("def {}({}, {}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _LO_POINT_VAR, _STRIDES_VAR)) # Unpack kernel arguments def _emit_assignment( - var: Any, idx: int, sz: Any, ty: np.dtype[Any] + var: Any, idx: int, sz: Any, ty: np.dtype[Any], scalar=False ) -> None: - lines.append( - " {} = carray({}[{}], {}, types.{})".format( - var, _ARGS_VAR, idx, sz, ty + if scalar: + lines.append( + " {} = carray({}[{}], 1, types.{})".format( + var, _ARGS_VAR, idx, ty + ) + ) + else: + lines.append( + " {} = carray({}[{}], {}, types.{})".format( + var, _ARGS_VAR, idx, sz, ty + ) ) - ) # get names of arguments arg_idx = 0 @@ -293,12 +296,28 @@ def _emit_assignment( for a in self._scalar_args: scalar_type = np.dtype(type(a).__name__) _emit_assignment( - self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type + self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type, True ) arg_idx += 1 # Main loop - lines.append(" for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR)) + lines.append(" for local_i in range({}):".format( _SIZE_VAR)) + lines.append(" {}:int = 0".format(_LOOP_VAR)) + lines.append(" j:int = local_i") + lines.append(" for p in range({}-1):".format(_DIM_VAR)) + lines.append(" x=int(j/{}[p])".format( + _PITCHES_VAR + ) + ) + lines.append(" j = int(j%{}[p])".format(_PITCHES_VAR )) + + lines.append(" {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) + ) + lines.append(" {}+=int(j*{}[{}-1])".format( + _LOOP_VAR, _STRIDES_VAR, _DIM_VAR + ) + ) + lines_old = self._get_func_body(self._pyfunc) @@ -308,11 +327,12 @@ def _lift_to_array_access(m: Any) -> str: # lines_new = [] for line in lines_old: - l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line) - lines.append(" " + l_new) + l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) + lines.append(" " + l_new) # Evaluate the string to get the Python function body = "\n".join(lines) + print ("IRINA DEBUG body =", body) glbs: Dict[str, Any] = {} six.exec_(body, glbs) return glbs[funcid] @@ -349,7 +369,11 @@ def _compile_func_gpu(self) -> tuple[Any]: def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: sig = numba.core.types.void( - numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64 + numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64, + numba.core.types.uint64, + numba.core.types.CPointer(numba.core.types.uint64), + numba.core.types.CPointer(numba.core.types.uint64), + numba.core.types.CPointer(numba.core.types.uint64) ) # type: ignore return numba.cfunc(sig)(self._numba_func) @@ -359,9 +383,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if is_gpu and not self._created: # create CUDA kernel launch_domain = Rect(lo=(0,), hi=(num_gpus,)) - kernel_task = self._context.create_task( + kernel_task = self._context.create_manual_task( CuNumericOpCode.CREATE_CU_KERNEL, - manual=True, launch_domain=launch_domain, ) ptx_hash = hash(self._gpu_func[0]) diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 92f8cd739f..c10a359694 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -15,10 +15,10 @@ */ #include "cunumeric/vectorize/eval_udf.h" +#include "cunumeric/pitches.h" namespace cunumeric { -using namespace Legion; using namespace legate; struct EvalUdfCPU { @@ -27,29 +27,39 @@ struct EvalUdfCPU { { // In the case of CPU, we pack arguments in a vector and pass them to the // function (through the function pointer geenrated by numba) - using UDF = void(void**, size_t); + using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*, uint32_t*); auto udf = reinterpret_cast(args.cpu_func_ptr); std::vector udf_args; size_t volume = 1; + Pitches pitches; + Rect rect; + size_t strides[DIM]; if (args.inputs.size()>0){ using VAL = legate_type_of; - auto rect = args.inputs[0].shape(); + rect = args.inputs[0].shape(); + volume = pitches.flatten(rect); if (rect.empty()) return; for (size_t i = 0; i < args.inputs.size(); i++) { if (i < args.num_outputs) { auto out = args.outputs[i].write_accessor(rect); - udf_args.push_back(reinterpret_cast(out.ptr(rect))); + udf_args.push_back(reinterpret_cast(out.ptr(rect, strides))); } else { auto out = args.inputs[i].read_accessor(rect); - udf_args.push_back(reinterpret_cast(const_cast(out.ptr(rect)))); + udf_args.push_back(reinterpret_cast(const_cast(out.ptr(rect,strides)))); } } - volume = rect.volume(); }//if for (auto s: args.scalars) udf_args.push_back(const_cast(s.ptr())); - udf(udf_args.data(), volume); +// udf(udf_args.data(), volume, size_t(DIM),reinterpret_cast(&pitches.data()[0]), +// reinterpret_cast(&rect.lo[0]), reinterpret_cast(&strides[0])); + std::cout<<"IRINA DEBUG pitches = "<( const_cast(pitches.data())), reinterpret_cast(&rect.lo[0]), reinterpret_cast(&strides[0])); + } }; diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 6a47e85f05..411c2aa423 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -156,7 +156,7 @@ struct EvalUdfGPU { context.get_current_processor(), ptx_hash}; size_t dim=1; - if (args.inputs.size()>1){ + if (args.inputs.size()>0){ dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); } From f3a704d6da89477029fff523e2c154a7445e5014 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 17 Mar 2023 21:07:15 -0700 Subject: [PATCH 50/78] removed unused argument --- cunumeric/vectorize.py | 8 +------- src/cunumeric/vectorize/eval_udf.cc | 10 ++-------- src/cunumeric/vectorize/eval_udf.cu | 10 ---------- 3 files changed, 3 insertions(+), 25 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 722e8ae52e..5629d9403e 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -47,7 +47,6 @@ _ARGS_VAR = "__args__" _DIM_VAR = "__dim__" _STRIDES_VAR = "__strides__" -_LO_POINT_VAR = "__lo_point__" _PITCHES_VAR = "__pitches__" @@ -216,7 +215,6 @@ def _build_gpu_function(self) -> Any: + [_SIZE_VAR] + [_DIM_VAR] + [_PITCHES_VAR] - + [_LO_POINT_VAR] + [_STRIDES_VAR] ) @@ -256,7 +254,6 @@ def _lift_to_array_access(m: Any) -> str: # Evaluate the string to get the Python function body = "\n".join(lines) - print("IRINA DEBUG GPU body", body) glbs: Dict[str, Any] = {} six.exec_(body, glbs) return glbs[funcid] @@ -268,7 +265,7 @@ def _build_cpu_function(self) -> Callable[[Any], Any]: lines = ["from numba import carray, types"] # Signature - lines.append("def {}({}, {}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _LO_POINT_VAR, _STRIDES_VAR)) + lines.append("def {}({}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _STRIDES_VAR)) # Unpack kernel arguments def _emit_assignment( @@ -332,7 +329,6 @@ def _lift_to_array_access(m: Any) -> str: # Evaluate the string to get the Python function body = "\n".join(lines) - print ("IRINA DEBUG body =", body) glbs: Dict[str, Any] = {} six.exec_(body, glbs) return glbs[funcid] @@ -360,7 +356,6 @@ def _compile_func_gpu(self) -> tuple[Any]: + [numba.core.types.uint64] + [numba.core.types.CPointer(numba.core.types.uint64)] + [numba.core.types.CPointer(numba.core.types.uint64)] - + [numba.core.types.CPointer(numba.core.types.uint64)] ) sig = (*arg_types,) @@ -372,7 +367,6 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64, numba.core.types.uint64, numba.core.types.CPointer(numba.core.types.uint64), - numba.core.types.CPointer(numba.core.types.uint64), numba.core.types.CPointer(numba.core.types.uint64) ) # type: ignore diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index c10a359694..e42978e0e9 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -27,7 +27,7 @@ struct EvalUdfCPU { { // In the case of CPU, we pack arguments in a vector and pass them to the // function (through the function pointer geenrated by numba) - using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*, uint32_t*); + using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*); auto udf = reinterpret_cast(args.cpu_func_ptr); std::vector udf_args; size_t volume = 1; @@ -52,13 +52,7 @@ struct EvalUdfCPU { }//if for (auto s: args.scalars) udf_args.push_back(const_cast(s.ptr())); -// udf(udf_args.data(), volume, size_t(DIM),reinterpret_cast(&pitches.data()[0]), -// reinterpret_cast(&rect.lo[0]), reinterpret_cast(&strides[0])); - std::cout<<"IRINA DEBUG pitches = "<( const_cast(pitches.data())), reinterpret_cast(&rect.lo[0]), reinterpret_cast(&strides[0])); + udf(udf_args.data(), volume, size_t(DIM), reinterpret_cast( const_cast(pitches.data())), reinterpret_cast(&strides[0])); } }; diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 411c2aa423..7d03af0341 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -45,7 +45,6 @@ struct EvalUdfGPU { buffer_size +=sizeof(size_t);//size buffer_size += sizeof(size_t);//dim buffer_size += sizeof(void*);//pitches - buffer_size += sizeof(void*);//lo_point buffer_size += sizeof(void*);//strides std::vector arg_buffer(buffer_size); @@ -84,22 +83,15 @@ struct EvalUdfGPU { //create buffers for pitches, lower point and strides since //we need to pass pointer to device memory auto device_pitches = create_buffer(Point<1>(DIM-1), Memory::Kind::Z_COPY_MEM); - auto device_lo = create_buffer(Point<1>(DIM), Memory::Kind::Z_COPY_MEM); auto device_strides = create_buffer(Point<1>(DIM), Memory::Kind::Z_COPY_MEM); - //std::cout<<"IRINA DEBUG"<(i)]=pitches.data()[i]; - //std::cout<<" pitches ="<(i)]=rect.lo[i]; device_strides[Point<1>(i)] = strides[i]; - //std::cout<<" device_lo = " <(m)`` for - vectorized matrix-vector multiplication. If provided, ``pyfunc`` will - be called with (and expected to return) arrays with shapes given by the - size of corresponding core dimensions. By default, ``pyfunc`` is - assumed to take scalars as input and output. - - Returns - ------- - vectorized : callable - Vectorized function. - - See Also - -------- - numpy.vectorize - - Availability - -------- - Multiple GPUs, Multiple CPUs - """ - def __init__( self, pyfunc: Callable[[Any], Any], @@ -112,12 +52,70 @@ def __init__( cache: bool = False, signature: Optional[str] = None, ) -> None: + """ + vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False, + signature=None) + Generalized function class. + Define a vectorized function which takes a nested sequence of + objects or numpy arrays as inputs and returns a single numpy array + or a tuple of numpy arrays. + The vectorized function evaluates `pyfunc` over successive tuples + of the input arrays like the python map function, except it uses the + broadcasting rules of numpy. + The data type of the output of `vectorized` is determined by calling + the function with the first element of the input. This can be avoided + by specifying the `otypes` argument. + + Parameters + ---------- + pyfunc : callable + A python function or method. + otypes : str or list of dtypes, optional + The output data type. It must be specified as either a string of + typecode characters or a list of data type specifiers. There should + be one data type specifier for each output. + WARNING: cuNumeric currently requires all output types to be the + same + doc : str, optional + The docstring for the function. If None, the docstring will be the + ``pyfunc.__doc__``. + excluded : set, optional + Set of strings or integers representing the positional or keyword + arguments for which the function will not be vectorized. + These will be passed directly to `pyfunc` unmodified. + WARNING: cuNumeric doesn't suport this argument at the moment + cache : bool, optional + If `True`, then cache the first function call that generates C fun- + ction or CUDA kernel + signature : string, optional + Generalized universal function signature, e.g., ``(m,n),(n)->(m)`` + for vectorized matrix-vector multiplication. If provided, + ``pyfunc`` will be called with (and expected to return) + arrays with shapes given by the size of corresponding core + dimensions. By default, ``pyfunc`` is assumed to take scalars + as input and output. + WARNING: cuNumeric doesn't suport this argument at the moment + + Returns + ------- + vectorized : callable + Vectorized function. + + See Also + -------- + numpy.vectorize + + Availability + -------- + Multiple GPUs, Multiple CPUs + """ + self._pyfunc = pyfunc + self._otypes: Optional[tuple[Any]] = None + self._cache: bool = cache self._numba_func: Callable[[Any], Any] self._cpu_func: numba.core.ccallback.CFunc self._gpu_func: tuple[Any] - self._otypes: Optional[tuple[Any]] = None - self._result = None self._args: List[Any] = [] self._scalar_args: List[Any] = [] self._scalar_idxs: List[int] = [] @@ -126,15 +124,7 @@ def __init__( self._kwargs: List[Any] = [] self._context = runtime.legate_context self._created: bool = False - self._cache: bool = cache self._num_outputs = 1 # there is at least 1 output - size_tmp=runtime.num_gpus - if size_tmp==1: - size_tmp=10 - #self._created_array = full((size_tmp,), True, dtype=bool) - #self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk) - #runtime.create_empty_thunk( - # (1,), dtype = np.dtype(np.bool), inputs=[]) if doc is None: self.__doc__ = pyfunc.__doc__ @@ -143,7 +133,7 @@ def __init__( if otypes is not None: self._num_outputs = len(otypes) - if len(otypes) == 0: + if self._num_outputs == 0: raise ValueError( "There should be at least 1 type specified in otypes" ) @@ -183,7 +173,7 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: if func.__doc__ is not None and len(func.__doc__.split("\n")) > 0: lines_to_skip = len(func.__doc__.split("\n")) - lines = inspect.getsourcelines(func)[0] + lines = inspect.getsourcelines(func)[0] # type ignore return_lines = [] for i in range(lines_to_skip + 1, len(lines)): @@ -206,6 +196,7 @@ def _build_gpu_function(self) -> Any: # Preamble lines = ["from numba import cuda"] + # we add math and numpy so user-defined functions can use them lines.append("import math") lines.append("import numpy") @@ -225,13 +216,16 @@ def _build_gpu_function(self) -> Any: lines.append(" local_i = cuda.grid(1)") lines.append(" if local_i >= {}:".format(_SIZE_VAR)) lines.append(" return") + # we compute inndex for sparse data access when using Legion's + # pointer. + # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] lines.append(" {}:int = 0".format(_LOOP_VAR)) lines.append(" for p in range({}-1):".format(_DIM_VAR)) - #fixme make sure we compute index correct for all data types - lines.append( - " x=int(local_i/{}[p])".format(_PITCHES_VAR)) + # fixme make sure we compute index correct for all data types + lines.append(" x=int(local_i/{}[p])".format(_PITCHES_VAR)) lines.append( - " local_i = int(local_i%{}[p])".format(_PITCHES_VAR)) + " local_i = int(local_i%{}[p])".format(_PITCHES_VAR) + ) lines.append( " {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) ) @@ -241,7 +235,7 @@ def _build_gpu_function(self) -> Any: ) ) - # Kernel body + # this function is used to replace all array names with array[i] def _lift_to_array_access(m: Any) -> str: return self._replace_name(m.group(0), _LOOP_VAR, True) @@ -251,7 +245,6 @@ def _lift_to_array_access(m: Any) -> str: l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(l_new) - # Evaluate the string to get the Python function body = "\n".join(lines) glbs: Dict[str, Any] = {} @@ -263,15 +256,32 @@ def _build_cpu_function(self) -> Callable[[Any], Any]: # Preamble lines = ["from numba import carray, types"] + # we add math and numpy so user-defined functions can use them + lines.append("import math") + lines.append("import numpy") # Signature - lines.append("def {}({}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _STRIDES_VAR)) + lines.append( + "def {}({}, {}, {}, {}, {}):".format( + funcid, + _ARGS_VAR, + _SIZE_VAR, + _DIM_VAR, + _PITCHES_VAR, + _STRIDES_VAR, + ) + ) # Unpack kernel arguments def _emit_assignment( - var: Any, idx: int, sz: Any, ty: np.dtype[Any], scalar=False + var: Any, + idx: int, + sz: Any, + ty: np.dtype[Any], + scalar: bool = False, ) -> None: if scalar: + # we represent scalars as arrays of size 1 lines.append( " {} = carray({}[{}], 1, types.{})".format( var, _ARGS_VAR, idx, ty @@ -284,11 +294,13 @@ def _emit_assignment( ) ) - # get names of arguments + # define pyfunc arguments ar carrays arg_idx = 0 for a in self._args: - type_a= a.dtype - _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, type_a) + type_a = a.dtype + _emit_assignment( + self._argnames[arg_idx], arg_idx, _SIZE_VAR, type_a + ) arg_idx += 1 for a in self._scalar_args: scalar_type = np.dtype(type(a).__name__) @@ -298,31 +310,31 @@ def _emit_assignment( arg_idx += 1 # Main loop - lines.append(" for local_i in range({}):".format( _SIZE_VAR)) + lines.append(" for local_i in range({}):".format(_SIZE_VAR)) + # we compute inndex for sparse data access when using Legion's + # pointer. + # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] lines.append(" {}:int = 0".format(_LOOP_VAR)) lines.append(" j:int = local_i") lines.append(" for p in range({}-1):".format(_DIM_VAR)) - lines.append(" x=int(j/{}[p])".format( - _PITCHES_VAR - ) - ) - lines.append(" j = int(j%{}[p])".format(_PITCHES_VAR )) + lines.append(" x=int(j/{}[p])".format(_PITCHES_VAR)) + lines.append(" j = int(j%{}[p])".format(_PITCHES_VAR)) - lines.append(" {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) + lines.append( + " {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) ) - lines.append(" {}+=int(j*{}[{}-1])".format( + lines.append( + " {}+=int(j*{}[{}-1])".format( _LOOP_VAR, _STRIDES_VAR, _DIM_VAR ) ) - lines_old = self._get_func_body(self._pyfunc) # Kernel body def _lift_to_array_access(m: Any) -> str: return self._replace_name(m.group(0), _LOOP_VAR) - # lines_new = [] for line in lines_old: l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(" " + l_new) @@ -364,14 +376,15 @@ def _compile_func_gpu(self) -> tuple[Any]: def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: sig = numba.core.types.void( - numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64, + numba.types.CPointer(numba.types.voidptr), + numba.core.types.uint64, numba.core.types.uint64, numba.core.types.CPointer(numba.core.types.uint64), - numba.core.types.CPointer(numba.core.types.uint64) - ) # type: ignore + numba.core.types.CPointer(numba.core.types.uint64), + ) return numba.cfunc(sig)(self._numba_func) - + @track_provenance(runtime.legate_context) def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if is_gpu and not self._created: @@ -384,22 +397,21 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: ptx_hash = hash(self._gpu_func[0]) kernel_task.add_scalar_arg(ptx_hash, ty.int64) kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - #added to introduce dependency between this and EVAL_UDF task - #kernel_task.add_input(self._created_array_deferred.base) - #kernel_task.add_output(self._created_array_deferred.base) kernel_task.execute() + # we want to make sure EVAL_UDF function is not executed before + # CUDA kernel is created self._context.issue_execution_fence(block=True) - # inline map first element of the array to make sure the CREATE_CU_KERNEL - # task has finished by the time we set self._created to True + # task has finished by the time we set self._created to True if self._cache: - #self._created = bool(self._created_array[0]) self._created = True task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) - task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs - task.add_scalar_arg(len(self._scalar_args), ty.uint32) # N of scalar_args - # add all scalars + task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs + task.add_scalar_arg( + len(self._scalar_args), ty.uint32 + ) # N of scalar_args + # add all scalars for a in self._scalar_args: dtype = convert_to_cunumeric_dtype(type(a).__name__) task.add_scalar_arg(a, dtype) @@ -409,21 +421,16 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): a_tmp = runtime.to_deferred_array(a._thunk) - a_tmp = a_tmp.base - task.add_input(a_tmp) + a_tmp_base = a_tmp.base + task.add_input(a_tmp_base) if count < self._num_outputs: - task.add_output(a_tmp) + task.add_output(a_tmp_base) if count != 0: - task.add_alignment(a0.base, a_tmp) + task.add_alignment(a0.base, a_tmp_base) if is_gpu: ptx_hash = hash(self._gpu_func[0]) task.add_scalar_arg(ptx_hash, ty.int64) - # passing the _created * array to introduce dependency between - # CREATE_CU_KERNEL task and EVAL_UDF task - #task.add_input(self._created_array_deferred.base) - #task.add_broadcast(self._created_array_deferred.base) - else: task.add_scalar_arg( self._cpu_func.address, ty.uint64 @@ -435,8 +442,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: Return arrays with the results of `pyfunc` broadcast (vectorized) over `args` and `kwargs` not in `excluded`. """ - # profiler = cProfile.Profile() - # profiler.enable() if not self._created: self._scalar_args.clear() self._scalar_idxs.clear() @@ -455,8 +460,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._scalar_idxs.append(i) else: self._args.append(convert_to_cunumeric_ndarray(arg)) - - # first fill arrays to argnames, then scalars: + + # first fill arrays to argnames, then scalars: for i, k in enumerate(inspect.signature(self._pyfunc).parameters): if not (i in self._scalar_idxs): self._argnames.append(k) @@ -515,4 +520,3 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: if self._cache: self._created = True self._execute(False) - diff --git a/docs/cunumeric/source/api/_vectorize.rst b/docs/cunumeric/source/api/_vectorize.rst new file mode 100644 index 0000000000..048e8ab51d --- /dev/null +++ b/docs/cunumeric/source/api/_vectorize.rst @@ -0,0 +1,13 @@ +cunumeric.vectorize +============================= + +.. currentmodule:: cunumeric + +.. autoclass:: vectorize + + .. automethod:: __init__ + + .. rubric:: Methods + + .. autosummary:: __call__ + diff --git a/docs/cunumeric/source/api/vectorize.rst b/docs/cunumeric/source/api/vectorize.rst new file mode 100644 index 0000000000..5e6cbea66b --- /dev/null +++ b/docs/cunumeric/source/api/vectorize.rst @@ -0,0 +1,15 @@ +.. module:: cunumeric.vectorize + +========================================= + +Vectorize +----------------- + +.. toctree:: + :maxdepth: 2 + :hidden: + + _vectorize + + + diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h index 8ebe3c809c..eb90ab6da2 100644 --- a/src/cunumeric/cuda_help.h +++ b/src/cunumeric/cuda_help.h @@ -393,7 +393,7 @@ __device__ __forceinline__ void store_streaming(double* ptr, double valu asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory"); } #if 0 -#include +#include class JITKernelStorage { @@ -429,5 +429,5 @@ class JITKernelStorage } };//class JITKernelStorage -#endif +#endif } // namespace cunumeric diff --git a/src/cunumeric/cudalibs.cu b/src/cunumeric/cudalibs.cu index 45d33d752f..f1e3a95be1 100644 --- a/src/cunumeric/cudalibs.cu +++ b/src/cunumeric/cudalibs.cu @@ -233,17 +233,16 @@ cufftContext CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& si return cufftContext(cache->get_cufft_plan(size)); } -void CUDALibraries::store_udf_func(size_t hash, CUfunction func){ - udf_caches_[hash]=func; -} +void CUDALibraries::store_udf_func(size_t hash, CUfunction func) { udf_caches_[hash] = func; } -CUfunction CUDALibraries::get_udf_func(size_t hash){ - auto finder = udf_caches_.find(hash); - if (udf_caches_.end() == finder) { - fprintf(stderr, "UDF function wasn't generated yet"); - LEGATE_ABORT; - } - return udf_caches_[hash]; +CUfunction CUDALibraries::get_udf_func(size_t hash) +{ + auto finder = udf_caches_.find(hash); + if (udf_caches_.end() == finder) { + fprintf(stderr, "UDF function wasn't generated yet"); + LEGATE_ABORT; + } + return udf_caches_[hash]; } static CUDALibraries& get_cuda_libraries(legate::Processor proc) @@ -291,13 +290,15 @@ cufftContext get_cufft_plan(cufftType type, const DomainPoint& size) return lib.get_cufft_plan(type, size); } -void store_udf(size_t hash, CUfunction func){ - const auto proc = legate::Processor::get_executing_processor(); +void store_udf(size_t hash, CUfunction func) +{ + const auto proc = legate::Processor::get_executing_processor(); auto& lib = get_cuda_libraries(proc); lib.store_udf_func(hash, func); } -CUfunction get_udf(size_t hash){ +CUfunction get_udf(size_t hash) +{ const auto proc = legate::Processor::get_executing_processor(); auto& lib = get_cuda_libraries(proc); return lib.get_udf_func(hash); diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index d3f00091cc..ce5b70a432 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -25,81 +25,79 @@ using namespace Legion; using namespace legate; __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) -fill_out_kernel(const AccessorRD,true,1> out) + fill_out_kernel(const AccessorRD, true, 1> out) { const int idx = (blockIdx.x * blockDim.x + threadIdx.x); - if (idx >0) return; + if (idx > 0) return; out.reduce(0, true); } /*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context) { - int64_t ptx_hash = context.scalars()[0].value(); - std::string ptx = context.scalars()[1].value(); - Processor point = context.get_current_processor(); + std::string ptx = context.scalars()[1].value(); + Processor point = context.get_current_processor(); CUfunction func; - const unsigned num_options = 4; - const size_t log_buffer_size = 16384; - std::vector log_info_buffer(log_buffer_size); - std::vector log_error_buffer(log_buffer_size); - CUjit_option jit_options[] = { - CU_JIT_INFO_LOG_BUFFER, - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - }; - void* option_vals[] = { - static_cast(log_info_buffer.data()), - reinterpret_cast(log_buffer_size), - static_cast(log_error_buffer.data()), - reinterpret_cast(log_buffer_size), - }; + const unsigned num_options = 4; + const size_t log_buffer_size = 16384; + std::vector log_info_buffer(log_buffer_size); + std::vector log_error_buffer(log_buffer_size); + CUjit_option jit_options[] = { + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + }; + void* option_vals[] = { + static_cast(log_info_buffer.data()), + reinterpret_cast(log_buffer_size), + static_cast(log_error_buffer.data()), + reinterpret_cast(log_buffer_size), + }; - CUmodule module; - CUresult result = - cuModuleLoadDataEx(&module, ptx.data(), num_options, jit_options, option_vals); - if (result != CUDA_SUCCESS) { - if (result == CUDA_ERROR_OPERATING_SYSTEM) { - fprintf(stderr, - "ERROR: Device side asserts are not supported by the " - "CUDA driver for MAC OSX, see NVBugs 1628896.\n"); - exit(-1); - } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) { - fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n"); - exit(-1); - } else { - fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data()); + CUmodule module; + CUresult result = cuModuleLoadDataEx(&module, ptx.data(), num_options, jit_options, option_vals); + if (result != CUDA_SUCCESS) { + if (result == CUDA_ERROR_OPERATING_SYSTEM) { + fprintf(stderr, + "ERROR: Device side asserts are not supported by the " + "CUDA driver for MAC OSX, see NVBugs 1628896.\n"); + exit(-1); + } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) { + fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n"); + exit(-1); + } else { + fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data()); #if CUDA_VERSION >= 6050 - const char *name, *str; - assert(cuGetErrorName(result, &name) == CUDA_SUCCESS); - assert(cuGetErrorString(result, &str) == CUDA_SUCCESS); - fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); + const char *name, *str; + assert(cuGetErrorName(result, &name) == CUDA_SUCCESS); + assert(cuGetErrorString(result, &str) == CUDA_SUCCESS); + fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str); #else - fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result); + fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result); #endif - exit(-1); - } + exit(-1); } - std::cmatch line_match; - bool match = - std::regex_search(ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); + } + std::cmatch line_match; + bool match = + std::regex_search(ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+")); #ifdef DEBUG_CUNUMERIC - assert(match); + assert(match); #endif - const auto& matched_line = line_match.begin()->str(); - auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); + const auto& matched_line = line_match.begin()->str(); + auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size()); - result = cuModuleGetFunction(&func, module, fun_name.c_str()); + result = cuModuleGetFunction(&func, module, fun_name.c_str()); #ifdef DEBUG_CUNUMERIC - assert(result == CUDA_SUCCESS); + assert(result == CUDA_SUCCESS); #endif - store_udf(ptx_hash, func); - //auto stream = get_cached_stream(); - //auto out = context.reductions()[0].reduce_accessor, true, 1>(); - //fill_out_kernel<<<1,1,0,stream>>>(out); - //CHECK_CUDA_STREAM(stream); + store_udf(ptx_hash, func); + // auto stream = get_cached_stream(); + // auto out = context.reductions()[0].reduce_accessor, true, 1>(); + // fill_out_kernel<<<1,1,0,stream>>>(out); + // CHECK_CUDA_STREAM(stream); } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index e42978e0e9..fe2c20bcc0 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -34,10 +34,10 @@ struct EvalUdfCPU { Pitches pitches; Rect rect; size_t strides[DIM]; - if (args.inputs.size()>0){ + if (args.inputs.size() > 0) { using VAL = legate_type_of; - rect = args.inputs[0].shape(); - volume = pitches.flatten(rect); + rect = args.inputs[0].shape(); + volume = pitches.flatten(rect); if (rect.empty()) return; for (size_t i = 0; i < args.inputs.size(); i++) { @@ -46,14 +46,16 @@ struct EvalUdfCPU { udf_args.push_back(reinterpret_cast(out.ptr(rect, strides))); } else { auto out = args.inputs[i].read_accessor(rect); - udf_args.push_back(reinterpret_cast(const_cast(out.ptr(rect,strides)))); + udf_args.push_back(reinterpret_cast(const_cast(out.ptr(rect, strides)))); } } - }//if - for (auto s: args.scalars) - udf_args.push_back(const_cast(s.ptr())); - udf(udf_args.data(), volume, size_t(DIM), reinterpret_cast( const_cast(pitches.data())), reinterpret_cast(&strides[0])); - + } // if + for (auto s : args.scalars) udf_args.push_back(const_cast(s.ptr())); + udf(udf_args.data(), + volume, + size_t(DIM), + reinterpret_cast(const_cast(pitches.data())), + reinterpret_cast(&strides[0])); } }; @@ -61,25 +63,23 @@ struct EvalUdfCPU { { uint32_t num_outputs = context.scalars()[0].value(); uint32_t num_scalars = context.scalars()[1].value(); - std::vectorscalars; - for (size_t i=2; i<(2+num_scalars); i++) - scalars.push_back(context.scalars()[i]); + std::vector scalars; + for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]); - EvalUdfArgs args{context.scalars()[2+num_scalars].value(), + EvalUdfArgs args{context.scalars()[2 + num_scalars].value(), context.inputs(), context.outputs(), scalars, num_outputs, context.get_current_processor()}; - size_t dim=1; - if (args.inputs.size()>0){ + size_t dim = 1; + if (args.inputs.size() > 0) { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); + } else { + // FIXME + double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); } - else{ - //FIXME - double_dispatch(dim, args.inputs[0].code() , EvalUdfCPU{}, args); - } } namespace // unnamed diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index 7d03af0341..f0a7844dc4 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -22,38 +22,38 @@ namespace cunumeric { -//using namespace Legion; +// using namespace Legion; using namespace legate; struct EvalUdfGPU { template void operator()(EvalUdfArgs& args) const { - using VAL = legate_type_of; - Rect rect; + using VAL = legate_type_of; + Rect rect; - // size_t input_size=args.inputs.size()-1; - size_t input_size=args.inputs.size(); - // auto procs_rect = args.inputs[input_size].shape<1>(); + // size_t input_size=args.inputs.size()-1; + size_t input_size = args.inputs.size(); + // auto procs_rect = args.inputs[input_size].shape<1>(); - //auto procs=args.inputs[input_size].read_accessor(); - //auto funcs=args.inputs[input_size+1].read_accessor(); - CUfunction func = get_udf(args.hash); + // auto procs=args.inputs[input_size].read_accessor(); + // auto funcs=args.inputs[input_size+1].read_accessor(); + CUfunction func = get_udf(args.hash); // Filling up the buffer with arguments - size_t buffer_size = (input_size+args.scalars.size()) * sizeof(void*); - buffer_size +=sizeof(size_t);//size - buffer_size += sizeof(size_t);//dim - buffer_size += sizeof(void*);//pitches - buffer_size += sizeof(void*);//strides + size_t buffer_size = (input_size + args.scalars.size()) * sizeof(void*); + buffer_size += sizeof(size_t); // size + buffer_size += sizeof(size_t); // dim + buffer_size += sizeof(void*); // pitches + buffer_size += sizeof(void*); // strides std::vector arg_buffer(buffer_size); char* raw_arg_buffer = arg_buffer.data(); auto p = raw_arg_buffer; size_t strides[DIM]; - size_t size =1; - if (input_size>0){ + size_t size = 1; + if (input_size > 0) { rect = args.inputs[0].shape(); size = rect.volume(); for (size_t i = 0; i < input_size; i++) { @@ -67,34 +67,31 @@ struct EvalUdfGPU { p += sizeof(void*); } } - for (auto scalar: args.scalars){ - memcpy(p, scalar.ptr(), scalar.size()); - p += scalar.size(); - // *reinterpret_cast(p) =s; - //p += sizeof(void*); - } + for (auto scalar : args.scalars) { + memcpy(p, scalar.ptr(), scalar.size()); + p += scalar.size(); + // *reinterpret_cast(p) =s; + // p += sizeof(void*); + } memcpy(p, &size, sizeof(size_t)); - size_t dim=DIM; + size_t dim = DIM; p += sizeof(size_t); memcpy(p, &dim, sizeof(size_t)); p += sizeof(size_t); Pitches pitches; size_t volume = pitches.flatten(rect); - //create buffers for pitches, lower point and strides since - //we need to pass pointer to device memory - auto device_pitches = create_buffer(Point<1>(DIM-1), Memory::Kind::Z_COPY_MEM); - auto device_strides = create_buffer(Point<1>(DIM), Memory::Kind::Z_COPY_MEM); - for (size_t i=0; i(i)]=pitches.data()[i]; - } + // create buffers for pitches, lower point and strides since + // we need to pass pointer to device memory + auto device_pitches = create_buffer(Point<1>(DIM - 1), Memory::Kind::Z_COPY_MEM); + auto device_strides = create_buffer(Point<1>(DIM), Memory::Kind::Z_COPY_MEM); + for (size_t i = 0; i < DIM; i++) { + if (i != DIM - 1) { device_pitches[Point<1>(i)] = pitches.data()[i]; } device_strides[Point<1>(i)] = strides[i]; } - *reinterpret_cast(p) =device_pitches.ptr(Point<1>(0)); + *reinterpret_cast(p) = device_pitches.ptr(Point<1>(0)); p += sizeof(void*); - *reinterpret_cast(p) =device_strides.ptr(Point<1>(0)); + *reinterpret_cast(p) = device_strides.ptr(Point<1>(0)); p += sizeof(void*); - void* config[] = { CU_LAUNCH_PARAM_BUFFER_POINTER, @@ -127,16 +124,13 @@ struct EvalUdfGPU { /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context) { - uint32_t num_outputs = context.scalars()[0].value(); uint32_t num_scalars = context.scalars()[1].value(); - std::vectorscalars; - for (size_t i=2; i<(2+num_scalars); i++) - scalars.push_back(context.scalars()[i]); - - int64_t ptx_hash = context.scalars()[2+num_scalars].value(); - //bool is_created = context.scalars()[3+num_scalars].value(); + std::vector scalars; + for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]); + int64_t ptx_hash = context.scalars()[2 + num_scalars].value(); + // bool is_created = context.scalars()[3+num_scalars].value(); EvalUdfArgs args{0, context.inputs(), @@ -145,15 +139,14 @@ struct EvalUdfGPU { num_outputs, context.get_current_processor(), ptx_hash}; - size_t dim=1; - if (args.inputs.size()>0){ + size_t dim = 1; + if (args.inputs.size() > 0) { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); - } - else{ - //FIXME + } else { + // FIXME double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); - //double_dispatch(dim, 0 , EvalUdfGPU{}, args); + // double_dispatch(dim, 0 , EvalUdfGPU{}, args); } } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h index 2e14f0dacc..784e2334b1 100644 --- a/src/cunumeric/vectorize/eval_udf.h +++ b/src/cunumeric/vectorize/eval_udf.h @@ -25,10 +25,10 @@ struct EvalUdfArgs { uint64_t cpu_func_ptr; std::vector& inputs; std::vector& outputs; - std::vectorscalars; + std::vector scalars; uint32_t num_outputs; Legion::Processor point; - int64_t hash=0; + int64_t hash = 0; }; class EvalUdfTask : public CuNumericTask { diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl index c597e44a14..c0bc72c428 100644 --- a/src/cunumeric/vectorize/eval_udf_template.inl +++ b/src/cunumeric/vectorize/eval_udf_template.inl @@ -38,7 +38,7 @@ struct EvalUdfImpl { std::vector udf_args; using VAL = legate_type_of; auto rect = args.args[0].shape(); - + size_t strides[DIM]; if (rect.empty()) return; @@ -46,8 +46,8 @@ struct EvalUdfImpl { for (size_t i = 0; i < args.args.size(); i++) { auto out = args.args[i].write_accessor(rect); udf_args.push_back(reinterpret_cast(out.ptr(rect, strides))); - for (size_t i=0; i 0: type_a = self._args[0].dtype diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index ce5b70a432..dde9616543 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -94,10 +94,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) assert(result == CUDA_SUCCESS); #endif store_udf(ptx_hash, func); - // auto stream = get_cached_stream(); - // auto out = context.reductions()[0].reduce_accessor, true, 1>(); - // fill_out_kernel<<<1,1,0,stream>>>(out); - // CHECK_CUDA_STREAM(stream); } } // namespace cunumeric diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index fe2c20bcc0..5a40260a1d 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -26,7 +26,7 @@ struct EvalUdfCPU { void operator()(EvalUdfArgs& args) const { // In the case of CPU, we pack arguments in a vector and pass them to the - // function (through the function pointer geenrated by numba) + // function (through the function pointer generated by numba) using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*); auto udf = reinterpret_cast(args.cpu_func_ptr); std::vector udf_args; @@ -77,8 +77,8 @@ struct EvalUdfCPU { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); } else { - // FIXME - double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); + LegateTypeCode code = LegateTypeCode::BOOL_LT ; + double_dispatch(dim, code, EvalUdfCPU{}, args); } } diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index f0a7844dc4..ebe3e0730e 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -32,15 +32,10 @@ struct EvalUdfGPU { using VAL = legate_type_of; Rect rect; - // size_t input_size=args.inputs.size()-1; size_t input_size = args.inputs.size(); - // auto procs_rect = args.inputs[input_size].shape<1>(); - - // auto procs=args.inputs[input_size].read_accessor(); - // auto funcs=args.inputs[input_size+1].read_accessor(); CUfunction func = get_udf(args.hash); - // Filling up the buffer with arguments + // Filling up the buffer with arguments size_t buffer_size = (input_size + args.scalars.size()) * sizeof(void*); buffer_size += sizeof(size_t); // size buffer_size += sizeof(size_t); // dim @@ -70,8 +65,6 @@ struct EvalUdfGPU { for (auto scalar : args.scalars) { memcpy(p, scalar.ptr(), scalar.size()); p += scalar.size(); - // *reinterpret_cast(p) =s; - // p += sizeof(void*); } memcpy(p, &size, sizeof(size_t)); size_t dim = DIM; @@ -80,7 +73,7 @@ struct EvalUdfGPU { p += sizeof(size_t); Pitches pitches; size_t volume = pitches.flatten(rect); - // create buffers for pitches, lower point and strides since + // create buffers for pitches and strides since // we need to pass pointer to device memory auto device_pitches = create_buffer(Point<1>(DIM - 1), Memory::Kind::Z_COPY_MEM); auto device_strides = create_buffer(Point<1>(DIM), Memory::Kind::Z_COPY_MEM); @@ -130,7 +123,6 @@ struct EvalUdfGPU { for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]); int64_t ptx_hash = context.scalars()[2 + num_scalars].value(); - // bool is_created = context.scalars()[3+num_scalars].value(); EvalUdfArgs args{0, context.inputs(), @@ -144,9 +136,8 @@ struct EvalUdfGPU { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); } else { - // FIXME - double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); - // double_dispatch(dim, 0 , EvalUdfGPU{}, args); + LegateTypeCode code = LegateTypeCode::BOOL_LT ; + double_dispatch(dim, code, EvalUdfGPU{}, args); } } } // namespace cunumeric From 725e223e25b067c3d4f5889803d76ee025293731 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 21 Mar 2023 09:58:05 -0700 Subject: [PATCH 53/78] parsing python function for return arguments --- cunumeric/vectorize.py | 50 ++++++++++++++++------------- tests/integration/test_vectorize.py | 34 ++++++++++++++++---- 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 14b672dd8e..3f3f802e4b 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -15,6 +15,7 @@ import inspect import re +import typing # numba typing from typing import Any, Callable, Dict, List, Optional, Union @@ -124,25 +125,27 @@ def __init__( self._kwargs: List[Any] = [] self._context = runtime.legate_context self._created: bool = False - self._num_outputs = 1 # there is at least 1 output + self._func_body: List[str]=[] if doc is None: self.__doc__ = pyfunc.__doc__ else: self.__doc__ = doc + self._return_arguments = self._get_return_argumets() + self._num_outputs = len(self._return_arguments) + if otypes is not None: - self._num_outputs = len(otypes) - if self._num_outputs == 0: - raise ValueError( - "There should be at least 1 type specified in otypes" - ) - type0 = otypes[0] - for t in otypes: - if t != type0: - raise NotImplementedError( - "cuNumeric doesn't support variable types in otypes" - ) + if self._num_outputs !=len(otypes): + raise ValueError("number of types in otypes is not consistente" + " with the number of return values difened in pyfunc") + if len(otypes)>1: + for t in otypes: + if t != otypes[0]: + raise NotImplementedError( + "cuNumeric doesn't support variable types in otypes" + ) + # FIXME if excluded is not None: @@ -156,14 +159,6 @@ def __init__( "signature variable is not supported yet" ) - # FIXME check return of the user function - # return annotation (we supprt only void) - - # if inspect.signature(self._pyfunc).return_annotation() - # != inspect._empty: - # raise NotImplementedError( - # "user defined functions can't have a return" - # ) def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: """Using the magic method __doc__, we KNOW the size of the docstring. @@ -180,6 +175,17 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return_lines.append(lines[i].rstrip()) return return_lines + def _get_return_argumets(self)->list[str]: + self._func_body = self._get_func_body(self._pyfunc) + return_names = [] + for l in self._func_body: + if "return" in l: + l = l.replace("return", '') + l=l.replace(" ",'') + return_names = l.split(",") + return return_names + + def _replace_name( self, name: str, _LOOP_VAR: str, is_gpu: bool = False ) -> str: @@ -240,7 +246,7 @@ def _lift_to_array_access(m: Any) -> str: return self._replace_name(m.group(0), _LOOP_VAR, True) # kernel body - lines_old = self._get_func_body(self._pyfunc) + lines_old = self._func_body for line in lines_old: l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(l_new) @@ -329,7 +335,7 @@ def _emit_assignment( ) ) - lines_old = self._get_func_body(self._pyfunc) + lines_old = self._func_body # Kernel body def _lift_to_array_access(m: Any) -> str: diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 520267ab76..212c4f4ac0 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -23,33 +23,54 @@ def my_func(a, b): a = a * 2 + b + return a def my_func_np(a, b): a = a * 2 + b return a - +#Capital letters and numbers in the signature def my_func2(A0, B0): A0 = A0 * 2 + B0 + return A0 def my_func_np2(A0, B0): A0 = A0 * 2 + B0 return A0 +def empty_func(): + print("within empty function") + def test_vectorize(): + #2 arrays func = num.vectorize(my_func) a = num.arange(5) b = num.ones((5,)) func(a, b) assert(np.array_equal(a, [1,3,5,7,9])) + #array and scalar func = num.vectorize(my_func) a= num.arange(5) b=2 func(a,b) assert(np.array_equal(a, [2,4,6,8,10])) + #2 scalars + #FIXME + #func = num.vectorize(my_func) + #a=3 + #b=2 + #func(a,b) + #assert(a ==8) + + #empty function + func = num.vectorize(empty_func) + func() + + #slices + func = num.vectorize(my_func) num.vectorize(my_func) a=num.array([[1,2,3],[4,5,6],[7,8,9]]) b=num.array([[10,11,12],[13,14,15],[16,17,18]]) @@ -57,44 +78,43 @@ def test_vectorize(): a=np.arange(100).reshape((25,4)) a_num= num.array(a) - b=a*10 b_num=a_num*10 func_np = np.vectorize(my_func_np) func_num=num.vectorize(my_func) - a=func_np(a,b) func_num(a_num, b_num) assert np.array_equal(a, a_num) + #reusing the same function for different inputs a[:,2]=func_np(a[:, 2], b[:,2]) func_num(a_num[:,2],b_num[:,2]) assert np.array_equal(a, a_num) + #reusing the same function for different inputs a[5:10,2]=func_np(a[5:10, 2], b[1:6,2]) func_num(a_num[5:10,2],b_num[1:6,2]) assert np.array_equal(a, a_num) + #reusing the same function for different inputs a[15:20]=func_np(a[15:20], b[15:20]) func_num(a_num[15:20],b_num[15:20]) assert np.array_equal(a, a_num) + # reusing the same function for different inputs a=np.arange(1000).reshape((25,10,4)) a_num= num.array(a) - a[:, 2, :] = func_np(a[:, 2, :],2) func_num(a_num[:, 2, :],2) assert np.array_equal(a, a_num) + #checking signature with capital letters and numbers a=np.arange(100).reshape((25,4)) a_num= num.array(a) - b=a*10 b_num=a_num*10 - func_np = np.vectorize(my_func_np2) func_num=num.vectorize(my_func2) - a=func_np(a,b) func_num(a_num, b_num) assert np.array_equal(a, a_num) From 905fa95c5e12ec1928a49e3f6dec97f6915ac6da Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 21 Mar 2023 11:11:30 -0700 Subject: [PATCH 54/78] removing dependency on six and some clean-up --- cunumeric/vectorize.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 3f3f802e4b..996326c4e7 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -24,7 +24,7 @@ import numba import numba.core.ccallback import numpy as np -import six +#import six from legate.core import Rect, track_provenance from cunumeric.runtime import runtime @@ -60,12 +60,13 @@ def __init__( Define a vectorized function which takes a nested sequence of objects or numpy arrays as inputs and returns a single numpy array or a tuple of numpy arrays. - The vectorized function evaluates `pyfunc` over successive tuples - of the input arrays like the python map function, except it uses the - broadcasting rules of numpy. + User defined pyfunction will be executed in a single cuNumeric task + over a set of arguments. The data type of the output of `vectorized` is determined by calling the function with the first element of the input. This can be avoided by specifying the `otypes` argument. + WARNING: when running with OpenMP back-end, "vectorize" will fall-back + to the serial CPU implementation Parameters ---------- @@ -87,7 +88,12 @@ def __init__( WARNING: cuNumeric doesn't suport this argument at the moment cache : bool, optional If `True`, then cache the first function call that generates C fun- - ction or CUDA kernel + ction or CUDA kernel. We recomment enabling caching in cuNumeric + for better performance, when possible. + Warning: in the case when cache=True, cuNumeric will parse function + signature and create C function or CUDA kernel only once. This + means that types of arguments passed to the vectorized function + (arrays, scalars etc) should be the same each time we call it. signature : string, optional Generalized universal function signature, e.g., ``(m,n),(n)->(m)`` for vectorized matrix-vector multiplication. If provided, @@ -132,8 +138,8 @@ def __init__( else: self.__doc__ = doc - self._return_arguments = self._get_return_argumets() - self._num_outputs = len(self._return_arguments) + self._return_argnames = self._get_return_argumets() + self._num_outputs = len(self._return_argnames) if otypes is not None: if self._num_outputs !=len(otypes): @@ -192,7 +198,7 @@ def _replace_name( if name in self._argnames and not (name in self._scalar_names): return "{}[int({})]".format(name, _LOOP_VAR) else: - if is_gpu: + if is_gpu or ((not is_gpu) and not (name in self._scalar_names)) : return "{}".format(name) else: return "{}[0]".format(name) @@ -254,7 +260,7 @@ def _lift_to_array_access(m: Any) -> str: # Evaluate the string to get the Python function body = "\n".join(lines) glbs: Dict[str, Any] = {} - six.exec_(body, glbs) + exec(body, glbs) return glbs[funcid] def _build_cpu_function(self) -> Callable[[Any], Any]: @@ -348,7 +354,7 @@ def _lift_to_array_access(m: Any) -> str: # Evaluate the string to get the Python function body = "\n".join(lines) glbs: Dict[str, Any] = {} - six.exec_(body, glbs) + exec(body, glbs) return glbs[funcid] def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: @@ -445,11 +451,11 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: - """ - Return arrays with the results of `pyfunc` broadcast (vectorized) over - `args` and `kwargs` not in `excluded`. - """ if not self._created: + # the case when we execute `__call__` the first time or + # when cache=False: + # each time we call `vectorize` on a pyfunc we need to clear + # these lists to support different types of arguments passed self._scalar_args.clear() self._scalar_idxs.clear() self._args.clear() From 9bdefe9d09fe09e69b624472af0928c303cb7fc1 Mon Sep 17 00:00:00 2001 From: Bryan Van de Ven Date: Tue, 21 Mar 2023 16:00:20 -0700 Subject: [PATCH 55/78] updates for docs --- cunumeric/vectorize.py | 13 +++-- docs/cunumeric/source/api/_vectorize.rst | 7 +-- docs/cunumeric/source/api/functional.rst | 7 +++ docs/cunumeric/source/api/routines.rst | 1 + docs/cunumeric/source/api/vectorize.rst | 15 ------ tests/integration/test_vectorize.py | 69 ++++++++++++------------ 6 files changed, 53 insertions(+), 59 deletions(-) create mode 100644 docs/cunumeric/source/api/functional.rst delete mode 100644 docs/cunumeric/source/api/vectorize.rst diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 14b672dd8e..1954c0c931 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -15,8 +15,6 @@ import inspect import re - -# numba typing from typing import Any, Callable, Dict, List, Optional, Union import legate.core.types as ty @@ -55,6 +53,7 @@ def __init__( """ vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False, signature=None) + Generalized function class. Define a vectorized function which takes a nested sequence of objects or numpy arrays as inputs and returns a single numpy array @@ -417,7 +416,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: task.add_scalar_arg(a, dtype) # add array arguments - if len (self._args)>0: + if len(self._args) > 0: a0 = self._args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): @@ -478,10 +477,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: "kwargs are not supported in user functions" ) - if self._num_outputs==0 or len(self._args)==0: - #execute function that doesn't modify anything: - self._pyfunc() - return + if self._num_outputs == 0 or len(self._args) == 0: + # execute function that doesn't modify anything: + self._pyfunc() + return # all output arrays should have the same type if len(self._args) > 0: diff --git a/docs/cunumeric/source/api/_vectorize.rst b/docs/cunumeric/source/api/_vectorize.rst index 048e8ab51d..c096e320de 100644 --- a/docs/cunumeric/source/api/_vectorize.rst +++ b/docs/cunumeric/source/api/_vectorize.rst @@ -1,7 +1,7 @@ cunumeric.vectorize -============================= +=================== -.. currentmodule:: cunumeric +.. currentmodule:: cunumeric.vectorize .. autoclass:: vectorize @@ -9,5 +9,6 @@ cunumeric.vectorize .. rubric:: Methods - .. autosummary:: __call__ + .. automethod:: __call__ + .. autosummary:: diff --git a/docs/cunumeric/source/api/functional.rst b/docs/cunumeric/source/api/functional.rst new file mode 100644 index 0000000000..4d35618ebf --- /dev/null +++ b/docs/cunumeric/source/api/functional.rst @@ -0,0 +1,7 @@ +Functional programming +====================== + +.. toctree:: + :maxdepth: 2 + + _vectorize diff --git a/docs/cunumeric/source/api/routines.rst b/docs/cunumeric/source/api/routines.rst index e85a5c65b0..5f0451584e 100644 --- a/docs/cunumeric/source/api/routines.rst +++ b/docs/cunumeric/source/api/routines.rst @@ -13,6 +13,7 @@ Routines logic math fft + functional random set sorting diff --git a/docs/cunumeric/source/api/vectorize.rst b/docs/cunumeric/source/api/vectorize.rst deleted file mode 100644 index 5e6cbea66b..0000000000 --- a/docs/cunumeric/source/api/vectorize.rst +++ /dev/null @@ -1,15 +0,0 @@ -.. module:: cunumeric.vectorize - -========================================= - -Vectorize ------------------ - -.. toctree:: - :maxdepth: 2 - :hidden: - - _vectorize - - - diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 520267ab76..44877cc997 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -14,16 +14,16 @@ # -# import numpy as np +import numpy as np import pytest import cunumeric as num -import numpy as np def my_func(a, b): a = a * 2 + b + def my_func_np(a, b): a = a * 2 + b return a @@ -32,6 +32,7 @@ def my_func_np(a, b): def my_func2(A0, B0): A0 = A0 * 2 + B0 + def my_func_np2(A0, B0): A0 = A0 * 2 + B0 return A0 @@ -42,66 +43,66 @@ def test_vectorize(): a = num.arange(5) b = num.ones((5,)) func(a, b) - assert(np.array_equal(a, [1,3,5,7,9])) + assert np.array_equal(a, [1, 3, 5, 7, 9]) func = num.vectorize(my_func) - a= num.arange(5) - b=2 - func(a,b) - assert(np.array_equal(a, [2,4,6,8,10])) - + a = num.arange(5) + b = 2 + func(a, b) + assert np.array_equal(a, [2, 4, 6, 8, 10]) + num.vectorize(my_func) - a=num.array([[1,2,3],[4,5,6],[7,8,9]]) - b=num.array([[10,11,12],[13,14,15],[16,17,18]]) - func(a[:2],b[:2]) + a = num.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + b = num.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]]) + func(a[:2], b[:2]) - a=np.arange(100).reshape((25,4)) - a_num= num.array(a) + a = np.arange(100).reshape((25, 4)) + a_num = num.array(a) - b=a*10 - b_num=a_num*10 + b = a * 10 + b_num = a_num * 10 func_np = np.vectorize(my_func_np) - func_num=num.vectorize(my_func) + func_num = num.vectorize(my_func) - a=func_np(a,b) + a = func_np(a, b) func_num(a_num, b_num) assert np.array_equal(a, a_num) - a[:,2]=func_np(a[:, 2], b[:,2]) - func_num(a_num[:,2],b_num[:,2]) + a[:, 2] = func_np(a[:, 2], b[:, 2]) + func_num(a_num[:, 2], b_num[:, 2]) assert np.array_equal(a, a_num) - a[5:10,2]=func_np(a[5:10, 2], b[1:6,2]) - func_num(a_num[5:10,2],b_num[1:6,2]) + a[5:10, 2] = func_np(a[5:10, 2], b[1:6, 2]) + func_num(a_num[5:10, 2], b_num[1:6, 2]) assert np.array_equal(a, a_num) - a[15:20]=func_np(a[15:20], b[15:20]) - func_num(a_num[15:20],b_num[15:20]) + a[15:20] = func_np(a[15:20], b[15:20]) + func_num(a_num[15:20], b_num[15:20]) assert np.array_equal(a, a_num) - a=np.arange(1000).reshape((25,10,4)) - a_num= num.array(a) + a = np.arange(1000).reshape((25, 10, 4)) + a_num = num.array(a) - a[:, 2, :] = func_np(a[:, 2, :],2) - func_num(a_num[:, 2, :],2) + a[:, 2, :] = func_np(a[:, 2, :], 2) + func_num(a_num[:, 2, :], 2) assert np.array_equal(a, a_num) - a=np.arange(100).reshape((25,4)) - a_num= num.array(a) + a = np.arange(100).reshape((25, 4)) + a_num = num.array(a) - b=a*10 - b_num=a_num*10 + b = a * 10 + b_num = a_num * 10 func_np = np.vectorize(my_func_np2) - func_num=num.vectorize(my_func2) + func_num = num.vectorize(my_func2) - a=func_np(a,b) + a = func_np(a, b) func_num(a_num, b_num) assert np.array_equal(a, a_num) - if __name__ == "__main__": import sys + np.random.seed(12345) sys.exit(pytest.main(sys.argv)) From c0278f7f5f177134614372ab16419a1677bc0f10 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Mar 2023 13:24:42 -0700 Subject: [PATCH 56/78] adding logic for parsing returns from UDF --- cunumeric/vectorize.py | 195 +++++++++++++++++++--------- src/cunumeric/vectorize/eval_udf.cc | 3 +- 2 files changed, 136 insertions(+), 62 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 996326c4e7..63f29af1de 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -24,7 +24,6 @@ import numba import numba.core.ccallback import numpy as np -#import six from legate.core import Rect, track_provenance from cunumeric.runtime import runtime @@ -140,17 +139,21 @@ def __init__( self._return_argnames = self._get_return_argumets() self._num_outputs = len(self._return_argnames) - + self._return_args=[] + self._output_shape :Optional[tuple[Any]]= None + self._output_dtype: Optional[np.dtype[Any]] = None + if otypes is not None: if self._num_outputs !=len(otypes): raise ValueError("number of types in otypes is not consistente" - " with the number of return values difened in pyfunc") + " with the number of return values defined in pyfunc") if len(otypes)>1: for t in otypes: if t != otypes[0]: raise NotImplementedError( "cuNumeric doesn't support variable types in otypes" ) + self._output_dtype = otypes[0] # FIXME @@ -195,7 +198,7 @@ def _get_return_argumets(self)->list[str]: def _replace_name( self, name: str, _LOOP_VAR: str, is_gpu: bool = False ) -> str: - if name in self._argnames and not (name in self._scalar_names): + if (name in self._argnames) or (name in self._return_argnames ): return "{}[int({})]".format(name, _LOOP_VAR) else: if is_gpu or ((not is_gpu) and not (name in self._scalar_names)) : @@ -214,7 +217,9 @@ def _build_gpu_function(self) -> Any: # Signature args = ( - self._argnames + self._return_argnames + + self._argnames + + self._scalar_names + [_SIZE_VAR] + [_DIM_VAR] + [_PITCHES_VAR] @@ -254,8 +259,9 @@ def _lift_to_array_access(m: Any) -> str: # kernel body lines_old = self._func_body for line in lines_old: - l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) - lines.append(l_new) + if not ( "return" in line): + l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) + lines.append(l_new) # Evaluate the string to get the Python function body = "\n".join(lines) @@ -289,35 +295,32 @@ def _emit_assignment( var: Any, idx: int, sz: Any, - ty: np.dtype[Any], - scalar: bool = False, + ty: np.dtype[Any] ) -> None: - if scalar: - # we represent scalars as arrays of size 1 - lines.append( - " {} = carray({}[{}], 1, types.{})".format( - var, _ARGS_VAR, idx, ty - ) - ) - else: - lines.append( - " {} = carray({}[{}], {}, types.{})".format( - var, _ARGS_VAR, idx, sz, ty - ) + lines.append( + " {} = carray({}[{}], {}, types.{})".format( + var, _ARGS_VAR, idx, sz, ty ) + ) # define pyfunc arguments ar carrays arg_idx = 0 - for a in self._args: + for count, a in enumerate(self._return_args): type_a = a.dtype _emit_assignment( - self._argnames[arg_idx], arg_idx, _SIZE_VAR, type_a + self._return_argnames[count], arg_idx, _SIZE_VAR, type_a ) arg_idx += 1 - for a in self._scalar_args: + for count,a in enumerate(self._args): + type_a = a.dtype + _emit_assignment( + self._argnames[count], arg_idx, _SIZE_VAR, type_a + ) + arg_idx += 1 + for count, a in enumerate(self._scalar_args): scalar_type = np.dtype(type(a).__name__) _emit_assignment( - self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type, True + self._scalar_names[count], arg_idx, 1, scalar_type ) arg_idx += 1 @@ -348,8 +351,9 @@ def _lift_to_array_access(m: Any) -> str: return self._replace_name(m.group(0), _LOOP_VAR) for line in lines_old: - l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) - lines.append(" " + l_new) + if not ( "return" in line): + l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) + lines.append(" " + l_new) # Evaluate the string to get the Python function body = "\n".join(lines) @@ -359,6 +363,12 @@ def _lift_to_array_access(m: Any) -> str: def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: types = [] + for arg in self._return_args: + type_a = arg.dtype + type_a = str(type_a) if type_a != bool else "int8" + type_a = getattr(numba.core.types, type_a) + type_a = numba.core.types.CPointer(type_a) + types.append(type_a) for arg in self._args: type_a = arg.dtype type_a = str(type_a) if type_a != bool else "int8" @@ -428,18 +438,28 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: dtype = convert_to_cunumeric_dtype(type(a).__name__) task.add_scalar_arg(a, dtype) - # add array arguments - if len (self._args)>0: - a0 = self._args[0]._thunk + # add return arguments + a0=None + if len (self._return_args)>0: + a0 = self._return_args[0]._thunk a0 = runtime.to_deferred_array(a0) - for count, a in enumerate(self._args): + for count, a in enumerate(self._return_args): a_tmp = runtime.to_deferred_array(a._thunk) a_tmp_base = a_tmp.base task.add_input(a_tmp_base) - if count < self._num_outputs: - task.add_output(a_tmp_base) + task.add_output(a_tmp_base) if count != 0: task.add_alignment(a0.base, a_tmp_base) + # add array arguments + if len (self._args)>0: + if a0 is None: + a0 = self._args[0]._thunk + a0 = runtime.to_deferred_array(a0) + for count, a in enumerate(self._args): + a_tmp = runtime.to_deferred_array(a._thunk) + a_tmp_base = a_tmp.base + task.add_input(a_tmp_base) + task.add_alignment(a0.base, a_tmp_base) if is_gpu: ptx_hash = hash(self._gpu_func[0]) @@ -482,45 +502,98 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: for i, k in enumerate(inspect.signature(self._pyfunc).parameters): if i in self._scalar_idxs: self._scalar_names.append(k) - self._argnames.append(k) self._kwargs = list(kwargs) - if len(self._kwargs) > 1: + if len(self._kwargs) > 0: raise NotImplementedError( "kwargs are not supported in user functions" ) - if self._num_outputs==0 or len(self._args)==0: - #execute function that doesn't modify anything: - self._pyfunc() - return - - # all output arrays should have the same type - if len(self._args) > 0: - type_a = self._args[0].dtype - shape = self._args[0].shape - for i in range(1, self._num_outputs): - if type_a != self._args[i].dtype: - raise TypeError( - "cuNumeric doesnt support " - "different types for output data in " - "user function passed to vectorize" - ) - if shape != self._args[i].shape: - raise TypeError( - "cuNumeric doesnt support " - "different shapes for output data in " - "user function passed to vectorize" + #we need to do ther rest each time `__call__` is executed + output_shape = self._output_shape + output_dtype = self._output_dtype + self._return_args.clear() + # if output type is not specified, we need to decide + # which one to use + # we also want to choose the shape for output array + + # check if output variable is in input arguments - > + # then use it's dtype and shape + for r in self._return_argnames: + if r in self._argnames: + idx = self._argnames.index(r) + if output_dtype is None: + output_dtype = self._args[idx].dtype + if output_shape is None: + output_shape = self._args[idx].shape + break + + #the case if we didn't find output argument in input argnames + if output_shape is None: + for r in self._return_argnames: + if r in self._scalar_argnames: + idx = self._scalar_argnames.index(r) + if output_dtype is None: + output_dtype = np.dtype(type(self._scalar_args[idx])) + output_shape = (1,) + break + #FIXME + #we could find common type of input arguments here and + #broadcasted shapes + if self._num_outputs>0 and output_dtype is None: + raise ValueError("Unable to choose output dtype") + if self._num_outputs>0 and output_shape is None: + raise ValueError("Unable to choose output shape") + + + # filing the list of return arguments + # check if there are return argnames in input argnames, + # if not, create a new array + for r in self._return_argnames: + if r in self._argnames: + idx = self._argnames.index(r) + if self._args[idx].shape !=output_shape: + raise ValueError( + "all output arrays should have the same shape") + if output_dtype != self._args[idx].dtype: + runtime.warn( + "converting input array to output types in user func ", + category=RuntimeWarning, ) - for i in range(self._num_outputs, len(self._args)): - if type_a != self._args[i].dtype: + self._args[idx]=self._args[idx].astype(output_dtype) + self._return_args.append(self._args[idx]) + self._args.remove(self._args[idx]) + self._argnames.remove(r) + elif r in self._scalar_names: + idx = self._scalar_names.index(r) + if output_shape != (1,): + raise ValueError( + "all output arrays should have the same shape") + self._return_args.append(full(output_shape,self._scalar_args[idx], output_dtype)) + self._scalar_args.remove(self._scalar_args[idx]) + self._scalar_names.remove(r) + else: + #create array and add it to the list of return_args + tmp_ret = full(output_shape,0, output_dtype) + self._return_args.append(tmp_ret) + #FIXME + #if self._num_outputs==0: + # #execute function that doesn't modify anything: + # self._pyfunc(args) + # return + + # bring all arrays to same type + if len(self._args) > 0: + for count, a in enumerate(self._args): + if output_dtype != a.dtype: runtime.warn( "converting input array to output types in user func ", category=RuntimeWarning, ) - self._args[i] = self._args[i].astype(type_a) - if shape != self._args[i].shape and np.ndim(self._args[i]) > 0: - raise TypeError( + self._args[count] = self._args[count].astype(output_dtype) + #FIXME broadcast shapes + if output_shape != self._args[count].shape : + raise ValueError( "cuNumeric doesnt support " "different shapes for arrays in " "user function passed to vectorize" diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 5a40260a1d..04133ef671 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -72,9 +72,10 @@ struct EvalUdfCPU { scalars, num_outputs, context.get_current_processor()}; - size_t dim = 1; + int dim = 1; if (args.inputs.size() > 0) { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); + assert(dim>0); double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); } else { LegateTypeCode code = LegateTypeCode::BOOL_LT ; From f6c515540107300e929a65ead0dfcaa440433c1f Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Mar 2023 13:45:00 -0700 Subject: [PATCH 57/78] making vectorize to return arrays --- cunumeric/vectorize.py | 12 ++++++-- tests/integration/test_vectorize.py | 47 ++++++++++++----------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 63f29af1de..b7990e2e31 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -31,6 +31,7 @@ from .array import convert_to_cunumeric_ndarray from .config import CuNumericOpCode from .utils import convert_to_cunumeric_dtype +from .module import full _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__" _MASK_VAR = "__mask__" @@ -531,8 +532,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: #the case if we didn't find output argument in input argnames if output_shape is None: for r in self._return_argnames: - if r in self._scalar_argnames: - idx = self._scalar_argnames.index(r) + if r in self._scalar_names: + idx = self._scalar_names.index(r) if output_dtype is None: output_dtype = np.dtype(type(self._scalar_args[idx])) output_shape = (1,) @@ -611,3 +612,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: if self._cache: self._created = True self._execute(False) + + if len(self._return_args)==1: + return self._return_args[0] + if len(self._return_args)>1: + return tuple(self._return_args) + else: + return diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 212c4f4ac0..ee758b0195 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -25,18 +25,11 @@ def my_func(a, b): a = a * 2 + b return a -def my_func_np(a, b): - a = a * 2 + b - return a - #Capital letters and numbers in the signature def my_func2(A0, B0): A0 = A0 * 2 + B0 - return A0 - -def my_func_np2(A0, B0): - A0 = A0 * 2 + B0 - return A0 + C0=A0*2 + return A0,C0 def empty_func(): print("within empty function") @@ -47,23 +40,22 @@ def test_vectorize(): func = num.vectorize(my_func) a = num.arange(5) b = num.ones((5,)) - func(a, b) + a = func(a, b) assert(np.array_equal(a, [1,3,5,7,9])) #array and scalar func = num.vectorize(my_func) a= num.arange(5) b=2 - func(a,b) + a = func(a,b) assert(np.array_equal(a, [2,4,6,8,10])) #2 scalars - #FIXME - #func = num.vectorize(my_func) - #a=3 - #b=2 - #func(a,b) - #assert(a ==8) + func = num.vectorize(my_func) + a=3 + b=2 + a = func(a,b) + assert(a ==8) #empty function func = num.vectorize(empty_func) @@ -74,38 +66,38 @@ def test_vectorize(): num.vectorize(my_func) a=num.array([[1,2,3],[4,5,6],[7,8,9]]) b=num.array([[10,11,12],[13,14,15],[16,17,18]]) - func(a[:2],b[:2]) + a[:2] = func(a[:2],b[:2]) a=np.arange(100).reshape((25,4)) a_num= num.array(a) b=a*10 b_num=a_num*10 - func_np = np.vectorize(my_func_np) + func_np = np.vectorize(my_func) func_num=num.vectorize(my_func) a=func_np(a,b) - func_num(a_num, b_num) + a_num=func_num(a_num, b_num) assert np.array_equal(a, a_num) #reusing the same function for different inputs a[:,2]=func_np(a[:, 2], b[:,2]) - func_num(a_num[:,2],b_num[:,2]) + a_num[:,2] =func_num(a_num[:,2],b_num[:,2]) assert np.array_equal(a, a_num) #reusing the same function for different inputs a[5:10,2]=func_np(a[5:10, 2], b[1:6,2]) - func_num(a_num[5:10,2],b_num[1:6,2]) + a_num[5:10,2]=func_num(a_num[5:10,2],b_num[1:6,2]) assert np.array_equal(a, a_num) #reusing the same function for different inputs a[15:20]=func_np(a[15:20], b[15:20]) - func_num(a_num[15:20],b_num[15:20]) + a_num[15:20]=func_num(a_num[15:20],b_num[15:20]) assert np.array_equal(a, a_num) # reusing the same function for different inputs a=np.arange(1000).reshape((25,10,4)) a_num= num.array(a) a[:, 2, :] = func_np(a[:, 2, :],2) - func_num(a_num[:, 2, :],2) + a_num[:, 2, :]=func_num(a_num[:, 2, :],2) assert np.array_equal(a, a_num) #checking signature with capital letters and numbers @@ -113,11 +105,12 @@ def test_vectorize(): a_num= num.array(a) b=a*10 b_num=a_num*10 - func_np = np.vectorize(my_func_np2) + func_np = np.vectorize(my_func2) func_num=num.vectorize(my_func2) - a=func_np(a,b) - func_num(a_num, b_num) + a,c=func_np(a,b) + a_num,c_num = func_num(a_num, b_num) assert np.array_equal(a, a_num) + assert np.array_equal(c, c_num) From 0924e4f84fa637024dc08158b8795359e9e4f2df Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Mar 2023 17:31:11 -0700 Subject: [PATCH 58/78] adding more tests + code clean-up --- cunumeric/vectorize.py | 46 +++++++++------- tests/integration/test_vectorize.py | 84 ++++++++++++++++++++++++----- 2 files changed, 98 insertions(+), 32 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index b7990e2e31..370d4dd040 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -90,7 +90,7 @@ def __init__( If `True`, then cache the first function call that generates C fun- ction or CUDA kernel. We recomment enabling caching in cuNumeric for better performance, when possible. - Warning: in the case when cache=True, cuNumeric will parse function + WARNING: in the case when cache=True, cuNumeric will parse function signature and create C function or CUDA kernel only once. This means that types of arguments passed to the vectorized function (arrays, scalars etc) should be the same each time we call it. @@ -127,7 +127,7 @@ def __init__( self._scalar_args: List[Any] = [] self._scalar_idxs: List[int] = [] self._scalar_names: List[str] = [] - self._argnames: List[str] = [] + self._arg_names: List[str] = [] self._kwargs: List[Any] = [] self._context = runtime.legate_context self._created: bool = False @@ -138,8 +138,8 @@ def __init__( else: self.__doc__ = doc - self._return_argnames = self._get_return_argumets() - self._num_outputs = len(self._return_argnames) + self._return_names = self._get_return_argumets() + self._num_outputs = len(self._return_names) self._return_args=[] self._output_shape :Optional[tuple[Any]]= None self._output_dtype: Optional[np.dtype[Any]] = None @@ -186,6 +186,9 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return return_lines def _get_return_argumets(self)->list[str]: + """ + Returns the list of names for return arrays/values + """ self._func_body = self._get_func_body(self._pyfunc) return_names = [] for l in self._func_body: @@ -199,7 +202,10 @@ def _get_return_argumets(self)->list[str]: def _replace_name( self, name: str, _LOOP_VAR: str, is_gpu: bool = False ) -> str: - if (name in self._argnames) or (name in self._return_argnames ): + """ + add indices to the names of input/output arrays in the function body + """ + if (name in self._arg_names) or (name in self._return_names ): return "{}[int({})]".format(name, _LOOP_VAR) else: if is_gpu or ((not is_gpu) and not (name in self._scalar_names)) : @@ -218,8 +224,8 @@ def _build_gpu_function(self) -> Any: # Signature args = ( - self._return_argnames - + self._argnames + self._return_names + + self._arg_names + self._scalar_names + [_SIZE_VAR] + [_DIM_VAR] @@ -309,13 +315,13 @@ def _emit_assignment( for count, a in enumerate(self._return_args): type_a = a.dtype _emit_assignment( - self._return_argnames[count], arg_idx, _SIZE_VAR, type_a + self._return_names[count], arg_idx, _SIZE_VAR, type_a ) arg_idx += 1 for count,a in enumerate(self._args): type_a = a.dtype _emit_assignment( - self._argnames[count], arg_idx, _SIZE_VAR, type_a + self._arg_names[count], arg_idx, _SIZE_VAR, type_a ) arg_idx += 1 for count, a in enumerate(self._scalar_args): @@ -480,7 +486,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._scalar_args.clear() self._scalar_idxs.clear() self._args.clear() - self._argnames.clear() + self._arg_names.clear() self._scalar_names.clear() for i, arg in enumerate(args): @@ -498,7 +504,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: # first fill arrays to argnames, then scalars: for i, k in enumerate(inspect.signature(self._pyfunc).parameters): if not (i in self._scalar_idxs): - self._argnames.append(k) + self._arg_names.append(k) for i, k in enumerate(inspect.signature(self._pyfunc).parameters): if i in self._scalar_idxs: @@ -520,18 +526,20 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: # check if output variable is in input arguments - > # then use it's dtype and shape - for r in self._return_argnames: - if r in self._argnames: - idx = self._argnames.index(r) + print ("IRINA DEBUG ", self._return_names, self._arg_names, output_dtype) + for r in self._return_names: + if r in self._arg_names: + idx = self._arg_names.index(r) if output_dtype is None: output_dtype = self._args[idx].dtype if output_shape is None: output_shape = self._args[idx].shape break + print ("IRINA DEBUG 2", output_dtype) #the case if we didn't find output argument in input argnames if output_shape is None: - for r in self._return_argnames: + for r in self._return_names: if r in self._scalar_names: idx = self._scalar_names.index(r) if output_dtype is None: @@ -550,9 +558,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: # filing the list of return arguments # check if there are return argnames in input argnames, # if not, create a new array - for r in self._return_argnames: - if r in self._argnames: - idx = self._argnames.index(r) + for r in self._return_names: + if r in self._arg_names: + idx = self._arg_names.index(r) if self._args[idx].shape !=output_shape: raise ValueError( "all output arrays should have the same shape") @@ -564,7 +572,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._args[idx]=self._args[idx].astype(output_dtype) self._return_args.append(self._args[idx]) self._args.remove(self._args[idx]) - self._argnames.remove(r) + self._arg_names.remove(r) elif r in self._scalar_names: idx = self._scalar_names.index(r) if output_shape != (1,): diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index ee758b0195..d5920ae235 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -31,10 +31,6 @@ def my_func2(A0, B0): C0=A0*2 return A0,C0 -def empty_func(): - print("within empty function") - - def test_vectorize(): #2 arrays func = num.vectorize(my_func) @@ -57,23 +53,42 @@ def test_vectorize(): a = func(a,b) assert(a ==8) +def empty_func(): + print("within empty function") + +def print_func(a,b): + print ("I am pringing input arguments", a, b) + +def test_empty_functions(): #empty function func = num.vectorize(empty_func) func() - #slices - func = num.vectorize(my_func) - num.vectorize(my_func) - a=num.array([[1,2,3],[4,5,6],[7,8,9]]) - b=num.array([[10,11,12],[13,14,15],[16,17,18]]) - a[:2] = func(a[:2],b[:2]) + func2 = num.vectorize(print_func) + print_func(1,2) + + print_func(np.array([1,2,3]), 2) + + +def test_vectorize_over_slices(): + #reuse the same vectorize object on + #different slices + func_num = num.vectorize(my_func) + func_np = np.vectorize(my_func) + + a=np.array([[1,2,3],[4,5,6],[7,8,9]]) + b=np.array([[10,11,12],[13,14,15],[16,17,18]]) + a_num=num.array(a) + b_num = num.array(b) + a[:2] = func_np(a[:2],b[:2]) + a_num[:2] = func_num(a_num[:2],b_num[:2]) + assert np.array_equal(a, a_num) + a=np.arange(100).reshape((25,4)) a_num= num.array(a) b=a*10 b_num=a_num*10 - func_np = np.vectorize(my_func) - func_num=num.vectorize(my_func) a=func_np(a,b) a_num=func_num(a_num, b_num) assert np.array_equal(a, a_num) @@ -100,7 +115,9 @@ def test_vectorize(): a_num[:, 2, :]=func_num(a_num[:, 2, :],2) assert np.array_equal(a, a_num) +def test_multiple_outputs(): #checking signature with capital letters and numbers + # + checking multiple outputs a=np.arange(100).reshape((25,4)) a_num= num.array(a) b=a*10 @@ -112,7 +129,48 @@ def test_vectorize(): assert np.array_equal(a, a_num) assert np.array_equal(c, c_num) - +def test_different_types(): + #checking the case when input and output types are different + a=np.arange(100, dtype = int).reshape((25,4)) + a_num= num.array(a) + b=a*10 + b_num=a_num*10 + func_np = np.vectorize(my_func, otypes=(float,)) + func_num=num.vectorize(my_func, otypes=(float,)) + a=func_np(a,b) + a_num=func_num(a_num, b_num) + assert np.array_equal(a, a_num) + + #another test for different types + a=np.arange(100, dtype = float).reshape((25,4)) + a_num= num.array(a) + b=a*10 + b_num=a_num*10 + func_np = np.vectorize(my_func2, otypes = (int, int,)) + func_num=num.vectorize(my_func2, otypes = (int, int, )) + a,c=func_np(a,b) + a_num,c_num = func_num(a_num, b_num) + assert np.array_equal(a, a_num) + assert np.array_equal(c, c_num) + + +def test_cache(): + a=np.arange(100).reshape((25,4)) + a_num= num.array(a) + b=a*10 + b_num=a_num*10 + func_np = np.vectorize(my_func2, cache = True) + func_num=num.vectorize(my_func2, cache = True) + for i in range (10): + a=a*2 + b=b*3 + a_num=a_num*2 + b_num=b_num*3 + a,c=func_np(a,b) + a_num,c_num = func_num(a_num, b_num) + assert np.array_equal(a, a_num) + assert np.array_equal(c, c_num) + if __name__ == "__main__": import sys From 00b39c463aded157d1d69c3bb797bfbf96f4d3c6 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Mar 2023 20:36:50 -0700 Subject: [PATCH 59/78] fixing logic for caching --- cunumeric/vectorize.py | 99 ++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 43 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 370d4dd040..e22f18ae73 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -141,9 +141,10 @@ def __init__( self._return_names = self._get_return_argumets() self._num_outputs = len(self._return_names) self._return_args=[] - self._output_shape :Optional[tuple[Any]]= None self._output_dtype: Optional[np.dtype[Any]] = None - + self._cached_dtype: Optional[np.dtype[Any]] = None + self._cached_scalar_types: List[Any]=[] + if otypes is not None: if self._num_outputs !=len(otypes): raise ValueError("number of types in otypes is not consistente" @@ -478,46 +479,52 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: task.execute() def __call__(self, *args: Any, **kwargs: Any) -> None: - if not self._created: - # the case when we execute `__call__` the first time or - # when cache=False: - # each time we call `vectorize` on a pyfunc we need to clear - # these lists to support different types of arguments passed - self._scalar_args.clear() - self._scalar_idxs.clear() - self._args.clear() - self._arg_names.clear() - self._scalar_names.clear() - - for i, arg in enumerate(args): - if arg is None: - raise ValueError( - "None is not supported in user function " - "passed to cunumeric.vectorize" - ) - elif np.ndim(arg) == 0: - self._scalar_args.append(arg) - self._scalar_idxs.append(i) - else: - self._args.append(convert_to_cunumeric_ndarray(arg)) - - # first fill arrays to argnames, then scalars: - for i, k in enumerate(inspect.signature(self._pyfunc).parameters): - if not (i in self._scalar_idxs): - self._arg_names.append(k) - - for i, k in enumerate(inspect.signature(self._pyfunc).parameters): - if i in self._scalar_idxs: - self._scalar_names.append(k) - - self._kwargs = list(kwargs) - if len(self._kwargs) > 0: - raise NotImplementedError( - "kwargs are not supported in user functions" + # each time we call `vectorize` on a pyfunc we need to clear + # these lists to support different types of arguments passed + self._scalar_args.clear() + self._scalar_idxs.clear() + self._args.clear() + self._arg_names.clear() + self._scalar_names.clear() + + scalar_idx=0 + for i, arg in enumerate(args): + if arg is None: + raise ValueError( + "None is not supported in user function " + "passed to cunumeric.vectorize" ) + elif np.ndim(arg) == 0: + if self._cache and not self._created: + self._cached_scalar_types.apend(type(arg)) + elif self._cache: + if self._cached_scalar_types[scalar_idx] != type(arg): + raise TypeError( + " Input arguments to vectorized function should" + " have consistent types for each invocation") + self._scalar_args.append(arg) + self._scalar_idxs.append(i) + scalar_idx+=1 + else: + self._args.append(convert_to_cunumeric_ndarray(arg)) + + # first fill arrays to argnames, then scalars: + for i, k in enumerate(inspect.signature(self._pyfunc).parameters): + if not (i in self._scalar_idxs): + self._arg_names.append(k) + + for i, k in enumerate(inspect.signature(self._pyfunc).parameters): + if i in self._scalar_idxs: + self._scalar_names.append(k) + + self._kwargs = list(kwargs) + if len(self._kwargs) > 0: + raise NotImplementedError( + "kwargs are not supported in user functions" + ) #we need to do ther rest each time `__call__` is executed - output_shape = self._output_shape + output_shape = None output_dtype = self._output_dtype self._return_args.clear() # if output type is not specified, we need to decide @@ -526,16 +533,14 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: # check if output variable is in input arguments - > # then use it's dtype and shape - print ("IRINA DEBUG ", self._return_names, self._arg_names, output_dtype) for r in self._return_names: if r in self._arg_names: idx = self._arg_names.index(r) if output_dtype is None: - output_dtype = self._args[idx].dtype + output_dtype = self._args[idx].dtype if output_shape is None: output_shape = self._args[idx].shape break - print ("IRINA DEBUG 2", output_dtype) #the case if we didn't find output argument in input argnames if output_shape is None: @@ -546,7 +551,15 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: output_dtype = np.dtype(type(self._scalar_args[idx])) output_shape = (1,) break - #FIXME + + if self._cache and not (self._cached_dtype is None): + if self._cached_dtype !=output_dtype: + raise TypeError("types of the arguments should stay the same" + " for each invocation of the vectorize object") + elif self._cache: + self._cached_dtype = output_dtype + + #FIXME #we could find common type of input arguments here and #broadcasted shapes if self._num_outputs>0 and output_dtype is None: From e7e4e7aca8d3971fb8e1987f5250889e614f32e1 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Mar 2023 20:48:07 -0700 Subject: [PATCH 60/78] adding more tests --- tests/integration/test_vectorize.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index d5920ae235..b6f4bcaf3e 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -19,7 +19,8 @@ import cunumeric as num import numpy as np - +from legate.core import LEGATE_MAX_DIM +from utils.generators import mk_seq_array def my_func(a, b): a = a * 2 + b @@ -171,6 +172,21 @@ def test_cache(): assert np.array_equal(a, a_num) assert np.array_equal(c, c_num) +#checking caching on different shapes of arrays: +func_np2 = np.vectorize(my_func2, cache = True) +func_num2=num.vectorize(my_func2, cache = True) + +@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1)) +def test_nd_vectorize(ndim): + a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim)) + a = mk_seq_array(np, a_shape) + a_num = mk_seq_array(num, a_shape) + b=a*2 + b_num=a_num*2 + a,c=func_np2(a,b) + a_num,c_num = func_num2(a_num, b_num) + assert np.array_equal(a, a_num) + assert np.array_equal(c, c_num) if __name__ == "__main__": import sys From fe5219ad9405a288492e30bb40069084bf9e55a2 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Mar 2023 21:25:21 -0700 Subject: [PATCH 61/78] formatting --- cunumeric/vectorize.py | 160 ++++++++++++++-------------- src/cunumeric/vectorize/eval_udf.cc | 4 +- src/cunumeric/vectorize/eval_udf.cu | 4 +- typings/numba/types/__init__.pyi | 2 +- 4 files changed, 85 insertions(+), 85 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index e22f18ae73..8c513d5fff 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -15,10 +15,7 @@ import inspect import re -import typing - -# numba typing -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import legate.core.types as ty import numba @@ -30,8 +27,8 @@ from .array import convert_to_cunumeric_ndarray from .config import CuNumericOpCode -from .utils import convert_to_cunumeric_dtype from .module import full +from .utils import convert_to_cunumeric_dtype _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__" _MASK_VAR = "__mask__" @@ -61,7 +58,7 @@ def __init__( objects or numpy arrays as inputs and returns a single numpy array or a tuple of numpy arrays. User defined pyfunction will be executed in a single cuNumeric task - over a set of arguments. + over a set of arguments. The data type of the output of `vectorized` is determined by calling the function with the first element of the input. This can be avoided by specifying the `otypes` argument. @@ -88,7 +85,7 @@ def __init__( WARNING: cuNumeric doesn't suport this argument at the moment cache : bool, optional If `True`, then cache the first function call that generates C fun- - ction or CUDA kernel. We recomment enabling caching in cuNumeric + ction or CUDA kernel. We recomment enabling caching in cuNumeric for better performance, when possible. WARNING: in the case when cache=True, cuNumeric will parse function signature and create C function or CUDA kernel only once. This @@ -131,7 +128,7 @@ def __init__( self._kwargs: List[Any] = [] self._context = runtime.legate_context self._created: bool = False - self._func_body: List[str]=[] + self._func_body: List[str] = [] if doc is None: self.__doc__ = pyfunc.__doc__ @@ -139,24 +136,26 @@ def __init__( self.__doc__ = doc self._return_names = self._get_return_argumets() - self._num_outputs = len(self._return_names) - self._return_args=[] + self._num_outputs: int = len(self._return_names) + self._return_args: List[Any] = [] self._output_dtype: Optional[np.dtype[Any]] = None self._cached_dtype: Optional[np.dtype[Any]] = None - self._cached_scalar_types: List[Any]=[] + self._cached_scalar_types: List[Any] = [] if otypes is not None: - if self._num_outputs !=len(otypes): - raise ValueError("number of types in otypes is not consistente" - " with the number of return values defined in pyfunc") - if len(otypes)>1: + if self._num_outputs != len(otypes): + raise ValueError( + "number of types in otypes is not consistente" + " with the number of return values defined in pyfunc" + ) + if len(otypes) > 1: for t in otypes: if t != otypes[0]: raise NotImplementedError( - "cuNumeric doesn't support variable types in otypes" + "cuNumeric doesn't support variable types" + " in otypes" ) - self._output_dtype = otypes[0] - + self._output_dtype = np.dtype(otypes[0]) # FIXME if excluded is not None: @@ -170,7 +169,6 @@ def __init__( "signature variable is not supported yet" ) - def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: """Using the magic method __doc__, we KNOW the size of the docstring. We then, just substract this from the total length of the function @@ -186,30 +184,29 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return_lines.append(lines[i].rstrip()) return return_lines - def _get_return_argumets(self)->list[str]: + def _get_return_argumets(self) -> list[str]: """ Returns the list of names for return arrays/values """ self._func_body = self._get_func_body(self._pyfunc) return_names = [] - for l in self._func_body: - if "return" in l: - l = l.replace("return", '') - l=l.replace(" ",'') - return_names = l.split(",") + for ln in self._func_body: + if "return" in ln: + ln = ln.replace("return", "") + ln = ln.replace(" ", "") + return_names = ln.split(",") return return_names - def _replace_name( self, name: str, _LOOP_VAR: str, is_gpu: bool = False ) -> str: """ add indices to the names of input/output arrays in the function body """ - if (name in self._arg_names) or (name in self._return_names ): + if (name in self._arg_names) or (name in self._return_names): return "{}[int({})]".format(name, _LOOP_VAR) else: - if is_gpu or ((not is_gpu) and not (name in self._scalar_names)) : + if is_gpu or ((not is_gpu) and not (name in self._scalar_names)): return "{}".format(name) else: return "{}[0]".format(name) @@ -267,7 +264,7 @@ def _lift_to_array_access(m: Any) -> str: # kernel body lines_old = self._func_body for line in lines_old: - if not ( "return" in line): + if not ("return" in line): l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(l_new) @@ -300,10 +297,7 @@ def _build_cpu_function(self) -> Callable[[Any], Any]: # Unpack kernel arguments def _emit_assignment( - var: Any, - idx: int, - sz: Any, - ty: np.dtype[Any] + var: Any, idx: int, sz: Any, ty: np.dtype[Any] ) -> None: lines.append( " {} = carray({}[{}], {}, types.{})".format( @@ -319,7 +313,7 @@ def _emit_assignment( self._return_names[count], arg_idx, _SIZE_VAR, type_a ) arg_idx += 1 - for count,a in enumerate(self._args): + for count, a in enumerate(self._args): type_a = a.dtype _emit_assignment( self._arg_names[count], arg_idx, _SIZE_VAR, type_a @@ -359,7 +353,7 @@ def _lift_to_array_access(m: Any) -> str: return self._replace_name(m.group(0), _LOOP_VAR) for line in lines_old: - if not ( "return" in line): + if not ("return" in line): l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line) lines.append(" " + l_new) @@ -405,13 +399,13 @@ def _compile_func_gpu(self) -> tuple[Any]: return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch) def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: - sig = numba.core.types.void( + sig = numba.core.types.void( # type : ignore numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64, numba.core.types.uint64, numba.core.types.CPointer(numba.core.types.uint64), numba.core.types.CPointer(numba.core.types.uint64), - ) + ) # type : ignore return numba.cfunc(sig)(self._numba_func) @@ -447,8 +441,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: task.add_scalar_arg(a, dtype) # add return arguments - a0=None - if len (self._return_args)>0: + a0 = None + if len(self._return_args) > 0: a0 = self._return_args[0]._thunk a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._return_args): @@ -459,10 +453,10 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if count != 0: task.add_alignment(a0.base, a_tmp_base) # add array arguments - if len (self._args)>0: + if len(self._args) > 0: if a0 is None: - a0 = self._args[0]._thunk - a0 = runtime.to_deferred_array(a0) + a0 = self._args[0]._thunk + a0 = runtime.to_deferred_array(a0) for count, a in enumerate(self._args): a_tmp = runtime.to_deferred_array(a._thunk) a_tmp_base = a_tmp.base @@ -478,7 +472,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: ) # type : ignore task.execute() - def __call__(self, *args: Any, **kwargs: Any) -> None: + def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: # each time we call `vectorize` on a pyfunc we need to clear # these lists to support different types of arguments passed self._scalar_args.clear() @@ -487,7 +481,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._arg_names.clear() self._scalar_names.clear() - scalar_idx=0 + scalar_idx = 0 for i, arg in enumerate(args): if arg is None: raise ValueError( @@ -496,15 +490,16 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: ) elif np.ndim(arg) == 0: if self._cache and not self._created: - self._cached_scalar_types.apend(type(arg)) + self._cached_scalar_types.append(type(arg)) elif self._cache: if self._cached_scalar_types[scalar_idx] != type(arg): raise TypeError( " Input arguments to vectorized function should" - " have consistent types for each invocation") + " have consistent types for each invocation" + ) self._scalar_args.append(arg) self._scalar_idxs.append(i) - scalar_idx+=1 + scalar_idx += 1 else: self._args.append(convert_to_cunumeric_ndarray(arg)) @@ -523,8 +518,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: "kwargs are not supported in user functions" ) - #we need to do ther rest each time `__call__` is executed - output_shape = None + # we need to do ther rest each time `__call__` is executed + output_shape: Tuple[int] = (-1,) output_dtype = self._output_dtype self._return_args.clear() # if output type is not specified, we need to decide @@ -537,13 +532,13 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: if r in self._arg_names: idx = self._arg_names.index(r) if output_dtype is None: - output_dtype = self._args[idx].dtype - if output_shape is None: - output_shape = self._args[idx].shape + output_dtype = self._args[idx].dtype + if output_shape is (-1,): + output_shape = self._args[idx].shape break - - #the case if we didn't find output argument in input argnames - if output_shape is None: + + # the case if we didn't find output argument in input argnames + if output_shape is (-1,): for r in self._return_names: if r in self._scalar_names: idx = self._scalar_names.index(r) @@ -551,38 +546,40 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: output_dtype = np.dtype(type(self._scalar_args[idx])) output_shape = (1,) break - + if self._cache and not (self._cached_dtype is None): - if self._cached_dtype !=output_dtype: - raise TypeError("types of the arguments should stay the same" - " for each invocation of the vectorize object") + if self._cached_dtype != output_dtype: + raise TypeError( + "types of the arguments should stay the same" + " for each invocation of the vectorize object" + ) elif self._cache: self._cached_dtype = output_dtype - #FIXME - #we could find common type of input arguments here and - #broadcasted shapes - if self._num_outputs>0 and output_dtype is None: + # FIXME + # we could find common type of input arguments here and + # broadcasted shapes + if self._num_outputs > 0 and output_dtype is None: raise ValueError("Unable to choose output dtype") - if self._num_outputs>0 and output_shape is None: + if self._num_outputs > 0 and output_shape is None: raise ValueError("Unable to choose output shape") - # filing the list of return arguments # check if there are return argnames in input argnames, # if not, create a new array for r in self._return_names: if r in self._arg_names: idx = self._arg_names.index(r) - if self._args[idx].shape !=output_shape: + if self._args[idx].shape != output_shape: raise ValueError( - "all output arrays should have the same shape") + "all output arrays should have the same shape" + ) if output_dtype != self._args[idx].dtype: runtime.warn( "converting input array to output types in user func ", category=RuntimeWarning, ) - self._args[idx]=self._args[idx].astype(output_dtype) + self._args[idx] = self._args[idx].astype(output_dtype) self._return_args.append(self._args[idx]) self._args.remove(self._args[idx]) self._arg_names.remove(r) @@ -590,31 +587,34 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: idx = self._scalar_names.index(r) if output_shape != (1,): raise ValueError( - "all output arrays should have the same shape") - self._return_args.append(full(output_shape,self._scalar_args[idx], output_dtype)) + "all output arrays should have the same shape" + ) + self._return_args.append( + full(output_shape, self._scalar_args[idx], output_dtype) + ) self._scalar_args.remove(self._scalar_args[idx]) self._scalar_names.remove(r) else: - #create array and add it to the list of return_args - tmp_ret = full(output_shape,0, output_dtype) + # create array and add it to the list of return_args + tmp_ret = full(output_shape, 0, output_dtype) self._return_args.append(tmp_ret) - #FIXME - #if self._num_outputs==0: + # FIXME + # if self._num_outputs==0: # #execute function that doesn't modify anything: # self._pyfunc(args) # return # bring all arrays to same type if len(self._args) > 0: - for count, a in enumerate(self._args): + for count, a in enumerate(self._args): if output_dtype != a.dtype: runtime.warn( "converting input array to output types in user func ", category=RuntimeWarning, ) self._args[count] = self._args[count].astype(output_dtype) - #FIXME broadcast shapes - if output_shape != self._args[count].shape : + # FIXME broadcast shapes + if output_shape != self._args[count].shape: raise ValueError( "cuNumeric doesnt support " "different shapes for arrays in " @@ -634,9 +634,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None: self._created = True self._execute(False) - if len(self._return_args)==1: + if len(self._return_args) == 1: return self._return_args[0] - if len(self._return_args)>1: + if len(self._return_args) > 1: return tuple(self._return_args) else: - return + return -1 diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 04133ef671..1860b588c1 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -75,10 +75,10 @@ struct EvalUdfCPU { int dim = 1; if (args.inputs.size() > 0) { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); - assert(dim>0); + assert(dim > 0); double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args); } else { - LegateTypeCode code = LegateTypeCode::BOOL_LT ; + LegateTypeCode code = LegateTypeCode::BOOL_LT; double_dispatch(dim, code, EvalUdfCPU{}, args); } } diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index ebe3e0730e..f7c6a452a9 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -33,7 +33,7 @@ struct EvalUdfGPU { Rect rect; size_t input_size = args.inputs.size(); - CUfunction func = get_udf(args.hash); + CUfunction func = get_udf(args.hash); // Filling up the buffer with arguments size_t buffer_size = (input_size + args.scalars.size()) * sizeof(void*); @@ -136,7 +136,7 @@ struct EvalUdfGPU { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args); } else { - LegateTypeCode code = LegateTypeCode::BOOL_LT ; + LegateTypeCode code = LegateTypeCode::BOOL_LT; double_dispatch(dim, code, EvalUdfGPU{}, args); } } diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi index ffbfbd5a94..697a068cfd 100644 --- a/typings/numba/types/__init__.pyi +++ b/typings/numba/types/__init__.pyi @@ -2,7 +2,7 @@ class Type(): ... -class Number(): ... +class Number(Type): ... class Integer(Number): def __init__(self, name: str) ->None: ... From 5b752976ac7809092c3bf2118d21f8aea1ab0cd7 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 23 Mar 2023 16:39:15 -0700 Subject: [PATCH 62/78] small bugfux --- cunumeric/vectorize.py | 4 ++-- tests/integration/test_vectorize.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 6c07d85da3..040bc433c9 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -534,12 +534,12 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: idx = self._arg_names.index(r) if output_dtype is None: output_dtype = self._args[idx].dtype - if output_shape is (-1,): + if output_shape == (-1,): output_shape = self._args[idx].shape break # the case if we didn't find output argument in input argnames - if output_shape is (-1,): + if output_shape == (-1,): for r in self._return_names: if r in self._scalar_names: idx = self._scalar_names.index(r) diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index cbb49bcaf4..f75aeab085 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -180,9 +180,9 @@ def test_cache(): def test_nd_vectorize(ndim): a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim)) a = mk_seq_array(np, a_shape) - a_num = mk_seq_array(num, a_shape) + a_num = num.array(a) b=a*2 - b_num=a_num*2 + b_num=num.array(b) a,c=func_np2(a,b) a_num,c_num = func_num2(a_num, b_num) assert np.array_equal(a, a_num) From 09b755fb306db1c3c305c62a59a949099e49eaf0 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 23 Mar 2023 20:40:54 -0700 Subject: [PATCH 63/78] clean-up + formatting --- cunumeric/utils.py | 5 +- cunumeric/vectorize.py | 120 +++++----- examples/black_scholes_greeks.py | 154 ++++++++----- src/cunumeric/cuda_help.h | 38 ---- src/cunumeric/pitches.h | 27 +-- src/cunumeric/vectorize/create_cu_kernel.cu | 10 +- src/cunumeric/vectorize/eval_udf.cc | 11 +- src/cunumeric/vectorize/eval_udf_template.inl | 69 ------ tests/integration/test_vectorize.py | 212 +++++++++--------- 9 files changed, 287 insertions(+), 359 deletions(-) delete mode 100644 src/cunumeric/vectorize/eval_udf_template.inl diff --git a/cunumeric/utils.py b/cunumeric/utils.py index f4cf6f962f..381470a0e9 100644 --- a/cunumeric/utils.py +++ b/cunumeric/utils.py @@ -51,7 +51,7 @@ "int16": ty.int16, "int32": ty.int32, "int": ty.int64, # np.int is int - "int64": ty.int64, + "int64": ty.int64, "uint8": ty.uint8, "uint16": ty.uint16, "uint32": ty.uint32, @@ -117,7 +117,8 @@ def is_supported_dtype(dtype: Any) -> bool: raise TypeError("expected a NumPy dtype") return dtype.type in SUPPORTED_DTYPES -def convert_to_cunumeric_dtype(dtype: str) ->Any: + +def convert_to_cunumeric_dtype(dtype: str) -> Any: if dtype in CUNUMERIC_TYPE_MAP: return CUNUMERIC_TYPE_MAP[dtype] else: diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 040bc433c9..893b0189ed 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -1,4 +1,4 @@ - # Copyright 2023 NVIDIA Corporation +# Copyright 2023 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -172,7 +172,7 @@ def __init__( def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: """Using the magic method __doc__, we KNOW the size of the docstring. - We then, just substract this from the total length of the function + We then, just subtract this from the total length of the function """ lines_to_skip = 0 if func.__doc__ is not None and len(func.__doc__.split("\n")) > 0: @@ -239,12 +239,12 @@ def _build_gpu_function(self) -> Any: lines.append(" local_i = cuda.grid(1)") lines.append(" if local_i >= {}:".format(_SIZE_VAR)) lines.append(" return") - # we compute inndex for sparse data access when using Legion's + # we compute index for sparse data access when using Legion's # pointer. - # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] + # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] lines.append(" {}:int = 0".format(_LOOP_VAR)) lines.append(" for p in range({}-1):".format(_DIM_VAR)) - # fixme make sure we compute index correct for all data types + # FIXME make sure we compute index correct for all data types lines.append(" x=int(local_i/{}[p])".format(_PITCHES_VAR)) lines.append( " local_i = int(local_i%{}[p])".format(_PITCHES_VAR) @@ -306,7 +306,7 @@ def _emit_assignment( ) ) - # define pyfunc arguments ar carrays + # define pyfunc arguments as carrays arg_idx = 0 for count, a in enumerate(self._return_args): type_a = a.dtype @@ -329,9 +329,9 @@ def _emit_assignment( # Main loop lines.append(" for local_i in range({}):".format(_SIZE_VAR)) - # we compute inndex for sparse data access when using Legion's + # we compute index for sparse data access when using Legion's # pointer. - # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] + # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] lines.append(" {}:int = 0".format(_LOOP_VAR)) lines.append(" j:int = local_i") lines.append(" for p in range({}-1):".format(_DIM_VAR)) @@ -366,13 +366,7 @@ def _lift_to_array_access(m: Any) -> str: def _get_numba_types(self, need_pointer: bool = True) -> list[Any]: types = [] - for arg in self._return_args: - type_a = arg.dtype - type_a = str(type_a) if type_a != bool else "int8" - type_a = getattr(numba.core.types, type_a) - type_a = numba.core.types.CPointer(type_a) - types.append(type_a) - for arg in self._args: + for arg in self._return_args + self._args: type_a = arg.dtype type_a = str(type_a) if type_a != bool else "int8" type_a = getattr(numba.core.types, type_a) @@ -400,69 +394,80 @@ def _compile_func_gpu(self) -> tuple[Any]: return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch) def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: - sig = numba.core.types.void( # type : ignore + sig = numba.core.types.void( # type: ignore numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64, numba.core.types.uint64, numba.core.types.CPointer(numba.core.types.uint64), numba.core.types.CPointer(numba.core.types.uint64), - ) # type : ignore + ) return numba.cfunc(sig)(self._numba_func) + def _create_cuda_kernel(self, num_gpus: int) -> None: + # create CUDA kernel + launch_domain = Rect(lo=(0,), hi=(num_gpus,)) + kernel_task = self._context.create_manual_task( + CuNumericOpCode.CREATE_CU_KERNEL, + launch_domain=launch_domain, + ) + ptx_hash = hash(self._gpu_func[0]) + kernel_task.add_scalar_arg(ptx_hash, ty.int64) + kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) + kernel_task.execute() + # we want to make sure EVAL_UDF function is not executed before + # CUDA kernel is created + self._context.issue_execution_fence(block=True) + + # task has finished by the time we set self._created to True + if self._cache: + self._created = True + @track_provenance(runtime.legate_context) def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: if is_gpu and not self._created: - # create CUDA kernel - launch_domain = Rect(lo=(0,), hi=(num_gpus,)) - kernel_task = self._context.create_manual_task( - CuNumericOpCode.CREATE_CU_KERNEL, - launch_domain=launch_domain, - ) - ptx_hash = hash(self._gpu_func[0]) - kernel_task.add_scalar_arg(ptx_hash, ty.int64) - kernel_task.add_scalar_arg(self._gpu_func[0], ty.string) - kernel_task.execute() - # we want to make sure EVAL_UDF function is not executed before - # CUDA kernel is created - self._context.issue_execution_fence(block=True) - - # task has finished by the time we set self._created to True - if self._cache: - self._created = True + self._create_cuda_kernel(num_gpus) task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF) task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs task.add_scalar_arg( len(self._scalar_args), ty.uint32 ) # N of scalar_args - # add all scalars + + # add all scalar arguments first for a in self._scalar_args: dtype = convert_to_cunumeric_dtype(type(a).__name__) task.add_scalar_arg(a, dtype) - # add return arguments - a0 = None - if len(self._return_args) > 0: - a0 = self._return_args[0]._thunk - a0 = runtime.to_deferred_array(a0) - for count, a in enumerate(self._return_args): - a_tmp = runtime.to_deferred_array(a._thunk) + num_args = len(self._args) + # add return arguments with RW permissions + first_array = None + if self._num_outputs > 0: + first_array = runtime.to_deferred_array( + self._return_args[0]._thunk + ) + task.add_input(first_array.base) + task.add_output(first_array.base) + + for i in range(1, self._num_outputs): + a_tmp = runtime.to_deferred_array(self._return_args[i]._thunk) a_tmp_base = a_tmp.base task.add_input(a_tmp_base) task.add_output(a_tmp_base) - if count != 0: - task.add_alignment(a0.base, a_tmp_base) - # add array arguments - if len(self._args) > 0: - if a0 is None: - a0 = self._args[0]._thunk - a0 = runtime.to_deferred_array(a0) - for count, a in enumerate(self._args): - a_tmp = runtime.to_deferred_array(a._thunk) + task.add_alignment(first_array.base, a_tmp_base) + + # add array arguments with read-only permissions + if num_args > 0: + start = 0 + if first_array is None: + first_array = runtime.to_deferred_array(self._args[0]._thunk) + task.add_input(first_array.base) + start = 1 + for i in range(start, num_args): + a_tmp = runtime.to_deferred_array(self._args[i]._thunk) a_tmp_base = a_tmp.base task.add_input(a_tmp_base) - task.add_alignment(a0.base, a_tmp_base) + task.add_alignment(first_array.base, a_tmp_base) if is_gpu: ptx_hash = hash(self._gpu_func[0]) @@ -519,10 +524,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: "kwargs are not supported in user functions" ) - # we need to do ther rest each time `__call__` is executed output_shape: Tuple[int] = (-1,) output_dtype = self._output_dtype self._return_args.clear() + # if output type is not specified, we need to decide # which one to use # we also want to choose the shape for output array @@ -599,13 +604,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: # create array and add it to the list of return_args tmp_ret = full(output_shape, 0, output_dtype) self._return_args.append(tmp_ret) - # FIXME - # if self._num_outputs==0: - # #execute function that doesn't modify anything: - # self._pyfunc(args) - # return - # bring all arrays to same type + # check types and shapes if len(self._args) > 0: for count, a in enumerate(self._args): if output_dtype != a.dtype: @@ -617,7 +617,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: # FIXME broadcast shapes if output_shape != self._args[count].shape: raise ValueError( - "cuNumeric doesnt support " + "cuNumeric doesn't support " "different shapes for arrays in " "user function passed to vectorize" ) diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py index b4538533bb..6d955c945b 100644 --- a/examples/black_scholes_greeks.py +++ b/examples/black_scholes_greeks.py @@ -16,57 +16,92 @@ # import argparse - -from benchmark import parse_args, run_benchmark, CuNumericTimer from enum import IntEnum -import math -import cunumeric as np +from benchmark import CuNumericTimer, parse_args, run_benchmark -NUM_ITERS=10 -WARMUP_ITER=2 +NUM_ITERS = 10 +WARMUP_ITER = 2 vol_start = 0.1 vol_step = 0.01 t_start = 0.5 -t_step = 1.0/(365*10) +t_step = 1.0 / (365 * 10) money_start = -0.4 money_step = 0.01 RISKFREE = 0.02 S0 = 100.0 -N_GREEKS=7 +N_GREEKS = 7 + class Greeks(IntEnum): - PREM=0, - DELTA=1, - VEGA=2, - GAMMA=3, - VANNA=4, - VOLGA=5, - THETA=6 + PREM = (0,) + DELTA = (1,) + VEGA = (2,) + GAMMA = (3,) + VANNA = (4,) + VOLGA = (5,) + THETA = 6 def initialize(n_vol_steps, n_t_steps, n_money_steps, D): - CALL = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D) - PUT = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D) - S=np.full((n_t_steps, n_vol_steps, n_money_steps,),S0, dtype = D) - temp_arr = np.arange((n_vol_steps*n_t_steps*n_money_steps), dtype=int) - k_temp=(temp_arr%n_money_steps)*money_step - k_temp = k_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,)) - K=(k_temp+(1 + money_start))*S0 - - t_temp = (temp_arr%(n_vol_steps*n_money_steps))*vol_step - t_temp = t_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,)) - T=t_temp+t_start - R= 0.02 - V=np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype = D) + CALL = np.zeros( + ( + N_GREEKS, + n_t_steps, + n_vol_steps, + n_money_steps, + ), + dtype=D, + ) + PUT = np.zeros( + ( + N_GREEKS, + n_t_steps, + n_vol_steps, + n_money_steps, + ), + dtype=D, + ) + S = np.full( + ( + n_t_steps, + n_vol_steps, + n_money_steps, + ), + S0, + dtype=D, + ) + temp_arr = np.arange((n_vol_steps * n_t_steps * n_money_steps), dtype=int) + k_temp = (temp_arr % n_money_steps) * money_step + k_temp = k_temp.reshape( + ( + n_t_steps, + n_vol_steps, + n_money_steps, + ) + ) + K = (k_temp + (1 + money_start)) * S0 + + t_temp = (temp_arr % (n_vol_steps * n_money_steps)) * vol_step + t_temp = t_temp.reshape( + ( + n_t_steps, + n_vol_steps, + n_money_steps, + ) + ) + T = t_temp + t_start + R = 0.02 + V = np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype=D) for i in range(n_vol_steps): - V[:,i,:]+=i*vol_step + V[:, i, :] += i * vol_step return CALL, PUT, S, K, T, R, V + def normCDF(d): A1 = 0.31938153 A2 = -0.356563782 @@ -77,37 +112,44 @@ def normCDF(d): K = 1.0 / (1.0 + 0.2316419 * np.absolute(d)) - cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) + cnd = ( + RSQRT2PI + * np.exp(-0.5 * d * d) + * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) + ) return np.where(d > 0, 1.0 - cnd, cnd) + def normPDF(d): RSQRT2PI = 0.39894228040143267793994605993438 - return RSQRT2PI * np.exp(- 0.5 * d * d) + return RSQRT2PI * np.exp(-0.5 * d * d) + -def black_scholes ( out , S, K, R, T, V, CP, greek): - EPS = 0.00000001 +def black_scholes(out, S, K, R, T, V, CP, greek): stdev = V * np.sqrt(T) - df = np.exp(-R*T) - d1 = (np.log(S/K)+(R+0.5*V*V)*T)/stdev - d2= d1-stdev - nd1 = normCDF(CP*d1) - nd2 = normCDF(CP*d2) + df = np.exp(-R * T) + d1 = (np.log(S / K) + (R + 0.5 * V * V) * T) / stdev + d2 = d1 - stdev + nd1 = normCDF(CP * d1) + nd2 = normCDF(CP * d2) if greek == Greeks.PREM: - out[...] = CP*(S*nd1 - K*df*nd2) + out[...] = CP * (S * nd1 - K * df * nd2) elif greek == Greeks.DELTA: - out[...] = CP*nd1 - elif greek ==Greeks.VEGA: - out[...] = S*np.sqrt(T)*normPDF(d1) + out[...] = CP * nd1 + elif greek == Greeks.VEGA: + out[...] = S * np.sqrt(T) * normPDF(d1) elif greek == Greeks.GAMMA: - out[...] = normPDF(d1)/(S*V*np.sqrt(T)) + out[...] = normPDF(d1) / (S * V * np.sqrt(T)) elif greek == Greeks.VANNA: - out[...] = -d2*normPDF(d1)/V + out[...] = -d2 * normPDF(d1) / V elif greek == Greeks.VOLGA: - out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V + out[...] = S * np.sqrt(T) * d1 * d2 * normPDF(d1) / V elif greek == Greeks.THETA: - out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2) + out[...] = -( + 0.5 * S * V / np.sqrt(T) * normPDF(d1) + CP * R * df * K * nd2 + ) else: raise RuntimeError("Wrong greek name is passed") @@ -115,20 +157,23 @@ def black_scholes ( out , S, K, R, T, V, CP, greek): def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps): timer = CuNumericTimer() print("Start black_scholes") - CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32) + CALL, PUT, S, K, T, R, V = initialize( + n_vol_steps, n_t_steps, n_money_steps, np.float32 + ) print("After the initialization") - for i in range (NUM_ITERS): - if i==WARMUP_ITER: + for i in range(NUM_ITERS): + if i == WARMUP_ITER: timer.start() for g in Greeks: - black_scholes(CALL[g.value],S, K, R, T, V, 1, g) - black_scholes(PUT[g.value],S, K, R, T, V, -1, g) + black_scholes(CALL[g.value], S, K, R, T, V, 1, g) + black_scholes(PUT[g.value], S, K, R, T, V, -1, g) - total = (timer.stop())/(NUM_ITERS-WARMUP_ITER) + total = (timer.stop()) / (NUM_ITERS - WARMUP_ITER) print("Elapsed Time: {} ms".format(total)) return total + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -157,14 +202,11 @@ def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps): help="number of money steps", ) - args, np, timer = parse_args(parser) - + run_benchmark( run_black_scholes, args.benchmark, "Black Scholes", (args.n_vol_steps, args.n_time_steps, args.n_money_steps), ) - - diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h index eb90ab6da2..de88b6707d 100644 --- a/src/cunumeric/cuda_help.h +++ b/src/cunumeric/cuda_help.h @@ -392,42 +392,4 @@ __device__ __forceinline__ void store_streaming(double* ptr, double valu { asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory"); } -#if 0 -#include - -class JITKernelStorage -{ - -private: - JITKernelStorage(){} - std::map, CUfunction> jit_functions_; - -public: - JITKernelStorage( JITKernelStorage const&) = delete; - - void operator=(JITKernelStorage const&) = delete; - - static JITKernelStorage& get_instance(void){ - static JITKernelStorage instance; - return instance; - } - - bool registered_jit_funtion(std::pair &key){ - return jit_functions_.find(key)!=jit_functions_.end(); - }; - - CUfunction return_saved_jit_function(std::pair &key){ - if ( - jit_functions_.find(key)!=jit_functions_.end()) - return jit_functions_[key]; - else - assert(false);//should never come here - } - - void add_jit_function(std::pair &key, CUfunction func){ - jit_functions_.insert({key, func}); - } -};//class JITKernelStorage - -#endif } // namespace cunumeric diff --git a/src/cunumeric/pitches.h b/src/cunumeric/pitches.h index dd95068e38..27d179b0e5 100644 --- a/src/cunumeric/pitches.h +++ b/src/cunumeric/pitches.h @@ -53,12 +53,9 @@ class Pitches { point[DIM] += index; return point; } - + __CUDA_HD__ - inline const size_t* data(void) - { - return &pitches[0]; - } + inline const size_t* data(void) { return &pitches[0]; } private: size_t pitches[DIM]; @@ -97,11 +94,7 @@ class Pitches { } __CUDA_HD__ - inline const size_t* data(void) - { - return &pitches[0]; - } - + inline const size_t* data(void) { return &pitches[0]; } private: size_t pitches[DIM]; @@ -116,10 +109,10 @@ class Pitches<0, C_ORDER> { { if (rect.lo[0] > rect.hi[0]) return 0; - else{ - pitches[0]=rect.hi[0] - rect.lo[0] + 1; + else { + pitches[0] = rect.hi[0] - rect.lo[0] + 1; return (rect.hi[0] - rect.lo[0] + 1); - } + } } __CUDA_HD__ inline legate::Point<1> unflatten(size_t index, const legate::Point<1>& lo) const @@ -129,14 +122,10 @@ class Pitches<0, C_ORDER> { return point; } __CUDA_HD__ - inline const size_t* data(void) - { - return &pitches[0]; - } + inline const size_t* data(void) { return &pitches[0]; } - private: + private: size_t pitches[1]; - }; } // namespace cunumeric diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu index dde9616543..5805d2ef1c 100644 --- a/src/cunumeric/vectorize/create_cu_kernel.cu +++ b/src/cunumeric/vectorize/create_cu_kernel.cu @@ -24,19 +24,11 @@ namespace cunumeric { using namespace Legion; using namespace legate; -__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) - fill_out_kernel(const AccessorRD, true, 1> out) -{ - const int idx = (blockIdx.x * blockDim.x + threadIdx.x); - if (idx > 0) return; - out.reduce(0, true); -} - /*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context) { int64_t ptx_hash = context.scalars()[0].value(); std::string ptx = context.scalars()[1].value(); - Processor point = context.get_current_processor(); + Processor point = legate::Processor::get_executing_processor(); CUfunction func; const unsigned num_options = 4; diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 1860b588c1..3ea306ff50 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -66,12 +66,11 @@ struct EvalUdfCPU { std::vector scalars; for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]); - EvalUdfArgs args{context.scalars()[2 + num_scalars].value(), - context.inputs(), - context.outputs(), - scalars, - num_outputs, - context.get_current_processor()}; + EvalUdfArgs args + { + context.scalars()[2 + num_scalars].value(), context.inputs(), context.outputs(), + scalars, num_outputs, legate::Processor::get_executing_processor() + }; int dim = 1; if (args.inputs.size() > 0) { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl deleted file mode 100644 index c0bc72c428..0000000000 --- a/src/cunumeric/vectorize/eval_udf_template.inl +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright 2023 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -#pragma once - -// Useful for IDEs -#include "cunumeric/vectorize/eval_udf.h" -#include "cunumeric/pitches.h" - -namespace cunumeric { - -using namespace Legion; -using namespace legate; - -template -struct EvalUdfImplBody; - -template -struct EvalUdfImpl { - template - void operator()(EvalUdfArgs& args) const - { - using UDF = void(void**, size_t); - auto udf = reinterpret_cast(args.func_ptr); - std::vector udf_args; - using VAL = legate_type_of; - auto rect = args.args[0].shape(); - - size_t strides[DIM]; - - if (rect.empty()) return; - EvalUdfImplBody(); - for (size_t i = 0; i < args.args.size(); i++) { - auto out = args.args[i].write_accessor(rect); - udf_args.push_back(reinterpret_cast(out.ptr(rect, strides))); - for (size_t i = 0; i < DIM; i++) - std::cout << "IRINA DEBUG strides = " << strides[i] << std::endl; - } - - udf(udf_args.data(), rect.volume()); - } -}; - -template -static void eval_udf_template(TaskContext& context) -{ - is_gpus = context.scalars()[0].value(); - if (is_gpus) - EvalUdfArgs args{0, context.scalars()[1].value(), context.outputs()}; - else - EvalUdfArgs args{context.scalars()[1].value(),'', context.outputs()}; - size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim(); - double_dispatch(dim, args.args[0].code(), EvalUdfImpl{}, args); -} - -} // namespace cunumeric diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index f75aeab085..429bd5474b 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -16,177 +16,189 @@ import numpy as np import pytest - -import cunumeric as num -import numpy as np from legate.core import LEGATE_MAX_DIM from utils.generators import mk_seq_array +import cunumeric as num + + def my_func(a, b): a = a * 2 + b return a -#Capital letters and numbers in the signature + +# Capital letters and numbers in the signature def my_func2(A0, B0): A0 = A0 * 2 + B0 - C0=A0*2 - return A0,C0 + C0 = A0 * 2 + return A0, C0 + def test_vectorize(): - #2 arrays + # 2 arrays func = num.vectorize(my_func) a = num.arange(5) b = num.ones((5,)) a = func(a, b) - assert(np.array_equal(a, [1,3,5,7,9])) + assert np.array_equal(a, [1, 3, 5, 7, 9]) - #array and scalar + # array and scalar func = num.vectorize(my_func) - a= num.arange(5) - b=2 - a = func(a,b) - assert(np.array_equal(a, [2,4,6,8,10])) - - #2 scalars + a = num.arange(5) + b = 2 + a = func(a, b) + assert np.array_equal(a, [2, 4, 6, 8, 10]) + + # 2 scalars func = num.vectorize(my_func) - a=3 - b=2 - a = func(a,b) - assert(a ==8) + a = 3 + b = 2 + a = func(a, b) + assert a == 8 + def empty_func(): print("within empty function") -def print_func(a,b): - print ("I am pringing input arguments", a, b) def test_empty_functions(): - #empty function + # empty function func = num.vectorize(empty_func) func() - func2 = num.vectorize(print_func) - print_func(1,2) - - print_func(np.array([1,2,3]), 2) - def test_vectorize_over_slices(): - #reuse the same vectorize object on - #different slices + # reuse the same vectorize object on + # different slices func_num = num.vectorize(my_func) func_np = np.vectorize(my_func) - a=np.array([[1,2,3],[4,5,6],[7,8,9]]) - b=np.array([[10,11,12],[13,14,15],[16,17,18]]) - a_num=num.array(a) + a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + b = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]]) + a_num = num.array(a) b_num = num.array(b) - a[:2] = func_np(a[:2],b[:2]) - a_num[:2] = func_num(a_num[:2],b_num[:2]) + a[:2] = func_np(a[:2], b[:2]) + a_num[:2] = func_num(a_num[:2], b_num[:2]) assert np.array_equal(a, a_num) - - a=np.arange(100).reshape((25,4)) - a_num= num.array(a) - b=a*10 - b_num=a_num*10 - a=func_np(a,b) - a_num=func_num(a_num, b_num) + a = np.arange(100).reshape((25, 4)) + a_num = num.array(a) + b = a * 10 + b_num = a_num * 10 + a = func_np(a, b) + a_num = func_num(a_num, b_num) assert np.array_equal(a, a_num) - #reusing the same function for different inputs - a[:,2]=func_np(a[:, 2], b[:,2]) - a_num[:,2] =func_num(a_num[:,2],b_num[:,2]) + # reusing the same function for different inputs + a[:, 2] = func_np(a[:, 2], b[:, 2]) + a_num[:, 2] = func_num(a_num[:, 2], b_num[:, 2]) assert np.array_equal(a, a_num) - #reusing the same function for different inputs - a[5:10,2]=func_np(a[5:10, 2], b[1:6,2]) - a_num[5:10,2]=func_num(a_num[5:10,2],b_num[1:6,2]) + # reusing the same function for different inputs + a[5:10, 2] = func_np(a[5:10, 2], b[1:6, 2]) + a_num[5:10, 2] = func_num(a_num[5:10, 2], b_num[1:6, 2]) assert np.array_equal(a, a_num) - #reusing the same function for different inputs - a[15:20]=func_np(a[15:20], b[15:20]) - a_num[15:20]=func_num(a_num[15:20],b_num[15:20]) + # reusing the same function for different inputs + a[15:20] = func_np(a[15:20], b[15:20]) + a_num[15:20] = func_num(a_num[15:20], b_num[15:20]) assert np.array_equal(a, a_num) # reusing the same function for different inputs - a=np.arange(1000).reshape((25,10,4)) - a_num= num.array(a) - a[:, 2, :] = func_np(a[:, 2, :],2) - a_num[:, 2, :]=func_num(a_num[:, 2, :],2) + a = np.arange(1000).reshape((25, 10, 4)) + a_num = num.array(a) + a[:, 2, :] = func_np(a[:, 2, :], 2) + a_num[:, 2, :] = func_num(a_num[:, 2, :], 2) assert np.array_equal(a, a_num) + def test_multiple_outputs(): - #checking signature with capital letters and numbers + # checking signature with capital letters and numbers # + checking multiple outputs - a=np.arange(100).reshape((25,4)) - a_num= num.array(a) - b=a*10 - b_num=a_num*10 + a = np.arange(100).reshape((25, 4)) + a_num = num.array(a) + b = a * 10 + b_num = a_num * 10 func_np = np.vectorize(my_func2) - func_num=num.vectorize(my_func2) - a,c=func_np(a,b) - a_num,c_num = func_num(a_num, b_num) + func_num = num.vectorize(my_func2) + a, c = func_np(a, b) + a_num, c_num = func_num(a_num, b_num) assert np.array_equal(a, a_num) assert np.array_equal(c, c_num) + def test_different_types(): - #checking the case when input and output types are different - a=np.arange(100, dtype = int).reshape((25,4)) - a_num= num.array(a) - b=a*10 - b_num=a_num*10 + # checking the case when input and output types are different + a = np.arange(100, dtype=int).reshape((25, 4)) + a_num = num.array(a) + b = a * 10 + b_num = a_num * 10 func_np = np.vectorize(my_func, otypes=(float,)) - func_num=num.vectorize(my_func, otypes=(float,)) - a=func_np(a,b) - a_num=func_num(a_num, b_num) + func_num = num.vectorize(my_func, otypes=(float,)) + a = func_np(a, b) + a_num = func_num(a_num, b_num) assert np.array_equal(a, a_num) - #another test for different types - a=np.arange(100, dtype = float).reshape((25,4)) - a_num= num.array(a) - b=a*10 - b_num=a_num*10 - func_np = np.vectorize(my_func2, otypes = (int, int,)) - func_num=num.vectorize(my_func2, otypes = (int, int, )) - a,c=func_np(a,b) - a_num,c_num = func_num(a_num, b_num) + # another test for different types + a = np.arange(100, dtype=float).reshape((25, 4)) + a_num = num.array(a) + b = a * 10 + b_num = a_num * 10 + func_np = np.vectorize( + my_func2, + otypes=( + int, + int, + ), + ) + func_num = num.vectorize( + my_func2, + otypes=( + int, + int, + ), + ) + a, c = func_np(a, b) + a_num, c_num = func_num(a_num, b_num) assert np.array_equal(a, a_num) assert np.array_equal(c, c_num) def test_cache(): - a=np.arange(100).reshape((25,4)) - a_num= num.array(a) - b=a*10 - b_num=a_num*10 - func_np = np.vectorize(my_func2, cache = True) - func_num=num.vectorize(my_func2, cache = True) - for i in range (10): - a=a*2 - b=b*3 - a_num=a_num*2 - b_num=b_num*3 - a,c=func_np(a,b) - a_num,c_num = func_num(a_num, b_num) + a = np.arange(100).reshape((25, 4)) + a_num = num.array(a) + b = a * 10 + b_num = a_num * 10 + func_np = np.vectorize(my_func2, cache=True) + func_num = num.vectorize(my_func2, cache=True) + for i in range(10): + a = a * 2 + b = b * 3 + a_num = a_num * 2 + b_num = b_num * 3 + a, c = func_np(a, b) + a_num, c_num = func_num(a_num, b_num) assert np.array_equal(a, a_num) assert np.array_equal(c, c_num) -#checking caching on different shapes of arrays: -func_np2 = np.vectorize(my_func2, cache = True) -func_num2=num.vectorize(my_func2, cache = True) + +# checking caching on different shapes of arrays: +func_np2 = np.vectorize(my_func2, cache=True) +func_num2 = num.vectorize(my_func2, cache=True) + @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1)) def test_nd_vectorize(ndim): a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim)) a = mk_seq_array(np, a_shape) a_num = num.array(a) - b=a*2 - b_num=num.array(b) - a,c=func_np2(a,b) - a_num,c_num = func_num2(a_num, b_num) + b = a * 2 + b_num = num.array(b) + a, c = func_np2(a, b) + a_num, c_num = func_num2(a_num, b_num) assert np.array_equal(a, a_num) - assert np.array_equal(c, c_num) + assert np.array_equal(c, c_num) + if __name__ == "__main__": import sys From 393aa1b449eb6af2b366c3978bb29ec695cd03f7 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 23 Mar 2023 21:00:02 -0700 Subject: [PATCH 64/78] clean-up + formatting --- src/cunumeric/vectorize/eval_udf.cc | 11 ++++++----- src/cunumeric/vectorize/eval_udf.cu | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc index 3ea306ff50..8952fb1262 100644 --- a/src/cunumeric/vectorize/eval_udf.cc +++ b/src/cunumeric/vectorize/eval_udf.cc @@ -66,11 +66,12 @@ struct EvalUdfCPU { std::vector scalars; for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]); - EvalUdfArgs args - { - context.scalars()[2 + num_scalars].value(), context.inputs(), context.outputs(), - scalars, num_outputs, legate::Processor::get_executing_processor() - }; + EvalUdfArgs args{context.scalars()[2 + num_scalars].value(), + context.inputs(), + context.outputs(), + scalars, + num_outputs, + legate::Processor::get_executing_processor()}; int dim = 1; if (args.inputs.size() > 0) { dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim(); diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu index f7c6a452a9..dd1d52c75b 100644 --- a/src/cunumeric/vectorize/eval_udf.cu +++ b/src/cunumeric/vectorize/eval_udf.cu @@ -129,7 +129,7 @@ struct EvalUdfGPU { context.outputs(), scalars, num_outputs, - context.get_current_processor(), + legate::Processor::get_executing_processor(), ptx_hash}; size_t dim = 1; if (args.inputs.size() > 0) { From c8dd7fa064b302b5513153573e5a5860c98bd406 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 04:10:32 +0000 Subject: [PATCH 65/78] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cunumeric/vectorize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 893b0189ed..ac150c1e36 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -400,7 +400,7 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc: numba.core.types.uint64, numba.core.types.CPointer(numba.core.types.uint64), numba.core.types.CPointer(numba.core.types.uint64), - ) + ) return numba.cfunc(sig)(self._numba_func) From 92170ff3639690f749b970b7a63ae1c827843fb6 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Sun, 26 Mar 2023 20:37:50 -0700 Subject: [PATCH 66/78] Update cunumeric/vectorize.py Co-authored-by: Bryan Van de Ven --- cunumeric/vectorize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index ac150c1e36..f7345dba48 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -500,7 +500,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: elif self._cache: if self._cached_scalar_types[scalar_idx] != type(arg): raise TypeError( - " Input arguments to vectorized function should" + "Input arguments to vectorized function should" " have consistent types for each invocation" ) self._scalar_args.append(arg) From a3196b4737e67e17e86cc3132fa4108665dd726f Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 27 Mar 2023 11:28:52 -0700 Subject: [PATCH 67/78] addressing comments from Bryan --- cunumeric/utils.py | 3 +- cunumeric/vectorize.py | 105 +++++++------- examples/black_scholes_greeks.py | 212 ---------------------------- pyproject.toml | 3 +- tests/integration/test_vectorize.py | 90 +++++++----- typings/numba/types/__init__.pyi | 15 +- 6 files changed, 114 insertions(+), 314 deletions(-) delete mode 100644 examples/black_scholes_greeks.py diff --git a/cunumeric/utils.py b/cunumeric/utils.py index 381470a0e9..62917020d2 100644 --- a/cunumeric/utils.py +++ b/cunumeric/utils.py @@ -121,8 +121,7 @@ def is_supported_dtype(dtype: Any) -> bool: def convert_to_cunumeric_dtype(dtype: str) -> Any: if dtype in CUNUMERIC_TYPE_MAP: return CUNUMERIC_TYPE_MAP[dtype] - else: - raise TypeError("dtype is not supported") + raise TypeError("dtype is not supported") def calculate_volume(shape: NdShape) -> int: diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index f7345dba48..c4b73f5f9a 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -126,7 +126,6 @@ def __init__( self._scalar_idxs: List[int] = [] self._scalar_names: List[str] = [] self._arg_names: List[str] = [] - self._kwargs: List[Any] = [] self._context = runtime.legate_context self._created: bool = False self._func_body: List[str] = [] @@ -136,7 +135,7 @@ def __init__( else: self.__doc__ = doc - self._return_names = self._get_return_argumets() + self._return_names = self._get_return_arguments() self._num_outputs: int = len(self._return_names) self._return_args: List[Any] = [] self._output_dtype: Optional[np.dtype[Any]] = None @@ -170,7 +169,7 @@ def __init__( "signature variable is not supported yet" ) - def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: + def _get_func_body(self, func: Callable[[Any], Any]) -> List[str]: """Using the magic method __doc__, we KNOW the size of the docstring. We then, just subtract this from the total length of the function """ @@ -185,7 +184,7 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]: return_lines.append(lines[i].rstrip()) return return_lines - def _get_return_argumets(self) -> list[str]: + def _get_return_arguments(self) -> List[str]: """ Returns the list of names for return arrays/values """ @@ -478,51 +477,9 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: ) # type : ignore task.execute() - def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: - # each time we call `vectorize` on a pyfunc we need to clear - # these lists to support different types of arguments passed - self._scalar_args.clear() - self._scalar_idxs.clear() - self._args.clear() - self._arg_names.clear() - self._scalar_names.clear() - - scalar_idx = 0 - for i, arg in enumerate(args): - if arg is None: - raise ValueError( - "None is not supported in user function " - "passed to cunumeric.vectorize" - ) - elif np.ndim(arg) == 0: - if self._cache and not self._created: - self._cached_scalar_types.append(type(arg)) - elif self._cache: - if self._cached_scalar_types[scalar_idx] != type(arg): - raise TypeError( - "Input arguments to vectorized function should" - " have consistent types for each invocation" - ) - self._scalar_args.append(arg) - self._scalar_idxs.append(i) - scalar_idx += 1 - else: - self._args.append(convert_to_cunumeric_ndarray(arg)) - - # first fill arrays to argnames, then scalars: - for i, k in enumerate(inspect.signature(self._pyfunc).parameters): - if not (i in self._scalar_idxs): - self._arg_names.append(k) - - for i, k in enumerate(inspect.signature(self._pyfunc).parameters): - if i in self._scalar_idxs: - self._scalar_names.append(k) - - self._kwargs = list(kwargs) - if len(self._kwargs) > 0: - raise NotImplementedError( - "kwargs are not supported in user functions" - ) + def _filter_arguments_and_check(self) -> None: + # this method will filter return and input arguments + # it will also check shape and type of the arguments output_shape: Tuple[int] = (-1,) output_dtype = self._output_dtype @@ -622,6 +579,53 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: "user function passed to vectorize" ) + def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: + # each time we call `vectorize` on a pyfunc we need to clear + # these lists to support different types of arguments passed + self._scalar_args.clear() + self._scalar_idxs.clear() + self._args.clear() + self._arg_names.clear() + self._scalar_names.clear() + + scalar_idx = 0 + for i, arg in enumerate(args): + if arg is None: + raise ValueError( + "None is not supported in user function " + "passed to cunumeric.vectorize" + ) + elif np.ndim(arg) == 0: + if self._cache and not self._created: + self._cached_scalar_types.append(type(arg)) + elif self._cache: + if self._cached_scalar_types[scalar_idx] != type(arg): + raise TypeError( + "Input arguments to vectorized function should" + " have consistent types for each invocation" + ) + self._scalar_args.append(arg) + self._scalar_idxs.append(i) + scalar_idx += 1 + else: + self._args.append(convert_to_cunumeric_ndarray(arg)) + + # first fill arrays to argnames, then scalars: + for i, k in enumerate(inspect.signature(self._pyfunc).parameters): + if not (i in self._scalar_idxs): + self._arg_names.append(k) + + for i, k in enumerate(inspect.signature(self._pyfunc).parameters): + if i in self._scalar_idxs: + self._scalar_names.append(k) + + if len(kwargs) > 0: + raise NotImplementedError( + "kwargs are not supported in user functions" + ) + + self._filter_arguments_and_check() + if runtime.num_gpus > 0: if not self._created: self._numba_func = self._build_gpu_function() @@ -639,5 +643,4 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: return self._return_args[0] if len(self._return_args) > 1: return tuple(self._return_args) - else: - return -1 + return -1 diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py deleted file mode 100644 index 6d955c945b..0000000000 --- a/examples/black_scholes_greeks.py +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2021-2022 NVIDIA Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -from enum import IntEnum - -from benchmark import CuNumericTimer, parse_args, run_benchmark - -NUM_ITERS = 10 -WARMUP_ITER = 2 - -vol_start = 0.1 -vol_step = 0.01 -t_start = 0.5 -t_step = 1.0 / (365 * 10) -money_start = -0.4 -money_step = 0.01 - - -RISKFREE = 0.02 -S0 = 100.0 -N_GREEKS = 7 - - -class Greeks(IntEnum): - PREM = (0,) - DELTA = (1,) - VEGA = (2,) - GAMMA = (3,) - VANNA = (4,) - VOLGA = (5,) - THETA = 6 - - -def initialize(n_vol_steps, n_t_steps, n_money_steps, D): - CALL = np.zeros( - ( - N_GREEKS, - n_t_steps, - n_vol_steps, - n_money_steps, - ), - dtype=D, - ) - PUT = np.zeros( - ( - N_GREEKS, - n_t_steps, - n_vol_steps, - n_money_steps, - ), - dtype=D, - ) - S = np.full( - ( - n_t_steps, - n_vol_steps, - n_money_steps, - ), - S0, - dtype=D, - ) - temp_arr = np.arange((n_vol_steps * n_t_steps * n_money_steps), dtype=int) - k_temp = (temp_arr % n_money_steps) * money_step - k_temp = k_temp.reshape( - ( - n_t_steps, - n_vol_steps, - n_money_steps, - ) - ) - K = (k_temp + (1 + money_start)) * S0 - - t_temp = (temp_arr % (n_vol_steps * n_money_steps)) * vol_step - t_temp = t_temp.reshape( - ( - n_t_steps, - n_vol_steps, - n_money_steps, - ) - ) - T = t_temp + t_start - R = 0.02 - V = np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype=D) - for i in range(n_vol_steps): - V[:, i, :] += i * vol_step - - return CALL, PUT, S, K, T, R, V - - -def normCDF(d): - A1 = 0.31938153 - A2 = -0.356563782 - A3 = 1.781477937 - A4 = -1.821255978 - A5 = 1.330274429 - RSQRT2PI = 0.39894228040143267793994605993438 - - K = 1.0 / (1.0 + 0.2316419 * np.absolute(d)) - - cnd = ( - RSQRT2PI - * np.exp(-0.5 * d * d) - * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) - ) - - return np.where(d > 0, 1.0 - cnd, cnd) - - -def normPDF(d): - RSQRT2PI = 0.39894228040143267793994605993438 - return RSQRT2PI * np.exp(-0.5 * d * d) - - -def black_scholes(out, S, K, R, T, V, CP, greek): - stdev = V * np.sqrt(T) - df = np.exp(-R * T) - d1 = (np.log(S / K) + (R + 0.5 * V * V) * T) / stdev - d2 = d1 - stdev - nd1 = normCDF(CP * d1) - nd2 = normCDF(CP * d2) - - if greek == Greeks.PREM: - out[...] = CP * (S * nd1 - K * df * nd2) - elif greek == Greeks.DELTA: - out[...] = CP * nd1 - elif greek == Greeks.VEGA: - out[...] = S * np.sqrt(T) * normPDF(d1) - elif greek == Greeks.GAMMA: - out[...] = normPDF(d1) / (S * V * np.sqrt(T)) - elif greek == Greeks.VANNA: - out[...] = -d2 * normPDF(d1) / V - elif greek == Greeks.VOLGA: - out[...] = S * np.sqrt(T) * d1 * d2 * normPDF(d1) / V - elif greek == Greeks.THETA: - out[...] = -( - 0.5 * S * V / np.sqrt(T) * normPDF(d1) + CP * R * df * K * nd2 - ) - else: - raise RuntimeError("Wrong greek name is passed") - - -def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps): - timer = CuNumericTimer() - print("Start black_scholes") - CALL, PUT, S, K, T, R, V = initialize( - n_vol_steps, n_t_steps, n_money_steps, np.float32 - ) - - print("After the initialization") - for i in range(NUM_ITERS): - if i == WARMUP_ITER: - timer.start() - for g in Greeks: - black_scholes(CALL[g.value], S, K, R, T, V, 1, g) - black_scholes(PUT[g.value], S, K, R, T, V, -1, g) - - total = (timer.stop()) / (NUM_ITERS - WARMUP_ITER) - print("Elapsed Time: {} ms".format(total)) - return total - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-v", - "--vol_tesps", - type=int, - default=40, - dest="n_vol_steps", - help="number of voltivity steps", - ) - - parser.add_argument( - "-t", - "--time_tesps", - type=int, - default=3650, - dest="n_time_steps", - help="number of time steps", - ) - parser.add_argument( - "-m", - "--money_tesps", - type=int, - default=60, - dest="n_money_steps", - help="number of money steps", - ) - - args, np, timer = parse_args(parser) - - run_benchmark( - run_black_scholes, - args.benchmark, - "Black Scholes", - (args.n_vol_steps, args.n_time_steps, args.n_money_steps), - ) diff --git a/pyproject.toml b/pyproject.toml index 5c8e1f83aa..f577d875ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,8 @@ exclude = ''' _build | buck-out | build | - dist + dist | + typings )/ ''' diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 429bd5474b..daba0c9979 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -67,48 +67,31 @@ def test_empty_functions(): func() -def test_vectorize_over_slices(): - # reuse the same vectorize object on - # different slices - func_num = num.vectorize(my_func) - func_np = np.vectorize(my_func) - - a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - b = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]]) - a_num = num.array(a) - b_num = num.array(b) - a[:2] = func_np(a[:2], b[:2]) - a_num[:2] = func_num(a_num[:2], b_num[:2]) - assert np.array_equal(a, a_num) - - a = np.arange(100).reshape((25, 4)) - a_num = num.array(a) - b = a * 10 - b_num = a_num * 10 - a = func_np(a, b) - a_num = func_num(a_num, b_num) - assert np.array_equal(a, a_num) - - # reusing the same function for different inputs - a[:, 2] = func_np(a[:, 2], b[:, 2]) - a_num[:, 2] = func_num(a_num[:, 2], b_num[:, 2]) - assert np.array_equal(a, a_num) - - # reusing the same function for different inputs - a[5:10, 2] = func_np(a[5:10, 2], b[1:6, 2]) - a_num[5:10, 2] = func_num(a_num[5:10, 2], b_num[1:6, 2]) - assert np.array_equal(a, a_num) +func_num = num.vectorize(my_func) +func_np = np.vectorize(my_func) - # reusing the same function for different inputs - a[15:20] = func_np(a[15:20], b[15:20]) - a_num[15:20] = func_num(a_num[15:20], b_num[15:20]) - assert np.array_equal(a, a_num) - # reusing the same function for different inputs +@pytest.mark.parametrize( + "slice", + ( + (Ellipsis), + ( + slice(5, 10), + 2, + ), + (slice(15, 20),), + ), +) # , (Ellipsis,2,))) +def test_vectorize_over_slices(slice): a = np.arange(1000).reshape((25, 10, 4)) a_num = num.array(a) - a[:, 2, :] = func_np(a[:, 2, :], 2) - a_num[:, 2, :] = func_num(a_num[:, 2, :], 2) + b = a * 10 + b_num = num.array(b) + a[slice] = func_np(a[slice], b[slice]) + a_num[slice] = func_num(a_num[slice], b_num[slice]) + print("IRINA DEBUG", slice) + print(a) + print(a_num) assert np.array_equal(a, a_num) @@ -164,7 +147,7 @@ def test_different_types(): assert np.array_equal(c, c_num) -def test_cache(): +def test_cache_multiple_outputs(): a = np.arange(100).reshape((25, 4)) a_num = num.array(a) b = a * 10 @@ -181,6 +164,35 @@ def test_cache(): assert np.array_equal(a, a_num) assert np.array_equal(c, c_num) + a_num = a_num.astype(float) + b_num = b_num.astype(float) + msg = r"types of the arguments should stay the same" + with pytest.raises(TypeError, match=msg): + a_num = func_num(a_num, b_num) + + +def test_cache_single_output(): + a = np.arange(100).reshape((2, 50)) + a_num = num.array(a) + b = a * 10 + b_num = a_num * 10 + func_np = np.vectorize(my_func, cache=True) + func_num = num.vectorize(my_func, cache=True) + for i in range(10): + a = a * 2 + b = b * 3 + a_num = a_num * 2 + b_num = b_num * 3 + a = func_np(a, b) + a_num = func_num(a_num, b_num) + assert np.array_equal(a, a_num) + + a_num = a_num.astype(float) + b_num = b_num.astype(float) + msg = r"types of the arguments should stay the same" + with pytest.raises(TypeError, match=msg): + a_num = func_num(a_num, b_num) + # checking caching on different shapes of arrays: func_np2 = np.vectorize(my_func2, cache=True) diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi index 697a068cfd..df003acbe4 100644 --- a/typings/numba/types/__init__.pyi +++ b/typings/numba/types/__init__.pyi @@ -1,15 +1,12 @@ - configuration locations on your computer. - -class Type(): ... - +class Type: ... class Number(Type): ... class Integer(Number): - def __init__(self, name: str) ->None: ... + def __init__(self, name: str) -> None: ... -class CPointer (Type): - def __init__ (self, dtype : Type) -> None : ... +class CPointer(Type): + def __init__(self, dtype: Type) -> None: ... -uint32 = Integer('uint32') -uint64 = Integer('uint64') +uint32 = Integer("uint32") +uint64 = Integer("uint64") void = none From fb7853e0f87eba2a5a8517663e08266bed22815e Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 27 Mar 2023 20:44:23 -0700 Subject: [PATCH 68/78] making strings more readable --- cunumeric/vectorize.py | 58 ++++++++++++----------------- tests/integration/test_vectorize.py | 6 +-- 2 files changed, 26 insertions(+), 38 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index c4b73f5f9a..1dee0c6894 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -232,30 +232,22 @@ def _build_gpu_function(self) -> Any: ) lines.append("def {}({}):".format(funcid, ",".join(args))) - # Initialize the index variable and return immediately # when it exceeds the data size - lines.append(" local_i = cuda.grid(1)") - lines.append(" if local_i >= {}:".format(_SIZE_VAR)) - lines.append(" return") # we compute index for sparse data access when using Legion's # pointer. # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] - lines.append(" {}:int = 0".format(_LOOP_VAR)) - lines.append(" for p in range({}-1):".format(_DIM_VAR)) - # FIXME make sure we compute index correct for all data types - lines.append(" x=int(local_i/{}[p])".format(_PITCHES_VAR)) - lines.append( - " local_i = int(local_i%{}[p])".format(_PITCHES_VAR) - ) - lines.append( - " {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) - ) - lines.append( - " {}+=int(local_i*{}[{}-1])".format( - _LOOP_VAR, _STRIDES_VAR, _DIM_VAR - ) - ) + loop_lines = f""" local_i = cuda.grid(1) + if local_i >= {_SIZE_VAR}: + return + {_LOOP_VAR}:int = 0 + for p in range({_DIM_VAR}-1): + x=int(local_i/{_PITCHES_VAR}[p]) + local_i = int(local_i%{_PITCHES_VAR}[p]) + {_LOOP_VAR}+=int(x*{_STRIDES_VAR}[p]) + {_LOOP_VAR}+=int(local_i*{_STRIDES_VAR}[{_DIM_VAR}-1]) + """ + lines += loop_lines.split("\n") # this function is used to replace all array names with array[i] def _lift_to_array_access(m: Any) -> str: @@ -326,25 +318,21 @@ def _emit_assignment( ) arg_idx += 1 - # Main loop - lines.append(" for local_i in range({}):".format(_SIZE_VAR)) + # Initialize the index variable and return immediately + # when it exceeds the data size # we compute index for sparse data access when using Legion's # pointer. # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] - lines.append(" {}:int = 0".format(_LOOP_VAR)) - lines.append(" j:int = local_i") - lines.append(" for p in range({}-1):".format(_DIM_VAR)) - lines.append(" x=int(j/{}[p])".format(_PITCHES_VAR)) - lines.append(" j = int(j%{}[p])".format(_PITCHES_VAR)) - - lines.append( - " {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR) - ) - lines.append( - " {}+=int(j*{}[{}-1])".format( - _LOOP_VAR, _STRIDES_VAR, _DIM_VAR - ) - ) + loop_lines = f""" for local_i in range({_SIZE_VAR}): + {_LOOP_VAR}:int = 0 + j:int = local_i + for p in range({_DIM_VAR}-1): + x=int(j/{_PITCHES_VAR}[p]) + j = int(j%{_PITCHES_VAR}[p]) + {_LOOP_VAR}+=int(x*{_STRIDES_VAR}[p]) + {_LOOP_VAR}+=int(j*{_STRIDES_VAR}[{_DIM_VAR}-1]) + """ + lines += loop_lines.split("\n") lines_old = self._func_body diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index daba0c9979..17ad778be7 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -89,9 +89,9 @@ def test_vectorize_over_slices(slice): b_num = num.array(b) a[slice] = func_np(a[slice], b[slice]) a_num[slice] = func_num(a_num[slice], b_num[slice]) - print("IRINA DEBUG", slice) - print(a) - print(a_num) + #print("IRINA DEBUG", slice) + #print(a) + #print(a_num) assert np.array_equal(a, a_num) From f62ed37dd412617715e86c22d399d7c91aa303fd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Mar 2023 03:44:46 +0000 Subject: [PATCH 69/78] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/integration/test_vectorize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 17ad778be7..772120257f 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -89,9 +89,9 @@ def test_vectorize_over_slices(slice): b_num = num.array(b) a[slice] = func_np(a[slice], b[slice]) a_num[slice] = func_num(a_num[slice], b_num[slice]) - #print("IRINA DEBUG", slice) - #print(a) - #print(a_num) + # print("IRINA DEBUG", slice) + # print(a) + # print(a_num) assert np.array_equal(a, a_num) From 27621233e84f612a54f0155e47fe22aad746af54 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 27 Mar 2023 20:55:08 -0700 Subject: [PATCH 70/78] replacing strings with f-strings --- cunumeric/vectorize.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 1dee0c6894..55f985488d 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -204,15 +204,15 @@ def _replace_name( add indices to the names of input/output arrays in the function body """ if (name in self._arg_names) or (name in self._return_names): - return "{}[int({})]".format(name, _LOOP_VAR) + return f"{name}[int({_LOOP_VAR})]" else: if is_gpu or ((not is_gpu) and not (name in self._scalar_names)): - return "{}".format(name) + return f"{name}" else: - return "{}[0]".format(name) + return f"{name}[0]" def _build_gpu_function(self) -> Any: - funcid = "vectorized_{}".format(self._pyfunc.__name__) + funcid = f"vectorized_{self._pyfunc.__name__}" # Preamble lines = ["from numba import cuda"] @@ -267,7 +267,7 @@ def _lift_to_array_access(m: Any) -> str: return glbs[funcid] def _build_cpu_function(self) -> Callable[[Any], Any]: - funcid = "vectorized_{}".format(self._pyfunc.__name__) + funcid = f"vectorized_{self._pyfunc.__name__}" # Preamble lines = ["from numba import carray, types"] @@ -277,14 +277,7 @@ def _build_cpu_function(self) -> Callable[[Any], Any]: # Signature lines.append( - "def {}({}, {}, {}, {}, {}):".format( - funcid, - _ARGS_VAR, - _SIZE_VAR, - _DIM_VAR, - _PITCHES_VAR, - _STRIDES_VAR, - ) + f"def {funcid}({_ARGS_VAR},{_SIZE_VAR}, {_DIM_VAR}, {_PITCHES_VAR}, {_STRIDES_VAR}):" ) # Unpack kernel arguments @@ -292,9 +285,7 @@ def _emit_assignment( var: Any, idx: int, sz: Any, ty: np.dtype[Any] ) -> None: lines.append( - " {} = carray({}[{}], {}, types.{})".format( - var, _ARGS_VAR, idx, sz, ty - ) + f" {var} = carray({ _ARGS_VAR}[{idx}], {sz}, types.{ty})" ) # define pyfunc arguments as carrays From 0a0650c43c684dbb87ed7fbf77d5128148f7b97e Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 27 Mar 2023 22:17:47 -0700 Subject: [PATCH 71/78] arguments should be a copy of arrays to match the numpy logic --- cunumeric/vectorize.py | 13 ++++++++++--- tests/integration/test_vectorize.py | 13 +++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 55f985488d..c3310063b8 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -194,7 +194,12 @@ def _get_return_arguments(self) -> List[str]: if "return" in ln: ln = ln.replace("return", "") ln = ln.replace(" ", "") - return_names = ln.split(",") + return_names += ln.split(",") + # FIXME + # for n in return_names: + # if re.match("^([-+]? ?(\d+|\(\g<1>\))( ?[-+*\/] ?\g<1>)?)$", n): + # raise NotImplementedError (" User defined function can't have" + # " mathematical operation as a return") return return_names def _replace_name( @@ -277,7 +282,8 @@ def _build_cpu_function(self) -> Callable[[Any], Any]: # Signature lines.append( - f"def {funcid}({_ARGS_VAR},{_SIZE_VAR}, {_DIM_VAR}, {_PITCHES_VAR}, {_STRIDES_VAR}):" + f"def {funcid}({_ARGS_VAR},{_SIZE_VAR}, " + f"{_DIM_VAR}, {_PITCHES_VAR}, {_STRIDES_VAR}):" ) # Unpack kernel arguments @@ -587,7 +593,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]: self._scalar_idxs.append(i) scalar_idx += 1 else: - self._args.append(convert_to_cunumeric_ndarray(arg)) + # we need to make a copy of original array to match numpy + self._args.append(convert_to_cunumeric_ndarray(arg.copy())) # first fill arrays to argnames, then scalars: for i, k in enumerate(inspect.signature(self._pyfunc).parameters): diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py index 772120257f..df03987fd7 100644 --- a/tests/integration/test_vectorize.py +++ b/tests/integration/test_vectorize.py @@ -79,19 +79,20 @@ def test_empty_functions(): slice(5, 10), 2, ), - (slice(15, 20),), + (slice(3, 7),), + ( + Ellipsis, + 2, + ), ), -) # , (Ellipsis,2,))) +) def test_vectorize_over_slices(slice): - a = np.arange(1000).reshape((25, 10, 4)) + a = np.arange(160).reshape((10, 4, 4)) a_num = num.array(a) b = a * 10 b_num = num.array(b) a[slice] = func_np(a[slice], b[slice]) a_num[slice] = func_num(a_num[slice], b_num[slice]) - # print("IRINA DEBUG", slice) - # print(a) - # print(a_num) assert np.array_equal(a, a_num) From 98f15d6f4caba22e7a06495cb93c66ba3c3db516 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 28 Mar 2023 10:13:37 -0700 Subject: [PATCH 72/78] adding missing type stubs for numpy --- typings/numba/core/__init__.pyi | 0 typings/numba/core/ccallback/__init__.pyi | 8 ++++++++ typings/numba/core/types/__init__.pyi | 25 +++++++++++++++++++++++ 3 files changed, 33 insertions(+) create mode 100644 typings/numba/core/__init__.pyi create mode 100644 typings/numba/core/ccallback/__init__.pyi create mode 100644 typings/numba/core/types/__init__.pyi diff --git a/typings/numba/core/__init__.pyi b/typings/numba/core/__init__.pyi new file mode 100644 index 0000000000..e69de29bb2 diff --git a/typings/numba/core/ccallback/__init__.pyi b/typings/numba/core/ccallback/__init__.pyi new file mode 100644 index 0000000000..81b5030b9c --- /dev/null +++ b/typings/numba/core/ccallback/__init__.pyi @@ -0,0 +1,8 @@ +from typing import Any + +class CFunc(object): + def __init__( + self, pyfunc: Any, sig: Any, locals: Any, options: Any + ) -> None: ... + @property + def address(self) -> int: ... diff --git a/typings/numba/core/types/__init__.pyi b/typings/numba/core/types/__init__.pyi new file mode 100644 index 0000000000..8bb1e2b103 --- /dev/null +++ b/typings/numba/core/types/__init__.pyi @@ -0,0 +1,25 @@ +class Opaque: ... + +class NoneType(Opaque): + def __init__(self, name: str) -> None: ... + +class Type: + def __init__(self, name: str) -> None: ... + +class Number(Type): ... + +class Integer(Number): + def __init__(self, name: str) -> None: ... + +class RawPointer: + def __init__(self, name: str) -> None: ... + +class CPointer(Type): + def __init__(self, dtype: Type) -> None: ... + +none = NoneType("none") + +uint32 = Integer("uint32") +uint64 = Integer("uint64") +void = none +voidptr = Type("void*") From 8d118caeb2c2c024bae82430504063649dfbf1cd Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 29 Mar 2023 23:16:02 -0700 Subject: [PATCH 73/78] checking return statememt of UDF for special characters --- cunumeric/vectorize.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index c3310063b8..33387b287c 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -195,11 +195,17 @@ def _get_return_arguments(self) -> List[str]: ln = ln.replace("return", "") ln = ln.replace(" ", "") return_names += ln.split(",") - # FIXME - # for n in return_names: - # if re.match("^([-+]? ?(\d+|\(\g<1>\))( ?[-+*\/] ?\g<1>)?)$", n): - # raise NotImplementedError (" User defined function can't have" - # " mathematical operation as a return") + # we check if return statement has any special characters since + # we don't support cases like "return a+b" + for n in return_names: + regex = re.compile("[^A-Za-z0-9]") + res = regex.findall(n) + if len(res) > 0: + raise NotImplementedError( + " CuNumeric doesn't support special " + "characters in the return statement of the " + "user-defined function " + ) return return_names def _replace_name( From 889fb65c8bd08c712d31d9a3b7e5936d9d24ca9f Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 31 Mar 2023 13:34:18 -0700 Subject: [PATCH 74/78] updating the map between pyarow types and legate types --- cunumeric/array.py | 2 +- cunumeric/utils.py | 36 +++++++++++++++++++----------------- cunumeric/vectorize.py | 2 +- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index a44e5afd8d..05a1c876ce 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -33,7 +33,7 @@ import legate.core.types as ty import numpy as np -import pyarrow # type: ignore [import] +import pyarrow from legate.core import Array from numpy.core.multiarray import ( # type: ignore [attr-defined] normalize_axis_index, diff --git a/cunumeric/utils.py b/cunumeric/utils.py index 62917020d2..d63ac2e30d 100644 --- a/cunumeric/utils.py +++ b/cunumeric/utils.py @@ -22,6 +22,7 @@ import legate.core.types as ty import numpy as np +import pyarrow as pa from .types import NdShape @@ -43,25 +44,26 @@ np.float64: ty.float64, np.complex64: ty.complex64, np.complex128: ty.complex128, + complex: ty.complex128, } CUNUMERIC_TYPE_MAP = { - "bool": ty.bool_, - "int8": ty.int8, - "int16": ty.int16, - "int32": ty.int32, - "int": ty.int64, # np.int is int - "int64": ty.int64, - "uint8": ty.uint8, - "uint16": ty.uint16, - "uint32": ty.uint32, - "uint64": ty.uint64, # np.uint is np.uint64 - "float16": ty.float16, - "float32": ty.float32, - "float": ty.float64, - "float64": ty.float64, - "complex64": ty.complex64, - "complex128": ty.complex128, + bool: ty.bool_, + int: ty.int64, + float: ty.float64, + complex: ty.complex128, + pa.bool_: ty.bool_, + pa.int8: ty.int8, + pa.int16: ty.int16, + pa.int32: ty.int32, + pa.int64: ty.int64, # np.int is int + pa.uint8: ty.uint8, + pa.uint16: ty.uint16, + pa.uint32: ty.uint32, + pa.uint64: ty.uint64, # np.uint is np.uint64 + pa.float16: ty.float16, + pa.float32: ty.float32, + pa.float64: ty.float64, } @@ -118,7 +120,7 @@ def is_supported_dtype(dtype: Any) -> bool: return dtype.type in SUPPORTED_DTYPES -def convert_to_cunumeric_dtype(dtype: str) -> Any: +def convert_to_cunumeric_dtype(dtype: Any) -> Any: if dtype in CUNUMERIC_TYPE_MAP: return CUNUMERIC_TYPE_MAP[dtype] raise TypeError("dtype is not supported") diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 33387b287c..e412dccf7f 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -426,7 +426,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None: # add all scalar arguments first for a in self._scalar_args: - dtype = convert_to_cunumeric_dtype(type(a).__name__) + dtype = convert_to_cunumeric_dtype(type(a)) task.add_scalar_arg(a, dtype) num_args = len(self._args) From 7df88a44dbaefd52b75e9e049d6e5cee4f3de7c6 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Fri, 31 Mar 2023 13:34:51 -0700 Subject: [PATCH 75/78] adding type stubs for pyarrow --- typings/pyarrow/__init__.pyi | 136 +++++++++++++++++++++++++++++++++++ typings/pyarrow/lib.pyi | 38 ++++++++++ 2 files changed, 174 insertions(+) create mode 100644 typings/pyarrow/__init__.pyi create mode 100644 typings/pyarrow/lib.pyi diff --git a/typings/pyarrow/__init__.pyi b/typings/pyarrow/__init__.pyi new file mode 100644 index 0000000000..cc2ac93aa9 --- /dev/null +++ b/typings/pyarrow/__init__.pyi @@ -0,0 +1,136 @@ +# Copyright 2021-2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Union + +from .lib import ( + DataType, + binary, + bool_, + float16, + float32, + float64, + int8, + int16, + int32, + int64, + string, + uint8, + uint16, + uint32, + uint64, +) + +class Field: + name: str + type: DataType + def __init__(self, *args: Any, **kwargs: Any) -> None: ... + def with_name(self, name: str) -> Field: ... + +def field( + name: Union[str, bytes], + type: DataType, + nullable: bool = True, + metadata: Any = None, +) -> Field: ... + +class Schema: + types: Any + def field(self, i: Union[str, int]) -> Field: ... + def get_all_field_indices(self, name: str) -> list[int]: ... + def get_field_index(self, name: str) -> int: ... + def __len__(self) -> int: ... + def __getitem__(self, idx: int) -> Field: ... + +def schema(fields: Any, metadata: Any = None) -> Schema: ... + +class ExtensionType: + def __init__(self, dtype: DataType, name: str) -> None: ... + +class DictionaryType: ... +class ListType: ... +class MapType: ... +class StructType: ... +class UnionType: ... +class TimestampType: ... +class Time32Type: ... +class Time64Type: ... +class FixedSizeBinaryType: ... +class Decimal128Type: ... +class time32: ... +class time64: ... +class timestamp: ... +class date32: ... +class date64: ... +class large_binary: ... +class large_string: ... +class large_utf8: ... +class decimal128: ... +class large_list: ... +class struct: ... +class dictionary: ... +class null: ... +class utf8: ... +class list_: ... +class map_: ... + +def from_numpy_dtype(dtype: Any) -> DataType: ... + +__all__ = ( + "binary", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float16", + "float32", + "float64", + "Field", + "Schema", + "DataType", + "DictionaryType", + "ListType", + "MapType", + "StructType", + "UnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "FixedSizeBinaryType", + "Decimal128Type", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "string", + "large_binary", + "large_string", + "large_utf8", + "decimal128", + "large_list", + "struct", + "dictionary", + "null", + "utf8", + "list_", + "map_", + "from_numpy_dtype", +) diff --git a/typings/pyarrow/lib.pyi b/typings/pyarrow/lib.pyi new file mode 100644 index 0000000000..398361089b --- /dev/null +++ b/typings/pyarrow/lib.pyi @@ -0,0 +1,38 @@ +# Copyright 2021-2022 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any + +class DataType: + id: int + num_fields: int + num_buffers: int + def equals(self, other: object) -> bool: ... + def to_pandas_dtype(self) -> Any: ... + +def binary(length: int) -> DataType: ... +def bool_() -> DataType: ... +def int8() -> DataType: ... +def int16() -> DataType: ... +def int32() -> DataType: ... +def int64() -> DataType: ... +def uint8() -> DataType: ... +def uint16() -> DataType: ... +def uint32() -> DataType: ... +def uint64() -> DataType: ... +def float16() -> DataType: ... +def float32() -> DataType: ... +def float64() -> DataType: ... +def string() -> DataType: ... From a65eea1d274a061cda1c0332bc66cc71e63e9842 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 3 Apr 2023 14:56:47 -0700 Subject: [PATCH 76/78] Update cunumeric/vectorize.py Co-authored-by: Bryan Van de Ven --- cunumeric/vectorize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index e412dccf7f..9724371e7b 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -248,7 +248,8 @@ def _build_gpu_function(self) -> Any: # we compute index for sparse data access when using Legion's # pointer. # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] - loop_lines = f""" local_i = cuda.grid(1) + loop_lines = f"""\ + local_i = cuda.grid(1) if local_i >= {_SIZE_VAR}: return {_LOOP_VAR}:int = 0 From 4dc584b0d9cf5c5aa19368df79cc492489220cfd Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Mon, 3 Apr 2023 15:02:45 -0700 Subject: [PATCH 77/78] addressing comments from Bryan --- cunumeric/vectorize.py | 3 ++- typings/numba/__init__.pyi | 5 ----- typings/numba/types/__init__.pyi | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 9724371e7b..7af68f6aed 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -327,7 +327,8 @@ def _emit_assignment( # we compute index for sparse data access when using Legion's # pointer. # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]] - loop_lines = f""" for local_i in range({_SIZE_VAR}): + loop_lines = f"""\ + for local_i in range({_SIZE_VAR}): {_LOOP_VAR}:int = 0 j:int = local_i for p in range({_DIM_VAR}-1): diff --git a/typings/numba/__init__.pyi b/typings/numba/__init__.pyi index 571faf5a05..3aa25ebbd1 100644 --- a/typings/numba/__init__.pyi +++ b/typings/numba/__init__.pyi @@ -1,14 +1,9 @@ from typing import Any, Callable -# Re-export types itself import numba.core.types as types import numba.cuda # import compile_ptx - -# import types from numba.core import types from numba.core.ccallback import CFunc - -# Re-export all type names from numba.core.types import CPointer, uint64 def cfunc(sig: Any) -> Any: diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi index df003acbe4..14c90eca2a 100644 --- a/typings/numba/types/__init__.pyi +++ b/typings/numba/types/__init__.pyi @@ -9,4 +9,4 @@ class CPointer(Type): uint32 = Integer("uint32") uint64 = Integer("uint64") -void = none +void = None From c123d63850020629ab2e18bac4b0ab3a9d84eef6 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Thu, 18 May 2023 20:54:20 -0700 Subject: [PATCH 78/78] fixed some typos --- cunumeric/vectorize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py index 3ac914bc36..c3f691d164 100644 --- a/cunumeric/vectorize.py +++ b/cunumeric/vectorize.py @@ -83,10 +83,10 @@ def __init__( Set of strings or integers representing the positional or keyword arguments for which the function will not be vectorized. These will be passed directly to `pyfunc` unmodified. - WARNING: cuNumeric doesn't suport this argument at the moment + WARNING: cuNumeric doesn't support this argument at the moment cache : bool, optional If `True`, then cache the first function call that generates C fun- - ction or CUDA kernel. We recomment enabling caching in cuNumeric + ction or CUDA kernel. We recommend enabling caching in cuNumeric for better performance, when possible. WARNING: in the case when cache=True, cuNumeric will parse function signature and create C function or CUDA kernel only once. This @@ -99,7 +99,7 @@ def __init__( arrays with shapes given by the size of corresponding core dimensions. By default, ``pyfunc`` is assumed to take scalars as input and output. - WARNING: cuNumeric doesn't suport this argument at the moment + WARNING: cuNumeric doesn't support this argument at the moment Returns ------- @@ -145,7 +145,7 @@ def __init__( if otypes is not None: if self._num_outputs != len(otypes): raise ValueError( - "number of types in otypes is not consistente" + "number of types in otypes is not consistent" " with the number of return values defined in pyfunc" ) if len(otypes) > 1: