From e7f6efd59b1138ba65638016c1f5b5611d2d9c47 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 25 Jan 2023 22:00:34 -0800
Subject: [PATCH 01/78] towards using vectorize

---
 cunumeric/__init__.py               |   1 +
 cunumeric/vectorize.py              | 314 ++++++++++++++++++++++++++++
 tests/integration/test_vectorize.py |  39 ++++
 3 files changed, 354 insertions(+)
 create mode 100644 cunumeric/vectorize.py
 create mode 100644 tests/integration/test_vectorize.py

diff --git a/cunumeric/__init__.py b/cunumeric/__init__.py
index 7c9e122aaa..1d0cce7c20 100644
--- a/cunumeric/__init__.py
+++ b/cunumeric/__init__.py
@@ -34,6 +34,7 @@
 from cunumeric.logic import *
 from cunumeric.window import bartlett, blackman, hamming, hanning, kaiser
 from cunumeric.coverage import clone_module
+from cunumeric.vectorize import vectorize
 
 clone_module(_np, globals())
 
diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
new file mode 100644
index 0000000000..e68c8d55f5
--- /dev/null
+++ b/cunumeric/vectorize.py
@@ -0,0 +1,314 @@
+# Copyright 2023  NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import inspect
+import re
+from typing import Any, Callable, Dict, List, Optional, Union
+
+# numba doesn't seem to include type hints
+import numba.cuda  # type: ignore
+import numba.types  # type: ignore
+import numpy as np
+import six
+
+from cunumeric.runtime import runtime
+
+from .array import convert_to_cunumeric_ndarray
+
+_EXTERNAL_REFERENCE_PREFIX = "__extern_ref__"
+_MASK_VAR = "__mask__"
+_SIZE_VAR = "__size__"
+_LOOP_VAR = "__i__"
+_ARGS_VAR = "__args__"
+
+
+class vectorize:
+    """
+    vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False,
+              signature=None)
+    Generalized function class.
+    Define a vectorized function which takes a nested sequence of objects or
+    numpy arrays as inputs and returns a single numpy array or a tuple of numpy
+    arrays. The vectorized function evaluates `pyfunc` over successive tuples
+    of the input arrays like the python map function, except it uses the
+    broadcasting rules of numpy.
+    The data type of the output of `vectorized` is determined by calling
+    the function with the first element of the input.  This can be avoided
+    by specifying the `otypes` argument.
+
+    Parameters
+    ----------
+    pyfunc : callable
+        A python function or method.
+    otypes : str or list of dtypes, optional
+        The output data type. It must be specified as either a string of
+        typecode characters or a list of data type specifiers. There should
+        be one data type specifier for each output.
+    doc : str, optional
+        The docstring for the function. If None, the docstring will be the
+        ``pyfunc.__doc__``.
+    excluded : set, optional
+        Set of strings or integers representing the positional or keyword
+        arguments for which the function will not be vectorized.  These will be
+        passed directly to `pyfunc` unmodified.
+    cache : bool, optional
+        If `True`, then cache the first function call that determines
+        the number of outputs if `otypes` is not provided.
+    signature : string, optional
+        Generalized universal function signature, e.g., ``(m,n),(n)->(m)`` for
+        vectorized matrix-vector multiplication. If provided, ``pyfunc`` will
+        be called with (and expected to return) arrays with shapes given by the
+        size of corresponding core dimensions. By default, ``pyfunc`` is
+        assumed to take scalars as input and output.
+
+    Returns
+    -------
+    vectorized : callable
+        Vectorized function.
+
+    See Also
+    --------
+    numpy.vectorize
+
+    Availability
+    --------
+    Multiple GPUs, Multiple CPUs
+    """
+
+    def __init__(
+        self,
+        pyfunc: Callable[[Any], Any],
+        otypes: Optional[Union[str, list[Any]]] = None,
+        doc: Optional[str] = None,
+        excluded: Optional[set[Any]] = None,
+        cache: Optional[bool] = False,
+        signature: Optional[str] = None,
+    ) -> None:
+        self._pyfunc = pyfunc
+        self._numba_func: Optional[Callable[[Any], Any]] = None
+        self._device_func: Optional[Callable[[Any], Any]] = None
+        self._otypes = None
+        self._result = None
+        self._args: List[Any] = []
+        self._kwargs: List[Any] = []
+
+        if doc is None:
+            self.__doc__ = pyfunc.__doc__
+        else:
+            self.__doc__ = doc
+
+        if otypes is not None:
+            raise NotImplementedError("Otypes variables are not supported yet")
+
+        if excluded is not None:
+            raise NotImplementedError(
+                "excluded variables are not supported yet"
+            )
+        if cache:
+            raise NotImplementedError("cache variable is not supported yet")
+
+        if signature is not None:
+            raise NotImplementedError(
+                "signature variable is not supported yet"
+            )
+
+        # FIXME check return of the user function
+        # return annotation (we supprt only void)
+
+    #        if inspect.signature(self._pyfunc).return_annotation()
+    #            != inspect._empty:
+    #            raise NotImplementedError(
+    #                "user defined functions can't have a return"
+    #            )
+
+    def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
+        """Using the magic method __doc__, we KNOW the size of the docstring.
+        We then, just substract this from the total length of the function
+        """
+        lines_to_skip = 0
+        if func.__doc__ is not None and len(func.__doc__.split("\n")) > 0:
+            lines_to_skip = len(func.__doc__.split("\n"))
+
+        lines = inspect.getsourcelines(func)[0]
+
+        return_lines = []
+        for i in range(lines_to_skip + 1, len(lines)):
+            return_lines.append(lines[i].rstrip())
+        return return_lines
+
+    def _build_gpu_function(self) -> Callable[[Any], Any]:
+
+        funcid = "vectorized_{}".format(self._pyfunc.__name__)
+
+        # Preamble
+        lines = ["from numba import cuda"]
+
+        # Signature
+        argnames = list(k for k in inspect.signature(self._pyfunc).parameters)
+        args = argnames + [_SIZE_VAR]
+        lines.append("def {}({}):".format(funcid, ",".join(args)))
+
+        # Initialize the index variable and return immediately
+        # when it exceeds the data size
+        lines.append("    {} = cuda.grid(1)".format(_LOOP_VAR))
+        lines.append("    if {} >= {}:".format(_LOOP_VAR, _SIZE_VAR))
+        lines.append("        return")
+
+        # Kernel body
+        def _lift_to_array_access(m: Any) -> str:
+            name = m.group(0)
+            if name in argnames:
+                return "{}[{}]".format(name, _LOOP_VAR)
+            else:
+                return "{}".format(name)
+
+        # kernel body
+        lines_old = self._get_func_body(self._pyfunc)
+        for line in lines_old:
+            l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
+            lines.append(l_new)
+
+        # Evaluate the string to get the Python function
+        body = "\n".join(lines)
+        glbs: Dict[str, Any] = {}
+        six.exec_(body, glbs)
+        return glbs[funcid]
+
+    def _build_cpu_function(self) -> Callable[[Any], Any]:
+
+        funcid = "vectorized_{}".format(self._pyfunc.__name__)
+
+        # Preamble
+        lines = ["from numba import carray, types"]
+
+        # Signature
+        lines.append("def {}({}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR))
+
+        # Unpack kernel arguments
+        def _emit_assignment(
+            var: Any, idx: int, sz: Any, ty: np.dtype[Any]
+        ) -> None:
+            lines.append(
+                "    {} = carray({}[{}], {}, types.{})".format(
+                    var, _ARGS_VAR, idx, sz, ty
+                )
+            )
+
+        # get names of arguments
+        argnames = list(k for k in inspect.signature(self._pyfunc).parameters)
+        arg_idx = 0
+        for a in self._args:
+            ty = a.dtype
+            _emit_assignment(argnames[arg_idx], arg_idx, _SIZE_VAR, ty)
+            arg_idx += 1
+
+        # Main loop
+        lines.append("    for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR))
+
+        lines_old = self._get_func_body(self._pyfunc)
+
+        def _lift_to_array_access(m: Any) -> str:
+            name = m.group(0)
+            if name in argnames:
+                return "{}[{}]".format(name, _LOOP_VAR)
+            else:
+                return "{}[0]".format(name)
+
+        # lines_new = []
+        for line in lines_old:
+            l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
+            lines.append("        " + l_new)
+
+        # Evaluate the string to get the Python function
+        body = "\n".join(lines)
+        glbs: Dict[str, Any] = {}
+        six.exec_(body, glbs)
+        return glbs[funcid]
+
+    def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
+        types = []
+        for arg in self._args:
+            ty = arg.dtype
+            ty = str(ty) if ty != bool else "int8"
+            ty = getattr(numba.types, ty)
+            ty = numba.types.CPointer(ty)
+            types.append(ty)
+        return types
+
+    def _compile_func_gpu(self) -> Callable[[Any], Any]:
+        types = self._get_numba_types()
+        arg_types = types + [numba.types.uint64]
+        sig = (*arg_types,)
+
+        cuda_arch = numba.cuda.get_current_device().compute_capability
+        return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch)
+
+    def _compile_func_cpu(self) -> Any:
+        sig = numba.types.void(
+            numba.types.CPointer(numba.types.voidptr), numba.types.uint64
+        )
+
+        return numba.cfunc(sig)(self._numba_func)
+
+    #     def _execute_gpu(self):
+    #        task = self.context.create_auto_task(CuNumericOpCode.LOAD_PTX)
+    #        task..add_future(
+    #            self._runtime.create_future_from_string(self._device_func)
+    #        )
+    #        kernel_fun = task.execute()
+
+    #        task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF)
+    # This will be ignored
+    #        task.add_scalar_arg(0, ty.uint64)
+    #        task.add_future_map(kernel_fun)
+    #        task.execute()
+
+    #     def _execute_cpu(self):
+
+    #        task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF)
+    #        task.add_scalar_arg(self._device_func.address, ty.uint64)
+
+    def __call__(self, *args: Any, **kwargs: Any) -> None:
+        """
+        Return arrays with the results of `pyfunc` broadcast (vectorized) over
+        `args` and `kwargs` not in `excluded`.
+        """
+        self._args = list(
+            convert_to_cunumeric_ndarray(arg) if arg is not None else arg
+            for (idx, arg) in enumerate(args)
+        )
+        for arg in self._args:
+            if arg is None:
+                raise ValueError(
+                    "None is not supported in user function "
+                    "passed to cunumeric.vectorize"
+                )
+
+        self._kwargs = list(kwargs)
+        if len(self._kwargs) > 1:
+            raise NotImplementedError(
+                "kwargs are not supported in user functions"
+            )
+
+        if runtime.num_gpus > 0:
+            self._numba_func = self._build_gpu_function()
+            self._device_func = self._compile_func_gpu()
+        #            self._execute_gpu()
+        else:
+            self._numba_func = self._build_cpu_function()
+            self._device_func = self._compile_func_cpu()
+        #            self._execute_cpu()
+
+        return self._result
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
new file mode 100644
index 0000000000..73a09b9d1b
--- /dev/null
+++ b/tests/integration/test_vectorize.py
@@ -0,0 +1,39 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+# import numpy as np
+import pytest
+
+import cunumeric as num
+
+
+def my_func(a, b):
+    a = a * 2 + b
+    a = a * 3
+
+
+def test_vectorize():
+    func = num.vectorize(my_func)
+    a = 1
+    b = 2
+    func(a, b)
+    print(a)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))

From 02b5ffc266b36f7932b013c7d4928b597f0f04dd Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 1 Feb 2023 21:11:17 -0800
Subject: [PATCH 02/78] making simple test work on CPUs

---
 cunumeric/config.py                           |  2 +
 cunumeric/vectorize.py                        | 93 +++++++++++++------
 cunumeric_cpp.cmake                           |  3 +
 src/cunumeric/cunumeric_c.h                   |  1 +
 src/cunumeric/vectorize/eval_udf.cc           | 40 ++++++++
 src/cunumeric/vectorize/eval_udf.cu           | 34 +++++++
 src/cunumeric/vectorize/eval_udf.h            | 42 +++++++++
 src/cunumeric/vectorize/eval_udf_omp.cc       | 35 +++++++
 src/cunumeric/vectorize/eval_udf_template.inl | 64 +++++++++++++
 tests/integration/test_vectorize.py           |  7 +-
 10 files changed, 290 insertions(+), 31 deletions(-)
 create mode 100644 src/cunumeric/vectorize/eval_udf.cc
 create mode 100644 src/cunumeric/vectorize/eval_udf.cu
 create mode 100644 src/cunumeric/vectorize/eval_udf.h
 create mode 100644 src/cunumeric/vectorize/eval_udf_omp.cc
 create mode 100644 src/cunumeric/vectorize/eval_udf_template.inl

diff --git a/cunumeric/config.py b/cunumeric/config.py
index cad52e77f4..14cb1b6434 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -146,6 +146,7 @@ class _CunumericSharedLib:
     CUNUMERIC_DIAG: int
     CUNUMERIC_DOT: int
     CUNUMERIC_EYE: int
+    CUNUMERIC_EVAL_UDF: int
     CUNUMERIC_FFT: int
     CUNUMERIC_FFT_C2C: int
     CUNUMERIC_FFT_C2R: int
@@ -348,6 +349,7 @@ class CuNumericOpCode(IntEnum):
     DIAG = _cunumeric.CUNUMERIC_DIAG
     DOT = _cunumeric.CUNUMERIC_DOT
     EYE = _cunumeric.CUNUMERIC_EYE
+    EVAL_UDF = _cunumeric.CUNUMERIC_EVAL_UDF
     FFT = _cunumeric.CUNUMERIC_FFT
     FILL = _cunumeric.CUNUMERIC_FILL
     FLIP = _cunumeric.CUNUMERIC_FLIP
diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index e68c8d55f5..f95531b217 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -17,15 +17,18 @@
 import re
 from typing import Any, Callable, Dict, List, Optional, Union
 
-# numba doesn't seem to include type hints
-import numba.cuda  # type: ignore
-import numba.types  # type: ignore
+import legate.core.types as ty
+import numba.cuda
+import numba.types
+
+# import numba
 import numpy as np
 import six
 
 from cunumeric.runtime import runtime
 
 from .array import convert_to_cunumeric_ndarray
+from .config import CuNumericOpCode
 
 _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__"
 _MASK_VAR = "__mask__"
@@ -98,11 +101,15 @@ def __init__(
     ) -> None:
         self._pyfunc = pyfunc
         self._numba_func: Optional[Callable[[Any], Any]] = None
-        self._device_func: Optional[Callable[[Any], Any]] = None
+        self._cpu_func: numba.types.CPointer = numba.types.CPointer(int)
+        self._gpu_func: tuple[Any] = (0,)
         self._otypes = None
         self._result = None
         self._args: List[Any] = []
         self._kwargs: List[Any] = []
+        self._context = runtime.legate_context
+
+        print("IRINA DEBUG initialization")
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -247,7 +254,7 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
             types.append(ty)
         return types
 
-    def _compile_func_gpu(self) -> Callable[[Any], Any]:
+    def _compile_func_gpu(self) -> tuple[Any]:
         types = self._get_numba_types()
         arg_types = types + [numba.types.uint64]
         sig = (*arg_types,)
@@ -255,30 +262,41 @@ def _compile_func_gpu(self) -> Callable[[Any], Any]:
         cuda_arch = numba.cuda.get_current_device().compute_capability
         return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch)
 
-    def _compile_func_cpu(self) -> Any:
+    def _compile_func_cpu(self) -> numba.types.CPointer:
         sig = numba.types.void(
             numba.types.CPointer(numba.types.voidptr), numba.types.uint64
         )
 
         return numba.cfunc(sig)(self._numba_func)
 
-    #     def _execute_gpu(self):
-    #        task = self.context.create_auto_task(CuNumericOpCode.LOAD_PTX)
-    #        task..add_future(
-    #            self._runtime.create_future_from_string(self._device_func)
-    #        )
-    #        kernel_fun = task.execute()
-
-    #        task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-    # This will be ignored
-    #        task.add_scalar_arg(0, ty.uint64)
-    #        task.add_future_map(kernel_fun)
-    #        task.execute()
-
-    #     def _execute_cpu(self):
-
-    #        task = self.context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-    #        task.add_scalar_arg(self._device_func.address, ty.uint64)
+    def _execute_gpu(self) -> None:
+        print("IRINA DEBUG executing GPU function")
+        # task = self._context.create_auto_task(CuNumericOpCode.LOAD_PTX)
+        # task.add_future(
+        #    self._runtime.create_future_from_string(self._device_func)
+        # )
+        # task.execute()
+
+        # task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
+        # This will be ignored
+        # task.add_scalar_arg(0, ty.uint64)
+        # task.add_future_map(kernel_fun)
+        # task.execute()
+
+    def _execute_cpu(self) -> None:
+        task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
+        task.add_scalar_arg(self._cpu_func.address, ty.uint64)
+        idx = 0
+        a0 = self._args[0]._thunk
+        a0 = runtime.to_deferred_array(a0)
+        for a in self._args:
+            a_tmp = runtime.to_deferred_array(a._thunk)
+            task.add_input(a_tmp.base)
+            task.add_output(a_tmp.base)
+            if idx != 0:
+                task.add_alignment(a0.base, a_tmp.base)
+            idx += 1
+        task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
         """
@@ -296,6 +314,25 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     "passed to cunumeric.vectorize"
                 )
 
+        #        #FIXME: comment out when brodcast PR is merged
+        #        #bring all argumants to the same shape and type:
+        #        if len(self._args)>0:
+        #             ty = self._args[0].dtype
+        #             #FIXME: should we bring them all to the same type?
+        #             for a in self._args:
+        #                 if a.dtype != ty:
+        #                    return TypeError("all arguments of "
+        #                         "user defined function "
+        #                      "should have the same type")
+
+        #    shapes = tuple(a.shape for a in self._args)
+        #    shape = broadcast_shapes(shapes)
+        #    new_args = tuple()
+        #    for a in self._args:
+        #        a_new = a.broadcast_to(shape)
+        #        new_args +=(a_new,)
+        #    self._args = new_args
+
         self._kwargs = list(kwargs)
         if len(self._kwargs) > 1:
             raise NotImplementedError(
@@ -304,11 +341,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
 
         if runtime.num_gpus > 0:
             self._numba_func = self._build_gpu_function()
-            self._device_func = self._compile_func_gpu()
-        #            self._execute_gpu()
+            self._gpu_func = self._compile_func_gpu()
+            self._execute_gpu()
         else:
             self._numba_func = self._build_cpu_function()
-            self._device_func = self._compile_func_cpu()
-        #            self._execute_cpu()
-
-        return self._result
+            self._cpu_func = self._compile_func_cpu()
+            self._execute_cpu()
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
index 7034bb600a..bdfa4c163a 100644
--- a/cunumeric_cpp.cmake
+++ b/cunumeric_cpp.cmake
@@ -160,6 +160,7 @@ list(APPEND cunumeric_SOURCES
   src/cunumeric/mapper.cc
   src/cunumeric/cephes/chbevl.cc
   src/cunumeric/cephes/i0.cc
+  src/cunumeric/vectorize/eval_udf.cc
 )
 
 if(Legion_USE_OpenMP)
@@ -206,6 +207,7 @@ if(Legion_USE_OpenMP)
     src/cunumeric/stat/bincount_omp.cc
     src/cunumeric/convolution/convolve_omp.cc
     src/cunumeric/transform/flip_omp.cc
+    src/cunumeric/vectorize/eval_udf_omp.cc
   )
 endif()
 
@@ -257,6 +259,7 @@ if(Legion_USE_CUDA)
     src/cunumeric/transform/flip.cu
     src/cunumeric/cudalibs.cu
     src/cunumeric/cunumeric.cu
+    src/cunumeric/vectorize/eval_udf.cu
   )
 endif()
 
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 724db00134..42c0bc955e 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -42,6 +42,7 @@ enum CuNumericOpCode {
   CUNUMERIC_DIAG,
   CUNUMERIC_DOT,
   CUNUMERIC_EYE,
+  CUNUMERIC_EVAL_UDF,
   CUNUMERIC_FFT,
   CUNUMERIC_FILL,
   CUNUMERIC_FLIP,
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
new file mode 100644
index 0000000000..ed0c5dea05
--- /dev/null
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -0,0 +1,40 @@
+/* Copyright 20223 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/vectorize/eval_udf.h"
+#include "cunumeric/vectorize/eval_udf_template.inl"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <LegateTypeCode CODE, int DIM>
+struct EvalUdfImplBody<VariantKind::CPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+};
+
+/*static*/ void EvalUdfTask::cpu_variant(TaskContext& context)
+{
+  eval_udf_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { EvalUdfTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
new file mode 100644
index 0000000000..74ac3ab31c
--- /dev/null
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -0,0 +1,34 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/vectorize/eval_udf.h"
+#include "cunumeric/vectorize/eval_udf_template.inl"
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+
+template <LegateTypeCode CODE, int DIM>
+struct EvalUdfImplBody<VariantKind::GPU, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+};
+
+/*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
+{
+  eval_udf_template<VariantKind::GPU>(context);
+}
+}  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
new file mode 100644
index 0000000000..8981dc3b0d
--- /dev/null
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -0,0 +1,42 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct EvalUdfArgs {
+  uint64_t func_ptr;
+  std::vector<Array>& args;
+};
+
+class EvalUdfTask : public CuNumericTask<EvalUdfTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_EVAL_UDF;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf_omp.cc b/src/cunumeric/vectorize/eval_udf_omp.cc
new file mode 100644
index 0000000000..eb946d8c76
--- /dev/null
+++ b/src/cunumeric/vectorize/eval_udf_omp.cc
@@ -0,0 +1,35 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/vectorize/eval_udf.h"
+#include "cunumeric/vectorize/eval_udf_template.inl"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <LegateTypeCode CODE, int DIM>
+struct EvalUdfImplBody<VariantKind::OMP, CODE, DIM> {
+  using VAL = legate_type_of<CODE>;
+};
+
+/*static*/ void EvalUdfTask::omp_variant(TaskContext& context)
+{
+  eval_udf_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
new file mode 100644
index 0000000000..a71903b02b
--- /dev/null
+++ b/src/cunumeric/vectorize/eval_udf_template.inl
@@ -0,0 +1,64 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+// Useful for IDEs
+#include "cunumeric/vectorize/eval_udf.h"
+#include "cunumeric/pitches.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <VariantKind KIND, LegateTypeCode CODE, int DIM>
+struct EvalUdfImplBody;
+
+template <VariantKind KIND>
+struct EvalUdfImpl {
+  template <LegateTypeCode CODE, int DIM>
+  void operator()(EvalUdfArgs& args) const
+  {
+    using UDF = void(void**, size_t);
+    auto udf  = reinterpret_cast<UDF*>(args.func_ptr);
+    std::vector<void*> udf_args;
+    using VAL = legate_type_of<CODE>;
+    auto rect = args.args[0].shape<DIM>();
+
+    std::cout << "IRINA DEBUG size = " << args.args.size() << " , rect = " << rect << std::endl;
+
+    if (rect.empty()) return;
+
+    for (size_t i = 0; i < args.args.size(); i++) {
+      auto out = args.args[i].write_accessor<VAL, DIM>(rect);
+      udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
+    }
+
+    udf(udf_args.data(), rect.volume());
+  }
+};
+
+template <VariantKind KIND>
+static void eval_udf_template(TaskContext& context)
+{
+  std::cout << "IRINA DEBUG inside eval_udf_template" << std::endl;
+  EvalUdfArgs args{context.scalars()[0].value<uint64_t>(), context.outputs()};
+  size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
+  double_dispatch(dim, args.args[0].code(), EvalUdfImpl<KIND>{}, args);
+}
+
+}  // namespace cunumeric
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 73a09b9d1b..cbc1e692aa 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -27,9 +27,12 @@ def my_func(a, b):
 
 def test_vectorize():
     func = num.vectorize(my_func)
-    a = 1
-    b = 2
+    a = num.arange(5)
+    b = num.zeros((5,))
+    # b = 2
     func(a, b)
+    # assert(a==12)
+    print("IRINA DEBUG:")
     print(a)
 
 

From 8b8e6012e3d329a0708a80ffd2440a2c7edb26b8 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 2 Feb 2023 20:11:09 -0800
Subject: [PATCH 03/78] making simple GPU function work

---
 cunumeric/vectorize.py                        |  33 ++---
 src/cunumeric/vectorize/eval_udf.cc           |  28 +++-
 src/cunumeric/vectorize/eval_udf.cu           | 120 +++++++++++++++++-
 src/cunumeric/vectorize/eval_udf.h            |   3 +-
 src/cunumeric/vectorize/eval_udf_omp.cc       |   7 +-
 src/cunumeric/vectorize/eval_udf_template.inl |  12 +-
 6 files changed, 165 insertions(+), 38 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index f95531b217..549dc79d80 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -18,8 +18,8 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import legate.core.types as ty
-import numba.cuda
-import numba.types
+import numba.cuda # type: ignore
+import numba.types # type: ignore
 
 # import numba
 import numpy as np
@@ -109,7 +109,6 @@ def __init__(
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
 
-        print("IRINA DEBUG initialization")
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -155,7 +154,7 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
             return_lines.append(lines[i].rstrip())
         return return_lines
 
-    def _build_gpu_function(self) -> Callable[[Any], Any]:
+    def _build_gpu_function(self) -> Any:
 
         funcid = "vectorized_{}".format(self._pyfunc.__name__)
 
@@ -270,18 +269,20 @@ def _compile_func_cpu(self) -> numba.types.CPointer:
         return numba.cfunc(sig)(self._numba_func)
 
     def _execute_gpu(self) -> None:
-        print("IRINA DEBUG executing GPU function")
-        # task = self._context.create_auto_task(CuNumericOpCode.LOAD_PTX)
-        # task.add_future(
-        #    self._runtime.create_future_from_string(self._device_func)
-        # )
-        # task.execute()
-
-        # task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-        # This will be ignored
-        # task.add_scalar_arg(0, ty.uint64)
-        # task.add_future_map(kernel_fun)
-        # task.execute()
+        print("IRINA DEBUG executing GPU function", type(self._gpu_func[0]))
+        task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
+        task.add_scalar_arg(self._gpu_func[0], ty.string)
+        idx = 0
+        a0 = self._args[0]._thunk
+        a0 = runtime.to_deferred_array(a0)
+        for a in self._args:
+            a_tmp = runtime.to_deferred_array(a._thunk)
+            task.add_input(a_tmp.base)
+            task.add_output(a_tmp.base)
+            if idx != 0:
+                task.add_alignment(a0.base, a_tmp.base)
+            idx += 1
+        task.execute()
 
     def _execute_cpu(self) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index ed0c5dea05..1975da6aa2 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -15,21 +15,39 @@
  */
 
 #include "cunumeric/vectorize/eval_udf.h"
-#include "cunumeric/vectorize/eval_udf_template.inl"
 
 namespace cunumeric {
 
 using namespace Legion;
 using namespace legate;
 
-template <LegateTypeCode CODE, int DIM>
-struct EvalUdfImplBody<VariantKind::CPU, CODE, DIM> {
-  using VAL = legate_type_of<CODE>;
+struct EvalUdfCPU {
+  template <LegateTypeCode CODE, int DIM>
+  void operator()(EvalUdfArgs& args) const
+  {
+    std::cout <<"IRINA DEBUG in CPU task 2"<<std::endl;
+    using UDF = void(void**, size_t);
+    auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
+    std::vector<void*> udf_args;
+    using VAL = legate_type_of<CODE>;
+    auto rect = args.args[0].shape<DIM>();
+
+    if (rect.empty()) return;
+    for (size_t i = 0; i < args.args.size(); i++) {
+      auto out = args.args[i].write_accessor<VAL, DIM>(rect);
+      udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
+    }
+
+    udf(udf_args.data(), rect.volume());
+  }
 };
 
 /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context)
 {
-  eval_udf_template<VariantKind::CPU>(context);
+  std::cout <<"IRINA DEBUG in CPU task"<<std::endl;
+  EvalUdfArgs args{context.scalars()[0].value<uint64_t>(), context.outputs()};
+  size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
+  double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args);
 }
 
 namespace  // unnamed
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 74ac3ab31c..deb19f0643 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -15,20 +15,130 @@
  */
 
 #include "cunumeric/vectorize/eval_udf.h"
-#include "cunumeric/vectorize/eval_udf_template.inl"
 #include "cunumeric/cuda_help.h"
+#include <regex>
+#include <cuda.h>
 
 namespace cunumeric {
 
 using namespace Legion;
+using namespace legate;
 
-template <LegateTypeCode CODE, int DIM>
-struct EvalUdfImplBody<VariantKind::GPU, CODE, DIM> {
-  using VAL = legate_type_of<CODE>;
+struct EvalUdfGPU {
+  template <LegateTypeCode CODE, int DIM>
+  void operator()(EvalUdfArgs& args) const
+  {
+    using VAL = legate_type_of<CODE>;
+    auto rect = args.args[0].shape<DIM>();
+    if (rect.empty()) return;
+
+    const unsigned num_options = 4;
+  const size_t log_buffer_size   = 16384;
+  std::vector<char> log_info_buffer(log_buffer_size);
+  std::vector<char> log_error_buffer(log_buffer_size);
+  CUjit_option jit_options[] = {
+    CU_JIT_INFO_LOG_BUFFER,
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_ERROR_LOG_BUFFER,
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+  };
+  void *option_vals[] = {
+    static_cast<void *>(log_info_buffer.data()),
+    reinterpret_cast<void *>(log_buffer_size),
+    static_cast<void *>(log_error_buffer.data()),
+    reinterpret_cast<void *>(log_buffer_size),
+  };
+
+  CUmodule module;
+  CUresult result = cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals);
+  if (result != CUDA_SUCCESS) {
+    if (result == CUDA_ERROR_OPERATING_SYSTEM) {
+      fprintf(stderr,
+              "ERROR: Device side asserts are not supported by the "
+              "CUDA driver for MAC OSX, see NVBugs 1628896.\n");
+      exit(-1);
+    } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) {
+      fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n");
+      exit(-1);
+    } else {
+      fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data());
+#if CUDA_VERSION >= 6050
+      const char *name, *str;
+      assert(cuGetErrorName(result, &name) == CUDA_SUCCESS);
+      assert(cuGetErrorString(result, &str) == CUDA_SUCCESS);
+      fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str);
+#else
+      fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result);
+#endif
+      exit(-1);
+    }
+  }
+
+  std::cmatch line_match;
+  bool match = std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
+#ifdef DEBUG_PANDAS
+  assert(match);
+#endif
+  const auto &matched_line = line_match.begin()->str();
+  auto fun_name            = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
+
+  CUfunction func;
+  result = cuModuleGetFunction(&func, module, fun_name.c_str());
+  assert(result == CUDA_SUCCESS);
+
+  //ececuting user function:
+  size_t buffer_size = (args.args.size() ) * sizeof(void *);
+  buffer_size += sizeof(size_t);
+
+  std::vector<char> arg_buffer(buffer_size);
+  char *raw_arg_buffer = arg_buffer.data();
+
+  auto p = raw_arg_buffer;
+
+  for (auto &arg : args.args) {
+    auto out = arg.write_accessor<VAL, DIM>(rect);
+    *reinterpret_cast<const void **>(p) = out.ptr(rect);
+    p += sizeof(void *);
+  }
+  auto size = rect.volume();
+  memcpy(p, &size, sizeof(size_t));
+
+  void *config[] = {
+    CU_LAUNCH_PARAM_BUFFER_POINTER,
+    static_cast<void *>(raw_arg_buffer),
+    CU_LAUNCH_PARAM_BUFFER_SIZE,
+    &buffer_size,
+    CU_LAUNCH_PARAM_END,
+  };
+
+  const uint32_t gridDimX = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  const uint32_t gridDimY = 1;
+  const uint32_t gridDimZ = 1;
+
+  const uint32_t blockDimX = THREADS_PER_BLOCK;
+  const uint32_t blockDimY = 1;
+  const uint32_t blockDimZ = 1;
+
+  auto stream  = get_cached_stream();
+
+  CUresult status = cuLaunchKernel(
+    func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config);
+  if (status != CUDA_SUCCESS) {
+    fprintf(stderr, "Failed to launch a CUDA kernel\n");
+    exit(-1);
+  }
+
+  CHECK_CUDA_STREAM(stream);
+
+  }
 };
 
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
-  eval_udf_template<VariantKind::GPU>(context);
+   //std::cout <<"IRINA DEBUG size of the scalars = "<<context.scalars()[0].value<std::string>()<<std::endl;
+    EvalUdfArgs args{0, context.outputs(), context.scalars()[0].value<std::string>()};
+    size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
+    double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args);
+
 }
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index 8981dc3b0d..6a53dd9f2f 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -21,8 +21,9 @@
 namespace cunumeric {
 
 struct EvalUdfArgs {
-  uint64_t func_ptr;
+  uint64_t cpu_func_ptr;
   std::vector<Array>& args;
+  std::string ptx = "";
 };
 
 class EvalUdfTask : public CuNumericTask<EvalUdfTask> {
diff --git a/src/cunumeric/vectorize/eval_udf_omp.cc b/src/cunumeric/vectorize/eval_udf_omp.cc
index eb946d8c76..33e3c6e2a8 100644
--- a/src/cunumeric/vectorize/eval_udf_omp.cc
+++ b/src/cunumeric/vectorize/eval_udf_omp.cc
@@ -15,21 +15,16 @@
  */
 
 #include "cunumeric/vectorize/eval_udf.h"
-#include "cunumeric/vectorize/eval_udf_template.inl"
 
 namespace cunumeric {
 
 using namespace Legion;
 using namespace legate;
 
-template <LegateTypeCode CODE, int DIM>
-struct EvalUdfImplBody<VariantKind::OMP, CODE, DIM> {
-  using VAL = legate_type_of<CODE>;
-};
 
 /*static*/ void EvalUdfTask::omp_variant(TaskContext& context)
 {
-  eval_udf_template<VariantKind::OMP>(context);
+  EvalUdfTask::cpu_variant(context);
 }
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
index a71903b02b..60e2582c54 100644
--- a/src/cunumeric/vectorize/eval_udf_template.inl
+++ b/src/cunumeric/vectorize/eval_udf_template.inl
@@ -39,10 +39,8 @@ struct EvalUdfImpl {
     using VAL = legate_type_of<CODE>;
     auto rect = args.args[0].shape<DIM>();
 
-    std::cout << "IRINA DEBUG size = " << args.args.size() << " , rect = " << rect << std::endl;
-
     if (rect.empty()) return;
-
+    EvalUdfImplBody<KIND,CODE,DIM>();
     for (size_t i = 0; i < args.args.size(); i++) {
       auto out = args.args[i].write_accessor<VAL, DIM>(rect);
       udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
@@ -55,8 +53,12 @@ struct EvalUdfImpl {
 template <VariantKind KIND>
 static void eval_udf_template(TaskContext& context)
 {
-  std::cout << "IRINA DEBUG inside eval_udf_template" << std::endl;
-  EvalUdfArgs args{context.scalars()[0].value<uint64_t>(), context.outputs()};
+  is_gpus = context.scalars()[0].value<bool>();
+  if (is_gpus)
+      std::cout <<"IRINA DEBUG size of the scalars = "<<context.scalars().size()<<std::endl;
+      EvalUdfArgs args{0,context.scalars()[1].value<char*>(), context.outputs()};
+  else
+      EvalUdfArgs args{context.scalars()[1].value<uint64_t>(),'', context.outputs()};
   size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
   double_dispatch(dim, args.args[0].code(), EvalUdfImpl<KIND>{}, args);
 }

From d3300167d62cb990c7351c694b658b274b53d24b Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 3 Feb 2023 08:50:47 -0800
Subject: [PATCH 04/78] changing isort version in pre-commmit due to some
 errors in the older verions

---
 .pre-commit-config.yaml                       |   2 +-
 src/cunumeric/vectorize/eval_udf.cc           |   4 +-
 src/cunumeric/vectorize/eval_udf.cu           | 181 +++++++++---------
 src/cunumeric/vectorize/eval_udf_omp.cc       |   1 -
 src/cunumeric/vectorize/eval_udf_template.inl |   9 +-
 5 files changed, 98 insertions(+), 99 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bc47df8a70..1572d79d3f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ repos:
           pass_filenames: false
           args: ['cunumeric']
     - repo: https://github.com/PyCQA/isort
-      rev: 5.11.4
+      rev: 5.12.0
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 1975da6aa2..fb71330581 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -25,7 +25,7 @@ struct EvalUdfCPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
   {
-    std::cout <<"IRINA DEBUG in CPU task 2"<<std::endl;
+    std::cout << "IRINA DEBUG in CPU task 2" << std::endl;
     using UDF = void(void**, size_t);
     auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
     std::vector<void*> udf_args;
@@ -44,7 +44,7 @@ struct EvalUdfCPU {
 
 /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context)
 {
-  std::cout <<"IRINA DEBUG in CPU task"<<std::endl;
+  std::cout << "IRINA DEBUG in CPU task" << std::endl;
   EvalUdfArgs args{context.scalars()[0].value<uint64_t>(), context.outputs()};
   size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
   double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args);
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index deb19f0643..d5b2c1f43a 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -32,113 +32,114 @@ struct EvalUdfGPU {
     auto rect = args.args[0].shape<DIM>();
     if (rect.empty()) return;
 
-    const unsigned num_options = 4;
-  const size_t log_buffer_size   = 16384;
-  std::vector<char> log_info_buffer(log_buffer_size);
-  std::vector<char> log_error_buffer(log_buffer_size);
-  CUjit_option jit_options[] = {
-    CU_JIT_INFO_LOG_BUFFER,
-    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-    CU_JIT_ERROR_LOG_BUFFER,
-    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-  };
-  void *option_vals[] = {
-    static_cast<void *>(log_info_buffer.data()),
-    reinterpret_cast<void *>(log_buffer_size),
-    static_cast<void *>(log_error_buffer.data()),
-    reinterpret_cast<void *>(log_buffer_size),
-  };
-
-  CUmodule module;
-  CUresult result = cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals);
-  if (result != CUDA_SUCCESS) {
-    if (result == CUDA_ERROR_OPERATING_SYSTEM) {
-      fprintf(stderr,
-              "ERROR: Device side asserts are not supported by the "
-              "CUDA driver for MAC OSX, see NVBugs 1628896.\n");
-      exit(-1);
-    } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) {
-      fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n");
-      exit(-1);
-    } else {
-      fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data());
+    const unsigned num_options   = 4;
+    const size_t log_buffer_size = 16384;
+    std::vector<char> log_info_buffer(log_buffer_size);
+    std::vector<char> log_error_buffer(log_buffer_size);
+    CUjit_option jit_options[] = {
+      CU_JIT_INFO_LOG_BUFFER,
+      CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+      CU_JIT_ERROR_LOG_BUFFER,
+      CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+    };
+    void* option_vals[] = {
+      static_cast<void*>(log_info_buffer.data()),
+      reinterpret_cast<void*>(log_buffer_size),
+      static_cast<void*>(log_error_buffer.data()),
+      reinterpret_cast<void*>(log_buffer_size),
+    };
+
+    CUmodule module;
+    CUresult result =
+      cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals);
+    if (result != CUDA_SUCCESS) {
+      if (result == CUDA_ERROR_OPERATING_SYSTEM) {
+        fprintf(stderr,
+                "ERROR: Device side asserts are not supported by the "
+                "CUDA driver for MAC OSX, see NVBugs 1628896.\n");
+        exit(-1);
+      } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) {
+        fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n");
+        exit(-1);
+      } else {
+        fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data());
 #if CUDA_VERSION >= 6050
-      const char *name, *str;
-      assert(cuGetErrorName(result, &name) == CUDA_SUCCESS);
-      assert(cuGetErrorString(result, &str) == CUDA_SUCCESS);
-      fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str);
+        const char *name, *str;
+        assert(cuGetErrorName(result, &name) == CUDA_SUCCESS);
+        assert(cuGetErrorString(result, &str) == CUDA_SUCCESS);
+        fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str);
 #else
-      fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result);
+        fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result);
 #endif
-      exit(-1);
+        exit(-1);
+      }
     }
-  }
 
-  std::cmatch line_match;
-  bool match = std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
+    std::cmatch line_match;
+    bool match =
+      std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
 #ifdef DEBUG_PANDAS
-  assert(match);
+    assert(match);
 #endif
-  const auto &matched_line = line_match.begin()->str();
-  auto fun_name            = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
+    const auto& matched_line = line_match.begin()->str();
+    auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
 
-  CUfunction func;
-  result = cuModuleGetFunction(&func, module, fun_name.c_str());
-  assert(result == CUDA_SUCCESS);
+    CUfunction func;
+    result = cuModuleGetFunction(&func, module, fun_name.c_str());
+    assert(result == CUDA_SUCCESS);
 
-  //ececuting user function:
-  size_t buffer_size = (args.args.size() ) * sizeof(void *);
-  buffer_size += sizeof(size_t);
+    // ececuting user function:
+    size_t buffer_size = (args.args.size()) * sizeof(void*);
+    buffer_size += sizeof(size_t);
 
-  std::vector<char> arg_buffer(buffer_size);
-  char *raw_arg_buffer = arg_buffer.data();
+    std::vector<char> arg_buffer(buffer_size);
+    char* raw_arg_buffer = arg_buffer.data();
 
-  auto p = raw_arg_buffer;
+    auto p = raw_arg_buffer;
 
-  for (auto &arg : args.args) {
-    auto out = arg.write_accessor<VAL, DIM>(rect);
-    *reinterpret_cast<const void **>(p) = out.ptr(rect);
-    p += sizeof(void *);
-  }
-  auto size = rect.volume();
-  memcpy(p, &size, sizeof(size_t));
-
-  void *config[] = {
-    CU_LAUNCH_PARAM_BUFFER_POINTER,
-    static_cast<void *>(raw_arg_buffer),
-    CU_LAUNCH_PARAM_BUFFER_SIZE,
-    &buffer_size,
-    CU_LAUNCH_PARAM_END,
-  };
-
-  const uint32_t gridDimX = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  const uint32_t gridDimY = 1;
-  const uint32_t gridDimZ = 1;
-
-  const uint32_t blockDimX = THREADS_PER_BLOCK;
-  const uint32_t blockDimY = 1;
-  const uint32_t blockDimZ = 1;
-
-  auto stream  = get_cached_stream();
-
-  CUresult status = cuLaunchKernel(
-    func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config);
-  if (status != CUDA_SUCCESS) {
-    fprintf(stderr, "Failed to launch a CUDA kernel\n");
-    exit(-1);
-  }
-
-  CHECK_CUDA_STREAM(stream);
+    for (auto& arg : args.args) {
+      auto out                           = arg.write_accessor<VAL, DIM>(rect);
+      *reinterpret_cast<const void**>(p) = out.ptr(rect);
+      p += sizeof(void*);
+    }
+    auto size = rect.volume();
+    memcpy(p, &size, sizeof(size_t));
+
+    void* config[] = {
+      CU_LAUNCH_PARAM_BUFFER_POINTER,
+      static_cast<void*>(raw_arg_buffer),
+      CU_LAUNCH_PARAM_BUFFER_SIZE,
+      &buffer_size,
+      CU_LAUNCH_PARAM_END,
+    };
+
+    const uint32_t gridDimX = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    const uint32_t gridDimY = 1;
+    const uint32_t gridDimZ = 1;
+
+    const uint32_t blockDimX = THREADS_PER_BLOCK;
+    const uint32_t blockDimY = 1;
+    const uint32_t blockDimZ = 1;
+
+    auto stream = get_cached_stream();
+
+    CUresult status = cuLaunchKernel(
+      func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config);
+    if (status != CUDA_SUCCESS) {
+      fprintf(stderr, "Failed to launch a CUDA kernel\n");
+      exit(-1);
+    }
 
+    CHECK_CUDA_STREAM(stream);
   }
 };
 
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
-   //std::cout <<"IRINA DEBUG size of the scalars = "<<context.scalars()[0].value<std::string>()<<std::endl;
-    EvalUdfArgs args{0, context.outputs(), context.scalars()[0].value<std::string>()};
-    size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
-    double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args);
-
+  // std::cout <<"IRINA DEBUG size of the scalars =
+  // "<<context.scalars()[0].value<std::string>()<<std::endl;
+  EvalUdfArgs args{0, context.outputs(), context.scalars()[0].value<std::string>()};
+  size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
+  double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args);
 }
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf_omp.cc b/src/cunumeric/vectorize/eval_udf_omp.cc
index 33e3c6e2a8..c6e2991733 100644
--- a/src/cunumeric/vectorize/eval_udf_omp.cc
+++ b/src/cunumeric/vectorize/eval_udf_omp.cc
@@ -21,7 +21,6 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-
 /*static*/ void EvalUdfTask::omp_variant(TaskContext& context)
 {
   EvalUdfTask::cpu_variant(context);
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
index 60e2582c54..775651b12c 100644
--- a/src/cunumeric/vectorize/eval_udf_template.inl
+++ b/src/cunumeric/vectorize/eval_udf_template.inl
@@ -40,7 +40,7 @@ struct EvalUdfImpl {
     auto rect = args.args[0].shape<DIM>();
 
     if (rect.empty()) return;
-    EvalUdfImplBody<KIND,CODE,DIM>();
+    EvalUdfImplBody<KIND, CODE, DIM>();
     for (size_t i = 0; i < args.args.size(); i++) {
       auto out = args.args[i].write_accessor<VAL, DIM>(rect);
       udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
@@ -55,10 +55,9 @@ static void eval_udf_template(TaskContext& context)
 {
   is_gpus = context.scalars()[0].value<bool>();
   if (is_gpus)
-      std::cout <<"IRINA DEBUG size of the scalars = "<<context.scalars().size()<<std::endl;
-      EvalUdfArgs args{0,context.scalars()[1].value<char*>(), context.outputs()};
-  else
-      EvalUdfArgs args{context.scalars()[1].value<uint64_t>(),'', context.outputs()};
+    std::cout << "IRINA DEBUG size of the scalars = " << context.scalars().size() << std::endl;
+  EvalUdfArgs args{0, context.scalars()[1].value<char*>(), context.outputs()};
+  else EvalUdfArgs args{context.scalars()[1].value<uint64_t>(),'', context.outputs()};
   size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
   double_dispatch(dim, args.args[0].code(), EvalUdfImpl<KIND>{}, args);
 }

From 6199a4160b86bca55578ac4418ad08a5b2fe063b Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 3 Feb 2023 21:27:05 -0800
Subject: [PATCH 05/78] adding type stubs for numba

---
 cunumeric/vectorize.py           | 33 ++++++++++++++++----------------
 pyproject.toml                   |  1 +
 typings/numba/__init__.pyi       | 15 +++++++++++++++
 typings/numba/cuda/__init__.pyi  |  5 +++++
 typings/numba/cuda/compiler.pyi  | 12 ++++++++++++
 typings/numba/types/CPointer.pyi |  5 +++++
 typings/numba/types/__init__.pyi | 15 +++++++++++++++
 7 files changed, 69 insertions(+), 17 deletions(-)
 create mode 100644 typings/numba/__init__.pyi
 create mode 100644 typings/numba/cuda/__init__.pyi
 create mode 100644 typings/numba/cuda/compiler.pyi
 create mode 100644 typings/numba/types/CPointer.pyi
 create mode 100644 typings/numba/types/__init__.pyi

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 549dc79d80..394986f16d 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -18,10 +18,7 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import legate.core.types as ty
-import numba.cuda # type: ignore
-import numba.types # type: ignore
-
-# import numba
+import numba
 import numpy as np
 import six
 
@@ -30,6 +27,10 @@
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
 
+# import numba.cuda
+# import numba.types
+
+
 _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__"
 _MASK_VAR = "__mask__"
 _SIZE_VAR = "__size__"
@@ -100,16 +101,15 @@ def __init__(
         signature: Optional[str] = None,
     ) -> None:
         self._pyfunc = pyfunc
-        self._numba_func: Optional[Callable[[Any], Any]] = None
-        self._cpu_func: numba.types.CPointer = numba.types.CPointer(int)
-        self._gpu_func: tuple[Any] = (0,)
+        self._numba_func: Callable[[Any], Any]
+        self._cpu_func: numba.core.ccallback.CFunc
+        self._gpu_func: tuple[Any]
         self._otypes = None
         self._result = None
         self._args: List[Any] = []
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
 
-
         if doc is None:
             self.__doc__ = pyfunc.__doc__
         else:
@@ -248,28 +248,27 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
         for arg in self._args:
             ty = arg.dtype
             ty = str(ty) if ty != bool else "int8"
-            ty = getattr(numba.types, ty)
-            ty = numba.types.CPointer(ty)
+            ty = getattr(numba.core.types, ty)
+            ty = numba.core.types.CPointer(ty)
             types.append(ty)
         return types
 
     def _compile_func_gpu(self) -> tuple[Any]:
         types = self._get_numba_types()
-        arg_types = types + [numba.types.uint64]
+        arg_types = types + [numba.core.types.uint64]
         sig = (*arg_types,)
 
         cuda_arch = numba.cuda.get_current_device().compute_capability
         return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch)
 
-    def _compile_func_cpu(self) -> numba.types.CPointer:
-        sig = numba.types.void(
-            numba.types.CPointer(numba.types.voidptr), numba.types.uint64
-        )
+    def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
+        sig = numba.core.types.void(
+            numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64
+        )  # type: ignore
 
         return numba.cfunc(sig)(self._numba_func)
 
     def _execute_gpu(self) -> None:
-        print("IRINA DEBUG executing GPU function", type(self._gpu_func[0]))
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._gpu_func[0], ty.string)
         idx = 0
@@ -286,7 +285,7 @@ def _execute_gpu(self) -> None:
 
     def _execute_cpu(self) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-        task.add_scalar_arg(self._cpu_func.address, ty.uint64)
+        task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
         idx = 0
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
diff --git a/pyproject.toml b/pyproject.toml
index 73ebc13c82..5c8e1f83aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ exclude = '''
 
 [tool.mypy]
 python_version = "3.10"
+mypy_path = "typings/"
 
 pretty = true
 show_error_codes = true
diff --git a/typings/numba/__init__.pyi b/typings/numba/__init__.pyi
new file mode 100644
index 0000000000..571faf5a05
--- /dev/null
+++ b/typings/numba/__init__.pyi
@@ -0,0 +1,15 @@
+from typing import Any, Callable
+
+# Re-export types itself
+import numba.core.types as types
+import numba.cuda  # import compile_ptx
+
+# import types
+from numba.core import types
+from numba.core.ccallback import CFunc
+
+# Re-export all type names
+from numba.core.types import CPointer, uint64
+
+def cfunc(sig: Any) -> Any:
+    def wrapper(func: Callable[[Any], Any]) -> tuple[Any]: ...
diff --git a/typings/numba/cuda/__init__.pyi b/typings/numba/cuda/__init__.pyi
new file mode 100644
index 0000000000..d66e40c5f4
--- /dev/null
+++ b/typings/numba/cuda/__init__.pyi
@@ -0,0 +1,5 @@
+from typing import Any
+
+from numba.cuda.compiler import compile_ptx as compile_ptx
+
+def get_current_device() -> Any: ...
diff --git a/typings/numba/cuda/compiler.pyi b/typings/numba/cuda/compiler.pyi
new file mode 100644
index 0000000000..56e02dd3e2
--- /dev/null
+++ b/typings/numba/cuda/compiler.pyi
@@ -0,0 +1,12 @@
+from typing import Any, Callable, Optional
+
+def compile_ptx(
+    pyfunc: Callable[[Any], Any],
+    args: Any,
+    debug: bool = False,
+    lineinfo: bool = False,
+    device: bool = False,
+    fastmath: bool = False,
+    cc: Optional[Any] = None,
+    opt: bool = True,
+) -> tuple[Any]: ...
diff --git a/typings/numba/types/CPointer.pyi b/typings/numba/types/CPointer.pyi
new file mode 100644
index 0000000000..249a23f191
--- /dev/null
+++ b/typings/numba/types/CPointer.pyi
@@ -0,0 +1,5 @@
+# import numpy as np
+from numba.core.types.abstract import Type
+
+class CPointer(Type):
+    def __init__(self, dtype: Type) -> None: ...
diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi
new file mode 100644
index 0000000000..ffbfbd5a94
--- /dev/null
+++ b/typings/numba/types/__init__.pyi
@@ -0,0 +1,15 @@
+                      configuration locations on your computer.
+
+class Type(): ...
+
+class Number(): ...
+
+class Integer(Number):
+    def __init__(self, name: str) ->None: ...
+
+class CPointer (Type):
+    def __init__ (self, dtype : Type) -> None : ...
+
+uint32 = Integer('uint32')
+uint64 = Integer('uint64')
+void = none

From 23b80c512775055dbf66f975c26d55c59e1b2af2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 7 Feb 2023 11:04:46 -0800
Subject: [PATCH 06/78] clean-up

---
 cunumeric/vectorize.py                        |  5 +++++
 src/cunumeric/vectorize/eval_udf.cc           |  5 ++---
 src/cunumeric/vectorize/eval_udf.cu           | 13 +++++++++----
 src/cunumeric/vectorize/eval_udf_template.inl |  1 -
 tests/integration/test_vectorize.py           | 19 +++++++++++--------
 5 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 394986f16d..855c2f62a2 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -19,6 +19,7 @@
 
 import legate.core.types as ty
 import numba
+import numba.core.ccallback 
 import numpy as np
 import six
 
@@ -115,16 +116,20 @@ def __init__(
         else:
             self.__doc__ = doc
 
+        #FIXME
         if otypes is not None:
             raise NotImplementedError("Otypes variables are not supported yet")
 
+        #FIXME
         if excluded is not None:
             raise NotImplementedError(
                 "excluded variables are not supported yet"
             )
+        #FIXME
         if cache:
             raise NotImplementedError("cache variable is not supported yet")
 
+        #FIXME
         if signature is not None:
             raise NotImplementedError(
                 "signature variable is not supported yet"
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index fb71330581..205c24e782 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -25,7 +25,8 @@ struct EvalUdfCPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
   {
-    std::cout << "IRINA DEBUG in CPU task 2" << std::endl;
+    //In the case of CPU, we pack arguments in a vector and pass them to the
+    //function (through the function pointer geenrated by numba)
     using UDF = void(void**, size_t);
     auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
     std::vector<void*> udf_args;
@@ -37,14 +38,12 @@ struct EvalUdfCPU {
       auto out = args.args[i].write_accessor<VAL, DIM>(rect);
       udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
     }
-
     udf(udf_args.data(), rect.volume());
   }
 };
 
 /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context)
 {
-  std::cout << "IRINA DEBUG in CPU task" << std::endl;
   EvalUdfArgs args{context.scalars()[0].value<uint64_t>(), context.outputs()};
   size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
   double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args);
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index d5b2c1f43a..61789bcdb5 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -32,6 +32,8 @@ struct EvalUdfGPU {
     auto rect = args.args[0].shape<DIM>();
     if (rect.empty()) return;
 
+
+    // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;
     const size_t log_buffer_size = 16384;
     std::vector<char> log_info_buffer(log_buffer_size);
@@ -78,7 +80,7 @@ struct EvalUdfGPU {
     std::cmatch line_match;
     bool match =
       std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
-#ifdef DEBUG_PANDAS
+#ifdef DEBUG_CUNUMERIC
     assert(match);
 #endif
     const auto& matched_line = line_match.begin()->str();
@@ -86,9 +88,13 @@ struct EvalUdfGPU {
 
     CUfunction func;
     result = cuModuleGetFunction(&func, module, fun_name.c_str());
+#ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
+#endif
+
+    //2: after fucntion is generated, we can execute it:
 
-    // ececuting user function:
+    //Filling up the bugger with arguments
     size_t buffer_size = (args.args.size()) * sizeof(void*);
     buffer_size += sizeof(size_t);
 
@@ -123,6 +129,7 @@ struct EvalUdfGPU {
 
     auto stream = get_cached_stream();
 
+    //executing the function
     CUresult status = cuLaunchKernel(
       func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config);
     if (status != CUDA_SUCCESS) {
@@ -136,8 +143,6 @@ struct EvalUdfGPU {
 
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
-  // std::cout <<"IRINA DEBUG size of the scalars =
-  // "<<context.scalars()[0].value<std::string>()<<std::endl;
   EvalUdfArgs args{0, context.outputs(), context.scalars()[0].value<std::string>()};
   size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
   double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args);
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
index 775651b12c..6825b90ae7 100644
--- a/src/cunumeric/vectorize/eval_udf_template.inl
+++ b/src/cunumeric/vectorize/eval_udf_template.inl
@@ -55,7 +55,6 @@ static void eval_udf_template(TaskContext& context)
 {
   is_gpus = context.scalars()[0].value<bool>();
   if (is_gpus)
-    std::cout << "IRINA DEBUG size of the scalars = " << context.scalars().size() << std::endl;
   EvalUdfArgs args{0, context.scalars()[1].value<char*>(), context.outputs()};
   else EvalUdfArgs args{context.scalars()[1].value<uint64_t>(),'', context.outputs()};
   size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index cbc1e692aa..a6fd3d298d 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -1,4 +1,4 @@
-# Copyright 2021-2022 NVIDIA Corporation
+# Copyright 2023 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,23 +18,26 @@
 import pytest
 
 import cunumeric as num
+import numpy as np
 
 
 def my_func(a, b):
     a = a * 2 + b
-    a = a * 3
 
 
 def test_vectorize():
     func = num.vectorize(my_func)
     a = num.arange(5)
-    b = num.zeros((5,))
-    # b = 2
+    b = num.ones((5,))
     func(a, b)
-    # assert(a==12)
-    print("IRINA DEBUG:")
-    print(a)
-
+    assert(np.array_equal(a, [1,3,5,7,9]))
+
+#FIXME uncomment once broadcast routines are merged
+#    a= num.arange(5)
+#    b=2
+#    func(a,b)
+#    assert(np.array_equal(a, [2,4,6,8,10]))
+    
 
 if __name__ == "__main__":
     import sys

From 3e07363a0209f7c0de1d1f8c31009d27a1884620 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 8 Feb 2023 09:34:20 -0800
Subject: [PATCH 07/78] forcing partition by the first dimention for vectorize

---
 cunumeric/vectorize.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 855c2f62a2..ca9cae749c 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -286,6 +286,9 @@ def _execute_gpu(self) -> None:
             if idx != 0:
                 task.add_alignment(a0.base, a_tmp.base)
             idx += 1
+            task.add_broadcast(
+                a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
+            )
         task.execute()
 
     def _execute_cpu(self) -> None:
@@ -301,6 +304,9 @@ def _execute_cpu(self) -> None:
             if idx != 0:
                 task.add_alignment(a0.base, a_tmp.base)
             idx += 1
+            task.add_broadcast(
+                a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
+            )
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:

From 67b5675ee1d3078e315a8379461285c6befcefb8 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 9 Feb 2023 13:34:45 -0800
Subject: [PATCH 08/78] adding caching to vectorize

---
 cunumeric/vectorize.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index ca9cae749c..9ec52b7813 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -28,6 +28,7 @@
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
 
+from legate.timing import time
 # import numba.cuda
 # import numba.types
 
@@ -110,6 +111,8 @@ def __init__(
         self._args: List[Any] = []
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
+        self._created: bool = False
+        self._cache: bool = cache
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -125,9 +128,6 @@ def __init__(
             raise NotImplementedError(
                 "excluded variables are not supported yet"
             )
-        #FIXME
-        if cache:
-            raise NotImplementedError("cache variable is not supported yet")
 
         #FIXME
         if signature is not None:
@@ -351,10 +351,16 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             )
 
         if runtime.num_gpus > 0:
-            self._numba_func = self._build_gpu_function()
-            self._gpu_func = self._compile_func_gpu()
+            if not self._created:
+                self._numba_func = self._build_gpu_function()
+                self._gpu_func = self._compile_func_gpu()
+                if self._cache:
+                    self._created = True
             self._execute_gpu()
         else:
-            self._numba_func = self._build_cpu_function()
-            self._cpu_func = self._compile_func_cpu()
+            if not self._created:
+                self._numba_func = self._build_cpu_function()
+                self._cpu_func = self._compile_func_cpu()
+                if self._cache:
+                    self._created = True
             self._execute_cpu()

From 5b634fb74991240c8a6b05af5a74883e8a00616d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 10 Feb 2023 18:41:03 -0800
Subject: [PATCH 09/78] adding support for otypes

---
 cunumeric/vectorize.py                        | 42 +++++++++++++------
 src/cunumeric/vectorize/eval_udf.cc           | 28 +++++++++----
 src/cunumeric/vectorize/eval_udf.cu           | 32 ++++++++------
 src/cunumeric/vectorize/eval_udf.h            |  4 +-
 src/cunumeric/vectorize/eval_udf_template.inl |  5 ++-
 5 files changed, 74 insertions(+), 37 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 9ec52b7813..4ecf496656 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -19,7 +19,7 @@
 
 import legate.core.types as ty
 import numba
-import numba.core.ccallback 
+import numba.core.ccallback
 import numpy as np
 import six
 
@@ -28,7 +28,9 @@
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
 
-from legate.timing import time
+# from legate.timing import time
+
+
 # import numba.cuda
 # import numba.types
 
@@ -99,37 +101,47 @@ def __init__(
         otypes: Optional[Union[str, list[Any]]] = None,
         doc: Optional[str] = None,
         excluded: Optional[set[Any]] = None,
-        cache: Optional[bool] = False,
+        cache: bool = False,
         signature: Optional[str] = None,
     ) -> None:
         self._pyfunc = pyfunc
         self._numba_func: Callable[[Any], Any]
         self._cpu_func: numba.core.ccallback.CFunc
         self._gpu_func: tuple[Any]
-        self._otypes = None
+        self._otypes: Optional[tuple[Any]] = None
         self._result = None
         self._args: List[Any] = []
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
         self._cache: bool = cache
+        self._num_outputs = 1  # there is at least 1 output
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
         else:
             self.__doc__ = doc
 
-        #FIXME
         if otypes is not None:
-            raise NotImplementedError("Otypes variables are not supported yet")
-
-        #FIXME
+            self._num_outputs = len(otypes)
+            if len(otypes) == 0:
+                raise ValueError(
+                    "There should be at least 1 type specified in otypes"
+                )
+            ty = otypes[0]
+            for t in otypes:
+                if t != ty:
+                    raise NotImplementedError(
+                        "cuNumeric doesn't support variable types in otypes"
+                    )
+
+        # FIXME
         if excluded is not None:
             raise NotImplementedError(
                 "excluded variables are not supported yet"
             )
 
-        #FIXME
+        # FIXME
         if signature is not None:
             raise NotImplementedError(
                 "signature variable is not supported yet"
@@ -276,13 +288,15 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
     def _execute_gpu(self) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._gpu_func[0], ty.string)
+        task.add_scalar_arg(self._num_outputs, ty.uint32)
         idx = 0
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
-        for a in self._args:
+        for count, a in enumerate(self._args):
             a_tmp = runtime.to_deferred_array(a._thunk)
             task.add_input(a_tmp.base)
-            task.add_output(a_tmp.base)
+            if count < self._num_outputs:
+                task.add_output(a_tmp.base)
             if idx != 0:
                 task.add_alignment(a0.base, a_tmp.base)
             idx += 1
@@ -294,13 +308,15 @@ def _execute_gpu(self) -> None:
     def _execute_cpu(self) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
+        task.add_scalar_arg(self._num_outputs, ty.uint32)
         idx = 0
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
-        for a in self._args:
+        for count, a in enumerate(self._args):
             a_tmp = runtime.to_deferred_array(a._thunk)
             task.add_input(a_tmp.base)
-            task.add_output(a_tmp.base)
+            if count < self._num_outputs:
+                task.add_output(a_tmp.base)
             if idx != 0:
                 task.add_alignment(a0.base, a_tmp.base)
             idx += 1
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 205c24e782..78f0a045b7 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -25,18 +25,23 @@ struct EvalUdfCPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
   {
-    //In the case of CPU, we pack arguments in a vector and pass them to the
-    //function (through the function pointer geenrated by numba)
+    // In the case of CPU, we pack arguments in a vector and pass them to the
+    // function (through the function pointer geenrated by numba)
     using UDF = void(void**, size_t);
     auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
     std::vector<void*> udf_args;
     using VAL = legate_type_of<CODE>;
-    auto rect = args.args[0].shape<DIM>();
+    auto rect = args.inputs[0].shape<DIM>();
 
     if (rect.empty()) return;
-    for (size_t i = 0; i < args.args.size(); i++) {
-      auto out = args.args[i].write_accessor<VAL, DIM>(rect);
-      udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
+    for (size_t i = 0; i < args.inputs.size(); i++) {
+      if (i < args.num_outputs) {
+        auto out = args.outputs[i].write_accessor<VAL, DIM>(rect);
+        udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
+      } else {
+        auto out = args.inputs[i].read_accessor<VAL, DIM>(rect);
+        udf_args.push_back(reinterpret_cast<void*>(const_cast<VAL*>(out.ptr(rect))));
+      }
     }
     udf(udf_args.data(), rect.volume());
   }
@@ -44,9 +49,14 @@ struct EvalUdfCPU {
 
 /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context)
 {
-  EvalUdfArgs args{context.scalars()[0].value<uint64_t>(), context.outputs()};
-  size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
-  double_dispatch(dim, args.args[0].code(), EvalUdfCPU{}, args);
+  std::string tmp("tmp");
+  EvalUdfArgs args{context.scalars()[0].value<uint64_t>(),
+                   context.inputs(),
+                   context.outputs(),
+                   tmp,
+                   context.scalars()[1].value<uint32_t>()};
+  size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
+  double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
 }
 
 namespace  // unnamed
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 61789bcdb5..d0886387a7 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -29,10 +29,9 @@ struct EvalUdfGPU {
   void operator()(EvalUdfArgs& args) const
   {
     using VAL = legate_type_of<CODE>;
-    auto rect = args.args[0].shape<DIM>();
+    auto rect = args.inputs[0].shape<DIM>();
     if (rect.empty()) return;
 
-
     // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;
     const size_t log_buffer_size = 16384;
@@ -92,10 +91,10 @@ struct EvalUdfGPU {
     assert(result == CUDA_SUCCESS);
 #endif
 
-    //2: after fucntion is generated, we can execute it:
+    // 2: after fucntion is generated, we can execute it:
 
-    //Filling up the bugger with arguments
-    size_t buffer_size = (args.args.size()) * sizeof(void*);
+    // Filling up the bugger with arguments
+    size_t buffer_size = (args.inputs.size()) * sizeof(void*);
     buffer_size += sizeof(size_t);
 
     std::vector<char> arg_buffer(buffer_size);
@@ -103,9 +102,14 @@ struct EvalUdfGPU {
 
     auto p = raw_arg_buffer;
 
-    for (auto& arg : args.args) {
-      auto out                           = arg.write_accessor<VAL, DIM>(rect);
-      *reinterpret_cast<const void**>(p) = out.ptr(rect);
+    for (size_t i = 0; i < args.inputs.size(); i++) {
+      if (i < args.num_outputs) {
+        auto out                           = args.outputs[i].write_accessor<VAL, DIM>(rect);
+        *reinterpret_cast<const void**>(p) = out.ptr(rect);
+      } else {
+        auto in                            = args.inputs[i].read_accessor<VAL, DIM>(rect);
+        *reinterpret_cast<const void**>(p) = in.ptr(rect);
+      }
       p += sizeof(void*);
     }
     auto size = rect.volume();
@@ -129,7 +133,7 @@ struct EvalUdfGPU {
 
     auto stream = get_cached_stream();
 
-    //executing the function
+    // executing the function
     CUresult status = cuLaunchKernel(
       func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config);
     if (status != CUDA_SUCCESS) {
@@ -143,8 +147,12 @@ struct EvalUdfGPU {
 
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
-  EvalUdfArgs args{0, context.outputs(), context.scalars()[0].value<std::string>()};
-  size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
-  double_dispatch(dim, args.args[0].code(), EvalUdfGPU{}, args);
+  EvalUdfArgs args{0,
+                   context.inputs(),
+                   context.outputs(),
+                   context.scalars()[0].value<std::string>(),
+                   context.scalars()[1].value<uint32_t>()};
+  size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
+  double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
 }
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index 6a53dd9f2f..0e5b479686 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -22,8 +22,10 @@ namespace cunumeric {
 
 struct EvalUdfArgs {
   uint64_t cpu_func_ptr;
-  std::vector<Array>& args;
+  std::vector<Array>& inputs;
+  std::vector<Array>& outputs;
   std::string ptx = "";
+  uint32_t num_outputs;
 };
 
 class EvalUdfTask : public CuNumericTask<EvalUdfTask> {
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
index 6825b90ae7..969999490d 100644
--- a/src/cunumeric/vectorize/eval_udf_template.inl
+++ b/src/cunumeric/vectorize/eval_udf_template.inl
@@ -55,8 +55,9 @@ static void eval_udf_template(TaskContext& context)
 {
   is_gpus = context.scalars()[0].value<bool>();
   if (is_gpus)
-  EvalUdfArgs args{0, context.scalars()[1].value<char*>(), context.outputs()};
-  else EvalUdfArgs args{context.scalars()[1].value<uint64_t>(),'', context.outputs()};
+    EvalUdfArgs args{0, context.scalars()[1].value<char*>(), context.outputs()};
+  else
+    EvalUdfArgs args{context.scalars()[1].value<uint64_t>(),'', context.outputs()};
   size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
   double_dispatch(dim, args.args[0].code(), EvalUdfImpl<KIND>{}, args);
 }

From 844a29f04d37f012a3f756c5c303b9c108ee0e7a Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 10 Feb 2023 19:22:44 -0800
Subject: [PATCH 10/78] requiring exact instance creation for EVAL_UDF task

---
 cunumeric/vectorize.py  | 12 ++++++------
 src/cunumeric/mapper.cc | 16 ++++++++++++++++
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 4ecf496656..cca4191b8e 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -300,9 +300,9 @@ def _execute_gpu(self) -> None:
             if idx != 0:
                 task.add_alignment(a0.base, a_tmp.base)
             idx += 1
-            task.add_broadcast(
-                a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
-            )
+            # task.add_broadcast(
+            #    a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
+            # )
         task.execute()
 
     def _execute_cpu(self) -> None:
@@ -320,9 +320,9 @@ def _execute_cpu(self) -> None:
             if idx != 0:
                 task.add_alignment(a0.base, a_tmp.base)
             idx += 1
-            task.add_broadcast(
-                a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
-            )
+            # task.add_broadcast(
+            #    a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
+            # )
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index 51797acfe7..c879c4f476 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -222,6 +222,22 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       }
       return std::move(mappings);
     }
+    case CUNUMERIC_EVAL_UDF: {
+      std::vector<StoreMapping> mappings;
+      auto& inputs  = task.inputs();
+      auto& outputs = task.outputs();
+      for (auto& input : inputs) {
+        mappings.push_back(StoreMapping::default_mapping(input, options.front()));
+        // mappings.back().policy.ordering.c_order();
+        mappings.back().policy.exact = true;
+      }
+      for (auto& output : outputs) {
+        mappings.push_back(StoreMapping::default_mapping(output, options.front()));
+        // mappings.back().policy.ordering.c_order();
+        mappings.back().policy.exact = true;
+      }
+      return std::move(mappings);
+    }
     default: {
       return {};
     }

From 728549825515bd693392426749e9d9b48801b8d4 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 10 Feb 2023 20:07:26 -0800
Subject: [PATCH 11/78] fixing logic for generating CPU functions

---
 cunumeric/vectorize.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index cca4191b8e..2494ea64d9 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -246,6 +246,20 @@ def _lift_to_array_access(m: Any) -> str:
             name = m.group(0)
             if name in argnames:
                 return "{}[{}]".format(name, _LOOP_VAR)
+            elif name == "if":
+                return "if "
+            elif name == "return":
+                return "return "
+            elif name == "or":
+                return "or "
+            elif name == "and":
+                return "and "
+            elif name == "not":
+                return "not "
+            elif name == "min":
+                return "min"
+            elif name == "max":
+                return "max"
             else:
                 return "{}[0]".format(name)
 
@@ -254,6 +268,9 @@ def _lift_to_array_access(m: Any) -> str:
             l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
             lines.append("        " + l_new)
 
+        print("IRINA DEBUG CPU function")
+        print(lines)
+
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
         glbs: Dict[str, Any] = {}

From f1192e6ec5288d0e8a110c7e6c43d1c1616fe65c Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 14 Feb 2023 09:18:20 -0800
Subject: [PATCH 12/78] some clean-up

---
 cunumeric/vectorize.py | 122 ++++++++++++++++++++---------------------
 1 file changed, 60 insertions(+), 62 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 2494ea64d9..9d914e1b28 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -171,6 +171,27 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
             return_lines.append(lines[i].rstrip())
         return return_lines
 
+    def _replace_name(self, name: str,argnames:list[str], _LOOP_VAR:str) -> str:
+        if name in argnames:
+            return "{}[{}]".format(name, _LOOP_VAR)
+        elif name == "if":
+            return "if "
+        elif name == "return":
+            return "return "
+        elif name == "or":
+            return "or "
+        elif name == "and":
+            return "and "
+        elif name == "not":
+            return "not "
+        elif name == "min":
+            return "min"
+        elif name == "max":
+            return "max"
+        else:
+            return "{}".format(name)
+
+
     def _build_gpu_function(self) -> Any:
 
         funcid = "vectorized_{}".format(self._pyfunc.__name__)
@@ -191,11 +212,7 @@ def _build_gpu_function(self) -> Any:
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
-            name = m.group(0)
-            if name in argnames:
-                return "{}[{}]".format(name, _LOOP_VAR)
-            else:
-                return "{}".format(name)
+            return self._replace_name(m.group(0), argnames, _LOOP_VAR)
 
         # kernel body
         lines_old = self._get_func_body(self._pyfunc)
@@ -242,34 +259,17 @@ def _emit_assignment(
 
         lines_old = self._get_func_body(self._pyfunc)
 
+        # Kernel body
         def _lift_to_array_access(m: Any) -> str:
-            name = m.group(0)
-            if name in argnames:
-                return "{}[{}]".format(name, _LOOP_VAR)
-            elif name == "if":
-                return "if "
-            elif name == "return":
-                return "return "
-            elif name == "or":
-                return "or "
-            elif name == "and":
-                return "and "
-            elif name == "not":
-                return "not "
-            elif name == "min":
-                return "min"
-            elif name == "max":
-                return "max"
-            else:
-                return "{}[0]".format(name)
+            return self._replace_name(m.group(0), argnames, _LOOP_VAR)
 
         # lines_new = []
         for line in lines_old:
             l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
             lines.append("        " + l_new)
 
-        print("IRINA DEBUG CPU function")
-        print(lines)
+        #print("IRINA DEBUG CPU function")
+        #print(lines)
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
@@ -306,40 +306,32 @@ def _execute_gpu(self) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._gpu_func[0], ty.string)
         task.add_scalar_arg(self._num_outputs, ty.uint32)
-        idx = 0
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
             a_tmp = runtime.to_deferred_array(a._thunk)
-            task.add_input(a_tmp.base)
+            a_tmp=a_tmp.base
+            task.add_input(a_tmp)
             if count < self._num_outputs:
-                task.add_output(a_tmp.base)
-            if idx != 0:
-                task.add_alignment(a0.base, a_tmp.base)
-            idx += 1
-            # task.add_broadcast(
-            #    a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
-            # )
+                task.add_output(a_tmp)
+            if count != 0:
+                task.add_alignment(a0.base, a_tmp)
         task.execute()
 
     def _execute_cpu(self) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
         task.add_scalar_arg(self._num_outputs, ty.uint32)
-        idx = 0
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
             a_tmp = runtime.to_deferred_array(a._thunk)
-            task.add_input(a_tmp.base)
+            a_tmp=a_tmp.base
+            task.add_input(a_tmp)
             if count < self._num_outputs:
-                task.add_output(a_tmp.base)
-            if idx != 0:
-                task.add_alignment(a0.base, a_tmp.base)
-            idx += 1
-            # task.add_broadcast(
-            #    a_tmp.base, axes=tuple(range(1, len(a_tmp.base.shape)))
-            # )
+                task.add_output(a_tmp)
+            if count != 0:
+                task.add_alignment(a0.base, a_tmp)
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
@@ -358,24 +350,30 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     "passed to cunumeric.vectorize"
                 )
 
-        #        #FIXME: comment out when brodcast PR is merged
-        #        #bring all argumants to the same shape and type:
-        #        if len(self._args)>0:
-        #             ty = self._args[0].dtype
-        #             #FIXME: should we bring them all to the same type?
-        #             for a in self._args:
-        #                 if a.dtype != ty:
-        #                    return TypeError("all arguments of "
-        #                         "user defined function "
-        #                      "should have the same type")
-
-        #    shapes = tuple(a.shape for a in self._args)
-        #    shape = broadcast_shapes(shapes)
-        #    new_args = tuple()
-        #    for a in self._args:
-        #        a_new = a.broadcast_to(shape)
-        #        new_args +=(a_new,)
-        #    self._args = new_args
+        #all output arrays should have the same type
+        if len(self._args)>0:
+            ty = self._args[0].dtype
+            shape = self._args[0].shape
+            for i in range (1, self._num_outputs):
+                if ty!=self._args[i].dtype:
+                    raise TypeError("cuNumeric doesnt support "
+                        "different types for output data in "
+                        "user function passed to vectorize")
+                if shape != self._args[i].shape:
+                    raise TypeError("cuNumeric doesnt support "
+                        "different shapes for output data in "
+                        "user function passed to vectorize")
+            for i in range (self._num_outputs, len(self._args)):
+                if ty!=self._args[i].dtype:
+                    runtime.warn(
+                        "converting input array to output types in user func ",
+                        category=RuntimeWarning,
+                    )
+                    self._args[i] = self._args[i].astype(ty)
+                if shape !=self._args[i].shape:
+                     raise TypeError("cuNumeric doesnt support "
+                        "different shapes for arrays in "
+                        "user function passed to vectorize")
 
         self._kwargs = list(kwargs)
         if len(self._kwargs) > 1:

From e8b544f2e2a71e409049ffc6101c3dcd296c7cdb Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 14 Feb 2023 17:56:26 -0800
Subject: [PATCH 13/78] adding logic for scalar arguments

---
 cunumeric/utils.py                  |  25 +++++++
 cunumeric/vectorize.py              | 102 +++++++++++++++++-----------
 src/cunumeric/vectorize/eval_udf.cc |  45 ++++++++----
 src/cunumeric/vectorize/eval_udf.cu |  52 ++++++++++----
 src/cunumeric/vectorize/eval_udf.h  |   2 +
 tests/integration/test_vectorize.py |   9 ++-
 6 files changed, 163 insertions(+), 72 deletions(-)

diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 25f0f19f15..cef1fbb539 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -45,6 +45,25 @@
     np.complex128: ty.complex128,
 }
 
+CUNUMERIC_TYPE_MAP = {
+    "bool": ty.bool_,
+    "int8": ty.int8,
+    "int16": ty.int16,
+    "int32": ty.int32,
+    "int": ty.int64,  # np.int is int
+    "int64": ty.int64, 
+    "uint8": ty.uint8,
+    "uint16": ty.uint16,
+    "uint32": ty.uint32,
+    "uint64": ty.uint64,  # np.uint is np.uint64
+    "float16": ty.float16,
+    "float32": ty.float32,
+    "float": ty.float64,
+    "float64": ty.float64,
+    "complex64": ty.complex64,
+    "complex128": ty.complex128,
+}
+
 
 def is_advanced_indexing(key: Any) -> bool:
     if key is Ellipsis or key is None:  # np.newdim case
@@ -98,6 +117,12 @@ def is_supported_dtype(dtype: Any) -> bool:
         raise TypeError("expected a NumPy dtype")
     return dtype.type in SUPPORTED_DTYPES
 
+def convert_to_cunumeric_dtype(dtype: str) ->Any:
+    if dtype in CUNUMERIC_TYPE_MAP:
+        return CUNUMERIC_TYPE_MAP[dtype]
+    else:
+        raise TypeError("dtype is not supported")
+
 
 def calculate_volume(shape: NdShape) -> int:
     if len(shape) == 0:
diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 9d914e1b28..5d74aac1e7 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -27,6 +27,7 @@
 
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
+from .utils import convert_to_cunumeric_dtype
 
 # from legate.timing import time
 
@@ -111,6 +112,10 @@ def __init__(
         self._otypes: Optional[tuple[Any]] = None
         self._result = None
         self._args: List[Any] = []
+        self._scalar_args: List[Any]=[]
+        self._scalar_idxs:List[int]=[]
+        self._scalar_names:List[str]=[]
+        self._argnames:List[str]=[]
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
@@ -171,8 +176,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
             return_lines.append(lines[i].rstrip())
         return return_lines
 
-    def _replace_name(self, name: str,argnames:list[str], _LOOP_VAR:str) -> str:
-        if name in argnames:
+    def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str:
+        if name in self._argnames and not(name in self._scalar_names) :
             return "{}[{}]".format(name, _LOOP_VAR)
         elif name == "if":
             return "if "
@@ -188,8 +193,10 @@ def _replace_name(self, name: str,argnames:list[str], _LOOP_VAR:str) -> str:
             return "min"
         elif name == "max":
             return "max"
-        else:
+        elif is_gpu:
             return "{}".format(name)
+        else:
+            return "{}[0]".format(name)
 
 
     def _build_gpu_function(self) -> Any:
@@ -200,8 +207,8 @@ def _build_gpu_function(self) -> Any:
         lines = ["from numba import cuda"]
 
         # Signature
-        argnames = list(k for k in inspect.signature(self._pyfunc).parameters)
-        args = argnames + [_SIZE_VAR]
+        args = self._argnames + [_SIZE_VAR]
+
         lines.append("def {}({}):".format(funcid, ",".join(args)))
 
         # Initialize the index variable and return immediately
@@ -212,7 +219,7 @@ def _build_gpu_function(self) -> Any:
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
-            return self._replace_name(m.group(0), argnames, _LOOP_VAR)
+            return self._replace_name(m.group(0), _LOOP_VAR, True)
 
         # kernel body
         lines_old = self._get_func_body(self._pyfunc)
@@ -247,12 +254,16 @@ def _emit_assignment(
             )
 
         # get names of arguments
-        argnames = list(k for k in inspect.signature(self._pyfunc).parameters)
         arg_idx = 0
         for a in self._args:
             ty = a.dtype
-            _emit_assignment(argnames[arg_idx], arg_idx, _SIZE_VAR, ty)
+            _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, ty)
             arg_idx += 1
+        for a in self._scalar_args:
+            scalar_type = np.dtype(type(a).__name__)
+            _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type)
+            arg_idx += 1
+
 
         # Main loop
         lines.append("    for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR))
@@ -261,16 +272,13 @@ def _emit_assignment(
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
-            return self._replace_name(m.group(0), argnames, _LOOP_VAR)
+            return self._replace_name(m.group(0), _LOOP_VAR)
 
         # lines_new = []
         for line in lines_old:
             l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
             lines.append("        " + l_new)
 
-        #print("IRINA DEBUG CPU function")
-        #print(lines)
-
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
         glbs: Dict[str, Any] = {}
@@ -285,6 +293,11 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
             ty = getattr(numba.core.types, ty)
             ty = numba.core.types.CPointer(ty)
             types.append(ty)
+        for arg in self._scalar_args:
+            ty = np.dtype(type(arg).__name__)
+            ty = str(ty) if ty != bool else "int8"
+            ty = getattr(numba.core.types, ty)
+            types.append(ty)
         return types
 
     def _compile_func_gpu(self) -> tuple[Any]:
@@ -302,26 +315,18 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
 
         return numba.cfunc(sig)(self._numba_func)
 
-    def _execute_gpu(self) -> None:
+    def _execute(self, is_gpu:bool) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-        task.add_scalar_arg(self._gpu_func[0], ty.string)
-        task.add_scalar_arg(self._num_outputs, ty.uint32)
-        a0 = self._args[0]._thunk
-        a0 = runtime.to_deferred_array(a0)
-        for count, a in enumerate(self._args):
-            a_tmp = runtime.to_deferred_array(a._thunk)
-            a_tmp=a_tmp.base
-            task.add_input(a_tmp)
-            if count < self._num_outputs:
-                task.add_output(a_tmp)
-            if count != 0:
-                task.add_alignment(a0.base, a_tmp)
-        task.execute()
+        if is_gpu:
+            task.add_scalar_arg(self._gpu_func[0], ty.string)
+            task.add_scalar_arg(self._num_outputs, ty.uint32)
+        else:
+            task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
+            task.add_scalar_arg(self._num_outputs, ty.uint32)
+        for a in self._scalar_args:
+            dtype = convert_to_cunumeric_dtype(type(a).__name__)
+            task.add_scalar_arg(a,dtype)
 
-    def _execute_cpu(self) -> None:
-        task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-        task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
-        task.add_scalar_arg(self._num_outputs, ty.uint32)
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
@@ -339,16 +344,31 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         Return arrays with the results of `pyfunc` broadcast (vectorized) over
         `args` and `kwargs` not in `excluded`.
         """
-        self._args = list(
-            convert_to_cunumeric_ndarray(arg) if arg is not None else arg
-            for (idx, arg) in enumerate(args)
-        )
-        for arg in self._args:
+        #self._args = list(
+        #    convert_to_cunumeric_ndarray(arg) if (arg is not None and np.ndim(Arg)>0)
+        #    for (idx, arg) in enumerate(args)
+        #)
+        for i,arg in enumerate(args):
             if arg is None:
                 raise ValueError(
                     "None is not supported in user function "
                     "passed to cunumeric.vectorize"
                 )
+            elif np.ndim(arg)==0:
+                self._scalar_args.append(arg)
+                self._scalar_idxs.append(i)
+            else:
+                self._args.append(convert_to_cunumeric_ndarray(arg))
+
+        #first fill arrays to argnames, then scalars:
+        for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
+            if not(i in self._scalar_idxs):
+                self._argnames.append(k)
+
+        for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
+            if i in self._scalar_idxs:
+                self._scalar_names.append(k)
+                self._argnames.append(k)
 
         #all output arrays should have the same type
         if len(self._args)>0:
@@ -370,7 +390,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                         category=RuntimeWarning,
                     )
                     self._args[i] = self._args[i].astype(ty)
-                if shape !=self._args[i].shape:
+                if shape !=self._args[i].shape and np.ndim(self._args[i])>0:
                      raise TypeError("cuNumeric doesnt support "
                         "different shapes for arrays in "
                         "user function passed to vectorize")
@@ -387,11 +407,17 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 self._gpu_func = self._compile_func_gpu()
                 if self._cache:
                     self._created = True
-            self._execute_gpu()
+            self._execute(True)
         else:
             if not self._created:
                 self._numba_func = self._build_cpu_function()
                 self._cpu_func = self._compile_func_cpu()
                 if self._cache:
                     self._created = True
-            self._execute_cpu()
+            self._execute(False)
+
+        self._args.clear()
+        self._scalar_args.clear()
+        self._scalar_idxs.clear()
+        self._argnames.clear()
+        self._scalar_names.clear()
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 78f0a045b7..47f84f0d97 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -30,33 +30,50 @@ struct EvalUdfCPU {
     using UDF = void(void**, size_t);
     auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
     std::vector<void*> udf_args;
-    using VAL = legate_type_of<CODE>;
-    auto rect = args.inputs[0].shape<DIM>();
+    size_t volume = 1;
+    if (args.inputs.size()>0){
+      using VAL = legate_type_of<CODE>;
+      auto rect = args.inputs[0].shape<DIM>();
 
-    if (rect.empty()) return;
-    for (size_t i = 0; i < args.inputs.size(); i++) {
-      if (i < args.num_outputs) {
-        auto out = args.outputs[i].write_accessor<VAL, DIM>(rect);
-        udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
-      } else {
-        auto out = args.inputs[i].read_accessor<VAL, DIM>(rect);
-        udf_args.push_back(reinterpret_cast<void*>(const_cast<VAL*>(out.ptr(rect))));
+      if (rect.empty()) return;
+      for (size_t i = 0; i < args.inputs.size(); i++) {
+        if (i < args.num_outputs) {
+          auto out = args.outputs[i].write_accessor<VAL, DIM>(rect);
+          udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
+        } else {
+          auto out = args.inputs[i].read_accessor<VAL, DIM>(rect);
+          udf_args.push_back(reinterpret_cast<void*>(const_cast<VAL*>(out.ptr(rect))));
+        }
       }
-    }
-    udf(udf_args.data(), rect.volume());
+      volume = rect.volume();
+    }//if
+    for (auto s: args.scalars)
+        udf_args.push_back(const_cast<void*>(s.ptr()));
+    udf(udf_args.data(), volume);
   }
 };
 
 /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context)
 {
   std::string tmp("tmp");
+  std::vector<Scalar>scalars;
+  for (size_t i=2; i<context.scalars().size(); i++)
+      scalars.push_back(context.scalars()[i]);
   EvalUdfArgs args{context.scalars()[0].value<uint64_t>(),
                    context.inputs(),
                    context.outputs(),
+                   scalars,
                    tmp,
                    context.scalars()[1].value<uint32_t>()};
-  size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
-  double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
+  size_t dim=1;
+  if (args.inputs.size()>0){
+    dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
+    double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
+  }
+  else{
+    //FIXME
+    double_dispatch(dim, args.inputs[0].code() , EvalUdfCPU{}, args);
+    }
 }
 
 namespace  // unnamed
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index d0886387a7..28d899b0e1 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -29,8 +29,8 @@ struct EvalUdfGPU {
   void operator()(EvalUdfArgs& args) const
   {
     using VAL = legate_type_of<CODE>;
-    auto rect = args.inputs[0].shape<DIM>();
-    if (rect.empty()) return;
+    //auto rect = args.inputs[0].shape<DIM>();
+    //if (rect.empty()) return;
 
     // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;
@@ -93,8 +93,8 @@ struct EvalUdfGPU {
 
     // 2: after fucntion is generated, we can execute it:
 
-    // Filling up the bugger with arguments
-    size_t buffer_size = (args.inputs.size()) * sizeof(void*);
+    // Filling up the buffer with arguments
+    size_t buffer_size = (args.inputs.size()+args.scalars.size()) * sizeof(void*);
     buffer_size += sizeof(size_t);
 
     std::vector<char> arg_buffer(buffer_size);
@@ -102,17 +102,27 @@ struct EvalUdfGPU {
 
     auto p = raw_arg_buffer;
 
-    for (size_t i = 0; i < args.inputs.size(); i++) {
-      if (i < args.num_outputs) {
-        auto out                           = args.outputs[i].write_accessor<VAL, DIM>(rect);
-        *reinterpret_cast<const void**>(p) = out.ptr(rect);
-      } else {
-        auto in                            = args.inputs[i].read_accessor<VAL, DIM>(rect);
-        *reinterpret_cast<const void**>(p) = in.ptr(rect);
+    size_t size =1;
+    if (args.inputs.size()>0){
+      auto rect = args.inputs[0].shape<DIM>();
+      size = rect.volume();
+      for (size_t i = 0; i < args.inputs.size(); i++) {
+        if (i < args.num_outputs) {
+          auto out                           = args.outputs[i].write_accessor<VAL, DIM>(rect);
+          *reinterpret_cast<const void**>(p) = out.ptr(rect);
+        } else {
+          auto in                            = args.inputs[i].read_accessor<VAL, DIM>(rect);
+          *reinterpret_cast<const void**>(p) = in.ptr(rect);
+        }
+        p += sizeof(void*);
       }
-      p += sizeof(void*);
     }
-    auto size = rect.volume();
+    for (auto scalar: args.scalars){
+        memcpy(p, scalar.ptr(), scalar.size());
+        p += scalar.size();
+       // *reinterpret_cast<const void**>(p) =s;
+        //p += sizeof(void*);
+      }
     memcpy(p, &size, sizeof(size_t));
 
     void* config[] = {
@@ -147,12 +157,24 @@ struct EvalUdfGPU {
 
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
+  std::vector<Scalar>scalars;
+  for (size_t i=2; i<context.scalars().size(); i++)
+      scalars.push_back(context.scalars()[i]);
+
   EvalUdfArgs args{0,
                    context.inputs(),
                    context.outputs(),
+                   scalars,
                    context.scalars()[0].value<std::string>(),
                    context.scalars()[1].value<uint32_t>()};
-  size_t dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
-  double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
+  size_t dim=1;
+  if (args.inputs.size()>0){
+    dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
+    double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
+  }
+  else{
+    double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
+    //double_dispatch(dim, 0 , EvalUdfGPU{}, args);
+  }
 }
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index 0e5b479686..243c439230 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "cunumeric/cunumeric.h"
+#include "core/data/scalar.h"
 
 namespace cunumeric {
 
@@ -24,6 +25,7 @@ struct EvalUdfArgs {
   uint64_t cpu_func_ptr;
   std::vector<Array>& inputs;
   std::vector<Array>& outputs;
+  std::vector<legate::Scalar>scalars;
   std::string ptx = "";
   uint32_t num_outputs;
 };
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index a6fd3d298d..a453198f23 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -32,11 +32,10 @@ def test_vectorize():
     func(a, b)
     assert(np.array_equal(a, [1,3,5,7,9]))
 
-#FIXME uncomment once broadcast routines are merged
-#    a= num.arange(5)
-#    b=2
-#    func(a,b)
-#    assert(np.array_equal(a, [2,4,6,8,10]))
+    a= num.arange(5)
+    b=2
+    func(a,b)
+    assert(np.array_equal(a, [2,4,6,8,10]))
     
 
 if __name__ == "__main__":

From 37a7281b2fbe4cf2409951e5cdf52462de35a230 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 16 Feb 2023 12:51:19 -0800
Subject: [PATCH 14/78] refactoring cectorize kernel to compute point

---
 cunumeric/vectorize.py                        | 18 ++++++++++---
 src/cunumeric/mapper.cc                       |  2 ++
 src/cunumeric/pitches.h                       | 26 ++++++++++++++++++-
 src/cunumeric/vectorize/eval_udf.cu           | 22 +++++++++++++---
 src/cunumeric/vectorize/eval_udf_template.inl |  6 ++++-
 tests/integration/test_vectorize.py           |  6 ++++-
 6 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 5d74aac1e7..8fae97608b 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -41,6 +41,10 @@
 _SIZE_VAR = "__size__"
 _LOOP_VAR = "__i__"
 _ARGS_VAR = "__args__"
+_DIM_VAR = "__dim__"
+_POINT_VAR = "__point__"
+_LO_POINT_VAR = "__lo_point__"
+_PITCHES_VAR = "__pitches__"
 
 
 class vectorize:
@@ -207,7 +211,7 @@ def _build_gpu_function(self) -> Any:
         lines = ["from numba import cuda"]
 
         # Signature
-        args = self._argnames + [_SIZE_VAR]
+        args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR]
 
         lines.append("def {}({}):".format(funcid, ",".join(args)))
 
@@ -216,16 +220,24 @@ def _build_gpu_function(self) -> Any:
         lines.append("    {} = cuda.grid(1)".format(_LOOP_VAR))
         lines.append("    if {} >= {}:".format(_LOOP_VAR, _SIZE_VAR))
         lines.append("        return")
+        lines.append("    {}={}".format(_POINT_VAR, _LO_POINT_VAR))
+        lines.append("    for p in range({}-1):".format(_DIM_VAR))
+        
+        lines.append("        {}[p]+={}/int({}[p])".format(_POINT_VAR,_LOOP_VAR, _PITCHES_VAR))
+        lines.append("        {}={}%int({})".format(_LOOP_VAR,_LOOP_VAR,_PITCHES_VAR))
+        lines.append("    {}[{}-1]+={}".format(_POINT_VAR, _DIM_VAR, _LOOP_VAR))
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
-            return self._replace_name(m.group(0), _LOOP_VAR, True)
+            return self._replace_name(m.group(0), _POINT_VAR, True)
 
         # kernel body
         lines_old = self._get_func_body(self._pyfunc)
         for line in lines_old:
             l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
             lines.append(l_new)
+   
+        print("IRINA DEBUG GPU function",lines)
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
@@ -302,7 +314,7 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
 
     def _compile_func_gpu(self) -> tuple[Any]:
         types = self._get_numba_types()
-        arg_types = types + [numba.core.types.uint64]
+        arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)]
         sig = (*arg_types,)
 
         cuda_arch = numba.cuda.get_current_device().compute_capability
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index c879c4f476..5959179ee5 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -222,6 +222,7 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       }
       return std::move(mappings);
     }
+#if 0
     case CUNUMERIC_EVAL_UDF: {
       std::vector<StoreMapping> mappings;
       auto& inputs  = task.inputs();
@@ -238,6 +239,7 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       }
       return std::move(mappings);
     }
+#endif
     default: {
       return {};
     }
diff --git a/src/cunumeric/pitches.h b/src/cunumeric/pitches.h
index af12c09e43..ae88ee9138 100644
--- a/src/cunumeric/pitches.h
+++ b/src/cunumeric/pitches.h
@@ -53,6 +53,12 @@ class Pitches {
     point[DIM] += index;
     return point;
   }
+  
+  __CUDA_HD__
+  inline const size_t* data(void)
+  {
+       return &pitches[0];
+  }
 
  private:
   size_t pitches[DIM];
@@ -90,6 +96,13 @@ class Pitches<DIM, false /*C_ORDER*/> {
     return point;
   }
 
+  __CUDA_HD__
+  inline const size_t* data(void)
+  {
+       return &pitches[0];
+  }
+
+
  private:
   size_t pitches[DIM];
 };
@@ -103,8 +116,10 @@ class Pitches<0, C_ORDER> {
   {
     if (rect.lo[0] > rect.hi[0])
       return 0;
-    else
+    else{
+      pitches[0]=rect.hi[0] - rect.lo[0] + 1;
       return (rect.hi[0] - rect.lo[0] + 1);
+      }
   }
   __CUDA_HD__
   inline Legion::Point<1> unflatten(size_t index, const Legion::Point<1>& lo) const
@@ -113,6 +128,15 @@ class Pitches<0, C_ORDER> {
     point[0] += index;
     return point;
   }
+  __CUDA_HD__
+  inline const size_t* data(void)
+  {
+       return &pitches[0];
+  }
+
+  private:
+  size_t pitches[1];
+
 };
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 28d899b0e1..3fc233384d 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -16,6 +16,7 @@
 
 #include "cunumeric/vectorize/eval_udf.h"
 #include "cunumeric/cuda_help.h"
+#include "cunumeric/pitches.h"
 #include <regex>
 #include <cuda.h>
 
@@ -29,8 +30,7 @@ struct EvalUdfGPU {
   void operator()(EvalUdfArgs& args) const
   {
     using VAL = legate_type_of<CODE>;
-    //auto rect = args.inputs[0].shape<DIM>();
-    //if (rect.empty()) return;
+    Rect<DIM> rect;
 
     // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;
@@ -95,7 +95,10 @@ struct EvalUdfGPU {
 
     // Filling up the buffer with arguments
     size_t buffer_size = (args.inputs.size()+args.scalars.size()) * sizeof(void*);
-    buffer_size += sizeof(size_t);
+    buffer_size +=sizeof(size_t);//size
+    buffer_size += sizeof(size_t);//dim
+    buffer_size += sizeof(void*);//pitches
+    buffer_size += sizeof(void*);//lo_point
 
     std::vector<char> arg_buffer(buffer_size);
     char* raw_arg_buffer = arg_buffer.data();
@@ -104,7 +107,7 @@ struct EvalUdfGPU {
 
     size_t size =1;
     if (args.inputs.size()>0){
-      auto rect = args.inputs[0].shape<DIM>();
+      rect = args.inputs[0].shape<DIM>();
       size = rect.volume();
       for (size_t i = 0; i < args.inputs.size(); i++) {
         if (i < args.num_outputs) {
@@ -124,6 +127,17 @@ struct EvalUdfGPU {
         //p += sizeof(void*);
       }
     memcpy(p, &size, sizeof(size_t));
+    size_t dim=DIM;
+    p += sizeof(size_t);
+    memcpy(p, &dim, sizeof(size_t));
+    p += sizeof(size_t);
+    Pitches<DIM - 1> pitches;
+    size_t volume = pitches.flatten(rect);
+    *reinterpret_cast<const void**>(p) =pitches.data();
+    p += sizeof(void*);
+    *reinterpret_cast<const void**>(p) =&rect.lo[0];
+//    p += sizeof(void*);
+    
 
     void* config[] = {
       CU_LAUNCH_PARAM_BUFFER_POINTER,
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
index 969999490d..c597e44a14 100644
--- a/src/cunumeric/vectorize/eval_udf_template.inl
+++ b/src/cunumeric/vectorize/eval_udf_template.inl
@@ -38,12 +38,16 @@ struct EvalUdfImpl {
     std::vector<void*> udf_args;
     using VAL = legate_type_of<CODE>;
     auto rect = args.args[0].shape<DIM>();
+   
+    size_t strides[DIM];
 
     if (rect.empty()) return;
     EvalUdfImplBody<KIND, CODE, DIM>();
     for (size_t i = 0; i < args.args.size(); i++) {
       auto out = args.args[i].write_accessor<VAL, DIM>(rect);
-      udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
+      udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect, strides)));
+      for (size_t i=0; i<DIM;i++)
+          std::cout<<"IRINA DEBUG strides = "<<strides[i]<<std::endl;
     }
 
     udf(udf_args.data(), rect.volume());
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index a453198f23..fb19a85e69 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -36,7 +36,11 @@ def test_vectorize():
     b=2
     func(a,b)
     assert(np.array_equal(a, [2,4,6,8,10]))
-    
+   
+    a=num.array([[1,2,3],[4,5,6],[7,8,9]])
+    b=num.array([[10,11,12],[13,14,15],[16,17,18]])
+    func(a[:2],b[:2])
+    print(a)
 
 if __name__ == "__main__":
     import sys

From 024e943f4c5cc42e2b23e68db60ce8a331c43857 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 16 Feb 2023 23:00:55 -0800
Subject: [PATCH 15/78] making GPU kernel work with sparse arrays

---
 cunumeric/vectorize.py              | 29 ++++++++++++++++------------
 src/cunumeric/vectorize/eval_udf.cu | 30 +++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 8fae97608b..81ecc4776a 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -42,7 +42,7 @@
 _LOOP_VAR = "__i__"
 _ARGS_VAR = "__args__"
 _DIM_VAR = "__dim__"
-_POINT_VAR = "__point__"
+_STRIDES_VAR = "__strides__"
 _LO_POINT_VAR = "__lo_point__"
 _PITCHES_VAR = "__pitches__"
 
@@ -211,25 +211,30 @@ def _build_gpu_function(self) -> Any:
         lines = ["from numba import cuda"]
 
         # Signature
-        args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR]
+        args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR] +[_STRIDES_VAR]
 
         lines.append("def {}({}):".format(funcid, ",".join(args)))
 
         # Initialize the index variable and return immediately
         # when it exceeds the data size
-        lines.append("    {} = cuda.grid(1)".format(_LOOP_VAR))
-        lines.append("    if {} >= {}:".format(_LOOP_VAR, _SIZE_VAR))
+        lines.append("    local_i = cuda.grid(1)")
+        lines.append("    if local_i >= {}:".format(_SIZE_VAR))
         lines.append("        return")
-        lines.append("    {}={}".format(_POINT_VAR, _LO_POINT_VAR))
+        lines.append("    {}:int = 0".format(_LOOP_VAR))
         lines.append("    for p in range({}-1):".format(_DIM_VAR))
-        
-        lines.append("        {}[p]+={}/int({}[p])".format(_POINT_VAR,_LOOP_VAR, _PITCHES_VAR))
-        lines.append("        {}={}%int({})".format(_LOOP_VAR,_LOOP_VAR,_PITCHES_VAR))
-        lines.append("    {}[{}-1]+={}".format(_POINT_VAR, _DIM_VAR, _LOOP_VAR))
+        lines.append("        x={}[p]+int(local_i/{}[p])".format(_LO_POINT_VAR,_PITCHES_VAR))
+        lines.append("        local_i = local_i-{}[p]*int(local_i/{}[p])".format(_PITCHES_VAR,_PITCHES_VAR))
+        lines.append("        {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR))
+        #lines.append("        print(x, {}, {}[p])".format(_LOOP_VAR, _STRIDES_VAR))
+
+        #lines.append("    print(local_i, local_i,local_i,local_i,local_i,local_i, {}[0])".format( _STRIDES_VAR))
+        lines.append("    {}+=int(local_i*{}[{}-1])".format(_LOOP_VAR, _STRIDES_VAR, _DIM_VAR))
+        #lines.append("    print( local_i, local_i, local_i, {})".format(_LOOP_VAR))
+        #lines.append("    {} =local_i".format(_LOOP_VAR))
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
-            return self._replace_name(m.group(0), _POINT_VAR, True)
+            return self._replace_name(m.group(0), _LOOP_VAR, True)
 
         # kernel body
         lines_old = self._get_func_body(self._pyfunc)
@@ -237,7 +242,7 @@ def _lift_to_array_access(m: Any) -> str:
             l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
             lines.append(l_new)
    
-        print("IRINA DEBUG GPU function",lines)
+        #print("IRINA DEBUG GPU function",lines)
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
@@ -314,7 +319,7 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
 
     def _compile_func_gpu(self) -> tuple[Any]:
         types = self._get_numba_types()
-        arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)]
+        arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)]+[numba.core.types.CPointer(numba.core.types.uint64)]
         sig = (*arg_types,)
 
         cuda_arch = numba.cuda.get_current_device().compute_capability
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 3fc233384d..ea6d878254 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -99,12 +99,13 @@ struct EvalUdfGPU {
     buffer_size += sizeof(size_t);//dim
     buffer_size += sizeof(void*);//pitches
     buffer_size += sizeof(void*);//lo_point
+    buffer_size += sizeof(void*);//strides
 
     std::vector<char> arg_buffer(buffer_size);
     char* raw_arg_buffer = arg_buffer.data();
 
     auto p = raw_arg_buffer;
-
+    size_t strides[DIM];
     size_t size =1;
     if (args.inputs.size()>0){
       rect = args.inputs[0].shape<DIM>();
@@ -112,10 +113,10 @@ struct EvalUdfGPU {
       for (size_t i = 0; i < args.inputs.size(); i++) {
         if (i < args.num_outputs) {
           auto out                           = args.outputs[i].write_accessor<VAL, DIM>(rect);
-          *reinterpret_cast<const void**>(p) = out.ptr(rect);
+          *reinterpret_cast<const void**>(p) = out.ptr(rect, strides);
         } else {
           auto in                            = args.inputs[i].read_accessor<VAL, DIM>(rect);
-          *reinterpret_cast<const void**>(p) = in.ptr(rect);
+          *reinterpret_cast<const void**>(p) = in.ptr(rect, strides);
         }
         p += sizeof(void*);
       }
@@ -133,10 +134,27 @@ struct EvalUdfGPU {
     p += sizeof(size_t);
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
-    *reinterpret_cast<const void**>(p) =pitches.data();
+    //create buffers for pitches, lower point and strides since
+    //we need to pass pointer to device memory
+    auto device_pitches   = create_buffer<int64_t>(Point<1>(DIM-1), Memory::Kind::Z_COPY_MEM);
+    auto device_lo   = create_buffer<int64_t>(Point<1>(DIM), Memory::Kind::Z_COPY_MEM);
+    auto device_strides   = create_buffer<int64_t>(Point<1>(DIM), Memory::Kind::Z_COPY_MEM);
+    //std::cout<<"IRINA DEBUG"<<std::endl;
+    for (size_t i=0; i<DIM;i++){
+      if (i!=DIM-1){
+        device_pitches[Point<1>(i)]=pitches.data()[i];
+        //std::cout<<" pitches ="<<pitches.data()[i];
+        }
+      device_lo[Point<1>(i)]=rect.lo[i];
+      device_strides[Point<1>(i)] = strides[i];
+      //std::cout<<" device_lo = " <<rect.lo[i]<< "  strides = "<<strides[i]<<std::endl;
+    }
+    *reinterpret_cast<const void**>(p) =device_pitches.ptr(Point<1>(0));
+    p += sizeof(void*);
+    *reinterpret_cast<const void**>(p) =device_lo.ptr(Point<1>(0));
+    p += sizeof(void*);
+    *reinterpret_cast<const void**>(p) =device_strides.ptr(Point<1>(0));
     p += sizeof(void*);
-    *reinterpret_cast<const void**>(p) =&rect.lo[0];
-//    p += sizeof(void*);
     
 
     void* config[] = {

From 24b16c3b0da4452ecc7a7e45299d1606aff100e3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 21 Feb 2023 10:15:22 -0800
Subject: [PATCH 16/78] some clean-up

---
 cunumeric/vectorize.py              | 74 +++++++++++++----------------
 tests/integration/test_vectorize.py | 37 ++++++++++++++-
 2 files changed, 69 insertions(+), 42 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 81ecc4776a..59cc83dc91 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -225,12 +225,7 @@ def _build_gpu_function(self) -> Any:
         lines.append("        x={}[p]+int(local_i/{}[p])".format(_LO_POINT_VAR,_PITCHES_VAR))
         lines.append("        local_i = local_i-{}[p]*int(local_i/{}[p])".format(_PITCHES_VAR,_PITCHES_VAR))
         lines.append("        {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR))
-        #lines.append("        print(x, {}, {}[p])".format(_LOOP_VAR, _STRIDES_VAR))
-
-        #lines.append("    print(local_i, local_i,local_i,local_i,local_i,local_i, {}[0])".format( _STRIDES_VAR))
         lines.append("    {}+=int(local_i*{}[{}-1])".format(_LOOP_VAR, _STRIDES_VAR, _DIM_VAR))
-        #lines.append("    print( local_i, local_i, local_i, {})".format(_LOOP_VAR))
-        #lines.append("    {} =local_i".format(_LOOP_VAR))
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
@@ -281,7 +276,6 @@ def _emit_assignment(
             _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type)
             arg_idx += 1
 
-
         # Main loop
         lines.append("    for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR))
 
@@ -361,31 +355,40 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         Return arrays with the results of `pyfunc` broadcast (vectorized) over
         `args` and `kwargs` not in `excluded`.
         """
-        #self._args = list(
-        #    convert_to_cunumeric_ndarray(arg) if (arg is not None and np.ndim(Arg)>0)
-        #    for (idx, arg) in enumerate(args)
-        #)
-        for i,arg in enumerate(args):
-            if arg is None:
-                raise ValueError(
-                    "None is not supported in user function "
-                    "passed to cunumeric.vectorize"
+        if not self._created:
+            self._scalar_args.clear()
+            self._scalar_idxs.clear()
+            self._args.clear()
+            self._argnames.clear()
+            self._scalar_names.clear()
+
+            for i,arg in enumerate(args):
+                if arg is None:
+                    raise ValueError(
+                        "None is not supported in user function "
+                        "passed to cunumeric.vectorize"
+                    )
+                elif np.ndim(arg)==0:
+                    self._scalar_args.append(arg)
+                    self._scalar_idxs.append(i)
+                else:
+                    self._args.append(convert_to_cunumeric_ndarray(arg))
+
+            #first fill arrays to argnames, then scalars:
+            for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
+                if not(i in self._scalar_idxs):
+                    self._argnames.append(k)
+
+            for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
+                if i in self._scalar_idxs:
+                    self._scalar_names.append(k)
+                    self._argnames.append(k)
+
+            self._kwargs = list(kwargs)
+            if len(self._kwargs) > 1:
+                raise NotImplementedError(
+                    "kwargs are not supported in user functions"
                 )
-            elif np.ndim(arg)==0:
-                self._scalar_args.append(arg)
-                self._scalar_idxs.append(i)
-            else:
-                self._args.append(convert_to_cunumeric_ndarray(arg))
-
-        #first fill arrays to argnames, then scalars:
-        for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
-            if not(i in self._scalar_idxs):
-                self._argnames.append(k)
-
-        for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
-            if i in self._scalar_idxs:
-                self._scalar_names.append(k)
-                self._argnames.append(k)
 
         #all output arrays should have the same type
         if len(self._args)>0:
@@ -412,12 +415,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                         "different shapes for arrays in "
                         "user function passed to vectorize")
 
-        self._kwargs = list(kwargs)
-        if len(self._kwargs) > 1:
-            raise NotImplementedError(
-                "kwargs are not supported in user functions"
-            )
-
         if runtime.num_gpus > 0:
             if not self._created:
                 self._numba_func = self._build_gpu_function()
@@ -433,8 +430,3 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._created = True
             self._execute(False)
 
-        self._args.clear()
-        self._scalar_args.clear()
-        self._scalar_idxs.clear()
-        self._argnames.clear()
-        self._scalar_names.clear()
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index fb19a85e69..903a8a8081 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -24,6 +24,10 @@
 def my_func(a, b):
     a = a * 2 + b
 
+def my_func_np(a, b):
+    a = a * 2 + b
+    return a
+
 
 def test_vectorize():
     func = num.vectorize(my_func)
@@ -40,7 +44,38 @@ def test_vectorize():
     a=num.array([[1,2,3],[4,5,6],[7,8,9]])
     b=num.array([[10,11,12],[13,14,15],[16,17,18]])
     func(a[:2],b[:2])
-    print(a)
+
+    a=np.arange(100).reshape((25,4))
+    a_num= num.array(a)
+
+    b=a*10
+    b_num=a_num*10
+    func_np = np.vectorize(my_func_np)
+    func_num=num.vectorize(my_func)
+
+    a=func_np(a,b)
+    func_num(a_num, b_num)
+    assert np.array_equal(a, a_num)
+
+    a[:,2]=func_np(a[:, 2], b[:,2])
+    func_num(a_num[:,2],b_num[:,2])
+    assert np.array_equal(a, a_num)
+
+    a[5:10,2]=func_np(a[5:10, 2], b[1:6,2])
+    func_num(a_num[5:10,2],b_num[1:6,2])
+    assert np.array_equal(a, a_num)
+
+    a[15:20]=func_np(a[15:20], b[15:20])
+    func_num(a_num[15:20],b_num[15:20])
+    assert np.array_equal(a, a_num)
+
+    a=np.arange(1000).reshape((25,10,4))
+    a_num= num.array(a)
+
+    a[:, 2, :] = func_np(a[:, 2, :],2)
+    func_num(a_num[:, 2, :],2)
+    assert np.array_equal(a, a_num)
+    
 
 if __name__ == "__main__":
     import sys

From c76b2fe8288d3a3f1618cbb02e42b419046ec486 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 09:07:39 -0800
Subject: [PATCH 17/78] hashing CUDA kernel for user function

---
 cunumeric/vectorize.py              |  1 +
 src/cunumeric/vectorize/eval_udf.cu | 54 +++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 59cc83dc91..7c3b26044d 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -417,6 +417,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
 
         if runtime.num_gpus > 0:
             if not self._created:
+                #print("IRINA DEBUG ptx is not created yet")
                 self._numba_func = self._build_gpu_function()
                 self._gpu_func = self._compile_func_gpu()
                 if self._cache:
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index ea6d878254..c416852dcc 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -25,6 +25,45 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
+class JITKernelStorage
+{
+
+private:
+    JITKernelStorage(){}
+    std::map<size_t, CUfunction> jit_functions_;
+
+public:
+    JITKernelStorage( JITKernelStorage const&) = delete;
+
+    void operator=(JITKernelStorage const&) = delete;
+
+    static JITKernelStorage& get_instance(void){
+        static JITKernelStorage instance;
+        return instance;
+    }
+
+    bool registered_jit_funtion(size_t hash){
+         return jit_functions_.find(hash)!=jit_functions_.end();
+    };
+
+    CUfunction return_saved_jit_function(size_t hash){
+       if (
+            jit_functions_.find(hash)!=jit_functions_.end())
+            return jit_functions_[hash];
+      else 
+          assert(false);//should never come here
+    }
+
+  void add_jit_function(size_t hash, CUfunction func){
+    if (
+        jit_functions_.find(hash)!=jit_functions_.end())
+        assert(false);// should never come here
+    else
+        jit_functions_.insert({hash, func});
+  }
+};//class JITKernelStorage
+
+
 struct EvalUdfGPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
@@ -32,6 +71,17 @@ struct EvalUdfGPU {
     using VAL = legate_type_of<CODE>;
     Rect<DIM> rect;
 
+  JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); 
+
+  std::hash<std::string> hasher;
+  CUfunction func;
+  size_t ptx_hash = hasher(args.ptx);
+  //std::cout <<"IRINA DEBUG hash = "<<ptx_hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(ptx_hash)<<std::endl;
+  if (jit_storage.registered_jit_funtion(ptx_hash)){
+    func = jit_storage.return_saved_jit_function(ptx_hash);
+  }
+  else{
+
     // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;
     const size_t log_buffer_size = 16384;
@@ -85,12 +135,12 @@ struct EvalUdfGPU {
     const auto& matched_line = line_match.begin()->str();
     auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
 
-    CUfunction func;
     result = cuModuleGetFunction(&func, module, fun_name.c_str());
 #ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
 #endif
-
+      jit_storage.add_jit_function(ptx_hash, func);
+   }
     // 2: after fucntion is generated, we can execute it:
 
     // Filling up the buffer with arguments

From c8e88472966fac2bcba72eae5c7a1e56e998f830 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 11:58:19 -0800
Subject: [PATCH 18/78] removing PTX from task arguments in the case the
 function was hashed

---
 cunumeric/vectorize.py              | 27 +++++++++++++++++++++++++--
 src/cunumeric/vectorize/eval_udf.cu | 17 +++++++++--------
 src/cunumeric/vectorize/eval_udf.h  |  1 +
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 7c3b26044d..e8bfa86400 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 #
 
+import cProfile, pstats
+
 import inspect
 import re
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -329,8 +331,15 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
     def _execute(self, is_gpu:bool) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         if is_gpu:
-            task.add_scalar_arg(self._gpu_func[0], ty.string)
+            ptx_hash = hash(self._gpu_func[0])
+            if self._created:
+                #use hashed ptx and CUfunction on the C++ side
+                str_tmp =""
+                task.add_scalar_arg(str_tmp, ty.string)
+            else:
+                task.add_scalar_arg(self._gpu_func[0], ty.string)    
             task.add_scalar_arg(self._num_outputs, ty.uint32)
+            task.add_scalar_arg(ptx_hash, ty.int64)
         else:
             task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
             task.add_scalar_arg(self._num_outputs, ty.uint32)
@@ -355,6 +364,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         Return arrays with the results of `pyfunc` broadcast (vectorized) over
         `args` and `kwargs` not in `excluded`.
         """
+        #profiler = cProfile.Profile()
+        #profiler.enable()   
         if not self._created:
             self._scalar_args.clear()
             self._scalar_idxs.clear()
@@ -420,9 +431,15 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 #print("IRINA DEBUG ptx is not created yet")
                 self._numba_func = self._build_gpu_function()
                 self._gpu_func = self._compile_func_gpu()
+            #profiler = cProfile.Profile()
+            #profiler.enable()
+            self._execute(True)
+            if not self._created:
                 if self._cache:
                     self._created = True
-            self._execute(True)
+            #profiler.disable()
+            #stats = pstats.Stats(profiler).sort_stats('cumtime')
+            #stats.print_stats()
         else:
             if not self._created:
                 self._numba_func = self._build_cpu_function()
@@ -431,3 +448,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._created = True
             self._execute(False)
 
+            
+        #profiler.disable()
+        #stats = pstats.Stats(profiler).sort_stats('cumtime')
+        #stats.print_stats()
+
+
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index c416852dcc..63e6322bd9 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -73,15 +73,15 @@ struct EvalUdfGPU {
 
   JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); 
 
-  std::hash<std::string> hasher;
+  //std::hash<std::string> hasher;
   CUfunction func;
-  size_t ptx_hash = hasher(args.ptx);
+  //size_t ptx_hash = hasher(args.ptx);
   //std::cout <<"IRINA DEBUG hash = "<<ptx_hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(ptx_hash)<<std::endl;
-  if (jit_storage.registered_jit_funtion(ptx_hash)){
-    func = jit_storage.return_saved_jit_function(ptx_hash);
+  if (jit_storage.registered_jit_funtion(args.hash)){
+    func = jit_storage.return_saved_jit_function(args.hash);
   }
   else{
-
+    assert(args.ptx.size()>1);// in this case PTX string shouldn't be empty
     // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;
     const size_t log_buffer_size = 16384;
@@ -139,7 +139,7 @@ struct EvalUdfGPU {
 #ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
 #endif
-      jit_storage.add_jit_function(ptx_hash, func);
+      jit_storage.add_jit_function(args.hash, func);
    }
     // 2: after fucntion is generated, we can execute it:
 
@@ -240,7 +240,7 @@ struct EvalUdfGPU {
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
   std::vector<Scalar>scalars;
-  for (size_t i=2; i<context.scalars().size(); i++)
+  for (size_t i=3; i<context.scalars().size(); i++)
       scalars.push_back(context.scalars()[i]);
 
   EvalUdfArgs args{0,
@@ -248,7 +248,8 @@ struct EvalUdfGPU {
                    context.outputs(),
                    scalars,
                    context.scalars()[0].value<std::string>(),
-                   context.scalars()[1].value<uint32_t>()};
+                   context.scalars()[1].value<uint32_t>(),
+                   context.scalars()[2].value<int64_t>()};
   size_t dim=1;
   if (args.inputs.size()>0){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index 243c439230..55a27bc200 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -28,6 +28,7 @@ struct EvalUdfArgs {
   std::vector<legate::Scalar>scalars;
   std::string ptx = "";
   uint32_t num_outputs;
+  int64_t hash=0;
 };
 
 class EvalUdfTask : public CuNumericTask<EvalUdfTask> {

From 392b4ee8038d5363221f98180f302dd4d67c5539 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 13:09:54 -0800
Subject: [PATCH 19/78] fixing Torchswe test

---
 src/cunumeric/vectorize/eval_udf.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 63e6322bd9..11c7ae8c45 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -30,7 +30,7 @@ class JITKernelStorage
 
 private:
     JITKernelStorage(){}
-    std::map<size_t, CUfunction> jit_functions_;
+    std::map<int64_t, CUfunction> jit_functions_;
 
 public:
     JITKernelStorage( JITKernelStorage const&) = delete;
@@ -42,11 +42,11 @@ public:
         return instance;
     }
 
-    bool registered_jit_funtion(size_t hash){
+    bool registered_jit_funtion(int64_t hash){
          return jit_functions_.find(hash)!=jit_functions_.end();
     };
 
-    CUfunction return_saved_jit_function(size_t hash){
+    CUfunction return_saved_jit_function(int64_t hash){
        if (
             jit_functions_.find(hash)!=jit_functions_.end())
             return jit_functions_[hash];
@@ -54,10 +54,10 @@ public:
           assert(false);//should never come here
     }
 
-  void add_jit_function(size_t hash, CUfunction func){
+  void add_jit_function(int64_t hash, CUfunction func){
     if (
         jit_functions_.find(hash)!=jit_functions_.end())
-        assert(false);// should never come here
+        //assert(false);// should never come here
     else
         jit_functions_.insert({hash, func});
   }

From 4278be6387da0268164f60c7a2b63417a17f4c68 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 13:35:58 -0800
Subject: [PATCH 20/78] fixing Torchswe test

---
 src/cunumeric/vectorize/eval_udf.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 11c7ae8c45..c70ee27ffc 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -56,9 +56,7 @@ public:
 
   void add_jit_function(int64_t hash, CUfunction func){
     if (
-        jit_functions_.find(hash)!=jit_functions_.end())
-        //assert(false);// should never come here
-    else
+        jit_functions_.find(hash)==jit_functions_.end())
         jit_functions_.insert({hash, func});
   }
 };//class JITKernelStorage

From ff0782b62174de5259bd91c40ff07a5611310182 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 18:26:00 -0800
Subject: [PATCH 21/78] fixing Torchswe test

---
 src/cunumeric/vectorize/eval_udf.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index c70ee27ffc..4ebae74c40 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -55,8 +55,6 @@ public:
     }
 
   void add_jit_function(int64_t hash, CUfunction func){
-    if (
-        jit_functions_.find(hash)==jit_functions_.end())
         jit_functions_.insert({hash, func});
   }
 };//class JITKernelStorage

From c772522242a7fcd55b88e1a1a4f59ec45fa2786d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 19:40:21 -0800
Subject: [PATCH 22/78] adding debug output

---
 cunumeric/vectorize.py              | 6 +++---
 src/cunumeric/vectorize/eval_udf.cu | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index e8bfa86400..3192c540b1 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -332,6 +332,7 @@ def _execute(self, is_gpu:bool) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
+            print("IRINA DEBUG hash =", ptx_hash)
             if self._created:
                 #use hashed ptx and CUfunction on the C++ side
                 str_tmp =""
@@ -434,9 +435,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             #profiler = cProfile.Profile()
             #profiler.enable()
             self._execute(True)
-            if not self._created:
-                if self._cache:
-                    self._created = True
+            if not self._created and self._cache:
+                self._created = True
             #profiler.disable()
             #stats = pstats.Stats(profiler).sort_stats('cumtime')
             #stats.print_stats()
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 4ebae74c40..e006cac2c2 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -72,11 +72,12 @@ struct EvalUdfGPU {
   //std::hash<std::string> hasher;
   CUfunction func;
   //size_t ptx_hash = hasher(args.ptx);
-  //std::cout <<"IRINA DEBUG hash = "<<ptx_hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(ptx_hash)<<std::endl;
+  std::cout <<"IRINA DEBUG within cuda task hash = "<<args.hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(args.hash)<<std::endl;
   if (jit_storage.registered_jit_funtion(args.hash)){
     func = jit_storage.return_saved_jit_function(args.hash);
   }
   else{
+    std::cout <<"IRINA DEBUG PTX code size within cuda task = "<<args.ptx.size()<<std::endl;
     assert(args.ptx.size()>1);// in this case PTX string shouldn't be empty
     // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;

From 958a7b8e3327c11e36dd6315ac6b8a3a209cea96 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 21:12:18 -0800
Subject: [PATCH 23/78] adding debug output

---
 src/cunumeric/vectorize/eval_udf.cc |  3 ++-
 src/cunumeric/vectorize/eval_udf.cu | 26 ++++++++++++++------------
 src/cunumeric/vectorize/eval_udf.h  |  1 +
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 47f84f0d97..60f4b4508b 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -64,7 +64,8 @@ struct EvalUdfCPU {
                    context.outputs(),
                    scalars,
                    tmp,
-                   context.scalars()[1].value<uint32_t>()};
+                   context.scalars()[1].value<uint32_t>(),
+                   context.get_task_index()};
   size_t dim=1;
   if (args.inputs.size()>0){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index e006cac2c2..de8ff0cbf0 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -30,7 +30,7 @@ class JITKernelStorage
 
 private:
     JITKernelStorage(){}
-    std::map<int64_t, CUfunction> jit_functions_;
+    std::map<std::pair<int64_t,DomainPoint>, CUfunction> jit_functions_;
 
 public:
     JITKernelStorage( JITKernelStorage const&) = delete;
@@ -42,20 +42,20 @@ public:
         return instance;
     }
 
-    bool registered_jit_funtion(int64_t hash){
-         return jit_functions_.find(hash)!=jit_functions_.end();
+    bool registered_jit_funtion(std::pair<int64_t, DomainPoint> &key){
+         return jit_functions_.find(key)!=jit_functions_.end();
     };
 
-    CUfunction return_saved_jit_function(int64_t hash){
+    CUfunction return_saved_jit_function(std::pair<int64_t,DomainPoint> &key){
        if (
-            jit_functions_.find(hash)!=jit_functions_.end())
-            return jit_functions_[hash];
+            jit_functions_.find(key)!=jit_functions_.end())
+            return jit_functions_[key];
       else 
           assert(false);//should never come here
     }
 
-  void add_jit_function(int64_t hash, CUfunction func){
-        jit_functions_.insert({hash, func});
+  void add_jit_function(std::pair<int64_t,DomainPoint> &key, CUfunction func){
+        jit_functions_.insert({key, func});
   }
 };//class JITKernelStorage
 
@@ -71,10 +71,11 @@ struct EvalUdfGPU {
 
   //std::hash<std::string> hasher;
   CUfunction func;
+  std::pair<int64_t,DomainPoint> key(args.hash, args.point);
   //size_t ptx_hash = hasher(args.ptx);
-  std::cout <<"IRINA DEBUG within cuda task hash = "<<args.hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(args.hash)<<std::endl;
-  if (jit_storage.registered_jit_funtion(args.hash)){
-    func = jit_storage.return_saved_jit_function(args.hash);
+  std::cout <<"IRINA DEBUG within cuda task hash = "<<args.hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(key)<<std::endl;
+  if (jit_storage.registered_jit_funtion(key)){
+    func = jit_storage.return_saved_jit_function(key);
   }
   else{
     std::cout <<"IRINA DEBUG PTX code size within cuda task = "<<args.ptx.size()<<std::endl;
@@ -136,7 +137,7 @@ struct EvalUdfGPU {
 #ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
 #endif
-      jit_storage.add_jit_function(args.hash, func);
+      jit_storage.add_jit_function(key, func);
    }
     // 2: after fucntion is generated, we can execute it:
 
@@ -246,6 +247,7 @@ struct EvalUdfGPU {
                    scalars,
                    context.scalars()[0].value<std::string>(),
                    context.scalars()[1].value<uint32_t>(),
+                   context.get_task_index(),
                    context.scalars()[2].value<int64_t>()};
   size_t dim=1;
   if (args.inputs.size()>0){
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index 55a27bc200..e8141ce754 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -28,6 +28,7 @@ struct EvalUdfArgs {
   std::vector<legate::Scalar>scalars;
   std::string ptx = "";
   uint32_t num_outputs;
+  Legion::DomainPoint point;
   int64_t hash=0;
 };
 

From 1f35532b5d0c5eb29726af5b565e145b1f969a20 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Feb 2023 22:34:25 -0800
Subject: [PATCH 24/78] removing debug output

---
 cunumeric/vectorize.py              | 2 +-
 src/cunumeric/vectorize/eval_udf.cu | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 3192c540b1..b082a7a1cb 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -332,7 +332,7 @@ def _execute(self, is_gpu:bool) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
-            print("IRINA DEBUG hash =", ptx_hash)
+            #print("IRINA DEBUG hash =", ptx_hash)
             if self._created:
                 #use hashed ptx and CUfunction on the C++ side
                 str_tmp =""
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index de8ff0cbf0..a2859981d2 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -73,12 +73,12 @@ struct EvalUdfGPU {
   CUfunction func;
   std::pair<int64_t,DomainPoint> key(args.hash, args.point);
   //size_t ptx_hash = hasher(args.ptx);
-  std::cout <<"IRINA DEBUG within cuda task hash = "<<args.hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(key)<<std::endl;
+  //std::cout <<"IRINA DEBUG within cuda task hash = "<<args.hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(key)<<std::endl;
   if (jit_storage.registered_jit_funtion(key)){
     func = jit_storage.return_saved_jit_function(key);
   }
   else{
-    std::cout <<"IRINA DEBUG PTX code size within cuda task = "<<args.ptx.size()<<std::endl;
+    //std::cout <<"IRINA DEBUG PTX code size within cuda task = "<<args.ptx.size()<<std::endl;
     assert(args.ptx.size()>1);// in this case PTX string shouldn't be empty
     // 1: we need to vreate a function from the ptx generated y numba
     const unsigned num_options   = 4;

From 63265c31d86634a42eb01f8f6db2b022bc75ef02 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 24 Feb 2023 08:51:18 -0800
Subject: [PATCH 25/78] do not pass string if PTX is hashed

---
 cunumeric/vectorize.py              | 22 +++++++++-------------
 src/cunumeric/vectorize/eval_udf.cc | 11 ++++++-----
 src/cunumeric/vectorize/eval_udf.cu | 17 +++++++++++++----
 src/cunumeric/vectorize/eval_udf.h  |  3 ++-
 4 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index b082a7a1cb..9129ab784d 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -330,24 +330,20 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
 
     def _execute(self, is_gpu:bool) -> None:
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
+        task.add_scalar_arg(self._num_outputs, ty.uint32)
+        task.add_scalar_arg(len(self._scalar_args), ty.uint32)
+        for a in self._scalar_args:
+            dtype = convert_to_cunumeric_dtype(type(a).__name__)
+            task.add_scalar_arg(a,dtype)
+
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
-            #print("IRINA DEBUG hash =", ptx_hash)
-            if self._created:
-                #use hashed ptx and CUfunction on the C++ side
-                str_tmp =""
-                task.add_scalar_arg(str_tmp, ty.string)
-            else:
-                task.add_scalar_arg(self._gpu_func[0], ty.string)    
-            task.add_scalar_arg(self._num_outputs, ty.uint32)
             task.add_scalar_arg(ptx_hash, ty.int64)
+            task.add_scalar_arg(self._created, bool)
+            if not self._created:
+                task.add_scalar_arg(self._gpu_func[0], ty.string)    
         else:
             task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
-            task.add_scalar_arg(self._num_outputs, ty.uint32)
-        for a in self._scalar_args:
-            dtype = convert_to_cunumeric_dtype(type(a).__name__)
-            task.add_scalar_arg(a,dtype)
-
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 60f4b4508b..bef822f464 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -55,16 +55,17 @@ struct EvalUdfCPU {
 
 /*static*/ void EvalUdfTask::cpu_variant(TaskContext& context)
 {
-  std::string tmp("tmp");
+  uint32_t num_outputs = context.scalars()[0].value<uint32_t>();
+  uint32_t num_scalars = context.scalars()[1].value<uint32_t>();
   std::vector<Scalar>scalars;
-  for (size_t i=2; i<context.scalars().size(); i++)
+  for (size_t i=2; i<(2+num_scalars); i++)
       scalars.push_back(context.scalars()[i]);
-  EvalUdfArgs args{context.scalars()[0].value<uint64_t>(),
+
+  EvalUdfArgs args{context.scalars()[2+num_scalars].value<uint64_t>(),
                    context.inputs(),
                    context.outputs(),
                    scalars,
-                   tmp,
-                   context.scalars()[1].value<uint32_t>(),
+                   num_outputs,
                    context.get_task_index()};
   size_t dim=1;
   if (args.inputs.size()>0){
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index a2859981d2..9c6c310d07 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -237,24 +237,33 @@ struct EvalUdfGPU {
 
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
+ 
+  uint32_t num_outputs = context.scalars()[0].value<uint32_t>();
+  uint32_t num_scalars = context.scalars()[1].value<uint32_t>();
   std::vector<Scalar>scalars;
-  for (size_t i=3; i<context.scalars().size(); i++)
+  for (size_t i=2; i<(2+num_scalars); i++)
       scalars.push_back(context.scalars()[i]);
+  
+  int64_t ptx_hash = context.scalars()[2+num_scalars].value<int64_t>();
+  bool is_created = context.scalars()[3+num_scalars].value<bool>();
+
 
   EvalUdfArgs args{0,
                    context.inputs(),
                    context.outputs(),
                    scalars,
-                   context.scalars()[0].value<std::string>(),
-                   context.scalars()[1].value<uint32_t>(),
+                   num_outputs,
                    context.get_task_index(),
-                   context.scalars()[2].value<int64_t>()};
+                   ptx_hash};
+  if (!is_created)
+      args.ptx = context.scalars()[4+num_scalars].value<std::string>();
   size_t dim=1;
   if (args.inputs.size()>0){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
   }
   else{
+    //FIXME
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
     //double_dispatch(dim, 0 , EvalUdfGPU{}, args);
   }
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index e8141ce754..aac5aade92 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -26,10 +26,11 @@ struct EvalUdfArgs {
   std::vector<Array>& inputs;
   std::vector<Array>& outputs;
   std::vector<legate::Scalar>scalars;
-  std::string ptx = "";
   uint32_t num_outputs;
   Legion::DomainPoint point;
   int64_t hash=0;
+  std::string ptx = "";
+
 };
 
 class EvalUdfTask : public CuNumericTask<EvalUdfTask> {

From 798289ff795e9d8f59e952b3c514a356959c6e19 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 24 Feb 2023 11:27:04 -0800
Subject: [PATCH 26/78] split Cuda kerenel generation with kernel execution

---
 cunumeric/config.py                         |   2 +
 cunumeric/vectorize.py                      |  19 +++-
 cunumeric_cpp.cmake                         |   2 +
 src/cunumeric/cuda_help.h                   |  37 +++++++
 src/cunumeric/cunumeric_c.h                 |   1 +
 src/cunumeric/vectorize/create_cu_kernel.cc |  38 ++++++++
 src/cunumeric/vectorize/create_cu_kernel.cu |  97 ++++++++++++++++++
 src/cunumeric/vectorize/create_cu_kernel.h  |  38 ++++++++
 src/cunumeric/vectorize/eval_udf.cu         | 103 +-------------------
 9 files changed, 236 insertions(+), 101 deletions(-)
 create mode 100644 src/cunumeric/vectorize/create_cu_kernel.cc
 create mode 100644 src/cunumeric/vectorize/create_cu_kernel.cu
 create mode 100644 src/cunumeric/vectorize/create_cu_kernel.h

diff --git a/cunumeric/config.py b/cunumeric/config.py
index 14cb1b6434..158dd68016 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -143,6 +143,7 @@ class _CunumericSharedLib:
     CUNUMERIC_CONVERT_NAN_PROD: int
     CUNUMERIC_CONVERT_NAN_SUM: int
     CUNUMERIC_CONVOLVE: int
+    CUNUMERIC_CREATE_CU_KERNEL: int
     CUNUMERIC_DIAG: int
     CUNUMERIC_DOT: int
     CUNUMERIC_EYE: int
@@ -346,6 +347,7 @@ class CuNumericOpCode(IntEnum):
     CONTRACT = _cunumeric.CUNUMERIC_CONTRACT
     CONVERT = _cunumeric.CUNUMERIC_CONVERT
     CONVOLVE = _cunumeric.CUNUMERIC_CONVOLVE
+    CREATE_CU_KERNEL = _cunumeric.CUNUMERIC_CREATE_CU_KERNEL
     DIAG = _cunumeric.CUNUMERIC_DIAG
     DOT = _cunumeric.CUNUMERIC_DOT
     EYE = _cunumeric.CUNUMERIC_EYE
diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 9129ab784d..a1d57ec454 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -329,9 +329,26 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
         return numba.cfunc(sig)(self._numba_func)
 
     def _execute(self, is_gpu:bool) -> None:
+        if is_gpu and not self._created:
+            #create CUDA kernel
+            kernel_task = self._context.create_auto_task(CuNumericOpCode.CREATE_CU_KERNEL)
+            ptx_hash = hash(self._gpu_func[0])
+            kernel_task.add_scalar_arg(ptx_hash, ty.int64)
+            kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
+            #adding unused array for creating correct launch domain
+            #and set up dependency between kernel_task and task
+            if len(self._args)>0:
+               a0 = self._args[0]._thunk
+               a0 = runtime.to_deferred_array(a0)
+               kernel_task.add_input(a0.base)
+               kernel_task.add_output(a0.base)
+            kernel_task.execute()
+                
+
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._num_outputs, ty.uint32)
         task.add_scalar_arg(len(self._scalar_args), ty.uint32)
+         
         for a in self._scalar_args:
             dtype = convert_to_cunumeric_dtype(type(a).__name__)
             task.add_scalar_arg(a,dtype)
@@ -340,8 +357,6 @@ def _execute(self, is_gpu:bool) -> None:
             ptx_hash = hash(self._gpu_func[0])
             task.add_scalar_arg(ptx_hash, ty.int64)
             task.add_scalar_arg(self._created, bool)
-            if not self._created:
-                task.add_scalar_arg(self._gpu_func[0], ty.string)    
         else:
             task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
         a0 = self._args[0]._thunk
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
index bdfa4c163a..39115eeb72 100644
--- a/cunumeric_cpp.cmake
+++ b/cunumeric_cpp.cmake
@@ -161,6 +161,7 @@ list(APPEND cunumeric_SOURCES
   src/cunumeric/cephes/chbevl.cc
   src/cunumeric/cephes/i0.cc
   src/cunumeric/vectorize/eval_udf.cc
+  src/cunumeric/vectorize/create_cu_kernel.cc
 )
 
 if(Legion_USE_OpenMP)
@@ -260,6 +261,7 @@ if(Legion_USE_CUDA)
     src/cunumeric/cudalibs.cu
     src/cunumeric/cunumeric.cu
     src/cunumeric/vectorize/eval_udf.cu
+    src/cunumeric/vectorize/create_cu_kernel.cu
   )
 endif()
 
diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
index 63bd6d4e17..2e67366a20 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cunumeric/cuda_help.h
@@ -390,4 +390,41 @@ __device__ __forceinline__ void store_streaming<double>(double* ptr, double valu
   asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory");
 }
 
+#include<cuda.h>
+
+class JITKernelStorage
+{
+
+private:
+    JITKernelStorage(){}
+    std::map<std::pair<int64_t,Legion::DomainPoint>, CUfunction> jit_functions_;
+
+public:
+    JITKernelStorage( JITKernelStorage const&) = delete;
+
+    void operator=(JITKernelStorage const&) = delete;
+
+    static JITKernelStorage& get_instance(void){
+        static JITKernelStorage instance;
+        return instance;
+    }
+
+    bool registered_jit_funtion(std::pair<int64_t, Legion::DomainPoint> &key){
+         return jit_functions_.find(key)!=jit_functions_.end();
+    };
+
+    CUfunction return_saved_jit_function(std::pair<int64_t,Legion::DomainPoint> &key){
+       if (
+            jit_functions_.find(key)!=jit_functions_.end())
+            return jit_functions_[key];
+      else 
+          assert(false);//should never come here
+    }
+
+  void add_jit_function(std::pair<int64_t,Legion::DomainPoint> &key, CUfunction func){
+        jit_functions_.insert({key, func});
+  }
+};//class JITKernelStorage
+
+
 }  // namespace cunumeric
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 42c0bc955e..7b055f82f6 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -37,6 +37,7 @@ enum CuNumericOpCode {
   CUNUMERIC_CONTRACT,
   CUNUMERIC_CONVERT,
   CUNUMERIC_CONVOLVE,
+  CUNUMERIC_CREATE_CU_KERNEL,
   CUNUMERIC_SCAN_GLOBAL,
   CUNUMERIC_SCAN_LOCAL,
   CUNUMERIC_DIAG,
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cc b/src/cunumeric/vectorize/create_cu_kernel.cc
new file mode 100644
index 0000000000..1d6b5d3950
--- /dev/null
+++ b/src/cunumeric/vectorize/create_cu_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright 20223 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/vectorize/create_cu_kernel.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+
+/*static*/ void CreateCUKernelTask::cpu_variant(TaskContext& context)
+{
+}
+/*static*/ void CreateCUKernelTask::omp_variant(TaskContext& context)
+{
+}
+
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { CreateCUKernelTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
new file mode 100644
index 0000000000..b3ba432cc2
--- /dev/null
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -0,0 +1,97 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/vectorize/create_cu_kernel.h"
+#include "cunumeric/cuda_help.h"
+#include <regex>
+#include <cuda.h>
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+/*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context)
+{
+  
+  int64_t ptx_hash = context.scalars()[0].value<int64_t>();
+  std::string ptx = context.scalars()[1].value<std::string>();
+  DomainPoint point = context.get_task_index();
+  JITKernelStorage& jit_storage =JITKernelStorage::get_instance();
+
+  CUfunction func;
+  std::pair<int64_t,DomainPoint> key(ptx_hash, point);
+  if (!jit_storage.registered_jit_funtion(key)){
+    const unsigned num_options   = 4;
+    const size_t log_buffer_size = 16384;
+    std::vector<char> log_info_buffer(log_buffer_size);
+    std::vector<char> log_error_buffer(log_buffer_size);
+    CUjit_option jit_options[] = {
+      CU_JIT_INFO_LOG_BUFFER,
+      CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+      CU_JIT_ERROR_LOG_BUFFER,
+      CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+    };
+    void* option_vals[] = {
+      static_cast<void*>(log_info_buffer.data()),
+      reinterpret_cast<void*>(log_buffer_size),
+      static_cast<void*>(log_error_buffer.data()),
+      reinterpret_cast<void*>(log_buffer_size),
+    };
+
+    CUmodule module;
+    CUresult result =
+      cuModuleLoadDataEx(&module, ptx.data(), num_options, jit_options, option_vals);
+    if (result != CUDA_SUCCESS) {
+      if (result == CUDA_ERROR_OPERATING_SYSTEM) {
+        fprintf(stderr,
+                "ERROR: Device side asserts are not supported by the "
+                "CUDA driver for MAC OSX, see NVBugs 1628896.\n");
+        exit(-1);
+      } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) {
+        fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n");
+        exit(-1);
+      } else {
+        fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data());
+#if CUDA_VERSION >= 6050
+        const char *name, *str;
+        assert(cuGetErrorName(result, &name) == CUDA_SUCCESS);
+        assert(cuGetErrorString(result, &str) == CUDA_SUCCESS);
+        fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str);
+#else
+        fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result);
+#endif
+        exit(-1);
+      }
+    }
+    std::cmatch line_match;
+    bool match =
+      std::regex_search(ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
+#ifdef DEBUG_CUNUMERIC
+    assert(match);
+#endif
+    const auto& matched_line = line_match.begin()->str();
+    auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
+
+    result = cuModuleGetFunction(&func, module, fun_name.c_str());
+#ifdef DEBUG_CUNUMERIC
+    assert(result == CUDA_SUCCESS);
+#endif
+      jit_storage.add_jit_function(key, func);
+   }
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/create_cu_kernel.h b/src/cunumeric/vectorize/create_cu_kernel.h
new file mode 100644
index 0000000000..7b1e176756
--- /dev/null
+++ b/src/cunumeric/vectorize/create_cu_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+#include "core/data/scalar.h"
+
+namespace cunumeric {
+
+class CreateCUKernelTask : public CuNumericTask<CreateCUKernelTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_CREATE_CU_KERNEL;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 9c6c310d07..12777df1d2 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -25,41 +25,6 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-class JITKernelStorage
-{
-
-private:
-    JITKernelStorage(){}
-    std::map<std::pair<int64_t,DomainPoint>, CUfunction> jit_functions_;
-
-public:
-    JITKernelStorage( JITKernelStorage const&) = delete;
-
-    void operator=(JITKernelStorage const&) = delete;
-
-    static JITKernelStorage& get_instance(void){
-        static JITKernelStorage instance;
-        return instance;
-    }
-
-    bool registered_jit_funtion(std::pair<int64_t, DomainPoint> &key){
-         return jit_functions_.find(key)!=jit_functions_.end();
-    };
-
-    CUfunction return_saved_jit_function(std::pair<int64_t,DomainPoint> &key){
-       if (
-            jit_functions_.find(key)!=jit_functions_.end())
-            return jit_functions_[key];
-      else 
-          assert(false);//should never come here
-    }
-
-  void add_jit_function(std::pair<int64_t,DomainPoint> &key, CUfunction func){
-        jit_functions_.insert({key, func});
-  }
-};//class JITKernelStorage
-
-
 struct EvalUdfGPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
@@ -78,68 +43,8 @@ struct EvalUdfGPU {
     func = jit_storage.return_saved_jit_function(key);
   }
   else{
-    //std::cout <<"IRINA DEBUG PTX code size within cuda task = "<<args.ptx.size()<<std::endl;
-    assert(args.ptx.size()>1);// in this case PTX string shouldn't be empty
-    // 1: we need to vreate a function from the ptx generated y numba
-    const unsigned num_options   = 4;
-    const size_t log_buffer_size = 16384;
-    std::vector<char> log_info_buffer(log_buffer_size);
-    std::vector<char> log_error_buffer(log_buffer_size);
-    CUjit_option jit_options[] = {
-      CU_JIT_INFO_LOG_BUFFER,
-      CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-      CU_JIT_ERROR_LOG_BUFFER,
-      CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-    };
-    void* option_vals[] = {
-      static_cast<void*>(log_info_buffer.data()),
-      reinterpret_cast<void*>(log_buffer_size),
-      static_cast<void*>(log_error_buffer.data()),
-      reinterpret_cast<void*>(log_buffer_size),
-    };
-
-    CUmodule module;
-    CUresult result =
-      cuModuleLoadDataEx(&module, args.ptx.data(), num_options, jit_options, option_vals);
-    if (result != CUDA_SUCCESS) {
-      if (result == CUDA_ERROR_OPERATING_SYSTEM) {
-        fprintf(stderr,
-                "ERROR: Device side asserts are not supported by the "
-                "CUDA driver for MAC OSX, see NVBugs 1628896.\n");
-        exit(-1);
-      } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) {
-        fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n");
-        exit(-1);
-      } else {
-        fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data());
-#if CUDA_VERSION >= 6050
-        const char *name, *str;
-        assert(cuGetErrorName(result, &name) == CUDA_SUCCESS);
-        assert(cuGetErrorString(result, &str) == CUDA_SUCCESS);
-        fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str);
-#else
-        fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result);
-#endif
-        exit(-1);
-      }
-    }
-
-    std::cmatch line_match;
-    bool match =
-      std::regex_search(args.ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
-#ifdef DEBUG_CUNUMERIC
-    assert(match);
-#endif
-    const auto& matched_line = line_match.begin()->str();
-    auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
-
-    result = cuModuleGetFunction(&func, module, fun_name.c_str());
-#ifdef DEBUG_CUNUMERIC
-    assert(result == CUDA_SUCCESS);
-#endif
-      jit_storage.add_jit_function(key, func);
+    assert(false); //should never come here
    }
-    // 2: after fucntion is generated, we can execute it:
 
     // Filling up the buffer with arguments
     size_t buffer_size = (args.inputs.size()+args.scalars.size()) * sizeof(void*);
@@ -245,7 +150,7 @@ struct EvalUdfGPU {
       scalars.push_back(context.scalars()[i]);
   
   int64_t ptx_hash = context.scalars()[2+num_scalars].value<int64_t>();
-  bool is_created = context.scalars()[3+num_scalars].value<bool>();
+ // bool is_created = context.scalars()[3+num_scalars].value<bool>();
 
 
   EvalUdfArgs args{0,
@@ -255,8 +160,8 @@ struct EvalUdfGPU {
                    num_outputs,
                    context.get_task_index(),
                    ptx_hash};
-  if (!is_created)
-      args.ptx = context.scalars()[4+num_scalars].value<std::string>();
+  //if (!is_created)
+  //    args.ptx = context.scalars()[4+num_scalars].value<std::string>();
   size_t dim=1;
   if (args.inputs.size()>0){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();

From 591fe8eaa7e19774b2fb0eed5463de5531452b31 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 27 Feb 2023 13:23:18 -0800
Subject: [PATCH 27/78] fixing errors after merge

---
 src/cunumeric/vectorize/eval_udf.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 12777df1d2..e307982be3 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -22,7 +22,7 @@
 
 namespace cunumeric {
 
-using namespace Legion;
+//using namespace Legion;
 using namespace legate;
 
 struct EvalUdfGPU {

From 7b56a5cfa0d096abef56c20602fa011eda7e80c3 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 28 Feb 2023 08:34:10 -0800
Subject: [PATCH 28/78] fixing logic for generating numba function

---
 cunumeric/vectorize.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index a1d57ec454..e071c4c12d 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -185,24 +185,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
     def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str:
         if name in self._argnames and not(name in self._scalar_names) :
             return "{}[{}]".format(name, _LOOP_VAR)
-        elif name == "if":
-            return "if "
-        elif name == "return":
-            return "return "
-        elif name == "or":
-            return "or "
-        elif name == "and":
-            return "and "
-        elif name == "not":
-            return "not "
-        elif name == "min":
-            return "min"
-        elif name == "max":
-            return "max"
-        elif is_gpu:
-            return "{}".format(name)
         else:
-            return "{}[0]".format(name)
+            return "{}".format(name)
 
 
     def _build_gpu_function(self) -> Any:

From 57bc2b3d67032bc3a16917f1bf39daafad1490ed Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 28 Feb 2023 13:04:17 -0800
Subject: [PATCH 29/78] adding support for upper-case Letters in argument names
 in UDF

---
 cunumeric/vectorize.py              |  6 ++++--
 tests/integration/test_vectorize.py | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index e071c4c12d..bfa504e234 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -183,6 +183,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
         return return_lines
 
     def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str:
+        print("IRINA DEBUG ARGNAMES =", self._argnames)
+        print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names)
         if name in self._argnames and not(name in self._scalar_names) :
             return "{}[{}]".format(name, _LOOP_VAR)
         else:
@@ -220,11 +222,11 @@ def _lift_to_array_access(m: Any) -> str:
         # kernel body
         lines_old = self._get_func_body(self._pyfunc)
         for line in lines_old:
-            l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
+            l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
             lines.append(l_new)
    
         #print("IRINA DEBUG GPU function",lines)
-
+        
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
         glbs: Dict[str, Any] = {}
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 903a8a8081..0dc8e7b6a7 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -29,6 +29,14 @@ def my_func_np(a, b):
     return a
 
 
+def my_func2(A0, B0):
+    A0 = A0 * 2 + B0
+
+def my_func_np2(A0, B0):
+    A0 = A0 * 2 + B0
+    return A0
+
+
 def test_vectorize():
     func = num.vectorize(my_func)
     a = num.arange(5)
@@ -75,6 +83,20 @@ def test_vectorize():
     a[:, 2, :] = func_np(a[:, 2, :],2)
     func_num(a_num[:, 2, :],2)
     assert np.array_equal(a, a_num)
+
+    a=np.arange(100).reshape((25,4))
+    a_num= num.array(a)
+
+    b=a*10
+    b_num=a_num*10
+
+    func_np = np.vectorize(my_func_np2)
+    func_num=num.vectorize(my_func2)
+
+    a=func_np(a,b)
+    func_num(a_num, b_num)
+    assert np.array_equal(a, a_num)
+
     
 
 if __name__ == "__main__":

From cd6d0256752f5db00ebb10b43491e3d214581a36 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 28 Feb 2023 15:25:42 -0800
Subject: [PATCH 30/78] fixing CUfunction caching logic

---
 cunumeric/vectorize.py                      | 30 +++++++++++----------
 src/cunumeric/cuda_help.h                   |  8 +++---
 src/cunumeric/vectorize/create_cu_kernel.cu |  4 +--
 src/cunumeric/vectorize/eval_udf.cc         |  2 +-
 src/cunumeric/vectorize/eval_udf.cu         | 21 ++++++++++-----
 src/cunumeric/vectorize/eval_udf.h          |  5 ++--
 6 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index bfa504e234..d970095c25 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -20,6 +20,7 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import legate.core.types as ty
+from legate.core import Rect
 import numba
 import numba.core.ccallback
 import numpy as np
@@ -183,8 +184,8 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
         return return_lines
 
     def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str:
-        print("IRINA DEBUG ARGNAMES =", self._argnames)
-        print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names)
+        #print("IRINA DEBUG ARGNAMES =", self._argnames)
+        #print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names)
         if name in self._argnames and not(name in self._scalar_names) :
             return "{}[{}]".format(name, _LOOP_VAR)
         else:
@@ -314,20 +315,19 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
 
         return numba.cfunc(sig)(self._numba_func)
 
-    def _execute(self, is_gpu:bool) -> None:
+    def _execute(self, is_gpu:bool, num_gpus:int=0) -> None:
         if is_gpu and not self._created:
-            #create CUDA kernel
-            kernel_task = self._context.create_auto_task(CuNumericOpCode.CREATE_CU_KERNEL)
+            # create future for dependency between CREATE_CU_KERNEL and 
+            # EVAL_UDF tasks
+            future  = convert_to_cunumeric_ndarray(num_gpus)
+            future_deferred = runtime.to_deferred_array(future._thunk)
+            # create CUDA kernel
+            launch_domain=Rect(lo=(0,), hi=(num_gpus,))
+            kernel_task = self._context.create_task(CuNumericOpCode.CREATE_CU_KERNEL,manual=True, launch_domain=launch_domain)
             ptx_hash = hash(self._gpu_func[0])
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            #adding unused array for creating correct launch domain
-            #and set up dependency between kernel_task and task
-            if len(self._args)>0:
-               a0 = self._args[0]._thunk
-               a0 = runtime.to_deferred_array(a0)
-               kernel_task.add_input(a0.base)
-               kernel_task.add_output(a0.base)
+            kernel_task.add_output(future_deferred.base)
             kernel_task.execute()
                 
 
@@ -342,7 +342,7 @@ def _execute(self, is_gpu:bool) -> None:
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
             task.add_scalar_arg(ptx_hash, ty.int64)
-            task.add_scalar_arg(self._created, bool)
+            task.add_scalar_arg((is_gpu and not self._created), bool)
         else:
             task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
         a0 = self._args[0]._thunk
@@ -355,6 +355,8 @@ def _execute(self, is_gpu:bool) -> None:
                 task.add_output(a_tmp)
             if count != 0:
                 task.add_alignment(a0.base, a_tmp)
+        if is_gpu and not self._created:
+            task.add_input(future_deferred.base)
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
@@ -431,7 +433,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 self._gpu_func = self._compile_func_gpu()
             #profiler = cProfile.Profile()
             #profiler.enable()
-            self._execute(True)
+            self._execute(True, runtime.num_gpus)
             if not self._created and self._cache:
                 self._created = True
             #profiler.disable()
diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
index 497815bad3..ce4504d4c0 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cunumeric/cuda_help.h
@@ -397,7 +397,7 @@ class JITKernelStorage
 
 private:
     JITKernelStorage(){}
-    std::map<std::pair<int64_t,Legion::DomainPoint>, CUfunction> jit_functions_;
+    std::map<std::pair<int64_t,Legion::Processor>, CUfunction> jit_functions_;
 
 public:
     JITKernelStorage( JITKernelStorage const&) = delete;
@@ -409,11 +409,11 @@ class JITKernelStorage
         return instance;
     }
 
-    bool registered_jit_funtion(std::pair<int64_t, Legion::DomainPoint> &key){
+    bool registered_jit_funtion(std::pair<int64_t, Legion::Processor> &key){
          return jit_functions_.find(key)!=jit_functions_.end();
     };
 
-    CUfunction return_saved_jit_function(std::pair<int64_t,Legion::DomainPoint> &key){
+    CUfunction return_saved_jit_function(std::pair<int64_t,Legion::Processor> &key){
        if (
             jit_functions_.find(key)!=jit_functions_.end())
             return jit_functions_[key];
@@ -421,7 +421,7 @@ class JITKernelStorage
           assert(false);//should never come here
     }
 
-  void add_jit_function(std::pair<int64_t,Legion::DomainPoint> &key, CUfunction func){
+  void add_jit_function(std::pair<int64_t,Legion::Processor> &key, CUfunction func){
         jit_functions_.insert({key, func});
   }
 };//class JITKernelStorage
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index b3ba432cc2..61bc0124cd 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -29,11 +29,11 @@ using namespace legate;
   
   int64_t ptx_hash = context.scalars()[0].value<int64_t>();
   std::string ptx = context.scalars()[1].value<std::string>();
-  DomainPoint point = context.get_task_index();
+  Processor point = context.get_current_processor();
   JITKernelStorage& jit_storage =JITKernelStorage::get_instance();
 
   CUfunction func;
-  std::pair<int64_t,DomainPoint> key(ptx_hash, point);
+  std::pair<int64_t,Processor> key(ptx_hash, point);
   if (!jit_storage.registered_jit_funtion(key)){
     const unsigned num_options   = 4;
     const size_t log_buffer_size = 16384;
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index bef822f464..92f8cd739f 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -66,7 +66,7 @@ struct EvalUdfCPU {
                    context.outputs(),
                    scalars,
                    num_outputs,
-                   context.get_task_index()};
+                   context.get_current_processor()};
   size_t dim=1;
   if (args.inputs.size()>0){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index e307982be3..428d8c5ed3 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -36,7 +36,7 @@ struct EvalUdfGPU {
 
   //std::hash<std::string> hasher;
   CUfunction func;
-  std::pair<int64_t,DomainPoint> key(args.hash, args.point);
+  std::pair<int64_t,Processor> key(args.hash, args.point);
   //size_t ptx_hash = hasher(args.ptx);
   //std::cout <<"IRINA DEBUG within cuda task hash = "<<args.hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(key)<<std::endl;
   if (jit_storage.registered_jit_funtion(key)){
@@ -47,7 +47,12 @@ struct EvalUdfGPU {
    }
 
     // Filling up the buffer with arguments
-    size_t buffer_size = (args.inputs.size()+args.scalars.size()) * sizeof(void*);
+    size_t input_size=args.inputs.size();
+      if (args.is_created){
+         input_size=input_size-1;
+      }
+
+    size_t buffer_size = (input_size+args.scalars.size()) * sizeof(void*);
     buffer_size +=sizeof(size_t);//size
     buffer_size += sizeof(size_t);//dim
     buffer_size += sizeof(void*);//pitches
@@ -63,7 +68,7 @@ struct EvalUdfGPU {
     if (args.inputs.size()>0){
       rect = args.inputs[0].shape<DIM>();
       size = rect.volume();
-      for (size_t i = 0; i < args.inputs.size(); i++) {
+      for (size_t i = 0; i < input_size; i++) {
         if (i < args.num_outputs) {
           auto out                           = args.outputs[i].write_accessor<VAL, DIM>(rect);
           *reinterpret_cast<const void**>(p) = out.ptr(rect, strides);
@@ -128,12 +133,13 @@ struct EvalUdfGPU {
 
     auto stream = get_cached_stream();
 
+    //std::cout <<"function = "<<func<<std::endl;
     // executing the function
     CUresult status = cuLaunchKernel(
       func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config);
     if (status != CUDA_SUCCESS) {
       fprintf(stderr, "Failed to launch a CUDA kernel\n");
-      exit(-1);
+      assert(false);
     }
 
     CHECK_CUDA_STREAM(stream);
@@ -150,7 +156,7 @@ struct EvalUdfGPU {
       scalars.push_back(context.scalars()[i]);
   
   int64_t ptx_hash = context.scalars()[2+num_scalars].value<int64_t>();
- // bool is_created = context.scalars()[3+num_scalars].value<bool>();
+  bool is_created = context.scalars()[3+num_scalars].value<bool>();
 
 
   EvalUdfArgs args{0,
@@ -158,8 +164,9 @@ struct EvalUdfGPU {
                    context.outputs(),
                    scalars,
                    num_outputs,
-                   context.get_task_index(),
-                   ptx_hash};
+                   context.get_current_processor(),
+                   ptx_hash,
+                   is_created};
   //if (!is_created)
   //    args.ptx = context.scalars()[4+num_scalars].value<std::string>();
   size_t dim=1;
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index aac5aade92..4d867a003f 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -27,10 +27,9 @@ struct EvalUdfArgs {
   std::vector<Array>& outputs;
   std::vector<legate::Scalar>scalars;
   uint32_t num_outputs;
-  Legion::DomainPoint point;
+  Legion::Processor point;
   int64_t hash=0;
-  std::string ptx = "";
-
+  bool is_created = false;
 };
 
 class EvalUdfTask : public CuNumericTask<EvalUdfTask> {

From c7b9a080c1445e7b5eab702d595079cd089fc506 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 28 Feb 2023 16:40:38 -0800
Subject: [PATCH 31/78] adding debug output

---
 src/cunumeric/vectorize/create_cu_kernel.cu | 3 +++
 src/cunumeric/vectorize/eval_udf.cu         | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index 61bc0124cd..eaddf6d3ec 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -90,6 +90,9 @@ using namespace legate;
 #ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
 #endif
+
+      std::cout <<"IRINA DEBUG create_func proc = "<<point<<" , func = "<<func
+      << ", hash = "<<ptx_hash<<std::endl;
       jit_storage.add_jit_function(key, func);
    }
 }
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 428d8c5ed3..9ae568875f 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -46,6 +46,9 @@ struct EvalUdfGPU {
     assert(false); //should never come here
    }
 
+    std::cout <<"IRINA DEBUG proc = "<<args.point<<" , func = "<<func<<
+      "  hash = "<<args.hash<< std::endl;
+
     // Filling up the buffer with arguments
     size_t input_size=args.inputs.size();
       if (args.is_created){

From 295bf4582f6386d9e9d6fd051825d290e2ca4b8c Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 28 Feb 2023 21:29:16 -0800
Subject: [PATCH 32/78] adding explicit omp version for createCuKernel

---
 cunumeric/vectorize.py                        | 174 +++++++++++-------
 cunumeric_cpp.cmake                           |   1 +
 src/cunumeric/vectorize/create_cu_kernel.cc   |  13 +-
 .../vectorize/create_cu_kernel_omp.cc         |  25 +++
 4 files changed, 136 insertions(+), 77 deletions(-)
 create mode 100644 src/cunumeric/vectorize/create_cu_kernel_omp.cc

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index d970095c25..dec4afde86 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 #
 
-import cProfile, pstats
-
+import cProfile
 import inspect
+import pstats
 import re
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import legate.core.types as ty
-from legate.core import Rect
 import numba
 import numba.core.ccallback
 import numpy as np
 import six
+from legate.core import Rect, get_legate_runtime
 
 from cunumeric.runtime import runtime
 
@@ -119,10 +119,10 @@ def __init__(
         self._otypes: Optional[tuple[Any]] = None
         self._result = None
         self._args: List[Any] = []
-        self._scalar_args: List[Any]=[]
-        self._scalar_idxs:List[int]=[]
-        self._scalar_names:List[str]=[]
-        self._argnames:List[str]=[]
+        self._scalar_args: List[Any] = []
+        self._scalar_idxs: List[int] = []
+        self._scalar_names: List[str] = []
+        self._argnames: List[str] = []
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
@@ -183,24 +183,31 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
             return_lines.append(lines[i].rstrip())
         return return_lines
 
-    def _replace_name(self, name: str, _LOOP_VAR:str, is_gpu:bool=False) -> str:
-        #print("IRINA DEBUG ARGNAMES =", self._argnames)
-        #print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names)
-        if name in self._argnames and not(name in self._scalar_names) :
+    def _replace_name(
+        self, name: str, _LOOP_VAR: str, is_gpu: bool = False
+    ) -> str:
+        # print("IRINA DEBUG ARGNAMES =", self._argnames)
+        # print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names)
+        if name in self._argnames and not (name in self._scalar_names):
             return "{}[{}]".format(name, _LOOP_VAR)
         else:
             return "{}".format(name)
 
-
     def _build_gpu_function(self) -> Any:
-
         funcid = "vectorized_{}".format(self._pyfunc.__name__)
 
         # Preamble
         lines = ["from numba import cuda"]
 
         # Signature
-        args = self._argnames + [_SIZE_VAR]+[_DIM_VAR]+[_PITCHES_VAR]+[_LO_POINT_VAR] +[_STRIDES_VAR]
+        args = (
+            self._argnames
+            + [_SIZE_VAR]
+            + [_DIM_VAR]
+            + [_PITCHES_VAR]
+            + [_LO_POINT_VAR]
+            + [_STRIDES_VAR]
+        )
 
         lines.append("def {}({}):".format(funcid, ",".join(args)))
 
@@ -211,10 +218,24 @@ def _build_gpu_function(self) -> Any:
         lines.append("        return")
         lines.append("    {}:int = 0".format(_LOOP_VAR))
         lines.append("    for p in range({}-1):".format(_DIM_VAR))
-        lines.append("        x={}[p]+int(local_i/{}[p])".format(_LO_POINT_VAR,_PITCHES_VAR))
-        lines.append("        local_i = local_i-{}[p]*int(local_i/{}[p])".format(_PITCHES_VAR,_PITCHES_VAR))
-        lines.append("        {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR))
-        lines.append("    {}+=int(local_i*{}[{}-1])".format(_LOOP_VAR, _STRIDES_VAR, _DIM_VAR))
+        lines.append(
+            "        x={}[p]+int(local_i/{}[p])".format(
+                _LO_POINT_VAR, _PITCHES_VAR
+            )
+        )
+        lines.append(
+            "        local_i = local_i-{}[p]*int(local_i/{}[p])".format(
+                _PITCHES_VAR, _PITCHES_VAR
+            )
+        )
+        lines.append(
+            "        {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
+        )
+        lines.append(
+            "    {}+=int(local_i*{}[{}-1])".format(
+                _LOOP_VAR, _STRIDES_VAR, _DIM_VAR
+            )
+        )
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
@@ -225,9 +246,9 @@ def _lift_to_array_access(m: Any) -> str:
         for line in lines_old:
             l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
             lines.append(l_new)
-   
-        #print("IRINA DEBUG GPU function",lines)
-        
+
+        # print("IRINA DEBUG GPU function",lines)
+
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
         glbs: Dict[str, Any] = {}
@@ -235,7 +256,6 @@ def _lift_to_array_access(m: Any) -> str:
         return glbs[funcid]
 
     def _build_cpu_function(self) -> Callable[[Any], Any]:
-
         funcid = "vectorized_{}".format(self._pyfunc.__name__)
 
         # Preamble
@@ -262,7 +282,9 @@ def _emit_assignment(
             arg_idx += 1
         for a in self._scalar_args:
             scalar_type = np.dtype(type(a).__name__)
-            _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type)
+            _emit_assignment(
+                self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type
+            )
             arg_idx += 1
 
         # Main loop
@@ -302,7 +324,14 @@ def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
 
     def _compile_func_gpu(self) -> tuple[Any]:
         types = self._get_numba_types()
-        arg_types = types + [numba.core.types.uint64] + [numba.core.types.uint64]+[numba.core.types.CPointer(numba.core.types.uint64)]+ [numba.core.types.CPointer(numba.core.types.uint64)]+[numba.core.types.CPointer(numba.core.types.uint64)]
+        arg_types = (
+            types
+            + [numba.core.types.uint64]
+            + [numba.core.types.uint64]
+            + [numba.core.types.CPointer(numba.core.types.uint64)]
+            + [numba.core.types.CPointer(numba.core.types.uint64)]
+            + [numba.core.types.CPointer(numba.core.types.uint64)]
+        )
         sig = (*arg_types,)
 
         cuda_arch = numba.cuda.get_current_device().compute_capability
@@ -315,41 +344,47 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
 
         return numba.cfunc(sig)(self._numba_func)
 
-    def _execute(self, is_gpu:bool, num_gpus:int=0) -> None:
+    def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         if is_gpu and not self._created:
-            # create future for dependency between CREATE_CU_KERNEL and 
+            # create future for dependency between CREATE_CU_KERNEL and
             # EVAL_UDF tasks
-            future  = convert_to_cunumeric_ndarray(num_gpus)
+            future = convert_to_cunumeric_ndarray(num_gpus)
             future_deferred = runtime.to_deferred_array(future._thunk)
             # create CUDA kernel
-            launch_domain=Rect(lo=(0,), hi=(num_gpus,))
-            kernel_task = self._context.create_task(CuNumericOpCode.CREATE_CU_KERNEL,manual=True, launch_domain=launch_domain)
+            launch_domain = Rect(lo=(0,), hi=(num_gpus,))
+            kernel_task = self._context.create_task(
+                CuNumericOpCode.CREATE_CU_KERNEL,
+                manual=True,
+                launch_domain=launch_domain,
+            )
             ptx_hash = hash(self._gpu_func[0])
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            kernel_task.add_output(future_deferred.base)
+            kernel_task.add_input(future_deferred.base)
             kernel_task.execute()
-                
+            get_legate_runtime().issue_execution_fence(block=True)
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._num_outputs, ty.uint32)
         task.add_scalar_arg(len(self._scalar_args), ty.uint32)
-         
+
         for a in self._scalar_args:
             dtype = convert_to_cunumeric_dtype(type(a).__name__)
-            task.add_scalar_arg(a,dtype)
+            task.add_scalar_arg(a, dtype)
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
             task.add_scalar_arg(ptx_hash, ty.int64)
             task.add_scalar_arg((is_gpu and not self._created), bool)
         else:
-            task.add_scalar_arg(self._cpu_func.address, ty.uint64)  # type : ignore
+            task.add_scalar_arg(
+                self._cpu_func.address, ty.uint64
+            )  # type : ignore
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
             a_tmp = runtime.to_deferred_array(a._thunk)
-            a_tmp=a_tmp.base
+            a_tmp = a_tmp.base
             task.add_input(a_tmp)
             if count < self._num_outputs:
                 task.add_output(a_tmp)
@@ -364,8 +399,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         Return arrays with the results of `pyfunc` broadcast (vectorized) over
         `args` and `kwargs` not in `excluded`.
         """
-        #profiler = cProfile.Profile()
-        #profiler.enable()   
+        # profiler = cProfile.Profile()
+        # profiler.enable()
         if not self._created:
             self._scalar_args.clear()
             self._scalar_idxs.clear()
@@ -373,24 +408,24 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             self._argnames.clear()
             self._scalar_names.clear()
 
-            for i,arg in enumerate(args):
+            for i, arg in enumerate(args):
                 if arg is None:
                     raise ValueError(
                         "None is not supported in user function "
                         "passed to cunumeric.vectorize"
                     )
-                elif np.ndim(arg)==0:
+                elif np.ndim(arg) == 0:
                     self._scalar_args.append(arg)
                     self._scalar_idxs.append(i)
                 else:
                     self._args.append(convert_to_cunumeric_ndarray(arg))
 
-            #first fill arrays to argnames, then scalars:
-            for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
-                if not(i in self._scalar_idxs):
+            # first fill arrays to argnames, then scalars:
+            for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
+                if not (i in self._scalar_idxs):
                     self._argnames.append(k)
 
-            for i,k in enumerate(inspect.signature(self._pyfunc).parameters):
+            for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
                 if i in self._scalar_idxs:
                     self._scalar_names.append(k)
                     self._argnames.append(k)
@@ -401,44 +436,50 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     "kwargs are not supported in user functions"
                 )
 
-        #all output arrays should have the same type
-        if len(self._args)>0:
+        # all output arrays should have the same type
+        if len(self._args) > 0:
             ty = self._args[0].dtype
             shape = self._args[0].shape
-            for i in range (1, self._num_outputs):
-                if ty!=self._args[i].dtype:
-                    raise TypeError("cuNumeric doesnt support "
+            for i in range(1, self._num_outputs):
+                if ty != self._args[i].dtype:
+                    raise TypeError(
+                        "cuNumeric doesnt support "
                         "different types for output data in "
-                        "user function passed to vectorize")
+                        "user function passed to vectorize"
+                    )
                 if shape != self._args[i].shape:
-                    raise TypeError("cuNumeric doesnt support "
+                    raise TypeError(
+                        "cuNumeric doesnt support "
                         "different shapes for output data in "
-                        "user function passed to vectorize")
-            for i in range (self._num_outputs, len(self._args)):
-                if ty!=self._args[i].dtype:
+                        "user function passed to vectorize"
+                    )
+            for i in range(self._num_outputs, len(self._args)):
+                if ty != self._args[i].dtype:
                     runtime.warn(
                         "converting input array to output types in user func ",
                         category=RuntimeWarning,
                     )
                     self._args[i] = self._args[i].astype(ty)
-                if shape !=self._args[i].shape and np.ndim(self._args[i])>0:
-                     raise TypeError("cuNumeric doesnt support "
+                if shape != self._args[i].shape and np.ndim(self._args[i]) > 0:
+                    raise TypeError(
+                        "cuNumeric doesnt support "
                         "different shapes for arrays in "
-                        "user function passed to vectorize")
+                        "user function passed to vectorize"
+                    )
 
         if runtime.num_gpus > 0:
             if not self._created:
-                #print("IRINA DEBUG ptx is not created yet")
+                # print("IRINA DEBUG ptx is not created yet")
                 self._numba_func = self._build_gpu_function()
                 self._gpu_func = self._compile_func_gpu()
-            #profiler = cProfile.Profile()
-            #profiler.enable()
+            # profiler = cProfile.Profile()
+            # profiler.enable()
             self._execute(True, runtime.num_gpus)
             if not self._created and self._cache:
                 self._created = True
-            #profiler.disable()
-            #stats = pstats.Stats(profiler).sort_stats('cumtime')
-            #stats.print_stats()
+            # profiler.disable()
+            # stats = pstats.Stats(profiler).sort_stats('cumtime')
+            # stats.print_stats()
         else:
             if not self._created:
                 self._numba_func = self._build_cpu_function()
@@ -447,9 +488,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._created = True
             self._execute(False)
 
-            
-        #profiler.disable()
-        #stats = pstats.Stats(profiler).sort_stats('cumtime')
-        #stats.print_stats()
-
-
+        # profiler.disable()
+        # stats = pstats.Stats(profiler).sort_stats('cumtime')
+        # stats.print_stats()
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
index c8d89694c6..e0305c4834 100644
--- a/cunumeric_cpp.cmake
+++ b/cunumeric_cpp.cmake
@@ -208,6 +208,7 @@ if(Legion_USE_OpenMP)
     src/cunumeric/convolution/convolve_omp.cc
     src/cunumeric/transform/flip_omp.cc
     src/cunumeric/vectorize/eval_udf_omp.cc
+    src/cunumeric/vectorize/create_cu_kernel_omp.cc
   )
 endif()
 
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cc b/src/cunumeric/vectorize/create_cu_kernel.cc
index 1d6b5d3950..effcb32c95 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cc
+++ b/src/cunumeric/vectorize/create_cu_kernel.cc
@@ -18,21 +18,16 @@
 
 namespace cunumeric {
 
-using namespace Legion;
 using namespace legate;
 
+/*static*/ void CreateCUKernelTask::cpu_variant(TaskContext& context){};
 
-/*static*/ void CreateCUKernelTask::cpu_variant(TaskContext& context)
+namespace  // unnamed
 {
-}
-/*static*/ void CreateCUKernelTask::omp_variant(TaskContext& context)
+static void __attribute__((constructor)) register_tasks(void)
 {
+  CreateCUKernelTask::register_variants();
 }
-
-
-namespace  // unnamed
-{
-static void __attribute__((constructor)) register_tasks(void) { CreateCUKernelTask::register_variants(); }
 }  // namespace
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/create_cu_kernel_omp.cc b/src/cunumeric/vectorize/create_cu_kernel_omp.cc
new file mode 100644
index 0000000000..40cc28f6c7
--- /dev/null
+++ b/src/cunumeric/vectorize/create_cu_kernel_omp.cc
@@ -0,0 +1,25 @@
+/* Copyright 20223 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/vectorize/create_cu_kernel.h"
+
+namespace cunumeric {
+
+using namespace legate;
+
+/*static*/ void CreateCUKernelTask::omp_variant(TaskContext& context) {}
+
+}  // namespace cunumeric

From 3f8a3073ace4237d38735b1e509e2cf7554d5552 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 1 Mar 2023 11:06:31 -0800
Subject: [PATCH 33/78] changing the way we store CUfunctions

---
 cunumeric/vectorize.py                      | 26 +++++++++++++--------
 src/cunumeric/vectorize/create_cu_kernel.cu | 17 +++++++-------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index dec4afde86..7b1a9ecea2 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -24,7 +24,7 @@
 import numba.core.ccallback
 import numpy as np
 import six
-from legate.core import Rect, get_legate_runtime
+from legate.core import Rect, get_legate_runtime, ReductionOp
 
 from cunumeric.runtime import runtime
 
@@ -128,6 +128,10 @@ def __init__(
         self._created: bool = False
         self._cache: bool = cache
         self._num_outputs = 1  # there is at least 1 output
+        self._proc_ids = runtime.create_empty_thunk(
+                (runtime.num_gpus,), ty.int64, inputs=[])
+        self._cu_func_pointers = runtime.create_empty_thunk(
+                (runtime.num_gpus,), ty.int64, inputs=[])
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -345,11 +349,8 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
         return numba.cfunc(sig)(self._numba_func)
 
     def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
+        print("IRINA DEBUG in execute")
         if is_gpu and not self._created:
-            # create future for dependency between CREATE_CU_KERNEL and
-            # EVAL_UDF tasks
-            future = convert_to_cunumeric_ndarray(num_gpus)
-            future_deferred = runtime.to_deferred_array(future._thunk)
             # create CUDA kernel
             launch_domain = Rect(lo=(0,), hi=(num_gpus,))
             kernel_task = self._context.create_task(
@@ -358,11 +359,13 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
                 launch_domain=launch_domain,
             )
             ptx_hash = hash(self._gpu_func[0])
+            print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash)
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            kernel_task.add_input(future_deferred.base)
+            kernel_task.add_output(self._proc_ids.base)
+            kernel_task.add_output(self._cu_func_pointers.base)
             kernel_task.execute()
-            get_legate_runtime().issue_execution_fence(block=True)
+            #get_legate_runtime().issue_execution_fence(block=True)
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._num_outputs, ty.uint32)
@@ -374,8 +377,13 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
+            print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
             task.add_scalar_arg(ptx_hash, ty.int64)
-            task.add_scalar_arg((is_gpu and not self._created), bool)
+            kernel_task.add_intput(self._proc_ids.base)
+            kernel_task.add_intput(self._cu_func_pointers.base)
+            task.add_broadcast(self._proc_ids.base)
+            task.add_broadcast(self._cu_func_pointers.base)
+
         else:
             task.add_scalar_arg(
                 self._cpu_func.address, ty.uint64
@@ -390,8 +398,6 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
                 task.add_output(a_tmp)
             if count != 0:
                 task.add_alignment(a0.base, a_tmp)
-        if is_gpu and not self._created:
-            task.add_input(future_deferred.base)
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index eaddf6d3ec..7cb77e5352 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -30,11 +30,14 @@ using namespace legate;
   int64_t ptx_hash = context.scalars()[0].value<int64_t>();
   std::string ptx = context.scalars()[1].value<std::string>();
   Processor point = context.get_current_processor();
-  JITKernelStorage& jit_storage =JITKernelStorage::get_instance();
+  //JITKernelStorage& jit_storage =JITKernelStorage::get_instance();
+  auto rect = context.outputs()[0].shape<1>;
+  auto procs = context.outputs()[0].write_accessor<int64_t,1>().ptr(rect);
+  auto funcs = context.outputs()[1].write_accessor<int64_t,1>().ptr(rect);
+  procs[0]=point;
+  
 
   CUfunction func;
-  std::pair<int64_t,Processor> key(ptx_hash, point);
-  if (!jit_storage.registered_jit_funtion(key)){
     const unsigned num_options   = 4;
     const size_t log_buffer_size = 16384;
     std::vector<char> log_info_buffer(log_buffer_size);
@@ -90,11 +93,9 @@ using namespace legate;
 #ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
 #endif
-
-      std::cout <<"IRINA DEBUG create_func proc = "<<point<<" , func = "<<func
-      << ", hash = "<<ptx_hash<<std::endl;
-      jit_storage.add_jit_function(key, func);
-   }
+    funcs[0]=func;
+     // std::cout <<"IRINA DEBUG create_func proc = "<<point<<" , func = "<<func
+     // << ", hash = "<<ptx_hash<<std::endl;
 }
 
 }  // namespace cunumeric

From fd060d1340075bba9f41aea971cccc38e9b30e4b Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 1 Mar 2023 15:37:24 -0800
Subject: [PATCH 34/78] refactoring GPU caching

---
 cunumeric/vectorize.py                      | 81 +++++++++++----------
 src/cunumeric/vectorize/create_cu_kernel.cu | 16 ++--
 src/cunumeric/vectorize/eval_udf.cu         | 50 ++++++-------
 src/cunumeric/vectorize/eval_udf.h          |  1 -
 4 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 7b1a9ecea2..55ad5a5e52 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -31,6 +31,7 @@
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
 from .utils import convert_to_cunumeric_dtype
+from .module import zeros
 
 # from legate.timing import time
 
@@ -128,10 +129,12 @@ def __init__(
         self._created: bool = False
         self._cache: bool = cache
         self._num_outputs = 1  # there is at least 1 output
-        self._proc_ids = runtime.create_empty_thunk(
-                (runtime.num_gpus,), ty.int64, inputs=[])
-        self._cu_func_pointers = runtime.create_empty_thunk(
-                (runtime.num_gpus,), ty.int64, inputs=[])
+        self._proc_ids =  zeros((runtime.num_gpus,), dtype=np.dtype(np.uint64)) 
+        self._cu_func_pointers = zeros((runtime.num_gpus,), dtype=np.dtype(np.uint64))
+        self._proc_ids_deferred = runtime.to_deferred_array(self._proc_ids._thunk)
+        self._cu_func_pointers_deferred = runtime.to_deferred_array(self._cu_func_pointers._thunk)
+        #runtime.create_empty_thunk(
+                #(runtime.num_gpus,), dtype = np.dtype(np.uint64), inputs=[])
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -144,9 +147,9 @@ def __init__(
                 raise ValueError(
                     "There should be at least 1 type specified in otypes"
                 )
-            ty = otypes[0]
+            type0 = otypes[0]
             for t in otypes:
-                if t != ty:
+                if t != type0:
                     raise NotImplementedError(
                         "cuNumeric doesn't support variable types in otypes"
                     )
@@ -281,8 +284,8 @@ def _emit_assignment(
         # get names of arguments
         arg_idx = 0
         for a in self._args:
-            ty = a.dtype
-            _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, ty)
+            type_a= a.dtype
+            _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, type_a)
             arg_idx += 1
         for a in self._scalar_args:
             scalar_type = np.dtype(type(a).__name__)
@@ -314,16 +317,16 @@ def _lift_to_array_access(m: Any) -> str:
     def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
         types = []
         for arg in self._args:
-            ty = arg.dtype
-            ty = str(ty) if ty != bool else "int8"
-            ty = getattr(numba.core.types, ty)
-            ty = numba.core.types.CPointer(ty)
-            types.append(ty)
+            type_a = arg.dtype
+            type_a = str(type_a) if type_a != bool else "int8"
+            type_a = getattr(numba.core.types, type_a)
+            type_a = numba.core.types.CPointer(type_a)
+            types.append(type_a)
         for arg in self._scalar_args:
-            ty = np.dtype(type(arg).__name__)
-            ty = str(ty) if ty != bool else "int8"
-            ty = getattr(numba.core.types, ty)
-            types.append(ty)
+            type_a = np.dtype(type(arg).__name__)
+            type_a = str(type_a) if type_a != bool else "int8"
+            type_a = getattr(numba.core.types, type_a)
+            types.append(type_a)
         return types
 
     def _compile_func_gpu(self) -> tuple[Any]:
@@ -355,16 +358,19 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             launch_domain = Rect(lo=(0,), hi=(num_gpus,))
             kernel_task = self._context.create_task(
                 CuNumericOpCode.CREATE_CU_KERNEL,
-                manual=True,
+                #manual=True,
                 launch_domain=launch_domain,
             )
             ptx_hash = hash(self._gpu_func[0])
             print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash)
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            kernel_task.add_output(self._proc_ids.base)
-            kernel_task.add_output(self._cu_func_pointers.base)
+            kernel_task.add_input(self._proc_ids_deferred.base)
+            kernel_task.add_input(self._cu_func_pointers_deferred.base)
+            kernel_task.add_output(self._proc_ids_deferred.base)
+            kernel_task.add_output(self._cu_func_pointers_deferred.base)
             kernel_task.execute()
+            print("IRINA DEBUG proc_ids =", self._proc_ids);
             #get_legate_runtime().issue_execution_fence(block=True)
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
@@ -375,19 +381,6 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             dtype = convert_to_cunumeric_dtype(type(a).__name__)
             task.add_scalar_arg(a, dtype)
 
-        if is_gpu:
-            ptx_hash = hash(self._gpu_func[0])
-            print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
-            task.add_scalar_arg(ptx_hash, ty.int64)
-            kernel_task.add_intput(self._proc_ids.base)
-            kernel_task.add_intput(self._cu_func_pointers.base)
-            task.add_broadcast(self._proc_ids.base)
-            task.add_broadcast(self._cu_func_pointers.base)
-
-        else:
-            task.add_scalar_arg(
-                self._cpu_func.address, ty.uint64
-            )  # type : ignore
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
@@ -398,6 +391,20 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
                 task.add_output(a_tmp)
             if count != 0:
                 task.add_alignment(a0.base, a_tmp)
+
+        if is_gpu:
+            ptx_hash = hash(self._gpu_func[0])
+            print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
+            task.add_scalar_arg(ptx_hash, ty.int64)
+            task.add_input(self._proc_ids_deferred.base)
+            task.add_input(self._cu_func_pointers_deferred.base)
+            task.add_broadcast(self._proc_ids_deferred.base)
+            task.add_broadcast(self._cu_func_pointers_deferred.base)
+
+        else:
+            task.add_scalar_arg(
+                self._cpu_func.address, ty.uint64
+            )  # type : ignore
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
@@ -444,10 +451,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
 
         # all output arrays should have the same type
         if len(self._args) > 0:
-            ty = self._args[0].dtype
+            type_a = self._args[0].dtype
             shape = self._args[0].shape
             for i in range(1, self._num_outputs):
-                if ty != self._args[i].dtype:
+                if type_a != self._args[i].dtype:
                     raise TypeError(
                         "cuNumeric doesnt support "
                         "different types for output data in "
@@ -460,12 +467,12 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                         "user function passed to vectorize"
                     )
             for i in range(self._num_outputs, len(self._args)):
-                if ty != self._args[i].dtype:
+                if type_a != self._args[i].dtype:
                     runtime.warn(
                         "converting input array to output types in user func ",
                         category=RuntimeWarning,
                     )
-                    self._args[i] = self._args[i].astype(ty)
+                    self._args[i] = self._args[i].astype(type_a)
                 if shape != self._args[i].shape and np.ndim(self._args[i]) > 0:
                     raise TypeError(
                         "cuNumeric doesnt support "
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index 7cb77e5352..e0c56b9075 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -31,11 +31,15 @@ using namespace legate;
   std::string ptx = context.scalars()[1].value<std::string>();
   Processor point = context.get_current_processor();
   //JITKernelStorage& jit_storage =JITKernelStorage::get_instance();
-  auto rect = context.outputs()[0].shape<1>;
-  auto procs = context.outputs()[0].write_accessor<int64_t,1>().ptr(rect);
-  auto funcs = context.outputs()[1].write_accessor<int64_t,1>().ptr(rect);
-  procs[0]=point;
-  
+  std::vector<Array> &outputs =context.outputs(); 
+  auto rect = outputs[0].shape<1>();
+  auto procs = outputs[0].write_accessor<uint64_t,1>(rect);
+  auto funcs = outputs[1].write_accessor<uint64_t,1>(rect);
+  //FIXME check if dense)
+  auto procs_ptr = procs.ptr(rect);
+  auto funcs_ptr = funcs.ptr(rect);
+  procs_ptr[0]=point.id;
+  std::cout <<"INSIDE OF THE CREATE FUNCTION "<<rect<< "  "<<procs_ptr[0]<<std::endl; 
 
   CUfunction func;
     const unsigned num_options   = 4;
@@ -93,7 +97,7 @@ using namespace legate;
 #ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
 #endif
-    funcs[0]=func;
+    funcs_ptr[0]=reinterpret_cast<std::uintptr_t>(func);
      // std::cout <<"IRINA DEBUG create_func proc = "<<point<<" , func = "<<func
      // << ", hash = "<<ptx_hash<<std::endl;
 }
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 9ae568875f..287bfdb020 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -29,31 +29,32 @@ struct EvalUdfGPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
   {
-    using VAL = legate_type_of<CODE>;
-    Rect<DIM> rect;
+  std::cout<<"IRINA DEBUG outputs = "<<args.outputs.size()<< " inputs = "<<
+      args.inputs.size()<<std::endl;
 
-  JITKernelStorage& jit_storage =JITKernelStorage::get_instance(); 
+   using VAL = legate_type_of<CODE>;
+    Rect<DIM> rect;
 
-  //std::hash<std::string> hasher;
-  CUfunction func;
-  std::pair<int64_t,Processor> key(args.hash, args.point);
-  //size_t ptx_hash = hasher(args.ptx);
-  //std::cout <<"IRINA DEBUG within cuda task hash = "<<args.hash<< " , registered = ?"<<jit_storage.registered_jit_funtion(key)<<std::endl;
-  if (jit_storage.registered_jit_funtion(key)){
-    func = jit_storage.return_saved_jit_function(key);
+   size_t input_size=args.inputs.size()-2;  
+   auto procs_rect = args.inputs[input_size].shape<1>();
+
+   std::cout<<"IRINA DEBUG proc rect = "<<procs_rect<<" input_size = "<<input_size<<std::endl;
+
+  auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();
+  auto funcs=args.inputs[input_size+1].read_accessor<uint64_t,1>();
+  Point<1> proc_point;
+  Pitches<0> proc_pitches;
+  size_t procs_volume = proc_pitches.flatten(procs_rect);
+  std::cout<<"IRINA DEBUG volume = "<<procs_volume<< "  " << procs[0]<<" " <<funcs[0]<<std::endl; 
+  for (size_t i=0; i<procs_volume; i++){
+    auto p = proc_pitches.unflatten(i, procs_rect.lo);
+     std::cout<<"IRINA DEBUG " <<funcs[0]<< std::endl;
+     std::cout<<"IRINA DEBUG2 " <<args.point.id<< std::endl;
+     if(procs[p]==args.point.id)
+         proc_point=p;
   }
-  else{
-    assert(false); //should never come here
-   }
-
-    std::cout <<"IRINA DEBUG proc = "<<args.point<<" , func = "<<func<<
-      "  hash = "<<args.hash<< std::endl;
-
+  CUfunction func = reinterpret_cast<CUfunction>(funcs[proc_point]);
     // Filling up the buffer with arguments
-    size_t input_size=args.inputs.size();
-      if (args.is_created){
-         input_size=input_size-1;
-      }
 
     size_t buffer_size = (input_size+args.scalars.size()) * sizeof(void*);
     buffer_size +=sizeof(size_t);//size
@@ -159,7 +160,7 @@ struct EvalUdfGPU {
       scalars.push_back(context.scalars()[i]);
   
   int64_t ptx_hash = context.scalars()[2+num_scalars].value<int64_t>();
-  bool is_created = context.scalars()[3+num_scalars].value<bool>();
+  //bool is_created = context.scalars()[3+num_scalars].value<bool>();
 
 
   EvalUdfArgs args{0,
@@ -168,10 +169,7 @@ struct EvalUdfGPU {
                    scalars,
                    num_outputs,
                    context.get_current_processor(),
-                   ptx_hash,
-                   is_created};
-  //if (!is_created)
-  //    args.ptx = context.scalars()[4+num_scalars].value<std::string>();
+                   ptx_hash};
   size_t dim=1;
   if (args.inputs.size()>0){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index 4d867a003f..2e14f0dacc 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -29,7 +29,6 @@ struct EvalUdfArgs {
   uint32_t num_outputs;
   Legion::Processor point;
   int64_t hash=0;
-  bool is_created = false;
 };
 
 class EvalUdfTask : public CuNumericTask<EvalUdfTask> {

From 113dee7a82573d867f5002409537f8e3af7a5abb Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 1 Mar 2023 20:47:08 -0800
Subject: [PATCH 35/78] moving map to store CUfunctions to cudalibs.h

---
 src/cunumeric/cuda_help.h                   |  7 ++++--
 src/cunumeric/cudalibs.cu                   | 25 +++++++++++++++++++++
 src/cunumeric/cudalibs.h                    |  3 +++
 src/cunumeric/vectorize/create_cu_kernel.cu | 12 ++--------
 src/cunumeric/vectorize/eval_udf.cu         | 20 ++---------------
 5 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
index ce4504d4c0..8ebe3c809c 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cunumeric/cuda_help.h
@@ -29,6 +29,7 @@
 #include <cufftXt.h>
 #include <cutensor.h>
 #include <nccl.h>
+#include <cuda.h>
 
 #define THREADS_PER_BLOCK 128
 #define MIN_CTAS_PER_SM 4
@@ -116,6 +117,8 @@ cublasHandle_t get_cublas();
 cusolverDnHandle_t get_cusolver();
 cutensorHandle_t* get_cutensor();
 cufftContext get_cufft_plan(cufftType type, const legate::DomainPoint& size);
+void store_udf(size_t hash, CUfunction func);
+CUfunction get_udf(size_t hash);
 
 __host__ inline void check_cublas(cublasStatus_t status, const char* file, int line)
 {
@@ -389,7 +392,7 @@ __device__ __forceinline__ void store_streaming<double>(double* ptr, double valu
 {
   asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory");
 }
-
+#if 0
 #include<cuda.h>
 
 class JITKernelStorage
@@ -426,5 +429,5 @@ class JITKernelStorage
   }
 };//class JITKernelStorage
 
-
+#endif 
 }  // namespace cunumeric
diff --git a/src/cunumeric/cudalibs.cu b/src/cunumeric/cudalibs.cu
index 7d3ab8a098..45d33d752f 100644
--- a/src/cunumeric/cudalibs.cu
+++ b/src/cunumeric/cudalibs.cu
@@ -233,6 +233,19 @@ cufftContext CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& si
   return cufftContext(cache->get_cufft_plan(size));
 }
 
+void CUDALibraries::store_udf_func(size_t hash, CUfunction func){
+  udf_caches_[hash]=func;
+}
+
+CUfunction CUDALibraries::get_udf_func(size_t hash){
+    auto finder = udf_caches_.find(hash);
+    if (udf_caches_.end() == finder) {
+      fprintf(stderr, "UDF function wasn't generated yet");
+      LEGATE_ABORT;
+    }
+    return udf_caches_[hash];
+}
+
 static CUDALibraries& get_cuda_libraries(legate::Processor proc)
 {
   if (proc.kind() != legate::Processor::TOC_PROC) {
@@ -278,6 +291,18 @@ cufftContext get_cufft_plan(cufftType type, const DomainPoint& size)
   return lib.get_cufft_plan(type, size);
 }
 
+void store_udf(size_t hash, CUfunction func){
+    const auto proc = legate::Processor::get_executing_processor();
+  auto& lib       = get_cuda_libraries(proc);
+  lib.store_udf_func(hash, func);
+}
+
+CUfunction get_udf(size_t hash){
+  const auto proc = legate::Processor::get_executing_processor();
+  auto& lib       = get_cuda_libraries(proc);
+  return lib.get_udf_func(hash);
+}
+
 class LoadCUDALibsTask : public CuNumericTask<LoadCUDALibsTask> {
  public:
   static const int TASK_ID = CUNUMERIC_LOAD_CUDALIBS;
diff --git a/src/cunumeric/cudalibs.h b/src/cunumeric/cudalibs.h
index f2f01fffe1..8f91f3aad3 100644
--- a/src/cunumeric/cudalibs.h
+++ b/src/cunumeric/cudalibs.h
@@ -38,6 +38,8 @@ struct CUDALibraries {
   cusolverDnHandle_t get_cusolver();
   cutensorHandle_t* get_cutensor();
   cufftContext get_cufft_plan(cufftType type, const legate::DomainPoint& size);
+  void store_udf_func(size_t hash, CUfunction func);
+  CUfunction get_udf_func(size_t hash);
 
  private:
   void finalize_cublas();
@@ -50,6 +52,7 @@ struct CUDALibraries {
   cusolverDnContext* cusolver_;
   cutensorHandle_t* cutensor_;
   std::map<cufftType, cufftPlanCache*> plan_caches_;
+  std::map<size_t, CUfunction> udf_caches_;
 };
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index e0c56b9075..6481727a00 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -30,16 +30,8 @@ using namespace legate;
   int64_t ptx_hash = context.scalars()[0].value<int64_t>();
   std::string ptx = context.scalars()[1].value<std::string>();
   Processor point = context.get_current_processor();
-  //JITKernelStorage& jit_storage =JITKernelStorage::get_instance();
-  std::vector<Array> &outputs =context.outputs(); 
-  auto rect = outputs[0].shape<1>();
-  auto procs = outputs[0].write_accessor<uint64_t,1>(rect);
-  auto funcs = outputs[1].write_accessor<uint64_t,1>(rect);
+  auto procs = context.outputs()[0].write_accessor<uint64_t,1>();
   //FIXME check if dense)
-  auto procs_ptr = procs.ptr(rect);
-  auto funcs_ptr = funcs.ptr(rect);
-  procs_ptr[0]=point.id;
-  std::cout <<"INSIDE OF THE CREATE FUNCTION "<<rect<< "  "<<procs_ptr[0]<<std::endl; 
 
   CUfunction func;
     const unsigned num_options   = 4;
@@ -97,7 +89,7 @@ using namespace legate;
 #ifdef DEBUG_CUNUMERIC
     assert(result == CUDA_SUCCESS);
 #endif
-    funcs_ptr[0]=reinterpret_cast<std::uintptr_t>(func);
+    store_udf(ptx_hash, func);
      // std::cout <<"IRINA DEBUG create_func proc = "<<point<<" , func = "<<func
      // << ", hash = "<<ptx_hash<<std::endl;
 }
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 287bfdb020..f554a549e8 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -29,31 +29,15 @@ struct EvalUdfGPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
   {
-  std::cout<<"IRINA DEBUG outputs = "<<args.outputs.size()<< " inputs = "<<
-      args.inputs.size()<<std::endl;
-
    using VAL = legate_type_of<CODE>;
-    Rect<DIM> rect;
+   Rect<DIM> rect;
 
    size_t input_size=args.inputs.size()-2;  
    auto procs_rect = args.inputs[input_size].shape<1>();
 
-   std::cout<<"IRINA DEBUG proc rect = "<<procs_rect<<" input_size = "<<input_size<<std::endl;
-
   auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();
   auto funcs=args.inputs[input_size+1].read_accessor<uint64_t,1>();
-  Point<1> proc_point;
-  Pitches<0> proc_pitches;
-  size_t procs_volume = proc_pitches.flatten(procs_rect);
-  std::cout<<"IRINA DEBUG volume = "<<procs_volume<< "  " << procs[0]<<" " <<funcs[0]<<std::endl; 
-  for (size_t i=0; i<procs_volume; i++){
-    auto p = proc_pitches.unflatten(i, procs_rect.lo);
-     std::cout<<"IRINA DEBUG " <<funcs[0]<< std::endl;
-     std::cout<<"IRINA DEBUG2 " <<args.point.id<< std::endl;
-     if(procs[p]==args.point.id)
-         proc_point=p;
-  }
-  CUfunction func = reinterpret_cast<CUfunction>(funcs[proc_point]);
+  CUfunction func = get_udf(args.hash);
     // Filling up the buffer with arguments
 
     size_t buffer_size = (input_size+args.scalars.size()) * sizeof(void*);

From 42b82776954cb8e139e7496e5c3c26f7faffe40c Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 2 Mar 2023 21:35:15 -0800
Subject: [PATCH 36/78] fixing dependency between creation and use of UDF
 function

---
 cunumeric/vectorize.py                      | 28 ++++++++-------------
 src/cunumeric/vectorize/create_cu_kernel.cu | 15 ++++++++---
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 55ad5a5e52..faea888d1c 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -129,12 +129,8 @@ def __init__(
         self._created: bool = False
         self._cache: bool = cache
         self._num_outputs = 1  # there is at least 1 output
-        self._proc_ids =  zeros((runtime.num_gpus,), dtype=np.dtype(np.uint64)) 
-        self._cu_func_pointers = zeros((runtime.num_gpus,), dtype=np.dtype(np.uint64))
-        self._proc_ids_deferred = runtime.to_deferred_array(self._proc_ids._thunk)
-        self._cu_func_pointers_deferred = runtime.to_deferred_array(self._cu_func_pointers._thunk)
-        #runtime.create_empty_thunk(
-                #(runtime.num_gpus,), dtype = np.dtype(np.uint64), inputs=[])
+        self._created_array = create_empty_thunk(
+                (1,), dtype = np.dtype(np.bool), inputs=[])
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -352,7 +348,7 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
         return numba.cfunc(sig)(self._numba_func)
 
     def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
-        print("IRINA DEBUG in execute")
+        #print("IRINA DEBUG in execute")
         if is_gpu and not self._created:
             # create CUDA kernel
             launch_domain = Rect(lo=(0,), hi=(num_gpus,))
@@ -365,12 +361,10 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash)
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            kernel_task.add_input(self._proc_ids_deferred.base)
-            kernel_task.add_input(self._cu_func_pointers_deferred.base)
-            kernel_task.add_output(self._proc_ids_deferred.base)
-            kernel_task.add_output(self._cu_func_pointers_deferred.base)
+            kernel_task.add_reduction(self._created_array.base,ReductionOp.MUL)
             kernel_task.execute()
-            print("IRINA DEBUG proc_ids =", self._proc_ids);
+            print("IRINA DEBUG created array= ",self._created_array);
+            self._created = bool(self._created_array[0])
             #get_legate_runtime().issue_execution_fence(block=True)
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
@@ -396,10 +390,10 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             ptx_hash = hash(self._gpu_func[0])
             print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
             task.add_scalar_arg(ptx_hash, ty.int64)
-            task.add_input(self._proc_ids_deferred.base)
-            task.add_input(self._cu_func_pointers_deferred.base)
-            task.add_broadcast(self._proc_ids_deferred.base)
-            task.add_broadcast(self._cu_func_pointers_deferred.base)
+            task.add_input(self._created_array.base)
+            #task.add_input(self._cu_func_pointers_deferred.base)
+            #task.add_broadcast(self._proc_ids_deferred.base)
+          #task.add_broadcast(self._cu_func_pointers_deferred.base)
 
         else:
             task.add_scalar_arg(
@@ -488,8 +482,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             # profiler = cProfile.Profile()
             # profiler.enable()
             self._execute(True, runtime.num_gpus)
-            if not self._created and self._cache:
-                self._created = True
             # profiler.disable()
             # stats = pstats.Stats(profiler).sort_stats('cumtime')
             # stats.print_stats()
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index 6481727a00..2b5477e17c 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -24,14 +24,19 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+fill_kernel(const AccessorRD<bool,1> out)
+{
+  reduce_output(out,true);
+}
+
 /*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context)
 {
-  
+
   int64_t ptx_hash = context.scalars()[0].value<int64_t>();
   std::string ptx = context.scalars()[1].value<std::string>();
   Processor point = context.get_current_processor();
   auto procs = context.outputs()[0].write_accessor<uint64_t,1>();
-  //FIXME check if dense)
 
   CUfunction func;
     const unsigned num_options   = 4;
@@ -90,8 +95,10 @@ using namespace legate;
     assert(result == CUDA_SUCCESS);
 #endif
     store_udf(ptx_hash, func);
-     // std::cout <<"IRINA DEBUG create_func proc = "<<point<<" , func = "<<func
-     // << ", hash = "<<ptx_hash<<std::endl;
+    auto stream = get_cached_stream();
+    out = context.outputs()[0].reduce_accessor<ProdReduction<bool>, true, 1>();
+    reduce_kernel<<<1,1,0,stream>>>(out);
+    CHECK_CUDA_STREAM(stream);
 }
 
 }  // namespace cunumeric

From d8caf5ffe67710f578b18a3d76ad770fce626491 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 2 Mar 2023 22:35:08 -0800
Subject: [PATCH 37/78] fixing dependency between creation and use of UDF
 function

---
 cunumeric/vectorize.py                      | 19 +++++++++++--------
 src/cunumeric/vectorize/create_cu_kernel.cu | 11 ++++++-----
 src/cunumeric/vectorize/eval_udf.cu         | 12 ++++++------
 tests/integration/test_vectorize.py         |  2 ++
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index faea888d1c..98507acb06 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -31,7 +31,7 @@
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
 from .utils import convert_to_cunumeric_dtype
-from .module import zeros
+from .module import full
 
 # from legate.timing import time
 
@@ -129,8 +129,10 @@ def __init__(
         self._created: bool = False
         self._cache: bool = cache
         self._num_outputs = 1  # there is at least 1 output
-        self._created_array = create_empty_thunk(
-                (1,), dtype = np.dtype(np.bool), inputs=[])
+        self._created_array = full((1,), True, dtype=bool)
+        self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk)
+             #runtime.create_empty_thunk(
+             #   (1,), dtype = np.dtype(np.bool), inputs=[])
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -361,10 +363,11 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash)
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            kernel_task.add_reduction(self._created_array.base,ReductionOp.MUL)
+            kernel_task.add_reduction(self._created_array_deferred.base,ReductionOp.MUL)
             kernel_task.execute()
             print("IRINA DEBUG created array= ",self._created_array);
-            self._created = bool(self._created_array[0])
+            if self._cache:
+                self._created = bool(self._created_array[0])
             #get_legate_runtime().issue_execution_fence(block=True)
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
@@ -390,7 +393,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             ptx_hash = hash(self._gpu_func[0])
             print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
             task.add_scalar_arg(ptx_hash, ty.int64)
-            task.add_input(self._created_array.base)
+            task.add_input(self._created_array_deferred.base)
             #task.add_input(self._cu_func_pointers_deferred.base)
             #task.add_broadcast(self._proc_ids_deferred.base)
           #task.add_broadcast(self._cu_func_pointers_deferred.base)
@@ -426,8 +429,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._scalar_idxs.append(i)
                 else:
                     self._args.append(convert_to_cunumeric_ndarray(arg))
-
-            # first fill arrays to argnames, then scalars:
+       
+                # first fill arrays to argnames, then scalars:
             for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
                 if not (i in self._scalar_idxs):
                     self._argnames.append(k)
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index 2b5477e17c..3b157d5319 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -25,9 +25,11 @@ using namespace Legion;
 using namespace legate;
 
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-fill_kernel(const AccessorRD<bool,1> out)
+fill_out_kernel(const AccessorRD<legate::ProdReduction<bool>,true,1> out)
 {
-  reduce_output(out,true);
+  const int idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx >0) return;
+  out.reduce(0, true);
 }
 
 /*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context)
@@ -36,7 +38,6 @@ fill_kernel(const AccessorRD<bool,1> out)
   int64_t ptx_hash = context.scalars()[0].value<int64_t>();
   std::string ptx = context.scalars()[1].value<std::string>();
   Processor point = context.get_current_processor();
-  auto procs = context.outputs()[0].write_accessor<uint64_t,1>();
 
   CUfunction func;
     const unsigned num_options   = 4;
@@ -96,8 +97,8 @@ fill_kernel(const AccessorRD<bool,1> out)
 #endif
     store_udf(ptx_hash, func);
     auto stream = get_cached_stream();
-    out = context.outputs()[0].reduce_accessor<ProdReduction<bool>, true, 1>();
-    reduce_kernel<<<1,1,0,stream>>>(out);
+    auto out = context.reductions()[0].reduce_accessor<legate::ProdReduction<bool>, true, 1>();
+    fill_out_kernel<<<1,1,0,stream>>>(out);
     CHECK_CUDA_STREAM(stream);
 }
 
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index f554a549e8..485227abfb 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -32,11 +32,11 @@ struct EvalUdfGPU {
    using VAL = legate_type_of<CODE>;
    Rect<DIM> rect;
 
-   size_t input_size=args.inputs.size()-2;  
-   auto procs_rect = args.inputs[input_size].shape<1>();
+   size_t input_size=args.inputs.size()-1;  
+  // auto procs_rect = args.inputs[input_size].shape<1>();
 
-  auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();
-  auto funcs=args.inputs[input_size+1].read_accessor<uint64_t,1>();
+  //auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();
+  //auto funcs=args.inputs[input_size+1].read_accessor<uint64_t,1>();
   CUfunction func = get_udf(args.hash);
     // Filling up the buffer with arguments
 
@@ -53,7 +53,7 @@ struct EvalUdfGPU {
     auto p = raw_arg_buffer;
     size_t strides[DIM];
     size_t size =1;
-    if (args.inputs.size()>0){
+    if (input_size>0){
       rect = args.inputs[0].shape<DIM>();
       size = rect.volume();
       for (size_t i = 0; i < input_size; i++) {
@@ -155,7 +155,7 @@ struct EvalUdfGPU {
                    context.get_current_processor(),
                    ptx_hash};
   size_t dim=1;
-  if (args.inputs.size()>0){
+  if (args.inputs.size()>1){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
   }
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 0dc8e7b6a7..520267ab76 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -44,11 +44,13 @@ def test_vectorize():
     func(a, b)
     assert(np.array_equal(a, [1,3,5,7,9]))
 
+    func = num.vectorize(my_func)
     a= num.arange(5)
     b=2
     func(a,b)
     assert(np.array_equal(a, [2,4,6,8,10]))
    
+    num.vectorize(my_func)
     a=num.array([[1,2,3],[4,5,6],[7,8,9]])
     b=num.array([[10,11,12],[13,14,15],[16,17,18]])
     func(a[:2],b[:2])

From cfdf8eb24b3f4abc5e305925ea453b5275bbc7f0 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 2 Mar 2023 23:57:45 -0800
Subject: [PATCH 38/78] futures do not add dependency between tasks

---
 cunumeric/vectorize.py                      | 15 ++++++++++-----
 src/cunumeric/vectorize/create_cu_kernel.cu |  8 ++++----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 98507acb06..ffd5f4dca8 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -129,7 +129,10 @@ def __init__(
         self._created: bool = False
         self._cache: bool = cache
         self._num_outputs = 1  # there is at least 1 output
-        self._created_array = full((1,), True, dtype=bool)
+        size_tmp=runtime.num_gpus
+        if size_tmp==1:
+           size_tmp=10
+        self._created_array = full((size_tmp,), True, dtype=bool)
         self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk)
              #runtime.create_empty_thunk(
              #   (1,), dtype = np.dtype(np.bool), inputs=[])
@@ -360,12 +363,13 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
                 launch_domain=launch_domain,
             )
             ptx_hash = hash(self._gpu_func[0])
-            print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash)
+            #print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash)
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            kernel_task.add_reduction(self._created_array_deferred.base,ReductionOp.MUL)
+            #kernel_task.add_reduction(self._created_array_deferred.base,ReductionOp.MUL)
+            kernel_task.add_output(self._created_array_deferred.base)
             kernel_task.execute()
-            print("IRINA DEBUG created array= ",self._created_array);
+            #print("IRINA DEBUG created array= ",self._created_array);
             if self._cache:
                 self._created = bool(self._created_array[0])
             #get_legate_runtime().issue_execution_fence(block=True)
@@ -391,9 +395,10 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
-            print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
+            #print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
             task.add_scalar_arg(ptx_hash, ty.int64)
             task.add_input(self._created_array_deferred.base)
+            task.add_broadcast(self._created_array_deferred.base)
             #task.add_input(self._cu_func_pointers_deferred.base)
             #task.add_broadcast(self._proc_ids_deferred.base)
           #task.add_broadcast(self._cu_func_pointers_deferred.base)
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index 3b157d5319..d3f00091cc 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -96,10 +96,10 @@ fill_out_kernel(const AccessorRD<legate::ProdReduction<bool>,true,1> out)
     assert(result == CUDA_SUCCESS);
 #endif
     store_udf(ptx_hash, func);
-    auto stream = get_cached_stream();
-    auto out = context.reductions()[0].reduce_accessor<legate::ProdReduction<bool>, true, 1>();
-    fill_out_kernel<<<1,1,0,stream>>>(out);
-    CHECK_CUDA_STREAM(stream);
+    //auto stream = get_cached_stream();
+    //auto out = context.reductions()[0].reduce_accessor<legate::ProdReduction<bool>, true, 1>();
+    //fill_out_kernel<<<1,1,0,stream>>>(out);
+    //CHECK_CUDA_STREAM(stream);
 }
 
 }  // namespace cunumeric

From 8846242083080c7ee3aa8913bdec36557b09c3a9 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 8 Mar 2023 14:44:28 -0800
Subject: [PATCH 39/78] some clean-up

---
 cunumeric/vectorize.py | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index ffd5f4dca8..15c8b276e6 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -206,6 +206,10 @@ def _build_gpu_function(self) -> Any:
 
         # Preamble
         lines = ["from numba import cuda"]
+        lines.append("import math")
+        lines.append("import numpy")
+        lines.append("import scipy.special")
+        lines.append("import numba_scipy")
 
         # Signature
         args = (
@@ -353,35 +357,32 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
         return numba.cfunc(sig)(self._numba_func)
 
     def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
-        #print("IRINA DEBUG in execute")
         if is_gpu and not self._created:
             # create CUDA kernel
             launch_domain = Rect(lo=(0,), hi=(num_gpus,))
             kernel_task = self._context.create_task(
                 CuNumericOpCode.CREATE_CU_KERNEL,
-                #manual=True,
                 launch_domain=launch_domain,
             )
             ptx_hash = hash(self._gpu_func[0])
-            #print("IRINA DEBUG creating CUkernel for hash = ", ptx_hash)
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            #kernel_task.add_reduction(self._created_array_deferred.base,ReductionOp.MUL)
             kernel_task.add_output(self._created_array_deferred.base)
             kernel_task.execute()
-            #print("IRINA DEBUG created array= ",self._created_array);
+            # inline map first element of the array to make sure the CREATE_CU_KERNEL
+            # task has finished by the time we set self._created to True 
             if self._cache:
                 self._created = bool(self._created_array[0])
-            #get_legate_runtime().issue_execution_fence(block=True)
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-        task.add_scalar_arg(self._num_outputs, ty.uint32)
-        task.add_scalar_arg(len(self._scalar_args), ty.uint32)
-
+        task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs
+        task.add_scalar_arg(len(self._scalar_args), ty.uint32) # N of scalar_args
+        # add all scalars 
         for a in self._scalar_args:
             dtype = convert_to_cunumeric_dtype(type(a).__name__)
             task.add_scalar_arg(a, dtype)
 
+        # add array arguments
         a0 = self._args[0]._thunk
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
@@ -395,13 +396,11 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
-            #print("IRINA DEBUG executing UDF for hash = ", ptx_hash)
             task.add_scalar_arg(ptx_hash, ty.int64)
+            # passing the _created * array to introduce dependency between
+            # CREATE_CU_KERNEL task and EVAL_UDF task
             task.add_input(self._created_array_deferred.base)
             task.add_broadcast(self._created_array_deferred.base)
-            #task.add_input(self._cu_func_pointers_deferred.base)
-            #task.add_broadcast(self._proc_ids_deferred.base)
-          #task.add_broadcast(self._cu_func_pointers_deferred.base)
 
         else:
             task.add_scalar_arg(
@@ -487,12 +486,12 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 # print("IRINA DEBUG ptx is not created yet")
                 self._numba_func = self._build_gpu_function()
                 self._gpu_func = self._compile_func_gpu()
-            # profiler = cProfile.Profile()
-            # profiler.enable()
+            #profiler = cProfile.Profile()
+            #profiler.enable()
             self._execute(True, runtime.num_gpus)
-            # profiler.disable()
-            # stats = pstats.Stats(profiler).sort_stats('cumtime')
-            # stats.print_stats()
+            #profiler.disable()
+            #stats = pstats.Stats(profiler).sort_stats('cumtime')
+            #stats.print_stats()
         else:
             if not self._created:
                 self._numba_func = self._build_cpu_function()

From 6e8975f7491652c6e36ccb4625c4146fbf46d47e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 8 Mar 2023 14:48:13 -0800
Subject: [PATCH 40/78] fixed dependency bug

---
 cunumeric/vectorize.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 15c8b276e6..6cd8c7e367 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -367,6 +367,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             ptx_hash = hash(self._gpu_func[0])
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
+            #added to introduce dependency between this and EVAL_UDF task
+            kernel_task.add_input(self._created_array_deferred.base)
             kernel_task.add_output(self._created_array_deferred.base)
             kernel_task.execute()
             # inline map first element of the array to make sure the CREATE_CU_KERNEL

From 1c3ce9e26561e27f4804548ef37092fb58651e9d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 8 Mar 2023 14:50:17 -0800
Subject: [PATCH 41/78] more clean-up

---
 cunumeric/vectorize.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 6cd8c7e367..5c31f8a031 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -485,15 +485,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
 
         if runtime.num_gpus > 0:
             if not self._created:
-                # print("IRINA DEBUG ptx is not created yet")
                 self._numba_func = self._build_gpu_function()
                 self._gpu_func = self._compile_func_gpu()
-            #profiler = cProfile.Profile()
-            #profiler.enable()
             self._execute(True, runtime.num_gpus)
-            #profiler.disable()
-            #stats = pstats.Stats(profiler).sort_stats('cumtime')
-            #stats.print_stats()
         else:
             if not self._created:
                 self._numba_func = self._build_cpu_function()
@@ -502,6 +496,3 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._created = True
             self._execute(False)
 
-        # profiler.disable()
-        # stats = pstats.Stats(profiler).sort_stats('cumtime')
-        # stats.print_stats()

From 5a5b0617eb5a7b664e603e2a4d30516c8927269c Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 8 Mar 2023 19:21:57 -0800
Subject: [PATCH 42/78] removing created_array

---
 cunumeric/vectorize.py              | 17 ++++++++++-------
 src/cunumeric/vectorize/eval_udf.cu |  3 ++-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 5c31f8a031..e29d26c209 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -132,8 +132,8 @@ def __init__(
         size_tmp=runtime.num_gpus
         if size_tmp==1:
            size_tmp=10
-        self._created_array = full((size_tmp,), True, dtype=bool)
-        self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk)
+        #self._created_array = full((size_tmp,), True, dtype=bool)
+        #self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk)
              #runtime.create_empty_thunk(
              #   (1,), dtype = np.dtype(np.bool), inputs=[])
 
@@ -368,13 +368,16 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
             #added to introduce dependency between this and EVAL_UDF task
-            kernel_task.add_input(self._created_array_deferred.base)
-            kernel_task.add_output(self._created_array_deferred.base)
+            #kernel_task.add_input(self._created_array_deferred.base)
+            #kernel_task.add_output(self._created_array_deferred.base)
             kernel_task.execute()
+            get_legate_runtime().issue_execution_fence(block=True)
             # inline map first element of the array to make sure the CREATE_CU_KERNEL
+
             # task has finished by the time we set self._created to True 
             if self._cache:
-                self._created = bool(self._created_array[0])
+                #self._created = bool(self._created_array[0])
+                self._created = True
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs
@@ -401,8 +404,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             task.add_scalar_arg(ptx_hash, ty.int64)
             # passing the _created * array to introduce dependency between
             # CREATE_CU_KERNEL task and EVAL_UDF task
-            task.add_input(self._created_array_deferred.base)
-            task.add_broadcast(self._created_array_deferred.base)
+            #task.add_input(self._created_array_deferred.base)
+            #task.add_broadcast(self._created_array_deferred.base)
 
         else:
             task.add_scalar_arg(
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 485227abfb..6a47e85f05 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -32,7 +32,8 @@ struct EvalUdfGPU {
    using VAL = legate_type_of<CODE>;
    Rect<DIM> rect;
 
-   size_t input_size=args.inputs.size()-1;  
+  // size_t input_size=args.inputs.size()-1;  
+  size_t input_size=args.inputs.size();
   // auto procs_rect = args.inputs[input_size].shape<1>();
 
   //auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();

From a80c1281aa43850ac95cad7f931ac9d958041328 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 9 Mar 2023 09:20:16 -0800
Subject: [PATCH 43/78] fixing barrier

---
 cunumeric/vectorize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index e29d26c209..085c384b10 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -362,6 +362,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             launch_domain = Rect(lo=(0,), hi=(num_gpus,))
             kernel_task = self._context.create_task(
                 CuNumericOpCode.CREATE_CU_KERNEL,
+                manual=True,
                 launch_domain=launch_domain,
             )
             ptx_hash = hash(self._gpu_func[0])
@@ -371,7 +372,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             #kernel_task.add_input(self._created_array_deferred.base)
             #kernel_task.add_output(self._created_array_deferred.base)
             kernel_task.execute()
-            get_legate_runtime().issue_execution_fence(block=True)
+            self._context.issue_execution_fence(block=True)
             # inline map first element of the array to make sure the CREATE_CU_KERNEL
 
             # task has finished by the time we set self._created to True 

From e45d0373f1a6dadb7fde13c31b17ebc6fecb7c23 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 9 Mar 2023 09:22:22 -0800
Subject: [PATCH 44/78] removing scipy

---
 cunumeric/vectorize.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 085c384b10..ed97209ccf 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -208,8 +208,6 @@ def _build_gpu_function(self) -> Any:
         lines = ["from numba import cuda"]
         lines.append("import math")
         lines.append("import numpy")
-        lines.append("import scipy.special")
-        lines.append("import numba_scipy")
 
         # Signature
         args = (

From dafbf1d9ff98b531c156588e84b36c936c4e9411 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 10 Mar 2023 12:14:05 -0800
Subject: [PATCH 45/78] adding provenance

---
 cunumeric/vectorize.py  |  5 +++--
 src/cunumeric/mapper.cc | 18 ------------------
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index ed97209ccf..718097effd 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -24,7 +24,7 @@
 import numba.core.ccallback
 import numpy as np
 import six
-from legate.core import Rect, get_legate_runtime, ReductionOp
+from legate.core import Rect, get_legate_runtime, ReductionOp, track_provenance
 
 from cunumeric.runtime import runtime
 
@@ -353,7 +353,8 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
         )  # type: ignore
 
         return numba.cfunc(sig)(self._numba_func)
-
+   
+    @track_provenance(runtime.legate_context)
     def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         if is_gpu and not self._created:
             # create CUDA kernel
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index 16b362bdb5..432f6713c0 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -223,24 +223,6 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       }
       return std::move(mappings);
     }
-#if 0
-    case CUNUMERIC_EVAL_UDF: {
-      std::vector<StoreMapping> mappings;
-      auto& inputs  = task.inputs();
-      auto& outputs = task.outputs();
-      for (auto& input : inputs) {
-        mappings.push_back(StoreMapping::default_mapping(input, options.front()));
-        // mappings.back().policy.ordering.c_order();
-        mappings.back().policy.exact = true;
-      }
-      for (auto& output : outputs) {
-        mappings.push_back(StoreMapping::default_mapping(output, options.front()));
-        // mappings.back().policy.ordering.c_order();
-        mappings.back().policy.exact = true;
-      }
-      return std::move(mappings);
-    }
-#endif
     default: {
       return {};
     }

From 9e09b8b88360ac2899ced840507a6bd8ab02425f Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 15 Mar 2023 12:13:47 -0700
Subject: [PATCH 46/78] adding black_scholes example that uses vectorize

---
 examples/black_scholes_greeks.py | 230 +++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 examples/black_scholes_greeks.py

diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py
new file mode 100644
index 0000000000..ff8354bd7a
--- /dev/null
+++ b/examples/black_scholes_greeks.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+
+from benchmark import parse_args, run_benchmark, CuNumericTimer
+
+import math
+import cunumeric as np
+
+
+#big size
+#n_vol_steps = 40
+vol_start = 0.1
+vol_step = 0.01
+#n_t_steps = 365*10
+t_start = 0.5
+t_step = 1.0/(365*10)
+#n_money_steps = 60
+money_start = -0.4
+money_step = 0.01
+
+
+#small size
+#n_vol_steps = 10
+#vol_start = 0.1
+#vol_step = 0.01
+#n_t_steps = 6
+#t_start = 0.5
+#t_step = 0.5
+#n_money_steps = 1
+#money_start = 0
+#money_step = 0.1
+
+RISKFREE = 0.02
+S0 = 100.0
+N_GREEKS=7
+EPS = 0.00000001
+
+
+def initialize(n_vol_steps, n_t_steps, n_money_steps, D):
+    CALL = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D)
+    PUT = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D)
+    S=np.full((n_t_steps, n_vol_steps, n_money_steps,),S0, dtype = D)
+    K=np.full((n_t_steps, n_vol_steps, n_money_steps,), (1 + money_start), dtype = D)  
+    temp_arr = np.arange((n_vol_steps*n_t_steps*n_money_steps), dtype=int)
+    k_temp=(temp_arr%n_money_steps)*money_step
+    k_temp = k_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,))
+    K+=k_temp
+    K=K*S0
+
+    T=np.full((n_t_steps, n_vol_steps, n_money_steps,),t_start, dtype = D)
+    t_temp = (temp_arr%(n_vol_steps*n_money_steps))*vol_step
+    t_temp = t_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,))
+    T+=t_temp
+    R=  0.02
+    V=np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype = D)
+    for i in range(n_vol_steps):
+        V[:,i,:]+=i*vol_step        
+
+    return CALL, PUT, S, K, T, R, V
+
+
+
+def normPDF(d):
+    RSQRT2PI = 0.39894228040143267793994605993438;
+    return RSQRT2PI * np.exp(- 0.5 * d * d);
+
+def black_scholes_vec_kernel(d1, d2, nd1, nd2, S, K, T, V, stdev, R,CP, EPS):
+    if (math.fabs(V)>EPS) and (math.fabs(T)>EPS) and (math.fabs(K)>EPS) and (math.fabs(S)>EPS):
+       d1 = (math.log(S/K)+(R+0.5*V*V)*T)/stdev
+       d2=d1-stdev
+       cpd1 = CP*d1
+       cpd2 = CP*d2
+       #manual inlining ndtr
+       NPY_SQRT1_2 = 0.707106781186547524400844362104849039
+       x = cpd1 * NPY_SQRT1_2
+       z = math.fabs(x)
+
+       if z < NPY_SQRT1_2:
+           y = 0.5 + 0.5 * math.erf(x)
+       else:
+           y = 0.5 * math.erfc(z)
+
+           if x > 0:
+               y = 1.0 - y
+       nd1=y
+
+       #manual inlining ndtr
+       x = cpd2 * NPY_SQRT1_2
+       z = math.fabs(x)
+
+       if z < NPY_SQRT1_2:
+           y = 0.5 + 0.5 * math.erf(x)
+       else:
+           y = 0.5 * math.erfc(z)
+
+           if x > 0:
+               y = 1.0 - y
+       nd2=y
+    else:
+        if (math.fabs(V)<=EPS) or (math.fabs(T)<=EPS) or (math.fabs(K)<=EPS):
+            d1 = math.inf
+            d2 = math.inf
+            nd1 = 1.
+            nd2 = 1.
+        else:
+            d1 = -math.inf
+            d2 = -math.inf
+            nd1 = 1.
+            nd2 = 1.
+
+
+bs_vec = np.vectorize(black_scholes_vec_kernel,otypes=(float,float,float,float), cache=True)
+
+def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, df,ind_v, ind_t, CP, greek):
+
+    if greek == "PREM":
+        out[...] = CP*(S*nd1 - K*df*nd2);
+    elif greek == "DELTA":
+        out[...] = CP*nd1
+    elif greek =="VEGA":
+        out[...] = S*np.sqrt(T)*normPDF(d1)
+    elif greek == "GAMMA":
+        out[...] = normPDF(d1)/(S*V*np.sqrt(T))
+        out[ind_v] =0.
+    elif greek == "VANNA":
+        out[...] = -d2*normPDF(d1)/V
+        out[ind_v] =0.
+    elif greek == "VOLGA":
+        out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V;
+        out[ind_v] =0.
+    elif greek == "THETA":
+        out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2)
+    else:
+        RuntimeError("Wrong greek name is passed")
+
+    if (greek != "PREM"):
+        out[ind_t] = 0.
+
+   
+greeks = ["PREM", "DELTA", "VEGA", "GAMMA", "VANNA", "VOLGA", "THETA",]
+#greeks = ["PREM",]
+
+def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps):
+    timer = CuNumericTimer()
+    print("Start black_scholes")
+    CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32)
+    #pre-compute some data for black_scholes
+    stdev = V * np.sqrt(T)
+    df = np.exp(-R*T)
+    ind_v = np.nonzero(np.absolute(V)<EPS)
+    ind_t = np.nonzero(np.absolute(T)<EPS)
+
+    d1_call = np.zeros_like(S)
+    d2_call= np.zeros_like(S)
+    nd1_call = np.zeros_like(S)
+    nd2_call= np.zeros_like(S)
+
+    bs_vec(d1_call, d2_call, nd1_call, nd2_call, S, K, T, V,stdev, R, 1, EPS);
+    d1_put = np.zeros_like(S)
+    d2_put= np.zeros_like(S)
+    nd1_put = np.zeros_like(S)
+    nd2_put= np.zeros_like(S)
+
+    bs_vec(d1_put, d2_put, nd1_put, nd2_put, S, K, T, V,stdev, R, -1, EPS);
+
+    print("After the initialization")
+    timer.start()
+    for count,g in enumerate(greeks):
+        black_scholes(CALL[count],S, K, R, T, V, d1_call, d2_call, nd1_call, nd2_call, df, ind_v, ind_t, 1, g)
+        black_scholes(PUT[count],S, K, R, T, V, d1_put, d2_put, nd1_put, nd2_put, df, ind_v, ind_t, -1, g)
+
+    total = timer.stop()
+    print("Elapsed Time: " + str(total) + " ms")
+    return total
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-v",
+        "--vol_tesps",
+        type=int,
+        default=40,
+        dest="n_vol_steps",
+        help="number of voltivity steps",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--time_tesps",
+        type=int,
+        default=3650,
+        dest="n_time_steps",
+        help="number of time steps",
+    )
+    parser.add_argument(
+        "-m",
+        "--money_tesps",
+        type=int,
+        default=60,
+        dest="n_money_steps",
+        help="number of money steps",
+    )
+
+
+    args, np, timer = parse_args(parser)
+    
+    run_benchmark(
+        run_black_scholes,
+        args.benchmark,
+        "Black Scholes",
+        (args.n_vol_steps, args.n_time_steps, args.n_money_steps),
+    )
+
+

From 9cd491a4514eee6407983f5191377632efc3d688 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 15 Mar 2023 14:01:00 -0700
Subject: [PATCH 47/78] removing special cases from black_scholes_greek

---
 examples/black_scholes_greeks.py | 100 ++++++++-----------------------
 1 file changed, 26 insertions(+), 74 deletions(-)

diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py
index ff8354bd7a..8b5465ad5c 100644
--- a/examples/black_scholes_greeks.py
+++ b/examples/black_scholes_greeks.py
@@ -49,7 +49,6 @@
 RISKFREE = 0.02
 S0 = 100.0
 N_GREEKS=7
-EPS = 0.00000001
 
 
 def initialize(n_vol_steps, n_t_steps, n_money_steps, D):
@@ -74,60 +73,32 @@ def initialize(n_vol_steps, n_t_steps, n_money_steps, D):
 
     return CALL, PUT, S, K, T, R, V
 
+def normCDF(d):
+    A1 = 0.31938153
+    A2 = -0.356563782
+    A3 = 1.781477937
+    A4 = -1.821255978
+    A5 = 1.330274429
+    RSQRT2PI = 0.39894228040143267793994605993438
 
+    K = 1.0 / (1.0 + 0.2316419 * np.absolute(d))
+
+    cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+    return np.where(d > 0, 1.0 - cnd, cnd)
 
 def normPDF(d):
     RSQRT2PI = 0.39894228040143267793994605993438;
     return RSQRT2PI * np.exp(- 0.5 * d * d);
 
-def black_scholes_vec_kernel(d1, d2, nd1, nd2, S, K, T, V, stdev, R,CP, EPS):
-    if (math.fabs(V)>EPS) and (math.fabs(T)>EPS) and (math.fabs(K)>EPS) and (math.fabs(S)>EPS):
-       d1 = (math.log(S/K)+(R+0.5*V*V)*T)/stdev
-       d2=d1-stdev
-       cpd1 = CP*d1
-       cpd2 = CP*d2
-       #manual inlining ndtr
-       NPY_SQRT1_2 = 0.707106781186547524400844362104849039
-       x = cpd1 * NPY_SQRT1_2
-       z = math.fabs(x)
-
-       if z < NPY_SQRT1_2:
-           y = 0.5 + 0.5 * math.erf(x)
-       else:
-           y = 0.5 * math.erfc(z)
-
-           if x > 0:
-               y = 1.0 - y
-       nd1=y
-
-       #manual inlining ndtr
-       x = cpd2 * NPY_SQRT1_2
-       z = math.fabs(x)
-
-       if z < NPY_SQRT1_2:
-           y = 0.5 + 0.5 * math.erf(x)
-       else:
-           y = 0.5 * math.erfc(z)
-
-           if x > 0:
-               y = 1.0 - y
-       nd2=y
-    else:
-        if (math.fabs(V)<=EPS) or (math.fabs(T)<=EPS) or (math.fabs(K)<=EPS):
-            d1 = math.inf
-            d2 = math.inf
-            nd1 = 1.
-            nd2 = 1.
-        else:
-            d1 = -math.inf
-            d2 = -math.inf
-            nd1 = 1.
-            nd2 = 1.
-
-
-bs_vec = np.vectorize(black_scholes_vec_kernel,otypes=(float,float,float,float), cache=True)
-
-def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, df,ind_v, ind_t, CP, greek):
+def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, CP, greek):
+    EPS = 0.00000001
+    stdev = V * np.sqrt(T)
+    df = np.exp(-R*T)
+    d1 = (np.log(S/K)+(R+0.5*V*V)*T)/stdev
+    d2= d1-stdev
+    nd1 = normCDF(CP*d1)
+    nd2 = normCDF(CP*d2)
 
     if greek == "PREM":
         out[...] = CP*(S*nd1 - K*df*nd2);
@@ -137,53 +108,34 @@ def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, df,ind_v, ind_t, CP,
         out[...] = S*np.sqrt(T)*normPDF(d1)
     elif greek == "GAMMA":
         out[...] = normPDF(d1)/(S*V*np.sqrt(T))
-        out[ind_v] =0.
     elif greek == "VANNA":
         out[...] = -d2*normPDF(d1)/V
-        out[ind_v] =0.
     elif greek == "VOLGA":
         out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V;
-        out[ind_v] =0.
     elif greek == "THETA":
         out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2)
     else:
         RuntimeError("Wrong greek name is passed")
 
-    if (greek != "PREM"):
-        out[ind_t] = 0.
 
    
 greeks = ["PREM", "DELTA", "VEGA", "GAMMA", "VANNA", "VOLGA", "THETA",]
-#greeks = ["PREM",]
 
 def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps):
     timer = CuNumericTimer()
     print("Start black_scholes")
     CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32)
-    #pre-compute some data for black_scholes
-    stdev = V * np.sqrt(T)
-    df = np.exp(-R*T)
-    ind_v = np.nonzero(np.absolute(V)<EPS)
-    ind_t = np.nonzero(np.absolute(T)<EPS)
-
-    d1_call = np.zeros_like(S)
-    d2_call= np.zeros_like(S)
-    nd1_call = np.zeros_like(S)
-    nd2_call= np.zeros_like(S)
-
-    bs_vec(d1_call, d2_call, nd1_call, nd2_call, S, K, T, V,stdev, R, 1, EPS);
-    d1_put = np.zeros_like(S)
-    d2_put= np.zeros_like(S)
-    nd1_put = np.zeros_like(S)
-    nd2_put= np.zeros_like(S)
 
-    bs_vec(d1_put, d2_put, nd1_put, nd2_put, S, K, T, V,stdev, R, -1, EPS);
+    d1 = np.zeros_like(S)
+    d2= np.zeros_like(S)
+    nd1 = np.zeros_like(S)
+    nd2= np.zeros_like(S)
 
     print("After the initialization")
     timer.start()
     for count,g in enumerate(greeks):
-        black_scholes(CALL[count],S, K, R, T, V, d1_call, d2_call, nd1_call, nd2_call, df, ind_v, ind_t, 1, g)
-        black_scholes(PUT[count],S, K, R, T, V, d1_put, d2_put, nd1_put, nd2_put, df, ind_v, ind_t, -1, g)
+        black_scholes(CALL[count],S, K, R, T, V, d1, d2, nd1, nd2,1, g)
+        black_scholes(PUT[count],S, K, R, T, V, d1, d2, nd1, nd2, -1, g)
 
     total = timer.stop()
     print("Elapsed Time: " + str(total) + " ms")

From 0c2f9cd9edeeeb559c83ffba0070db912c49236e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 16 Mar 2023 10:06:41 -0700
Subject: [PATCH 48/78] cleaning up greeks test

---
 examples/black_scholes_greeks.py | 88 ++++++++++++++------------------
 1 file changed, 38 insertions(+), 50 deletions(-)

diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py
index 8b5465ad5c..b4538533bb 100644
--- a/examples/black_scholes_greeks.py
+++ b/examples/black_scholes_greeks.py
@@ -18,54 +18,48 @@
 import argparse
 
 from benchmark import parse_args, run_benchmark, CuNumericTimer
-
+from enum import IntEnum
 import math
 import cunumeric as np
 
 
-#big size
-#n_vol_steps = 40
+NUM_ITERS=10
+WARMUP_ITER=2
+
 vol_start = 0.1
 vol_step = 0.01
-#n_t_steps = 365*10
 t_start = 0.5
 t_step = 1.0/(365*10)
-#n_money_steps = 60
 money_start = -0.4
 money_step = 0.01
 
 
-#small size
-#n_vol_steps = 10
-#vol_start = 0.1
-#vol_step = 0.01
-#n_t_steps = 6
-#t_start = 0.5
-#t_step = 0.5
-#n_money_steps = 1
-#money_start = 0
-#money_step = 0.1
-
 RISKFREE = 0.02
 S0 = 100.0
 N_GREEKS=7
 
+class Greeks(IntEnum):
+    PREM=0,
+    DELTA=1,
+    VEGA=2,
+    GAMMA=3,
+    VANNA=4,
+    VOLGA=5,
+    THETA=6
+
 
 def initialize(n_vol_steps, n_t_steps, n_money_steps, D):
     CALL = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D)
     PUT = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D)
     S=np.full((n_t_steps, n_vol_steps, n_money_steps,),S0, dtype = D)
-    K=np.full((n_t_steps, n_vol_steps, n_money_steps,), (1 + money_start), dtype = D)  
     temp_arr = np.arange((n_vol_steps*n_t_steps*n_money_steps), dtype=int)
     k_temp=(temp_arr%n_money_steps)*money_step
     k_temp = k_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,))
-    K+=k_temp
-    K=K*S0
+    K=(k_temp+(1 + money_start))*S0
 
-    T=np.full((n_t_steps, n_vol_steps, n_money_steps,),t_start, dtype = D)
     t_temp = (temp_arr%(n_vol_steps*n_money_steps))*vol_step
     t_temp = t_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,))
-    T+=t_temp
+    T=t_temp+t_start
     R=  0.02
     V=np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype = D)
     for i in range(n_vol_steps):
@@ -83,15 +77,15 @@ def normCDF(d):
 
     K = 1.0 / (1.0 + 0.2316419 * np.absolute(d))
 
-    cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+    cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
 
     return np.where(d > 0, 1.0 - cnd, cnd)
 
 def normPDF(d):
-    RSQRT2PI = 0.39894228040143267793994605993438;
-    return RSQRT2PI * np.exp(- 0.5 * d * d);
+    RSQRT2PI = 0.39894228040143267793994605993438
+    return RSQRT2PI * np.exp(- 0.5 * d * d)
 
-def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, CP, greek):
+def black_scholes ( out , S, K, R, T, V, CP, greek):
     EPS = 0.00000001
     stdev = V * np.sqrt(T)
     df = np.exp(-R*T)
@@ -100,45 +94,39 @@ def black_scholes ( out , S, K, R, T, V, d1, d2, nd1, nd2, CP, greek):
     nd1 = normCDF(CP*d1)
     nd2 = normCDF(CP*d2)
 
-    if greek == "PREM":
-        out[...] = CP*(S*nd1 - K*df*nd2);
-    elif greek == "DELTA":
+    if greek == Greeks.PREM:
+        out[...] = CP*(S*nd1 - K*df*nd2)
+    elif greek == Greeks.DELTA:
         out[...] = CP*nd1
-    elif greek =="VEGA":
+    elif greek ==Greeks.VEGA:
         out[...] = S*np.sqrt(T)*normPDF(d1)
-    elif greek == "GAMMA":
+    elif greek == Greeks.GAMMA:
         out[...] = normPDF(d1)/(S*V*np.sqrt(T))
-    elif greek == "VANNA":
+    elif greek == Greeks.VANNA:
         out[...] = -d2*normPDF(d1)/V
-    elif greek == "VOLGA":
-        out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V;
-    elif greek == "THETA":
+    elif greek == Greeks.VOLGA:
+        out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V
+    elif greek == Greeks.THETA:
         out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2)
     else:
-        RuntimeError("Wrong greek name is passed")
-
+        raise RuntimeError("Wrong greek name is passed")
 
-   
-greeks = ["PREM", "DELTA", "VEGA", "GAMMA", "VANNA", "VOLGA", "THETA",]
 
 def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps):
     timer = CuNumericTimer()
     print("Start black_scholes")
     CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32)
 
-    d1 = np.zeros_like(S)
-    d2= np.zeros_like(S)
-    nd1 = np.zeros_like(S)
-    nd2= np.zeros_like(S)
-
     print("After the initialization")
-    timer.start()
-    for count,g in enumerate(greeks):
-        black_scholes(CALL[count],S, K, R, T, V, d1, d2, nd1, nd2,1, g)
-        black_scholes(PUT[count],S, K, R, T, V, d1, d2, nd1, nd2, -1, g)
-
-    total = timer.stop()
-    print("Elapsed Time: " + str(total) + " ms")
+    for i in range (NUM_ITERS):
+        if i==WARMUP_ITER:
+            timer.start()
+        for g in Greeks:
+            black_scholes(CALL[g.value],S, K, R, T, V, 1, g)
+            black_scholes(PUT[g.value],S, K, R, T, V, -1, g)
+
+    total = (timer.stop())/(NUM_ITERS-WARMUP_ITER)
+    print("Elapsed Time: {} ms".format(total))
     return total
 
 if __name__ == "__main__":

From f6b8e67043e62598edaa9b12ead8643c8762f8b0 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 17 Mar 2023 11:53:32 -0700
Subject: [PATCH 49/78] making CPU UDF work with sparse arrays

---
 cunumeric/vectorize.py              | 75 +++++++++++++++++++----------
 src/cunumeric/vectorize/eval_udf.cc | 24 ++++++---
 src/cunumeric/vectorize/eval_udf.cu |  2 +-
 3 files changed, 67 insertions(+), 34 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 718097effd..722e8ae52e 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -194,12 +194,13 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
     def _replace_name(
         self, name: str, _LOOP_VAR: str, is_gpu: bool = False
     ) -> str:
-        # print("IRINA DEBUG ARGNAMES =", self._argnames)
-        # print("IRINA DEBUG SCALAR_NAMES =", self._scalar_names)
         if name in self._argnames and not (name in self._scalar_names):
-            return "{}[{}]".format(name, _LOOP_VAR)
+            return "{}[int({})]".format(name, _LOOP_VAR)
         else:
-            return "{}".format(name)
+            if is_gpu:
+                return "{}".format(name)
+            else:
+                return "{}[0]".format(name)
 
     def _build_gpu_function(self) -> Any:
         funcid = "vectorized_{}".format(self._pyfunc.__name__)
@@ -228,16 +229,11 @@ def _build_gpu_function(self) -> Any:
         lines.append("        return")
         lines.append("    {}:int = 0".format(_LOOP_VAR))
         lines.append("    for p in range({}-1):".format(_DIM_VAR))
+        #fixme make sure we compute index correct for all data types
         lines.append(
-            "        x={}[p]+int(local_i/{}[p])".format(
-                _LO_POINT_VAR, _PITCHES_VAR
-            )
-        )
+                "        x=int(local_i/{}[p])".format(_PITCHES_VAR))
         lines.append(
-            "        local_i = local_i-{}[p]*int(local_i/{}[p])".format(
-                _PITCHES_VAR, _PITCHES_VAR
-            )
-        )
+            "        local_i = int(local_i%{}[p])".format(_PITCHES_VAR))
         lines.append(
             "        {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
         )
@@ -257,10 +253,10 @@ def _lift_to_array_access(m: Any) -> str:
             l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
             lines.append(l_new)
 
-        # print("IRINA DEBUG GPU function",lines)
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
+        print("IRINA DEBUG GPU body", body)
         glbs: Dict[str, Any] = {}
         six.exec_(body, glbs)
         return glbs[funcid]
@@ -272,17 +268,24 @@ def _build_cpu_function(self) -> Callable[[Any], Any]:
         lines = ["from numba import carray, types"]
 
         # Signature
-        lines.append("def {}({}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR))
+        lines.append("def {}({}, {}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _LO_POINT_VAR, _STRIDES_VAR))
 
         # Unpack kernel arguments
         def _emit_assignment(
-            var: Any, idx: int, sz: Any, ty: np.dtype[Any]
+            var: Any, idx: int, sz: Any, ty: np.dtype[Any], scalar=False
         ) -> None:
-            lines.append(
-                "    {} = carray({}[{}], {}, types.{})".format(
-                    var, _ARGS_VAR, idx, sz, ty
+            if scalar:
+                lines.append(
+                    "    {} = carray({}[{}], 1, types.{})".format(
+                        var, _ARGS_VAR, idx, ty
+                    )
+                )
+            else:
+                lines.append(
+                    "    {} = carray({}[{}], {}, types.{})".format(
+                        var, _ARGS_VAR, idx, sz, ty
+                    )
                 )
-            )
 
         # get names of arguments
         arg_idx = 0
@@ -293,12 +296,28 @@ def _emit_assignment(
         for a in self._scalar_args:
             scalar_type = np.dtype(type(a).__name__)
             _emit_assignment(
-                self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type
+                self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type, True
             )
             arg_idx += 1
 
         # Main loop
-        lines.append("    for {} in range({}):".format(_LOOP_VAR, _SIZE_VAR))
+        lines.append("    for local_i in range({}):".format( _SIZE_VAR))
+        lines.append("        {}:int = 0".format(_LOOP_VAR))
+        lines.append("        j:int = local_i")
+        lines.append("        for p in range({}-1):".format(_DIM_VAR))
+        lines.append("            x=int(j/{}[p])".format(
+                 _PITCHES_VAR
+            )
+        )
+        lines.append("            j = int(j%{}[p])".format(_PITCHES_VAR ))
+
+        lines.append("            {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
+        )
+        lines.append("        {}+=int(j*{}[{}-1])".format(
+                _LOOP_VAR, _STRIDES_VAR, _DIM_VAR
+            )
+        )
+
 
         lines_old = self._get_func_body(self._pyfunc)
 
@@ -308,11 +327,12 @@ def _lift_to_array_access(m: Any) -> str:
 
         # lines_new = []
         for line in lines_old:
-            l_new = re.sub(r"[_a-z]\w*", _lift_to_array_access, line)
-            lines.append("        " + l_new)
+            l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
+            lines.append("    " + l_new)
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
+        print ("IRINA DEBUG body =", body)
         glbs: Dict[str, Any] = {}
         six.exec_(body, glbs)
         return glbs[funcid]
@@ -349,7 +369,11 @@ def _compile_func_gpu(self) -> tuple[Any]:
 
     def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
         sig = numba.core.types.void(
-            numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64
+            numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64,
+            numba.core.types.uint64,
+            numba.core.types.CPointer(numba.core.types.uint64),
+            numba.core.types.CPointer(numba.core.types.uint64),
+            numba.core.types.CPointer(numba.core.types.uint64)
         )  # type: ignore
 
         return numba.cfunc(sig)(self._numba_func)
@@ -359,9 +383,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         if is_gpu and not self._created:
             # create CUDA kernel
             launch_domain = Rect(lo=(0,), hi=(num_gpus,))
-            kernel_task = self._context.create_task(
+            kernel_task = self._context.create_manual_task(
                 CuNumericOpCode.CREATE_CU_KERNEL,
-                manual=True,
                 launch_domain=launch_domain,
             )
             ptx_hash = hash(self._gpu_func[0])
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 92f8cd739f..c10a359694 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -15,10 +15,10 @@
  */
 
 #include "cunumeric/vectorize/eval_udf.h"
+#include "cunumeric/pitches.h"
 
 namespace cunumeric {
 
-using namespace Legion;
 using namespace legate;
 
 struct EvalUdfCPU {
@@ -27,29 +27,39 @@ struct EvalUdfCPU {
   {
     // In the case of CPU, we pack arguments in a vector and pass them to the
     // function (through the function pointer geenrated by numba)
-    using UDF = void(void**, size_t);
+    using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*, uint32_t*);
     auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
     std::vector<void*> udf_args;
     size_t volume = 1;
+    Pitches<DIM - 1> pitches;
+    Rect<DIM> rect;
+    size_t strides[DIM];
     if (args.inputs.size()>0){
       using VAL = legate_type_of<CODE>;
-      auto rect = args.inputs[0].shape<DIM>();
+      rect = args.inputs[0].shape<DIM>();
+      volume = pitches.flatten(rect);
 
       if (rect.empty()) return;
       for (size_t i = 0; i < args.inputs.size(); i++) {
         if (i < args.num_outputs) {
           auto out = args.outputs[i].write_accessor<VAL, DIM>(rect);
-          udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect)));
+          udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect, strides)));
         } else {
           auto out = args.inputs[i].read_accessor<VAL, DIM>(rect);
-          udf_args.push_back(reinterpret_cast<void*>(const_cast<VAL*>(out.ptr(rect))));
+          udf_args.push_back(reinterpret_cast<void*>(const_cast<VAL*>(out.ptr(rect,strides))));
         }
       }
-      volume = rect.volume();
     }//if
     for (auto s: args.scalars)
         udf_args.push_back(const_cast<void*>(s.ptr()));
-    udf(udf_args.data(), volume);
+//    udf(udf_args.data(), volume, size_t(DIM),reinterpret_cast<void*>(&pitches.data()[0]),
+//    reinterpret_cast<void*>(&rect.lo[0]), reinterpret_cast<void*>(&strides[0])); 
+      std::cout<<"IRINA DEBUG pitches = "<<pitches.data()[0]<<" , "<<
+        pitches.data()[1]<<
+        " rect = "<< rect.lo<<" strides = "<< strides[0]<<
+        " , "<<strides[1]<<std::endl;
+      udf(udf_args.data(), volume, size_t(DIM), reinterpret_cast<uint32_t*>( const_cast<size_t*>(pitches.data())), reinterpret_cast<uint32_t*>(&rect.lo[0]), reinterpret_cast<uint32_t*>(&strides[0]));
+
   }
 };
 
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 6a47e85f05..411c2aa423 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -156,7 +156,7 @@ struct EvalUdfGPU {
                    context.get_current_processor(),
                    ptx_hash};
   size_t dim=1;
-  if (args.inputs.size()>1){
+  if (args.inputs.size()>0){
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
   }

From f3a704d6da89477029fff523e2c154a7445e5014 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 17 Mar 2023 21:07:15 -0700
Subject: [PATCH 50/78] removed unused argument

---
 cunumeric/vectorize.py              |  8 +-------
 src/cunumeric/vectorize/eval_udf.cc | 10 ++--------
 src/cunumeric/vectorize/eval_udf.cu | 10 ----------
 3 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 722e8ae52e..5629d9403e 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -47,7 +47,6 @@
 _ARGS_VAR = "__args__"
 _DIM_VAR = "__dim__"
 _STRIDES_VAR = "__strides__"
-_LO_POINT_VAR = "__lo_point__"
 _PITCHES_VAR = "__pitches__"
 
 
@@ -216,7 +215,6 @@ def _build_gpu_function(self) -> Any:
             + [_SIZE_VAR]
             + [_DIM_VAR]
             + [_PITCHES_VAR]
-            + [_LO_POINT_VAR]
             + [_STRIDES_VAR]
         )
 
@@ -256,7 +254,6 @@ def _lift_to_array_access(m: Any) -> str:
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
-        print("IRINA DEBUG GPU body", body)
         glbs: Dict[str, Any] = {}
         six.exec_(body, glbs)
         return glbs[funcid]
@@ -268,7 +265,7 @@ def _build_cpu_function(self) -> Callable[[Any], Any]:
         lines = ["from numba import carray, types"]
 
         # Signature
-        lines.append("def {}({}, {}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _LO_POINT_VAR, _STRIDES_VAR))
+        lines.append("def {}({}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _STRIDES_VAR))
 
         # Unpack kernel arguments
         def _emit_assignment(
@@ -332,7 +329,6 @@ def _lift_to_array_access(m: Any) -> str:
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
-        print ("IRINA DEBUG body =", body)
         glbs: Dict[str, Any] = {}
         six.exec_(body, glbs)
         return glbs[funcid]
@@ -360,7 +356,6 @@ def _compile_func_gpu(self) -> tuple[Any]:
             + [numba.core.types.uint64]
             + [numba.core.types.CPointer(numba.core.types.uint64)]
             + [numba.core.types.CPointer(numba.core.types.uint64)]
-            + [numba.core.types.CPointer(numba.core.types.uint64)]
         )
         sig = (*arg_types,)
 
@@ -372,7 +367,6 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
             numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64,
             numba.core.types.uint64,
             numba.core.types.CPointer(numba.core.types.uint64),
-            numba.core.types.CPointer(numba.core.types.uint64),
             numba.core.types.CPointer(numba.core.types.uint64)
         )  # type: ignore
 
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index c10a359694..e42978e0e9 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -27,7 +27,7 @@ struct EvalUdfCPU {
   {
     // In the case of CPU, we pack arguments in a vector and pass them to the
     // function (through the function pointer geenrated by numba)
-    using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*, uint32_t*);
+    using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*);
     auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
     std::vector<void*> udf_args;
     size_t volume = 1;
@@ -52,13 +52,7 @@ struct EvalUdfCPU {
     }//if
     for (auto s: args.scalars)
         udf_args.push_back(const_cast<void*>(s.ptr()));
-//    udf(udf_args.data(), volume, size_t(DIM),reinterpret_cast<void*>(&pitches.data()[0]),
-//    reinterpret_cast<void*>(&rect.lo[0]), reinterpret_cast<void*>(&strides[0])); 
-      std::cout<<"IRINA DEBUG pitches = "<<pitches.data()[0]<<" , "<<
-        pitches.data()[1]<<
-        " rect = "<< rect.lo<<" strides = "<< strides[0]<<
-        " , "<<strides[1]<<std::endl;
-      udf(udf_args.data(), volume, size_t(DIM), reinterpret_cast<uint32_t*>( const_cast<size_t*>(pitches.data())), reinterpret_cast<uint32_t*>(&rect.lo[0]), reinterpret_cast<uint32_t*>(&strides[0]));
+      udf(udf_args.data(), volume, size_t(DIM), reinterpret_cast<uint32_t*>( const_cast<size_t*>(pitches.data())), reinterpret_cast<uint32_t*>(&strides[0]));
 
   }
 };
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 411c2aa423..7d03af0341 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -45,7 +45,6 @@ struct EvalUdfGPU {
     buffer_size +=sizeof(size_t);//size
     buffer_size += sizeof(size_t);//dim
     buffer_size += sizeof(void*);//pitches
-    buffer_size += sizeof(void*);//lo_point
     buffer_size += sizeof(void*);//strides
 
     std::vector<char> arg_buffer(buffer_size);
@@ -84,22 +83,15 @@ struct EvalUdfGPU {
     //create buffers for pitches, lower point and strides since
     //we need to pass pointer to device memory
     auto device_pitches   = create_buffer<int64_t>(Point<1>(DIM-1), Memory::Kind::Z_COPY_MEM);
-    auto device_lo   = create_buffer<int64_t>(Point<1>(DIM), Memory::Kind::Z_COPY_MEM);
     auto device_strides   = create_buffer<int64_t>(Point<1>(DIM), Memory::Kind::Z_COPY_MEM);
-    //std::cout<<"IRINA DEBUG"<<std::endl;
     for (size_t i=0; i<DIM;i++){
       if (i!=DIM-1){
         device_pitches[Point<1>(i)]=pitches.data()[i];
-        //std::cout<<" pitches ="<<pitches.data()[i];
         }
-      device_lo[Point<1>(i)]=rect.lo[i];
       device_strides[Point<1>(i)] = strides[i];
-      //std::cout<<" device_lo = " <<rect.lo[i]<< "  strides = "<<strides[i]<<std::endl;
     }
     *reinterpret_cast<const void**>(p) =device_pitches.ptr(Point<1>(0));
     p += sizeof(void*);
-    *reinterpret_cast<const void**>(p) =device_lo.ptr(Point<1>(0));
-    p += sizeof(void*);
     *reinterpret_cast<const void**>(p) =device_strides.ptr(Point<1>(0));
     p += sizeof(void*);
     
@@ -122,8 +114,6 @@ struct EvalUdfGPU {
 
     auto stream = get_cached_stream();
 
-    //std::cout <<"function = "<<func<<std::endl;
-    // executing the function
     CUresult status = cuLaunchKernel(
       func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, 0, stream, NULL, config);
     if (status != CUDA_SUCCESS) {

From ac074fefe4b37fa00cf7bc8c5b3a5f439db7443d Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 20 Mar 2023 16:18:37 -0700
Subject: [PATCH 51/78] code clean-up and formatting

---
 cunumeric/vectorize.py                        | 250 +++++++++---------
 docs/cunumeric/source/api/_vectorize.rst      |  13 +
 docs/cunumeric/source/api/vectorize.rst       |  15 ++
 src/cunumeric/cuda_help.h                     |   4 +-
 src/cunumeric/cudalibs.cu                     |  27 +-
 src/cunumeric/vectorize/create_cu_kernel.cu   | 108 ++++----
 src/cunumeric/vectorize/eval_udf.cc           |  38 +--
 src/cunumeric/vectorize/eval_udf.cu           |  87 +++---
 src/cunumeric/vectorize/eval_udf.h            |   4 +-
 src/cunumeric/vectorize/eval_udf_template.inl |   6 +-
 10 files changed, 288 insertions(+), 264 deletions(-)
 create mode 100644 docs/cunumeric/source/api/_vectorize.rst
 create mode 100644 docs/cunumeric/source/api/vectorize.rst

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 5629d9403e..e7bd052e7d 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 #
 
-import cProfile
 import inspect
-import pstats
 import re
+
+# numba typing
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import legate.core.types as ty
@@ -24,21 +24,13 @@
 import numba.core.ccallback
 import numpy as np
 import six
-from legate.core import Rect, get_legate_runtime, ReductionOp, track_provenance
+from legate.core import Rect, track_provenance
 
 from cunumeric.runtime import runtime
 
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
 from .utils import convert_to_cunumeric_dtype
-from .module import full
-
-# from legate.timing import time
-
-
-# import numba.cuda
-# import numba.types
-
 
 _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__"
 _MASK_VAR = "__mask__"
@@ -51,58 +43,6 @@
 
 
 class vectorize:
-    """
-    vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False,
-              signature=None)
-    Generalized function class.
-    Define a vectorized function which takes a nested sequence of objects or
-    numpy arrays as inputs and returns a single numpy array or a tuple of numpy
-    arrays. The vectorized function evaluates `pyfunc` over successive tuples
-    of the input arrays like the python map function, except it uses the
-    broadcasting rules of numpy.
-    The data type of the output of `vectorized` is determined by calling
-    the function with the first element of the input.  This can be avoided
-    by specifying the `otypes` argument.
-
-    Parameters
-    ----------
-    pyfunc : callable
-        A python function or method.
-    otypes : str or list of dtypes, optional
-        The output data type. It must be specified as either a string of
-        typecode characters or a list of data type specifiers. There should
-        be one data type specifier for each output.
-    doc : str, optional
-        The docstring for the function. If None, the docstring will be the
-        ``pyfunc.__doc__``.
-    excluded : set, optional
-        Set of strings or integers representing the positional or keyword
-        arguments for which the function will not be vectorized.  These will be
-        passed directly to `pyfunc` unmodified.
-    cache : bool, optional
-        If `True`, then cache the first function call that determines
-        the number of outputs if `otypes` is not provided.
-    signature : string, optional
-        Generalized universal function signature, e.g., ``(m,n),(n)->(m)`` for
-        vectorized matrix-vector multiplication. If provided, ``pyfunc`` will
-        be called with (and expected to return) arrays with shapes given by the
-        size of corresponding core dimensions. By default, ``pyfunc`` is
-        assumed to take scalars as input and output.
-
-    Returns
-    -------
-    vectorized : callable
-        Vectorized function.
-
-    See Also
-    --------
-    numpy.vectorize
-
-    Availability
-    --------
-    Multiple GPUs, Multiple CPUs
-    """
-
     def __init__(
         self,
         pyfunc: Callable[[Any], Any],
@@ -112,12 +52,70 @@ def __init__(
         cache: bool = False,
         signature: Optional[str] = None,
     ) -> None:
+        """
+        vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False,
+                  signature=None)
+        Generalized function class.
+        Define a vectorized function which takes a nested sequence of
+        objects or numpy arrays as inputs and returns a single numpy array
+        or a tuple of numpy arrays.
+        The vectorized function evaluates `pyfunc` over successive tuples
+        of the input arrays like the python map function, except it uses the
+        broadcasting rules of numpy.
+        The data type of the output of `vectorized` is determined by calling
+        the function with the first element of the input.  This can be avoided
+        by specifying the `otypes` argument.
+
+        Parameters
+        ----------
+        pyfunc : callable
+            A python function or method.
+        otypes : str or list of dtypes, optional
+            The output data type. It must be specified as either a string of
+            typecode characters or a list of data type specifiers. There should
+            be one data type specifier for each output.
+            WARNING: cuNumeric currently requires all output types to be the
+            same
+        doc : str, optional
+            The docstring for the function. If None, the docstring will be the
+            ``pyfunc.__doc__``.
+        excluded : set, optional
+            Set of strings or integers representing the positional or keyword
+            arguments for which the function will not be vectorized.
+            These will be passed directly to `pyfunc` unmodified.
+            WARNING: cuNumeric doesn't suport this argument at the moment
+        cache : bool, optional
+            If `True`, then cache the first function call that generates C fun-
+            ction or CUDA kernel
+        signature : string, optional
+            Generalized universal function signature, e.g., ``(m,n),(n)->(m)``
+            for vectorized matrix-vector multiplication. If provided,
+            ``pyfunc`` will be called with (and expected to return)
+            arrays with shapes given by the size of corresponding core
+            dimensions. By default, ``pyfunc`` is assumed to take scalars
+            as input and output.
+            WARNING: cuNumeric doesn't suport this argument at the moment
+
+        Returns
+        -------
+        vectorized : callable
+            Vectorized function.
+
+        See Also
+        --------
+        numpy.vectorize
+
+        Availability
+        --------
+        Multiple GPUs, Multiple CPUs
+        """
+
         self._pyfunc = pyfunc
+        self._otypes: Optional[tuple[Any]] = None
+        self._cache: bool = cache
         self._numba_func: Callable[[Any], Any]
         self._cpu_func: numba.core.ccallback.CFunc
         self._gpu_func: tuple[Any]
-        self._otypes: Optional[tuple[Any]] = None
-        self._result = None
         self._args: List[Any] = []
         self._scalar_args: List[Any] = []
         self._scalar_idxs: List[int] = []
@@ -126,15 +124,7 @@ def __init__(
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
-        self._cache: bool = cache
         self._num_outputs = 1  # there is at least 1 output
-        size_tmp=runtime.num_gpus
-        if size_tmp==1:
-           size_tmp=10
-        #self._created_array = full((size_tmp,), True, dtype=bool)
-        #self._created_array_deferred = runtime.to_deferred_array(self._created_array._thunk)
-             #runtime.create_empty_thunk(
-             #   (1,), dtype = np.dtype(np.bool), inputs=[])
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -143,7 +133,7 @@ def __init__(
 
         if otypes is not None:
             self._num_outputs = len(otypes)
-            if len(otypes) == 0:
+            if self._num_outputs == 0:
                 raise ValueError(
                     "There should be at least 1 type specified in otypes"
                 )
@@ -183,7 +173,7 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
         if func.__doc__ is not None and len(func.__doc__.split("\n")) > 0:
             lines_to_skip = len(func.__doc__.split("\n"))
 
-        lines = inspect.getsourcelines(func)[0]
+        lines = inspect.getsourcelines(func)[0]  # type ignore
 
         return_lines = []
         for i in range(lines_to_skip + 1, len(lines)):
@@ -206,6 +196,7 @@ def _build_gpu_function(self) -> Any:
 
         # Preamble
         lines = ["from numba import cuda"]
+        # we add math and numpy so user-defined functions can use them
         lines.append("import math")
         lines.append("import numpy")
 
@@ -225,13 +216,16 @@ def _build_gpu_function(self) -> Any:
         lines.append("    local_i = cuda.grid(1)")
         lines.append("    if local_i >= {}:".format(_SIZE_VAR))
         lines.append("        return")
+        # we compute inndex for sparse data access when using Legion's
+        # pointer.
+        # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
         lines.append("    {}:int = 0".format(_LOOP_VAR))
         lines.append("    for p in range({}-1):".format(_DIM_VAR))
-        #fixme make sure we compute index correct for all data types
-        lines.append(
-                "        x=int(local_i/{}[p])".format(_PITCHES_VAR))
+        # fixme make sure we compute index correct for all data types
+        lines.append("        x=int(local_i/{}[p])".format(_PITCHES_VAR))
         lines.append(
-            "        local_i = int(local_i%{}[p])".format(_PITCHES_VAR))
+            "        local_i = int(local_i%{}[p])".format(_PITCHES_VAR)
+        )
         lines.append(
             "        {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
         )
@@ -241,7 +235,7 @@ def _build_gpu_function(self) -> Any:
             )
         )
 
-        # Kernel body
+        # this function is used to replace all array names with array[i]
         def _lift_to_array_access(m: Any) -> str:
             return self._replace_name(m.group(0), _LOOP_VAR, True)
 
@@ -251,7 +245,6 @@ def _lift_to_array_access(m: Any) -> str:
             l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
             lines.append(l_new)
 
-
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
         glbs: Dict[str, Any] = {}
@@ -263,15 +256,32 @@ def _build_cpu_function(self) -> Callable[[Any], Any]:
 
         # Preamble
         lines = ["from numba import carray, types"]
+        # we add math and numpy so user-defined functions can use them
+        lines.append("import math")
+        lines.append("import numpy")
 
         # Signature
-        lines.append("def {}({}, {}, {}, {}, {}):".format(funcid, _ARGS_VAR, _SIZE_VAR, _DIM_VAR, _PITCHES_VAR, _STRIDES_VAR))
+        lines.append(
+            "def {}({}, {}, {}, {}, {}):".format(
+                funcid,
+                _ARGS_VAR,
+                _SIZE_VAR,
+                _DIM_VAR,
+                _PITCHES_VAR,
+                _STRIDES_VAR,
+            )
+        )
 
         # Unpack kernel arguments
         def _emit_assignment(
-            var: Any, idx: int, sz: Any, ty: np.dtype[Any], scalar=False
+            var: Any,
+            idx: int,
+            sz: Any,
+            ty: np.dtype[Any],
+            scalar: bool = False,
         ) -> None:
             if scalar:
+                # we represent scalars as arrays of size 1
                 lines.append(
                     "    {} = carray({}[{}], 1, types.{})".format(
                         var, _ARGS_VAR, idx, ty
@@ -284,11 +294,13 @@ def _emit_assignment(
                     )
                 )
 
-        # get names of arguments
+        # define pyfunc arguments ar carrays
         arg_idx = 0
         for a in self._args:
-            type_a= a.dtype
-            _emit_assignment(self._argnames[arg_idx], arg_idx, _SIZE_VAR, type_a)
+            type_a = a.dtype
+            _emit_assignment(
+                self._argnames[arg_idx], arg_idx, _SIZE_VAR, type_a
+            )
             arg_idx += 1
         for a in self._scalar_args:
             scalar_type = np.dtype(type(a).__name__)
@@ -298,31 +310,31 @@ def _emit_assignment(
             arg_idx += 1
 
         # Main loop
-        lines.append("    for local_i in range({}):".format( _SIZE_VAR))
+        lines.append("    for local_i in range({}):".format(_SIZE_VAR))
+        # we compute inndex for sparse data access when using Legion's
+        # pointer.
+        # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
         lines.append("        {}:int = 0".format(_LOOP_VAR))
         lines.append("        j:int = local_i")
         lines.append("        for p in range({}-1):".format(_DIM_VAR))
-        lines.append("            x=int(j/{}[p])".format(
-                 _PITCHES_VAR
-            )
-        )
-        lines.append("            j = int(j%{}[p])".format(_PITCHES_VAR ))
+        lines.append("            x=int(j/{}[p])".format(_PITCHES_VAR))
+        lines.append("            j = int(j%{}[p])".format(_PITCHES_VAR))
 
-        lines.append("            {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
+        lines.append(
+            "            {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
         )
-        lines.append("        {}+=int(j*{}[{}-1])".format(
+        lines.append(
+            "        {}+=int(j*{}[{}-1])".format(
                 _LOOP_VAR, _STRIDES_VAR, _DIM_VAR
             )
         )
 
-
         lines_old = self._get_func_body(self._pyfunc)
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
             return self._replace_name(m.group(0), _LOOP_VAR)
 
-        # lines_new = []
         for line in lines_old:
             l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
             lines.append("    " + l_new)
@@ -364,14 +376,15 @@ def _compile_func_gpu(self) -> tuple[Any]:
 
     def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
         sig = numba.core.types.void(
-            numba.types.CPointer(numba.types.voidptr), numba.core.types.uint64,
+            numba.types.CPointer(numba.types.voidptr),
+            numba.core.types.uint64,
             numba.core.types.uint64,
             numba.core.types.CPointer(numba.core.types.uint64),
-            numba.core.types.CPointer(numba.core.types.uint64)
-        )  # type: ignore
+            numba.core.types.CPointer(numba.core.types.uint64),
+        )
 
         return numba.cfunc(sig)(self._numba_func)
-   
+
     @track_provenance(runtime.legate_context)
     def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         if is_gpu and not self._created:
@@ -384,22 +397,21 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             ptx_hash = hash(self._gpu_func[0])
             kernel_task.add_scalar_arg(ptx_hash, ty.int64)
             kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            #added to introduce dependency between this and EVAL_UDF task
-            #kernel_task.add_input(self._created_array_deferred.base)
-            #kernel_task.add_output(self._created_array_deferred.base)
             kernel_task.execute()
+            # we want to make sure EVAL_UDF function is not executed before
+            # CUDA kernel is created
             self._context.issue_execution_fence(block=True)
-            # inline map first element of the array to make sure the CREATE_CU_KERNEL
 
-            # task has finished by the time we set self._created to True 
+            # task has finished by the time we set self._created to True
             if self._cache:
-                #self._created = bool(self._created_array[0])
                 self._created = True
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
-        task.add_scalar_arg(self._num_outputs, ty.uint32) # N of outputs
-        task.add_scalar_arg(len(self._scalar_args), ty.uint32) # N of scalar_args
-        # add all scalars 
+        task.add_scalar_arg(self._num_outputs, ty.uint32)  # N of outputs
+        task.add_scalar_arg(
+            len(self._scalar_args), ty.uint32
+        )  # N of scalar_args
+        # add all scalars
         for a in self._scalar_args:
             dtype = convert_to_cunumeric_dtype(type(a).__name__)
             task.add_scalar_arg(a, dtype)
@@ -409,21 +421,16 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         a0 = runtime.to_deferred_array(a0)
         for count, a in enumerate(self._args):
             a_tmp = runtime.to_deferred_array(a._thunk)
-            a_tmp = a_tmp.base
-            task.add_input(a_tmp)
+            a_tmp_base = a_tmp.base
+            task.add_input(a_tmp_base)
             if count < self._num_outputs:
-                task.add_output(a_tmp)
+                task.add_output(a_tmp_base)
             if count != 0:
-                task.add_alignment(a0.base, a_tmp)
+                task.add_alignment(a0.base, a_tmp_base)
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
             task.add_scalar_arg(ptx_hash, ty.int64)
-            # passing the _created * array to introduce dependency between
-            # CREATE_CU_KERNEL task and EVAL_UDF task
-            #task.add_input(self._created_array_deferred.base)
-            #task.add_broadcast(self._created_array_deferred.base)
-
         else:
             task.add_scalar_arg(
                 self._cpu_func.address, ty.uint64
@@ -435,8 +442,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         Return arrays with the results of `pyfunc` broadcast (vectorized) over
         `args` and `kwargs` not in `excluded`.
         """
-        # profiler = cProfile.Profile()
-        # profiler.enable()
         if not self._created:
             self._scalar_args.clear()
             self._scalar_idxs.clear()
@@ -455,8 +460,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._scalar_idxs.append(i)
                 else:
                     self._args.append(convert_to_cunumeric_ndarray(arg))
-       
-                # first fill arrays to argnames, then scalars:
+
+            # first fill arrays to argnames, then scalars:
             for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
                 if not (i in self._scalar_idxs):
                     self._argnames.append(k)
@@ -515,4 +520,3 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 if self._cache:
                     self._created = True
             self._execute(False)
-
diff --git a/docs/cunumeric/source/api/_vectorize.rst b/docs/cunumeric/source/api/_vectorize.rst
new file mode 100644
index 0000000000..048e8ab51d
--- /dev/null
+++ b/docs/cunumeric/source/api/_vectorize.rst
@@ -0,0 +1,13 @@
+cunumeric.vectorize
+=============================
+
+.. currentmodule:: cunumeric
+
+.. autoclass:: vectorize
+
+   .. automethod:: __init__
+
+   .. rubric:: Methods
+
+   .. autosummary:: __call__
+
diff --git a/docs/cunumeric/source/api/vectorize.rst b/docs/cunumeric/source/api/vectorize.rst
new file mode 100644
index 0000000000..5e6cbea66b
--- /dev/null
+++ b/docs/cunumeric/source/api/vectorize.rst
@@ -0,0 +1,15 @@
+.. module:: cunumeric.vectorize
+  
+=========================================
+
+Vectorize
+-----------------
+
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+
+   _vectorize
+
+
+
diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
index 8ebe3c809c..eb90ab6da2 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cunumeric/cuda_help.h
@@ -393,7 +393,7 @@ __device__ __forceinline__ void store_streaming<double>(double* ptr, double valu
   asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory");
 }
 #if 0
-#include<cuda.h>
+#include <cuda.h>
 
 class JITKernelStorage
 {
@@ -429,5 +429,5 @@ class JITKernelStorage
   }
 };//class JITKernelStorage
 
-#endif 
+#endif
 }  // namespace cunumeric
diff --git a/src/cunumeric/cudalibs.cu b/src/cunumeric/cudalibs.cu
index 45d33d752f..f1e3a95be1 100644
--- a/src/cunumeric/cudalibs.cu
+++ b/src/cunumeric/cudalibs.cu
@@ -233,17 +233,16 @@ cufftContext CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& si
   return cufftContext(cache->get_cufft_plan(size));
 }
 
-void CUDALibraries::store_udf_func(size_t hash, CUfunction func){
-  udf_caches_[hash]=func;
-}
+void CUDALibraries::store_udf_func(size_t hash, CUfunction func) { udf_caches_[hash] = func; }
 
-CUfunction CUDALibraries::get_udf_func(size_t hash){
-    auto finder = udf_caches_.find(hash);
-    if (udf_caches_.end() == finder) {
-      fprintf(stderr, "UDF function wasn't generated yet");
-      LEGATE_ABORT;
-    }
-    return udf_caches_[hash];
+CUfunction CUDALibraries::get_udf_func(size_t hash)
+{
+  auto finder = udf_caches_.find(hash);
+  if (udf_caches_.end() == finder) {
+    fprintf(stderr, "UDF function wasn't generated yet");
+    LEGATE_ABORT;
+  }
+  return udf_caches_[hash];
 }
 
 static CUDALibraries& get_cuda_libraries(legate::Processor proc)
@@ -291,13 +290,15 @@ cufftContext get_cufft_plan(cufftType type, const DomainPoint& size)
   return lib.get_cufft_plan(type, size);
 }
 
-void store_udf(size_t hash, CUfunction func){
-    const auto proc = legate::Processor::get_executing_processor();
+void store_udf(size_t hash, CUfunction func)
+{
+  const auto proc = legate::Processor::get_executing_processor();
   auto& lib       = get_cuda_libraries(proc);
   lib.store_udf_func(hash, func);
 }
 
-CUfunction get_udf(size_t hash){
+CUfunction get_udf(size_t hash)
+{
   const auto proc = legate::Processor::get_executing_processor();
   auto& lib       = get_cuda_libraries(proc);
   return lib.get_udf_func(hash);
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index d3f00091cc..ce5b70a432 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -25,81 +25,79 @@ using namespace Legion;
 using namespace legate;
 
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-fill_out_kernel(const AccessorRD<legate::ProdReduction<bool>,true,1> out)
+  fill_out_kernel(const AccessorRD<legate::ProdReduction<bool>, true, 1> out)
 {
   const int idx = (blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx >0) return;
+  if (idx > 0) return;
   out.reduce(0, true);
 }
 
 /*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context)
 {
-
   int64_t ptx_hash = context.scalars()[0].value<int64_t>();
-  std::string ptx = context.scalars()[1].value<std::string>();
-  Processor point = context.get_current_processor();
+  std::string ptx  = context.scalars()[1].value<std::string>();
+  Processor point  = context.get_current_processor();
 
   CUfunction func;
-    const unsigned num_options   = 4;
-    const size_t log_buffer_size = 16384;
-    std::vector<char> log_info_buffer(log_buffer_size);
-    std::vector<char> log_error_buffer(log_buffer_size);
-    CUjit_option jit_options[] = {
-      CU_JIT_INFO_LOG_BUFFER,
-      CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-      CU_JIT_ERROR_LOG_BUFFER,
-      CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-    };
-    void* option_vals[] = {
-      static_cast<void*>(log_info_buffer.data()),
-      reinterpret_cast<void*>(log_buffer_size),
-      static_cast<void*>(log_error_buffer.data()),
-      reinterpret_cast<void*>(log_buffer_size),
-    };
+  const unsigned num_options   = 4;
+  const size_t log_buffer_size = 16384;
+  std::vector<char> log_info_buffer(log_buffer_size);
+  std::vector<char> log_error_buffer(log_buffer_size);
+  CUjit_option jit_options[] = {
+    CU_JIT_INFO_LOG_BUFFER,
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_ERROR_LOG_BUFFER,
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+  };
+  void* option_vals[] = {
+    static_cast<void*>(log_info_buffer.data()),
+    reinterpret_cast<void*>(log_buffer_size),
+    static_cast<void*>(log_error_buffer.data()),
+    reinterpret_cast<void*>(log_buffer_size),
+  };
 
-    CUmodule module;
-    CUresult result =
-      cuModuleLoadDataEx(&module, ptx.data(), num_options, jit_options, option_vals);
-    if (result != CUDA_SUCCESS) {
-      if (result == CUDA_ERROR_OPERATING_SYSTEM) {
-        fprintf(stderr,
-                "ERROR: Device side asserts are not supported by the "
-                "CUDA driver for MAC OSX, see NVBugs 1628896.\n");
-        exit(-1);
-      } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) {
-        fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n");
-        exit(-1);
-      } else {
-        fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data());
+  CUmodule module;
+  CUresult result = cuModuleLoadDataEx(&module, ptx.data(), num_options, jit_options, option_vals);
+  if (result != CUDA_SUCCESS) {
+    if (result == CUDA_ERROR_OPERATING_SYSTEM) {
+      fprintf(stderr,
+              "ERROR: Device side asserts are not supported by the "
+              "CUDA driver for MAC OSX, see NVBugs 1628896.\n");
+      exit(-1);
+    } else if (result == CUDA_ERROR_NO_BINARY_FOR_GPU) {
+      fprintf(stderr, "ERROR: The binary was compiled for the wrong GPU architecture.\n");
+      exit(-1);
+    } else {
+      fprintf(stderr, "Failed to load CUDA module! Error log: %s\n", log_error_buffer.data());
 #if CUDA_VERSION >= 6050
-        const char *name, *str;
-        assert(cuGetErrorName(result, &name) == CUDA_SUCCESS);
-        assert(cuGetErrorString(result, &str) == CUDA_SUCCESS);
-        fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str);
+      const char *name, *str;
+      assert(cuGetErrorName(result, &name) == CUDA_SUCCESS);
+      assert(cuGetErrorString(result, &str) == CUDA_SUCCESS);
+      fprintf(stderr, "CU: cuModuleLoadDataEx = %d (%s): %s\n", result, name, str);
 #else
-        fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result);
+      fprintf(stderr, "CU: cuModuleLoadDataEx = %d\n", result);
 #endif
-        exit(-1);
-      }
+      exit(-1);
     }
-    std::cmatch line_match;
-    bool match =
-      std::regex_search(ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
+  }
+  std::cmatch line_match;
+  bool match =
+    std::regex_search(ptx.data(), line_match, std::regex(".visible .entry [_a-zA-Z0-9$]+"));
 #ifdef DEBUG_CUNUMERIC
-    assert(match);
+  assert(match);
 #endif
-    const auto& matched_line = line_match.begin()->str();
-    auto fun_name = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
+  const auto& matched_line = line_match.begin()->str();
+  auto fun_name            = matched_line.substr(matched_line.rfind(" ") + 1, matched_line.size());
 
-    result = cuModuleGetFunction(&func, module, fun_name.c_str());
+  result = cuModuleGetFunction(&func, module, fun_name.c_str());
 #ifdef DEBUG_CUNUMERIC
-    assert(result == CUDA_SUCCESS);
+  assert(result == CUDA_SUCCESS);
 #endif
-    store_udf(ptx_hash, func);
-    //auto stream = get_cached_stream();
-    //auto out = context.reductions()[0].reduce_accessor<legate::ProdReduction<bool>, true, 1>();
-    //fill_out_kernel<<<1,1,0,stream>>>(out);
-    //CHECK_CUDA_STREAM(stream);
+  store_udf(ptx_hash, func);
+  // auto stream = get_cached_stream();
+  // auto out = context.reductions()[0].reduce_accessor<legate::ProdReduction<bool>, true, 1>();
+  // fill_out_kernel<<<1,1,0,stream>>>(out);
+  // CHECK_CUDA_STREAM(stream);
 }
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index e42978e0e9..fe2c20bcc0 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -34,10 +34,10 @@ struct EvalUdfCPU {
     Pitches<DIM - 1> pitches;
     Rect<DIM> rect;
     size_t strides[DIM];
-    if (args.inputs.size()>0){
+    if (args.inputs.size() > 0) {
       using VAL = legate_type_of<CODE>;
-      rect = args.inputs[0].shape<DIM>();
-      volume = pitches.flatten(rect);
+      rect      = args.inputs[0].shape<DIM>();
+      volume    = pitches.flatten(rect);
 
       if (rect.empty()) return;
       for (size_t i = 0; i < args.inputs.size(); i++) {
@@ -46,14 +46,16 @@ struct EvalUdfCPU {
           udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect, strides)));
         } else {
           auto out = args.inputs[i].read_accessor<VAL, DIM>(rect);
-          udf_args.push_back(reinterpret_cast<void*>(const_cast<VAL*>(out.ptr(rect,strides))));
+          udf_args.push_back(reinterpret_cast<void*>(const_cast<VAL*>(out.ptr(rect, strides))));
         }
       }
-    }//if
-    for (auto s: args.scalars)
-        udf_args.push_back(const_cast<void*>(s.ptr()));
-      udf(udf_args.data(), volume, size_t(DIM), reinterpret_cast<uint32_t*>( const_cast<size_t*>(pitches.data())), reinterpret_cast<uint32_t*>(&strides[0]));
-
+    }  // if
+    for (auto s : args.scalars) udf_args.push_back(const_cast<void*>(s.ptr()));
+    udf(udf_args.data(),
+        volume,
+        size_t(DIM),
+        reinterpret_cast<uint32_t*>(const_cast<size_t*>(pitches.data())),
+        reinterpret_cast<uint32_t*>(&strides[0]));
   }
 };
 
@@ -61,25 +63,23 @@ struct EvalUdfCPU {
 {
   uint32_t num_outputs = context.scalars()[0].value<uint32_t>();
   uint32_t num_scalars = context.scalars()[1].value<uint32_t>();
-  std::vector<Scalar>scalars;
-  for (size_t i=2; i<(2+num_scalars); i++)
-      scalars.push_back(context.scalars()[i]);
+  std::vector<Scalar> scalars;
+  for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]);
 
-  EvalUdfArgs args{context.scalars()[2+num_scalars].value<uint64_t>(),
+  EvalUdfArgs args{context.scalars()[2 + num_scalars].value<uint64_t>(),
                    context.inputs(),
                    context.outputs(),
                    scalars,
                    num_outputs,
                    context.get_current_processor()};
-  size_t dim=1;
-  if (args.inputs.size()>0){
+  size_t dim = 1;
+  if (args.inputs.size() > 0) {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
+  } else {
+    // FIXME
+    double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
   }
-  else{
-    //FIXME
-    double_dispatch(dim, args.inputs[0].code() , EvalUdfCPU{}, args);
-    }
 }
 
 namespace  // unnamed
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index 7d03af0341..f0a7844dc4 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -22,38 +22,38 @@
 
 namespace cunumeric {
 
-//using namespace Legion;
+// using namespace Legion;
 using namespace legate;
 
 struct EvalUdfGPU {
   template <LegateTypeCode CODE, int DIM>
   void operator()(EvalUdfArgs& args) const
   {
-   using VAL = legate_type_of<CODE>;
-   Rect<DIM> rect;
+    using VAL = legate_type_of<CODE>;
+    Rect<DIM> rect;
 
-  // size_t input_size=args.inputs.size()-1;  
-  size_t input_size=args.inputs.size();
-  // auto procs_rect = args.inputs[input_size].shape<1>();
+    // size_t input_size=args.inputs.size()-1;
+    size_t input_size = args.inputs.size();
+    // auto procs_rect = args.inputs[input_size].shape<1>();
 
-  //auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();
-  //auto funcs=args.inputs[input_size+1].read_accessor<uint64_t,1>();
-  CUfunction func = get_udf(args.hash);
+    // auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();
+    // auto funcs=args.inputs[input_size+1].read_accessor<uint64_t,1>();
+    CUfunction func = get_udf(args.hash);
     // Filling up the buffer with arguments
 
-    size_t buffer_size = (input_size+args.scalars.size()) * sizeof(void*);
-    buffer_size +=sizeof(size_t);//size
-    buffer_size += sizeof(size_t);//dim
-    buffer_size += sizeof(void*);//pitches
-    buffer_size += sizeof(void*);//strides
+    size_t buffer_size = (input_size + args.scalars.size()) * sizeof(void*);
+    buffer_size += sizeof(size_t);  // size
+    buffer_size += sizeof(size_t);  // dim
+    buffer_size += sizeof(void*);   // pitches
+    buffer_size += sizeof(void*);   // strides
 
     std::vector<char> arg_buffer(buffer_size);
     char* raw_arg_buffer = arg_buffer.data();
 
     auto p = raw_arg_buffer;
     size_t strides[DIM];
-    size_t size =1;
-    if (input_size>0){
+    size_t size = 1;
+    if (input_size > 0) {
       rect = args.inputs[0].shape<DIM>();
       size = rect.volume();
       for (size_t i = 0; i < input_size; i++) {
@@ -67,34 +67,31 @@ struct EvalUdfGPU {
         p += sizeof(void*);
       }
     }
-    for (auto scalar: args.scalars){
-        memcpy(p, scalar.ptr(), scalar.size());
-        p += scalar.size();
-       // *reinterpret_cast<const void**>(p) =s;
-        //p += sizeof(void*);
-      }
+    for (auto scalar : args.scalars) {
+      memcpy(p, scalar.ptr(), scalar.size());
+      p += scalar.size();
+      // *reinterpret_cast<const void**>(p) =s;
+      // p += sizeof(void*);
+    }
     memcpy(p, &size, sizeof(size_t));
-    size_t dim=DIM;
+    size_t dim = DIM;
     p += sizeof(size_t);
     memcpy(p, &dim, sizeof(size_t));
     p += sizeof(size_t);
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
-    //create buffers for pitches, lower point and strides since
-    //we need to pass pointer to device memory
-    auto device_pitches   = create_buffer<int64_t>(Point<1>(DIM-1), Memory::Kind::Z_COPY_MEM);
-    auto device_strides   = create_buffer<int64_t>(Point<1>(DIM), Memory::Kind::Z_COPY_MEM);
-    for (size_t i=0; i<DIM;i++){
-      if (i!=DIM-1){
-        device_pitches[Point<1>(i)]=pitches.data()[i];
-        }
+    // create buffers for pitches, lower point and strides since
+    // we need to pass pointer to device memory
+    auto device_pitches = create_buffer<int64_t>(Point<1>(DIM - 1), Memory::Kind::Z_COPY_MEM);
+    auto device_strides = create_buffer<int64_t>(Point<1>(DIM), Memory::Kind::Z_COPY_MEM);
+    for (size_t i = 0; i < DIM; i++) {
+      if (i != DIM - 1) { device_pitches[Point<1>(i)] = pitches.data()[i]; }
       device_strides[Point<1>(i)] = strides[i];
     }
-    *reinterpret_cast<const void**>(p) =device_pitches.ptr(Point<1>(0));
+    *reinterpret_cast<const void**>(p) = device_pitches.ptr(Point<1>(0));
     p += sizeof(void*);
-    *reinterpret_cast<const void**>(p) =device_strides.ptr(Point<1>(0));
+    *reinterpret_cast<const void**>(p) = device_strides.ptr(Point<1>(0));
     p += sizeof(void*);
-    
 
     void* config[] = {
       CU_LAUNCH_PARAM_BUFFER_POINTER,
@@ -127,16 +124,13 @@ struct EvalUdfGPU {
 
 /*static*/ void EvalUdfTask::gpu_variant(TaskContext& context)
 {
- 
   uint32_t num_outputs = context.scalars()[0].value<uint32_t>();
   uint32_t num_scalars = context.scalars()[1].value<uint32_t>();
-  std::vector<Scalar>scalars;
-  for (size_t i=2; i<(2+num_scalars); i++)
-      scalars.push_back(context.scalars()[i]);
-  
-  int64_t ptx_hash = context.scalars()[2+num_scalars].value<int64_t>();
-  //bool is_created = context.scalars()[3+num_scalars].value<bool>();
+  std::vector<Scalar> scalars;
+  for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]);
 
+  int64_t ptx_hash = context.scalars()[2 + num_scalars].value<int64_t>();
+  // bool is_created = context.scalars()[3+num_scalars].value<bool>();
 
   EvalUdfArgs args{0,
                    context.inputs(),
@@ -145,15 +139,14 @@ struct EvalUdfGPU {
                    num_outputs,
                    context.get_current_processor(),
                    ptx_hash};
-  size_t dim=1;
-  if (args.inputs.size()>0){
+  size_t dim = 1;
+  if (args.inputs.size() > 0) {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
-  }
-  else{
-    //FIXME
+  } else {
+    // FIXME
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
-    //double_dispatch(dim, 0 , EvalUdfGPU{}, args);
+    // double_dispatch(dim, 0 , EvalUdfGPU{}, args);
   }
 }
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.h b/src/cunumeric/vectorize/eval_udf.h
index 2e14f0dacc..784e2334b1 100644
--- a/src/cunumeric/vectorize/eval_udf.h
+++ b/src/cunumeric/vectorize/eval_udf.h
@@ -25,10 +25,10 @@ struct EvalUdfArgs {
   uint64_t cpu_func_ptr;
   std::vector<Array>& inputs;
   std::vector<Array>& outputs;
-  std::vector<legate::Scalar>scalars;
+  std::vector<legate::Scalar> scalars;
   uint32_t num_outputs;
   Legion::Processor point;
-  int64_t hash=0;
+  int64_t hash = 0;
 };
 
 class EvalUdfTask : public CuNumericTask<EvalUdfTask> {
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
index c597e44a14..c0bc72c428 100644
--- a/src/cunumeric/vectorize/eval_udf_template.inl
+++ b/src/cunumeric/vectorize/eval_udf_template.inl
@@ -38,7 +38,7 @@ struct EvalUdfImpl {
     std::vector<void*> udf_args;
     using VAL = legate_type_of<CODE>;
     auto rect = args.args[0].shape<DIM>();
-   
+
     size_t strides[DIM];
 
     if (rect.empty()) return;
@@ -46,8 +46,8 @@ struct EvalUdfImpl {
     for (size_t i = 0; i < args.args.size(); i++) {
       auto out = args.args[i].write_accessor<VAL, DIM>(rect);
       udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect, strides)));
-      for (size_t i=0; i<DIM;i++)
-          std::cout<<"IRINA DEBUG strides = "<<strides[i]<<std::endl;
+      for (size_t i = 0; i < DIM; i++)
+        std::cout << "IRINA DEBUG strides = " << strides[i] << std::endl;
     }
 
     udf(udf_args.data(), rect.volume());

From 99446de8100d70b8f8be68b62f61e12455429345 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 20 Mar 2023 21:57:20 -0700
Subject: [PATCH 52/78] some fixes for empty functions

---
 cunumeric/vectorize.py                      | 26 +++++++++++++--------
 src/cunumeric/vectorize/create_cu_kernel.cu |  4 ----
 src/cunumeric/vectorize/eval_udf.cc         |  6 ++---
 src/cunumeric/vectorize/eval_udf.cu         | 17 ++++----------
 4 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index e7bd052e7d..14b672dd8e 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -417,16 +417,17 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             task.add_scalar_arg(a, dtype)
 
         # add array arguments
-        a0 = self._args[0]._thunk
-        a0 = runtime.to_deferred_array(a0)
-        for count, a in enumerate(self._args):
-            a_tmp = runtime.to_deferred_array(a._thunk)
-            a_tmp_base = a_tmp.base
-            task.add_input(a_tmp_base)
-            if count < self._num_outputs:
-                task.add_output(a_tmp_base)
-            if count != 0:
-                task.add_alignment(a0.base, a_tmp_base)
+        if len (self._args)>0:
+            a0 = self._args[0]._thunk
+            a0 = runtime.to_deferred_array(a0)
+            for count, a in enumerate(self._args):
+                a_tmp = runtime.to_deferred_array(a._thunk)
+                a_tmp_base = a_tmp.base
+                task.add_input(a_tmp_base)
+                if count < self._num_outputs:
+                    task.add_output(a_tmp_base)
+                if count != 0:
+                    task.add_alignment(a0.base, a_tmp_base)
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
@@ -477,6 +478,11 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     "kwargs are not supported in user functions"
                 )
 
+        if self._num_outputs==0 or len(self._args)==0:
+           #execute function that doesn't modify anything:
+           self._pyfunc()
+           return
+
         # all output arrays should have the same type
         if len(self._args) > 0:
             type_a = self._args[0].dtype
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index ce5b70a432..dde9616543 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -94,10 +94,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   assert(result == CUDA_SUCCESS);
 #endif
   store_udf(ptx_hash, func);
-  // auto stream = get_cached_stream();
-  // auto out = context.reductions()[0].reduce_accessor<legate::ProdReduction<bool>, true, 1>();
-  // fill_out_kernel<<<1,1,0,stream>>>(out);
-  // CHECK_CUDA_STREAM(stream);
 }
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index fe2c20bcc0..5a40260a1d 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -26,7 +26,7 @@ struct EvalUdfCPU {
   void operator()(EvalUdfArgs& args) const
   {
     // In the case of CPU, we pack arguments in a vector and pass them to the
-    // function (through the function pointer geenrated by numba)
+    // function (through the function pointer generated by numba)
     using UDF = void(void**, size_t, size_t, uint32_t*, uint32_t*);
     auto udf  = reinterpret_cast<UDF*>(args.cpu_func_ptr);
     std::vector<void*> udf_args;
@@ -77,8 +77,8 @@ struct EvalUdfCPU {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
   } else {
-    // FIXME
-    double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
+    LegateTypeCode code = LegateTypeCode::BOOL_LT ;
+    double_dispatch(dim, code, EvalUdfCPU{}, args);
   }
 }
 
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index f0a7844dc4..ebe3e0730e 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -32,15 +32,10 @@ struct EvalUdfGPU {
     using VAL = legate_type_of<CODE>;
     Rect<DIM> rect;
 
-    // size_t input_size=args.inputs.size()-1;
     size_t input_size = args.inputs.size();
-    // auto procs_rect = args.inputs[input_size].shape<1>();
-
-    // auto procs=args.inputs[input_size].read_accessor<uint64_t,1>();
-    // auto funcs=args.inputs[input_size+1].read_accessor<uint64_t,1>();
     CUfunction func = get_udf(args.hash);
-    // Filling up the buffer with arguments
 
+    // Filling up the buffer with arguments
     size_t buffer_size = (input_size + args.scalars.size()) * sizeof(void*);
     buffer_size += sizeof(size_t);  // size
     buffer_size += sizeof(size_t);  // dim
@@ -70,8 +65,6 @@ struct EvalUdfGPU {
     for (auto scalar : args.scalars) {
       memcpy(p, scalar.ptr(), scalar.size());
       p += scalar.size();
-      // *reinterpret_cast<const void**>(p) =s;
-      // p += sizeof(void*);
     }
     memcpy(p, &size, sizeof(size_t));
     size_t dim = DIM;
@@ -80,7 +73,7 @@ struct EvalUdfGPU {
     p += sizeof(size_t);
     Pitches<DIM - 1> pitches;
     size_t volume = pitches.flatten(rect);
-    // create buffers for pitches, lower point and strides since
+    // create buffers for pitches and strides since
     // we need to pass pointer to device memory
     auto device_pitches = create_buffer<int64_t>(Point<1>(DIM - 1), Memory::Kind::Z_COPY_MEM);
     auto device_strides = create_buffer<int64_t>(Point<1>(DIM), Memory::Kind::Z_COPY_MEM);
@@ -130,7 +123,6 @@ struct EvalUdfGPU {
   for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]);
 
   int64_t ptx_hash = context.scalars()[2 + num_scalars].value<int64_t>();
-  // bool is_created = context.scalars()[3+num_scalars].value<bool>();
 
   EvalUdfArgs args{0,
                    context.inputs(),
@@ -144,9 +136,8 @@ struct EvalUdfGPU {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
   } else {
-    // FIXME
-    double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
-    // double_dispatch(dim, 0 , EvalUdfGPU{}, args);
+    LegateTypeCode code = LegateTypeCode::BOOL_LT ;
+    double_dispatch(dim, code, EvalUdfGPU{}, args);
   }
 }
 }  // namespace cunumeric

From 725e223e25b067c3d4f5889803d76ee025293731 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 21 Mar 2023 09:58:05 -0700
Subject: [PATCH 53/78] parsing python function for return arguments

---
 cunumeric/vectorize.py              | 50 ++++++++++++++++-------------
 tests/integration/test_vectorize.py | 34 ++++++++++++++++----
 2 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 14b672dd8e..3f3f802e4b 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -15,6 +15,7 @@
 
 import inspect
 import re
+import typing
 
 # numba typing
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -124,25 +125,27 @@ def __init__(
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
-        self._num_outputs = 1  # there is at least 1 output
+        self._func_body: List[str]=[]
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
         else:
             self.__doc__ = doc
 
+        self._return_arguments = self._get_return_argumets()
+        self._num_outputs = len(self._return_arguments) 
+
         if otypes is not None:
-            self._num_outputs = len(otypes)
-            if self._num_outputs == 0:
-                raise ValueError(
-                    "There should be at least 1 type specified in otypes"
-                )
-            type0 = otypes[0]
-            for t in otypes:
-                if t != type0:
-                    raise NotImplementedError(
-                        "cuNumeric doesn't support variable types in otypes"
-                    )
+            if self._num_outputs !=len(otypes):
+                raise ValueError("number of types in otypes is not consistente"
+                 " with the number of return values difened in pyfunc")
+            if len(otypes)>1:
+                for t in otypes:
+                    if t != otypes[0]:
+                        raise NotImplementedError(
+                            "cuNumeric doesn't support variable types in otypes"
+                        )
+
 
         # FIXME
         if excluded is not None:
@@ -156,14 +159,6 @@ def __init__(
                 "signature variable is not supported yet"
             )
 
-        # FIXME check return of the user function
-        # return annotation (we supprt only void)
-
-    #        if inspect.signature(self._pyfunc).return_annotation()
-    #            != inspect._empty:
-    #            raise NotImplementedError(
-    #                "user defined functions can't have a return"
-    #            )
 
     def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
         """Using the magic method __doc__, we KNOW the size of the docstring.
@@ -180,6 +175,17 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
             return_lines.append(lines[i].rstrip())
         return return_lines
 
+    def _get_return_argumets(self)->list[str]:
+        self._func_body = self._get_func_body(self._pyfunc)
+        return_names = []
+        for l in self._func_body:
+            if "return"  in l:
+                l = l.replace("return", '')
+                l=l.replace(" ",'')
+                return_names = l.split(",")
+        return return_names
+
+
     def _replace_name(
         self, name: str, _LOOP_VAR: str, is_gpu: bool = False
     ) -> str:
@@ -240,7 +246,7 @@ def _lift_to_array_access(m: Any) -> str:
             return self._replace_name(m.group(0), _LOOP_VAR, True)
 
         # kernel body
-        lines_old = self._get_func_body(self._pyfunc)
+        lines_old = self._func_body
         for line in lines_old:
             l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
             lines.append(l_new)
@@ -329,7 +335,7 @@ def _emit_assignment(
             )
         )
 
-        lines_old = self._get_func_body(self._pyfunc)
+        lines_old = self._func_body
 
         # Kernel body
         def _lift_to_array_access(m: Any) -> str:
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 520267ab76..212c4f4ac0 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -23,33 +23,54 @@
 
 def my_func(a, b):
     a = a * 2 + b
+    return a
 
 def my_func_np(a, b):
     a = a * 2 + b
     return a
 
-
+#Capital letters and numbers in the signature 
 def my_func2(A0, B0):
     A0 = A0 * 2 + B0
+    return A0
 
 def my_func_np2(A0, B0):
     A0 = A0 * 2 + B0
     return A0
 
+def empty_func():
+    print("within empty function")
+
 
 def test_vectorize():
+    #2 arrays
     func = num.vectorize(my_func)
     a = num.arange(5)
     b = num.ones((5,))
     func(a, b)
     assert(np.array_equal(a, [1,3,5,7,9]))
 
+    #array and scalar
     func = num.vectorize(my_func)
     a= num.arange(5)
     b=2
     func(a,b)
     assert(np.array_equal(a, [2,4,6,8,10]))
    
+    #2 scalars
+    #FIXME
+    #func = num.vectorize(my_func)
+    #a=3
+    #b=2
+    #func(a,b)
+    #assert(a ==8)
+
+    #empty function
+    func = num.vectorize(empty_func)
+    func()
+
+    #slices
+    func = num.vectorize(my_func)
     num.vectorize(my_func)
     a=num.array([[1,2,3],[4,5,6],[7,8,9]])
     b=num.array([[10,11,12],[13,14,15],[16,17,18]])
@@ -57,44 +78,43 @@ def test_vectorize():
 
     a=np.arange(100).reshape((25,4))
     a_num= num.array(a)
-
     b=a*10
     b_num=a_num*10
     func_np = np.vectorize(my_func_np)
     func_num=num.vectorize(my_func)
-
     a=func_np(a,b)
     func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
 
+    #reusing the same function for different inputs
     a[:,2]=func_np(a[:, 2], b[:,2])
     func_num(a_num[:,2],b_num[:,2])
     assert np.array_equal(a, a_num)
 
+    #reusing the same function for different inputs
     a[5:10,2]=func_np(a[5:10, 2], b[1:6,2])
     func_num(a_num[5:10,2],b_num[1:6,2])
     assert np.array_equal(a, a_num)
 
+    #reusing the same function for different inputs
     a[15:20]=func_np(a[15:20], b[15:20])
     func_num(a_num[15:20],b_num[15:20])
     assert np.array_equal(a, a_num)
 
+    # reusing the same function for different inputs
     a=np.arange(1000).reshape((25,10,4))
     a_num= num.array(a)
-
     a[:, 2, :] = func_np(a[:, 2, :],2)
     func_num(a_num[:, 2, :],2)
     assert np.array_equal(a, a_num)
 
+    #checking signature with capital letters and numbers
     a=np.arange(100).reshape((25,4))
     a_num= num.array(a)
-
     b=a*10
     b_num=a_num*10
-
     func_np = np.vectorize(my_func_np2)
     func_num=num.vectorize(my_func2)
-
     a=func_np(a,b)
     func_num(a_num, b_num)
     assert np.array_equal(a, a_num)

From 905fa95c5e12ec1928a49e3f6dec97f6915ac6da Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 21 Mar 2023 11:11:30 -0700
Subject: [PATCH 54/78] removing dependency on six and some clean-up

---
 cunumeric/vectorize.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 3f3f802e4b..996326c4e7 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -24,7 +24,7 @@
 import numba
 import numba.core.ccallback
 import numpy as np
-import six
+#import six
 from legate.core import Rect, track_provenance
 
 from cunumeric.runtime import runtime
@@ -60,12 +60,13 @@ def __init__(
         Define a vectorized function which takes a nested sequence of
         objects or numpy arrays as inputs and returns a single numpy array
         or a tuple of numpy arrays.
-        The vectorized function evaluates `pyfunc` over successive tuples
-        of the input arrays like the python map function, except it uses the
-        broadcasting rules of numpy.
+        User defined pyfunction will be executed in a single cuNumeric task
+        over a set of arguments. 
         The data type of the output of `vectorized` is determined by calling
         the function with the first element of the input.  This can be avoided
         by specifying the `otypes` argument.
+        WARNING: when running with OpenMP back-end, "vectorize" will fall-back
+        to the serial CPU implementation
 
         Parameters
         ----------
@@ -87,7 +88,12 @@ def __init__(
             WARNING: cuNumeric doesn't suport this argument at the moment
         cache : bool, optional
             If `True`, then cache the first function call that generates C fun-
-            ction or CUDA kernel
+            ction or CUDA kernel. We recomment enabling caching in cuNumeric 
+            for better performance, when possible.
+            Warning: in the case when cache=True, cuNumeric will parse function
+            signature and create C function or CUDA kernel only once. This
+            means that types of arguments passed to the vectorized function
+            (arrays, scalars etc) should be the same each time we call it.
         signature : string, optional
             Generalized universal function signature, e.g., ``(m,n),(n)->(m)``
             for vectorized matrix-vector multiplication. If provided,
@@ -132,8 +138,8 @@ def __init__(
         else:
             self.__doc__ = doc
 
-        self._return_arguments = self._get_return_argumets()
-        self._num_outputs = len(self._return_arguments) 
+        self._return_argnames = self._get_return_argumets()
+        self._num_outputs = len(self._return_argnames) 
 
         if otypes is not None:
             if self._num_outputs !=len(otypes):
@@ -192,7 +198,7 @@ def _replace_name(
         if name in self._argnames and not (name in self._scalar_names):
             return "{}[int({})]".format(name, _LOOP_VAR)
         else:
-            if is_gpu:
+            if is_gpu or ((not is_gpu) and not (name  in self._scalar_names)) :
                 return "{}".format(name)
             else:
                 return "{}[0]".format(name)
@@ -254,7 +260,7 @@ def _lift_to_array_access(m: Any) -> str:
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
         glbs: Dict[str, Any] = {}
-        six.exec_(body, glbs)
+        exec(body, glbs)
         return glbs[funcid]
 
     def _build_cpu_function(self) -> Callable[[Any], Any]:
@@ -348,7 +354,7 @@ def _lift_to_array_access(m: Any) -> str:
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
         glbs: Dict[str, Any] = {}
-        six.exec_(body, glbs)
+        exec(body, glbs)
         return glbs[funcid]
 
     def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
@@ -445,11 +451,11 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
-        """
-        Return arrays with the results of `pyfunc` broadcast (vectorized) over
-        `args` and `kwargs` not in `excluded`.
-        """
         if not self._created:
+            # the case when  we execute `__call__` the first time or
+            # when cache=False:
+            # each time we call `vectorize` on a pyfunc we need to clear
+            # these lists to support different types of arguments passed
             self._scalar_args.clear()
             self._scalar_idxs.clear()
             self._args.clear()

From 9bdefe9d09fe09e69b624472af0928c303cb7fc1 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Tue, 21 Mar 2023 16:00:20 -0700
Subject: [PATCH 55/78] updates for docs

---
 cunumeric/vectorize.py                   | 13 +++--
 docs/cunumeric/source/api/_vectorize.rst |  7 +--
 docs/cunumeric/source/api/functional.rst |  7 +++
 docs/cunumeric/source/api/routines.rst   |  1 +
 docs/cunumeric/source/api/vectorize.rst  | 15 ------
 tests/integration/test_vectorize.py      | 69 ++++++++++++------------
 6 files changed, 53 insertions(+), 59 deletions(-)
 create mode 100644 docs/cunumeric/source/api/functional.rst
 delete mode 100644 docs/cunumeric/source/api/vectorize.rst

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 14b672dd8e..1954c0c931 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -15,8 +15,6 @@
 
 import inspect
 import re
-
-# numba typing
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import legate.core.types as ty
@@ -55,6 +53,7 @@ def __init__(
         """
         vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False,
                   signature=None)
+
         Generalized function class.
         Define a vectorized function which takes a nested sequence of
         objects or numpy arrays as inputs and returns a single numpy array
@@ -417,7 +416,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             task.add_scalar_arg(a, dtype)
 
         # add array arguments
-        if len (self._args)>0:
+        if len(self._args) > 0:
             a0 = self._args[0]._thunk
             a0 = runtime.to_deferred_array(a0)
             for count, a in enumerate(self._args):
@@ -478,10 +477,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     "kwargs are not supported in user functions"
                 )
 
-        if self._num_outputs==0 or len(self._args)==0:
-           #execute function that doesn't modify anything:
-           self._pyfunc()
-           return
+        if self._num_outputs == 0 or len(self._args) == 0:
+            # execute function that doesn't modify anything:
+            self._pyfunc()
+            return
 
         # all output arrays should have the same type
         if len(self._args) > 0:
diff --git a/docs/cunumeric/source/api/_vectorize.rst b/docs/cunumeric/source/api/_vectorize.rst
index 048e8ab51d..c096e320de 100644
--- a/docs/cunumeric/source/api/_vectorize.rst
+++ b/docs/cunumeric/source/api/_vectorize.rst
@@ -1,7 +1,7 @@
 cunumeric.vectorize
-=============================
+===================
 
-.. currentmodule:: cunumeric
+.. currentmodule:: cunumeric.vectorize
 
 .. autoclass:: vectorize
 
@@ -9,5 +9,6 @@ cunumeric.vectorize
 
    .. rubric:: Methods
 
-   .. autosummary:: __call__
+   .. automethod:: __call__
 
+   .. autosummary::
diff --git a/docs/cunumeric/source/api/functional.rst b/docs/cunumeric/source/api/functional.rst
new file mode 100644
index 0000000000..4d35618ebf
--- /dev/null
+++ b/docs/cunumeric/source/api/functional.rst
@@ -0,0 +1,7 @@
+Functional programming
+======================
+
+.. toctree::
+   :maxdepth: 2
+
+   _vectorize
diff --git a/docs/cunumeric/source/api/routines.rst b/docs/cunumeric/source/api/routines.rst
index e85a5c65b0..5f0451584e 100644
--- a/docs/cunumeric/source/api/routines.rst
+++ b/docs/cunumeric/source/api/routines.rst
@@ -13,6 +13,7 @@ Routines
    logic
    math
    fft
+   functional
    random
    set
    sorting
diff --git a/docs/cunumeric/source/api/vectorize.rst b/docs/cunumeric/source/api/vectorize.rst
deleted file mode 100644
index 5e6cbea66b..0000000000
--- a/docs/cunumeric/source/api/vectorize.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. module:: cunumeric.vectorize
-  
-=========================================
-
-Vectorize
------------------
-
-.. toctree::
-   :maxdepth: 2
-   :hidden:
-
-   _vectorize
-
-
-
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 520267ab76..44877cc997 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -14,16 +14,16 @@
 #
 
 
-# import numpy as np
+import numpy as np
 import pytest
 
 import cunumeric as num
-import numpy as np
 
 
 def my_func(a, b):
     a = a * 2 + b
 
+
 def my_func_np(a, b):
     a = a * 2 + b
     return a
@@ -32,6 +32,7 @@ def my_func_np(a, b):
 def my_func2(A0, B0):
     A0 = A0 * 2 + B0
 
+
 def my_func_np2(A0, B0):
     A0 = A0 * 2 + B0
     return A0
@@ -42,66 +43,66 @@ def test_vectorize():
     a = num.arange(5)
     b = num.ones((5,))
     func(a, b)
-    assert(np.array_equal(a, [1,3,5,7,9]))
+    assert np.array_equal(a, [1, 3, 5, 7, 9])
 
     func = num.vectorize(my_func)
-    a= num.arange(5)
-    b=2
-    func(a,b)
-    assert(np.array_equal(a, [2,4,6,8,10]))
-   
+    a = num.arange(5)
+    b = 2
+    func(a, b)
+    assert np.array_equal(a, [2, 4, 6, 8, 10])
+
     num.vectorize(my_func)
-    a=num.array([[1,2,3],[4,5,6],[7,8,9]])
-    b=num.array([[10,11,12],[13,14,15],[16,17,18]])
-    func(a[:2],b[:2])
+    a = num.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    b = num.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]])
+    func(a[:2], b[:2])
 
-    a=np.arange(100).reshape((25,4))
-    a_num= num.array(a)
+    a = np.arange(100).reshape((25, 4))
+    a_num = num.array(a)
 
-    b=a*10
-    b_num=a_num*10
+    b = a * 10
+    b_num = a_num * 10
     func_np = np.vectorize(my_func_np)
-    func_num=num.vectorize(my_func)
+    func_num = num.vectorize(my_func)
 
-    a=func_np(a,b)
+    a = func_np(a, b)
     func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
 
-    a[:,2]=func_np(a[:, 2], b[:,2])
-    func_num(a_num[:,2],b_num[:,2])
+    a[:, 2] = func_np(a[:, 2], b[:, 2])
+    func_num(a_num[:, 2], b_num[:, 2])
     assert np.array_equal(a, a_num)
 
-    a[5:10,2]=func_np(a[5:10, 2], b[1:6,2])
-    func_num(a_num[5:10,2],b_num[1:6,2])
+    a[5:10, 2] = func_np(a[5:10, 2], b[1:6, 2])
+    func_num(a_num[5:10, 2], b_num[1:6, 2])
     assert np.array_equal(a, a_num)
 
-    a[15:20]=func_np(a[15:20], b[15:20])
-    func_num(a_num[15:20],b_num[15:20])
+    a[15:20] = func_np(a[15:20], b[15:20])
+    func_num(a_num[15:20], b_num[15:20])
     assert np.array_equal(a, a_num)
 
-    a=np.arange(1000).reshape((25,10,4))
-    a_num= num.array(a)
+    a = np.arange(1000).reshape((25, 10, 4))
+    a_num = num.array(a)
 
-    a[:, 2, :] = func_np(a[:, 2, :],2)
-    func_num(a_num[:, 2, :],2)
+    a[:, 2, :] = func_np(a[:, 2, :], 2)
+    func_num(a_num[:, 2, :], 2)
     assert np.array_equal(a, a_num)
 
-    a=np.arange(100).reshape((25,4))
-    a_num= num.array(a)
+    a = np.arange(100).reshape((25, 4))
+    a_num = num.array(a)
 
-    b=a*10
-    b_num=a_num*10
+    b = a * 10
+    b_num = a_num * 10
 
     func_np = np.vectorize(my_func_np2)
-    func_num=num.vectorize(my_func2)
+    func_num = num.vectorize(my_func2)
 
-    a=func_np(a,b)
+    a = func_np(a, b)
     func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
 
-    
 
 if __name__ == "__main__":
     import sys
 
+    np.random.seed(12345)
     sys.exit(pytest.main(sys.argv))

From c0278f7f5f177134614372ab16419a1677bc0f10 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Mar 2023 13:24:42 -0700
Subject: [PATCH 56/78] adding logic for parsing returns from UDF

---
 cunumeric/vectorize.py              | 195 +++++++++++++++++++---------
 src/cunumeric/vectorize/eval_udf.cc |   3 +-
 2 files changed, 136 insertions(+), 62 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 996326c4e7..63f29af1de 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -24,7 +24,6 @@
 import numba
 import numba.core.ccallback
 import numpy as np
-#import six
 from legate.core import Rect, track_provenance
 
 from cunumeric.runtime import runtime
@@ -140,17 +139,21 @@ def __init__(
 
         self._return_argnames = self._get_return_argumets()
         self._num_outputs = len(self._return_argnames) 
-
+        self._return_args=[]
+        self._output_shape :Optional[tuple[Any]]= None
+        self._output_dtype: Optional[np.dtype[Any]] = None
+  
         if otypes is not None:
             if self._num_outputs !=len(otypes):
                 raise ValueError("number of types in otypes is not consistente"
-                 " with the number of return values difened in pyfunc")
+                 " with the number of return values defined in pyfunc")
             if len(otypes)>1:
                 for t in otypes:
                     if t != otypes[0]:
                         raise NotImplementedError(
                             "cuNumeric doesn't support variable types in otypes"
                         )
+                self._output_dtype = otypes[0]
 
 
         # FIXME
@@ -195,7 +198,7 @@ def _get_return_argumets(self)->list[str]:
     def _replace_name(
         self, name: str, _LOOP_VAR: str, is_gpu: bool = False
     ) -> str:
-        if name in self._argnames and not (name in self._scalar_names):
+        if (name in self._argnames) or (name in self._return_argnames ):
             return "{}[int({})]".format(name, _LOOP_VAR)
         else:
             if is_gpu or ((not is_gpu) and not (name  in self._scalar_names)) :
@@ -214,7 +217,9 @@ def _build_gpu_function(self) -> Any:
 
         # Signature
         args = (
-            self._argnames
+            self._return_argnames
+            + self._argnames
+            + self._scalar_names
             + [_SIZE_VAR]
             + [_DIM_VAR]
             + [_PITCHES_VAR]
@@ -254,8 +259,9 @@ def _lift_to_array_access(m: Any) -> str:
         # kernel body
         lines_old = self._func_body
         for line in lines_old:
-            l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
-            lines.append(l_new)
+            if not ( "return" in line):
+                l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
+                lines.append(l_new)
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
@@ -289,35 +295,32 @@ def _emit_assignment(
             var: Any,
             idx: int,
             sz: Any,
-            ty: np.dtype[Any],
-            scalar: bool = False,
+            ty: np.dtype[Any]
         ) -> None:
-            if scalar:
-                # we represent scalars as arrays of size 1
-                lines.append(
-                    "    {} = carray({}[{}], 1, types.{})".format(
-                        var, _ARGS_VAR, idx, ty
-                    )
-                )
-            else:
-                lines.append(
-                    "    {} = carray({}[{}], {}, types.{})".format(
-                        var, _ARGS_VAR, idx, sz, ty
-                    )
+            lines.append(
+                "    {} = carray({}[{}], {}, types.{})".format(
+                    var, _ARGS_VAR, idx, sz, ty
                 )
+            )
 
         # define pyfunc arguments ar carrays
         arg_idx = 0
-        for a in self._args:
+        for count, a in enumerate(self._return_args):
             type_a = a.dtype
             _emit_assignment(
-                self._argnames[arg_idx], arg_idx, _SIZE_VAR, type_a
+                self._return_argnames[count], arg_idx, _SIZE_VAR, type_a
             )
             arg_idx += 1
-        for a in self._scalar_args:
+        for count,a in enumerate(self._args):
+            type_a = a.dtype
+            _emit_assignment(
+                self._argnames[count], arg_idx, _SIZE_VAR, type_a
+            )
+            arg_idx += 1
+        for count, a in enumerate(self._scalar_args):
             scalar_type = np.dtype(type(a).__name__)
             _emit_assignment(
-                self._argnames[arg_idx], arg_idx, _SIZE_VAR, scalar_type, True
+                self._scalar_names[count], arg_idx, 1, scalar_type
             )
             arg_idx += 1
 
@@ -348,8 +351,9 @@ def _lift_to_array_access(m: Any) -> str:
             return self._replace_name(m.group(0), _LOOP_VAR)
 
         for line in lines_old:
-            l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
-            lines.append("    " + l_new)
+            if not ( "return" in line): 
+                l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
+                lines.append("    " + l_new)
 
         # Evaluate the string to get the Python function
         body = "\n".join(lines)
@@ -359,6 +363,12 @@ def _lift_to_array_access(m: Any) -> str:
 
     def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
         types = []
+        for arg in self._return_args:
+            type_a = arg.dtype
+            type_a = str(type_a) if type_a != bool else "int8"
+            type_a = getattr(numba.core.types, type_a)
+            type_a = numba.core.types.CPointer(type_a)
+            types.append(type_a)
         for arg in self._args:
             type_a = arg.dtype
             type_a = str(type_a) if type_a != bool else "int8"
@@ -428,18 +438,28 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             dtype = convert_to_cunumeric_dtype(type(a).__name__)
             task.add_scalar_arg(a, dtype)
 
-        # add array arguments
-        if len (self._args)>0:
-            a0 = self._args[0]._thunk
+        # add return arguments
+        a0=None
+        if len (self._return_args)>0:
+            a0 = self._return_args[0]._thunk
             a0 = runtime.to_deferred_array(a0)
-            for count, a in enumerate(self._args):
+            for count, a in enumerate(self._return_args):
                 a_tmp = runtime.to_deferred_array(a._thunk)
                 a_tmp_base = a_tmp.base
                 task.add_input(a_tmp_base)
-                if count < self._num_outputs:
-                    task.add_output(a_tmp_base)
+                task.add_output(a_tmp_base)
                 if count != 0:
                     task.add_alignment(a0.base, a_tmp_base)
+        # add array arguments
+        if len (self._args)>0:
+            if a0 is None:
+              a0 = self._args[0]._thunk
+              a0 = runtime.to_deferred_array(a0)
+            for count, a in enumerate(self._args):
+                a_tmp = runtime.to_deferred_array(a._thunk)
+                a_tmp_base = a_tmp.base
+                task.add_input(a_tmp_base)
+                task.add_alignment(a0.base, a_tmp_base)
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
@@ -482,45 +502,98 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
                 if i in self._scalar_idxs:
                     self._scalar_names.append(k)
-                    self._argnames.append(k)
 
             self._kwargs = list(kwargs)
-            if len(self._kwargs) > 1:
+            if len(self._kwargs) > 0:
                 raise NotImplementedError(
                     "kwargs are not supported in user functions"
                 )
 
-        if self._num_outputs==0 or len(self._args)==0:
-           #execute function that doesn't modify anything:
-           self._pyfunc()
-           return
-
-        # all output arrays should have the same type
-        if len(self._args) > 0:
-            type_a = self._args[0].dtype
-            shape = self._args[0].shape
-            for i in range(1, self._num_outputs):
-                if type_a != self._args[i].dtype:
-                    raise TypeError(
-                        "cuNumeric doesnt support "
-                        "different types for output data in "
-                        "user function passed to vectorize"
-                    )
-                if shape != self._args[i].shape:
-                    raise TypeError(
-                        "cuNumeric doesnt support "
-                        "different shapes for output data in "
-                        "user function passed to vectorize"
+        #we need to do ther rest each time `__call__` is executed
+        output_shape = self._output_shape
+        output_dtype = self._output_dtype
+        self._return_args.clear()
+        # if output type is not specified, we need to decide
+        # which one to use
+        # we also want to choose the shape for output array
+
+        # check if output variable is in input arguments - >
+        # then use it's dtype and shape
+        for r in self._return_argnames:
+            if r in self._argnames:
+                idx = self._argnames.index(r)
+                if output_dtype is None:
+                    output_dtype = self._args[idx].dtype
+                if output_shape is None:
+                   output_shape = self._args[idx].shape
+                break
+                
+        #the case if we didn't find output argument in input argnames
+        if output_shape is None:
+            for r in self._return_argnames:
+                if r in self._scalar_argnames:
+                    idx = self._scalar_argnames.index(r)
+                    if output_dtype is None:
+                        output_dtype = np.dtype(type(self._scalar_args[idx]))
+                    output_shape = (1,)
+                    break
+        #FIXME
+        #we could find common type of input arguments here and
+        #broadcasted shapes
+        if self._num_outputs>0 and output_dtype is None:
+            raise ValueError("Unable to choose output dtype")
+        if self._num_outputs>0 and output_shape is None:
+            raise ValueError("Unable to choose output shape")
+
+
+        # filing the list of return arguments
+        # check if there are return argnames in input argnames,
+        # if not, create a new array
+        for r in self._return_argnames:
+            if r in self._argnames:
+                idx = self._argnames.index(r)
+                if  self._args[idx].shape !=output_shape:
+                    raise ValueError(
+                        "all output arrays should have the same shape")
+                if output_dtype != self._args[idx].dtype:
+                    runtime.warn(
+                        "converting input array to output types in user func ",
+                        category=RuntimeWarning,
                     )
-            for i in range(self._num_outputs, len(self._args)):
-                if type_a != self._args[i].dtype:
+                    self._args[idx]=self._args[idx].astype(output_dtype)
+                self._return_args.append(self._args[idx])
+                self._args.remove(self._args[idx])
+                self._argnames.remove(r)
+            elif r in self._scalar_names:
+                idx = self._scalar_names.index(r)
+                if output_shape != (1,):
+                    raise ValueError(
+                      "all output arrays should have the same shape")
+                self._return_args.append(full(output_shape,self._scalar_args[idx], output_dtype))
+                self._scalar_args.remove(self._scalar_args[idx])
+                self._scalar_names.remove(r)
+            else:
+                #create array and add it to the list of return_args
+                tmp_ret = full(output_shape,0, output_dtype)
+                self._return_args.append(tmp_ret)
+        #FIXME
+        #if self._num_outputs==0:
+        #   #execute function that doesn't modify anything:
+        #   self._pyfunc(args)
+        #   return
+
+        # bring all arrays to same type
+        if len(self._args) > 0:
+            for count, a  in enumerate(self._args):
+                if output_dtype != a.dtype:
                     runtime.warn(
                         "converting input array to output types in user func ",
                         category=RuntimeWarning,
                     )
-                    self._args[i] = self._args[i].astype(type_a)
-                if shape != self._args[i].shape and np.ndim(self._args[i]) > 0:
-                    raise TypeError(
+                    self._args[count] = self._args[count].astype(output_dtype)
+                #FIXME broadcast shapes
+                if output_shape != self._args[count].shape :
+                    raise ValueError(
                         "cuNumeric doesnt support "
                         "different shapes for arrays in "
                         "user function passed to vectorize"
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 5a40260a1d..04133ef671 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -72,9 +72,10 @@ struct EvalUdfCPU {
                    scalars,
                    num_outputs,
                    context.get_current_processor()};
-  size_t dim = 1;
+  int dim = 1;
   if (args.inputs.size() > 0) {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
+    assert(dim>0);
     double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
   } else {
     LegateTypeCode code = LegateTypeCode::BOOL_LT ;

From f6c515540107300e929a65ead0dfcaa440433c1f Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Mar 2023 13:45:00 -0700
Subject: [PATCH 57/78] making vectorize to return arrays

---
 cunumeric/vectorize.py              | 12 ++++++--
 tests/integration/test_vectorize.py | 47 ++++++++++++-----------------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 63f29af1de..b7990e2e31 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -31,6 +31,7 @@
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
 from .utils import convert_to_cunumeric_dtype
+from .module import full
 
 _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__"
 _MASK_VAR = "__mask__"
@@ -531,8 +532,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         #the case if we didn't find output argument in input argnames
         if output_shape is None:
             for r in self._return_argnames:
-                if r in self._scalar_argnames:
-                    idx = self._scalar_argnames.index(r)
+                if r in self._scalar_names:
+                    idx = self._scalar_names.index(r)
                     if output_dtype is None:
                         output_dtype = np.dtype(type(self._scalar_args[idx]))
                     output_shape = (1,)
@@ -611,3 +612,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 if self._cache:
                     self._created = True
             self._execute(False)
+
+        if len(self._return_args)==1:
+            return self._return_args[0]
+        if len(self._return_args)>1:
+            return tuple(self._return_args)
+        else:
+            return 
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 212c4f4ac0..ee758b0195 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -25,18 +25,11 @@ def my_func(a, b):
     a = a * 2 + b
     return a
 
-def my_func_np(a, b):
-    a = a * 2 + b
-    return a
-
 #Capital letters and numbers in the signature 
 def my_func2(A0, B0):
     A0 = A0 * 2 + B0
-    return A0
-
-def my_func_np2(A0, B0):
-    A0 = A0 * 2 + B0
-    return A0
+    C0=A0*2
+    return A0,C0
 
 def empty_func():
     print("within empty function")
@@ -47,23 +40,22 @@ def test_vectorize():
     func = num.vectorize(my_func)
     a = num.arange(5)
     b = num.ones((5,))
-    func(a, b)
+    a = func(a, b)
     assert(np.array_equal(a, [1,3,5,7,9]))
 
     #array and scalar
     func = num.vectorize(my_func)
     a= num.arange(5)
     b=2
-    func(a,b)
+    a = func(a,b)
     assert(np.array_equal(a, [2,4,6,8,10]))
    
     #2 scalars
-    #FIXME
-    #func = num.vectorize(my_func)
-    #a=3
-    #b=2
-    #func(a,b)
-    #assert(a ==8)
+    func = num.vectorize(my_func)
+    a=3
+    b=2
+    a = func(a,b)
+    assert(a ==8)
 
     #empty function
     func = num.vectorize(empty_func)
@@ -74,38 +66,38 @@ def test_vectorize():
     num.vectorize(my_func)
     a=num.array([[1,2,3],[4,5,6],[7,8,9]])
     b=num.array([[10,11,12],[13,14,15],[16,17,18]])
-    func(a[:2],b[:2])
+    a[:2] = func(a[:2],b[:2])
 
     a=np.arange(100).reshape((25,4))
     a_num= num.array(a)
     b=a*10
     b_num=a_num*10
-    func_np = np.vectorize(my_func_np)
+    func_np = np.vectorize(my_func)
     func_num=num.vectorize(my_func)
     a=func_np(a,b)
-    func_num(a_num, b_num)
+    a_num=func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
 
     #reusing the same function for different inputs
     a[:,2]=func_np(a[:, 2], b[:,2])
-    func_num(a_num[:,2],b_num[:,2])
+    a_num[:,2] =func_num(a_num[:,2],b_num[:,2])
     assert np.array_equal(a, a_num)
 
     #reusing the same function for different inputs
     a[5:10,2]=func_np(a[5:10, 2], b[1:6,2])
-    func_num(a_num[5:10,2],b_num[1:6,2])
+    a_num[5:10,2]=func_num(a_num[5:10,2],b_num[1:6,2])
     assert np.array_equal(a, a_num)
 
     #reusing the same function for different inputs
     a[15:20]=func_np(a[15:20], b[15:20])
-    func_num(a_num[15:20],b_num[15:20])
+    a_num[15:20]=func_num(a_num[15:20],b_num[15:20])
     assert np.array_equal(a, a_num)
 
     # reusing the same function for different inputs
     a=np.arange(1000).reshape((25,10,4))
     a_num= num.array(a)
     a[:, 2, :] = func_np(a[:, 2, :],2)
-    func_num(a_num[:, 2, :],2)
+    a_num[:, 2, :]=func_num(a_num[:, 2, :],2)
     assert np.array_equal(a, a_num)
 
     #checking signature with capital letters and numbers
@@ -113,11 +105,12 @@ def test_vectorize():
     a_num= num.array(a)
     b=a*10
     b_num=a_num*10
-    func_np = np.vectorize(my_func_np2)
+    func_np = np.vectorize(my_func2)
     func_num=num.vectorize(my_func2)
-    a=func_np(a,b)
-    func_num(a_num, b_num)
+    a,c=func_np(a,b)
+    a_num,c_num = func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
+    assert np.array_equal(c, c_num)
 
     
 

From 0924e4f84fa637024dc08158b8795359e9e4f2df Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Mar 2023 17:31:11 -0700
Subject: [PATCH 58/78] adding more tests + code clean-up

---
 cunumeric/vectorize.py              | 46 +++++++++-------
 tests/integration/test_vectorize.py | 84 ++++++++++++++++++++++++-----
 2 files changed, 98 insertions(+), 32 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index b7990e2e31..370d4dd040 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -90,7 +90,7 @@ def __init__(
             If `True`, then cache the first function call that generates C fun-
             ction or CUDA kernel. We recomment enabling caching in cuNumeric 
             for better performance, when possible.
-            Warning: in the case when cache=True, cuNumeric will parse function
+            WARNING: in the case when cache=True, cuNumeric will parse function
             signature and create C function or CUDA kernel only once. This
             means that types of arguments passed to the vectorized function
             (arrays, scalars etc) should be the same each time we call it.
@@ -127,7 +127,7 @@ def __init__(
         self._scalar_args: List[Any] = []
         self._scalar_idxs: List[int] = []
         self._scalar_names: List[str] = []
-        self._argnames: List[str] = []
+        self._arg_names: List[str] = []
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
@@ -138,8 +138,8 @@ def __init__(
         else:
             self.__doc__ = doc
 
-        self._return_argnames = self._get_return_argumets()
-        self._num_outputs = len(self._return_argnames) 
+        self._return_names = self._get_return_argumets()
+        self._num_outputs = len(self._return_names) 
         self._return_args=[]
         self._output_shape :Optional[tuple[Any]]= None
         self._output_dtype: Optional[np.dtype[Any]] = None
@@ -186,6 +186,9 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
         return return_lines
 
     def _get_return_argumets(self)->list[str]:
+        """
+        Returns the list of names for return arrays/values
+        """
         self._func_body = self._get_func_body(self._pyfunc)
         return_names = []
         for l in self._func_body:
@@ -199,7 +202,10 @@ def _get_return_argumets(self)->list[str]:
     def _replace_name(
         self, name: str, _LOOP_VAR: str, is_gpu: bool = False
     ) -> str:
-        if (name in self._argnames) or (name in self._return_argnames ):
+        """
+        add indices to the names of input/output arrays in the function body
+        """
+        if (name in self._arg_names) or (name in self._return_names ):
             return "{}[int({})]".format(name, _LOOP_VAR)
         else:
             if is_gpu or ((not is_gpu) and not (name  in self._scalar_names)) :
@@ -218,8 +224,8 @@ def _build_gpu_function(self) -> Any:
 
         # Signature
         args = (
-            self._return_argnames
-            + self._argnames
+            self._return_names
+            + self._arg_names
             + self._scalar_names
             + [_SIZE_VAR]
             + [_DIM_VAR]
@@ -309,13 +315,13 @@ def _emit_assignment(
         for count, a in enumerate(self._return_args):
             type_a = a.dtype
             _emit_assignment(
-                self._return_argnames[count], arg_idx, _SIZE_VAR, type_a
+                self._return_names[count], arg_idx, _SIZE_VAR, type_a
             )
             arg_idx += 1
         for count,a in enumerate(self._args):
             type_a = a.dtype
             _emit_assignment(
-                self._argnames[count], arg_idx, _SIZE_VAR, type_a
+                self._arg_names[count], arg_idx, _SIZE_VAR, type_a
             )
             arg_idx += 1
         for count, a in enumerate(self._scalar_args):
@@ -480,7 +486,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             self._scalar_args.clear()
             self._scalar_idxs.clear()
             self._args.clear()
-            self._argnames.clear()
+            self._arg_names.clear()
             self._scalar_names.clear()
 
             for i, arg in enumerate(args):
@@ -498,7 +504,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             # first fill arrays to argnames, then scalars:
             for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
                 if not (i in self._scalar_idxs):
-                    self._argnames.append(k)
+                    self._arg_names.append(k)
 
             for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
                 if i in self._scalar_idxs:
@@ -520,18 +526,20 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
 
         # check if output variable is in input arguments - >
         # then use it's dtype and shape
-        for r in self._return_argnames:
-            if r in self._argnames:
-                idx = self._argnames.index(r)
+        print ("IRINA DEBUG ", self._return_names, self._arg_names, output_dtype)
+        for r in self._return_names:
+            if r in self._arg_names:
+                idx = self._arg_names.index(r)
                 if output_dtype is None:
                     output_dtype = self._args[idx].dtype
                 if output_shape is None:
                    output_shape = self._args[idx].shape
                 break
+        print ("IRINA DEBUG 2", output_dtype)
                 
         #the case if we didn't find output argument in input argnames
         if output_shape is None:
-            for r in self._return_argnames:
+            for r in self._return_names:
                 if r in self._scalar_names:
                     idx = self._scalar_names.index(r)
                     if output_dtype is None:
@@ -550,9 +558,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         # filing the list of return arguments
         # check if there are return argnames in input argnames,
         # if not, create a new array
-        for r in self._return_argnames:
-            if r in self._argnames:
-                idx = self._argnames.index(r)
+        for r in self._return_names:
+            if r in self._arg_names:
+                idx = self._arg_names.index(r)
                 if  self._args[idx].shape !=output_shape:
                     raise ValueError(
                         "all output arrays should have the same shape")
@@ -564,7 +572,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._args[idx]=self._args[idx].astype(output_dtype)
                 self._return_args.append(self._args[idx])
                 self._args.remove(self._args[idx])
-                self._argnames.remove(r)
+                self._arg_names.remove(r)
             elif r in self._scalar_names:
                 idx = self._scalar_names.index(r)
                 if output_shape != (1,):
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index ee758b0195..d5920ae235 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -31,10 +31,6 @@ def my_func2(A0, B0):
     C0=A0*2
     return A0,C0
 
-def empty_func():
-    print("within empty function")
-
-
 def test_vectorize():
     #2 arrays
     func = num.vectorize(my_func)
@@ -57,23 +53,42 @@ def test_vectorize():
     a = func(a,b)
     assert(a ==8)
 
+def empty_func():
+    print("within empty function")
+
+def print_func(a,b):
+    print ("I am pringing input arguments", a, b)
+
+def test_empty_functions():
     #empty function
     func = num.vectorize(empty_func)
     func()
 
-    #slices
-    func = num.vectorize(my_func)
-    num.vectorize(my_func)
-    a=num.array([[1,2,3],[4,5,6],[7,8,9]])
-    b=num.array([[10,11,12],[13,14,15],[16,17,18]])
-    a[:2] = func(a[:2],b[:2])
+    func2 = num.vectorize(print_func)
+    print_func(1,2)
+
+    print_func(np.array([1,2,3]), 2)
+
+
+def test_vectorize_over_slices():
+    #reuse the same vectorize object on
+    #different slices
+    func_num = num.vectorize(my_func)
+    func_np = np.vectorize(my_func)
+
+    a=np.array([[1,2,3],[4,5,6],[7,8,9]])
+    b=np.array([[10,11,12],[13,14,15],[16,17,18]])
+    a_num=num.array(a)
+    b_num = num.array(b)
+    a[:2] = func_np(a[:2],b[:2])
+    a_num[:2] = func_num(a_num[:2],b_num[:2])
+    assert np.array_equal(a, a_num)
+
 
     a=np.arange(100).reshape((25,4))
     a_num= num.array(a)
     b=a*10
     b_num=a_num*10
-    func_np = np.vectorize(my_func)
-    func_num=num.vectorize(my_func)
     a=func_np(a,b)
     a_num=func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
@@ -100,7 +115,9 @@ def test_vectorize():
     a_num[:, 2, :]=func_num(a_num[:, 2, :],2)
     assert np.array_equal(a, a_num)
 
+def test_multiple_outputs():
     #checking signature with capital letters and numbers
+    # + checking multiple outputs
     a=np.arange(100).reshape((25,4))
     a_num= num.array(a)
     b=a*10
@@ -112,7 +129,48 @@ def test_vectorize():
     assert np.array_equal(a, a_num)
     assert np.array_equal(c, c_num)
 
-    
+def test_different_types():
+    #checking the case when input and output types are different
+    a=np.arange(100, dtype = int).reshape((25,4))
+    a_num= num.array(a)
+    b=a*10
+    b_num=a_num*10
+    func_np = np.vectorize(my_func, otypes=(float,))
+    func_num=num.vectorize(my_func, otypes=(float,))
+    a=func_np(a,b)
+    a_num=func_num(a_num, b_num)
+    assert np.array_equal(a, a_num)
+
+    #another test for different types
+    a=np.arange(100, dtype = float).reshape((25,4))
+    a_num= num.array(a)
+    b=a*10
+    b_num=a_num*10
+    func_np = np.vectorize(my_func2, otypes = (int, int,))
+    func_num=num.vectorize(my_func2, otypes = (int, int, ))
+    a,c=func_np(a,b)
+    a_num,c_num = func_num(a_num, b_num)
+    assert np.array_equal(a, a_num)
+    assert np.array_equal(c, c_num)
+
+
+def test_cache():
+    a=np.arange(100).reshape((25,4))
+    a_num= num.array(a)
+    b=a*10
+    b_num=a_num*10
+    func_np = np.vectorize(my_func2, cache = True)
+    func_num=num.vectorize(my_func2, cache = True)
+    for i in range (10):
+        a=a*2
+        b=b*3
+        a_num=a_num*2
+        b_num=b_num*3
+        a,c=func_np(a,b)
+        a_num,c_num = func_num(a_num, b_num)
+        assert np.array_equal(a, a_num)
+        assert np.array_equal(c, c_num)
+
 
 if __name__ == "__main__":
     import sys

From 00b39c463aded157d1d69c3bb797bfbf96f4d3c6 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Mar 2023 20:36:50 -0700
Subject: [PATCH 59/78] fixing logic for caching

---
 cunumeric/vectorize.py | 99 ++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 43 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 370d4dd040..e22f18ae73 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -141,9 +141,10 @@ def __init__(
         self._return_names = self._get_return_argumets()
         self._num_outputs = len(self._return_names) 
         self._return_args=[]
-        self._output_shape :Optional[tuple[Any]]= None
         self._output_dtype: Optional[np.dtype[Any]] = None
-  
+        self._cached_dtype: Optional[np.dtype[Any]] = None
+        self._cached_scalar_types: List[Any]=[]
+
         if otypes is not None:
             if self._num_outputs !=len(otypes):
                 raise ValueError("number of types in otypes is not consistente"
@@ -478,46 +479,52 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         task.execute()
 
     def __call__(self, *args: Any, **kwargs: Any) -> None:
-        if not self._created:
-            # the case when  we execute `__call__` the first time or
-            # when cache=False:
-            # each time we call `vectorize` on a pyfunc we need to clear
-            # these lists to support different types of arguments passed
-            self._scalar_args.clear()
-            self._scalar_idxs.clear()
-            self._args.clear()
-            self._arg_names.clear()
-            self._scalar_names.clear()
-
-            for i, arg in enumerate(args):
-                if arg is None:
-                    raise ValueError(
-                        "None is not supported in user function "
-                        "passed to cunumeric.vectorize"
-                    )
-                elif np.ndim(arg) == 0:
-                    self._scalar_args.append(arg)
-                    self._scalar_idxs.append(i)
-                else:
-                    self._args.append(convert_to_cunumeric_ndarray(arg))
-
-            # first fill arrays to argnames, then scalars:
-            for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
-                if not (i in self._scalar_idxs):
-                    self._arg_names.append(k)
-
-            for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
-                if i in self._scalar_idxs:
-                    self._scalar_names.append(k)
-
-            self._kwargs = list(kwargs)
-            if len(self._kwargs) > 0:
-                raise NotImplementedError(
-                    "kwargs are not supported in user functions"
+        # each time we call `vectorize` on a pyfunc we need to clear
+        # these lists to support different types of arguments passed
+        self._scalar_args.clear()
+        self._scalar_idxs.clear()
+        self._args.clear()
+        self._arg_names.clear()
+        self._scalar_names.clear()
+
+        scalar_idx=0
+        for i, arg in enumerate(args):
+            if arg is None:
+                raise ValueError(
+                    "None is not supported in user function "
+                    "passed to cunumeric.vectorize"
                 )
+            elif np.ndim(arg) == 0:
+                if self._cache and not self._created:
+                    self._cached_scalar_types.apend(type(arg))
+                elif self._cache:
+                    if self._cached_scalar_types[scalar_idx] != type(arg):
+                        raise TypeError(
+                            " Input arguments to vectorized function should"
+                            " have consistent types for each invocation")
+                self._scalar_args.append(arg)
+                self._scalar_idxs.append(i)
+                scalar_idx+=1
+            else:
+                self._args.append(convert_to_cunumeric_ndarray(arg))
+
+        # first fill arrays to argnames, then scalars:
+        for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
+            if not (i in self._scalar_idxs):
+                self._arg_names.append(k)
+
+        for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
+            if i in self._scalar_idxs:
+                self._scalar_names.append(k)
+
+        self._kwargs = list(kwargs)
+        if len(self._kwargs) > 0:
+            raise NotImplementedError(
+                "kwargs are not supported in user functions"
+            )
 
         #we need to do ther rest each time `__call__` is executed
-        output_shape = self._output_shape
+        output_shape = None
         output_dtype = self._output_dtype
         self._return_args.clear()
         # if output type is not specified, we need to decide
@@ -526,16 +533,14 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
 
         # check if output variable is in input arguments - >
         # then use it's dtype and shape
-        print ("IRINA DEBUG ", self._return_names, self._arg_names, output_dtype)
         for r in self._return_names:
             if r in self._arg_names:
                 idx = self._arg_names.index(r)
                 if output_dtype is None:
-                    output_dtype = self._args[idx].dtype
+                   output_dtype = self._args[idx].dtype
                 if output_shape is None:
                    output_shape = self._args[idx].shape
                 break
-        print ("IRINA DEBUG 2", output_dtype)
                 
         #the case if we didn't find output argument in input argnames
         if output_shape is None:
@@ -546,7 +551,15 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                         output_dtype = np.dtype(type(self._scalar_args[idx]))
                     output_shape = (1,)
                     break
-        #FIXME
+        
+        if self._cache and not (self._cached_dtype is None):
+            if self._cached_dtype !=output_dtype:
+                raise TypeError("types of the arguments should stay the same"
+                    " for each invocation of the vectorize object")
+        elif self._cache:
+            self._cached_dtype = output_dtype
+
+        #FIXME            
         #we could find common type of input arguments here and
         #broadcasted shapes
         if self._num_outputs>0 and output_dtype is None:

From e7e4e7aca8d3971fb8e1987f5250889e614f32e1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Mar 2023 20:48:07 -0700
Subject: [PATCH 60/78] adding more tests

---
 tests/integration/test_vectorize.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index d5920ae235..b6f4bcaf3e 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -19,7 +19,8 @@
 
 import cunumeric as num
 import numpy as np
-
+from legate.core import LEGATE_MAX_DIM
+from utils.generators import mk_seq_array
 
 def my_func(a, b):
     a = a * 2 + b
@@ -171,6 +172,21 @@ def test_cache():
         assert np.array_equal(a, a_num)
         assert np.array_equal(c, c_num)
 
+#checking caching on different shapes of arrays:
+func_np2 = np.vectorize(my_func2, cache = True)
+func_num2=num.vectorize(my_func2, cache = True)
+
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+def test_nd_vectorize(ndim):
+    a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
+    a = mk_seq_array(np, a_shape)
+    a_num = mk_seq_array(num, a_shape)
+    b=a*2
+    b_num=a_num*2
+    a,c=func_np2(a,b)
+    a_num,c_num = func_num2(a_num, b_num)
+    assert np.array_equal(a, a_num)
+    assert np.array_equal(c, c_num)        
 
 if __name__ == "__main__":
     import sys

From fe5219ad9405a288492e30bb40069084bf9e55a2 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 22 Mar 2023 21:25:21 -0700
Subject: [PATCH 61/78] formatting

---
 cunumeric/vectorize.py              | 160 ++++++++++++++--------------
 src/cunumeric/vectorize/eval_udf.cc |   4 +-
 src/cunumeric/vectorize/eval_udf.cu |   4 +-
 typings/numba/types/__init__.pyi    |   2 +-
 4 files changed, 85 insertions(+), 85 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index e22f18ae73..8c513d5fff 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -15,10 +15,7 @@
 
 import inspect
 import re
-import typing
-
-# numba typing
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import legate.core.types as ty
 import numba
@@ -30,8 +27,8 @@
 
 from .array import convert_to_cunumeric_ndarray
 from .config import CuNumericOpCode
-from .utils import convert_to_cunumeric_dtype
 from .module import full
+from .utils import convert_to_cunumeric_dtype
 
 _EXTERNAL_REFERENCE_PREFIX = "__extern_ref__"
 _MASK_VAR = "__mask__"
@@ -61,7 +58,7 @@ def __init__(
         objects or numpy arrays as inputs and returns a single numpy array
         or a tuple of numpy arrays.
         User defined pyfunction will be executed in a single cuNumeric task
-        over a set of arguments. 
+        over a set of arguments.
         The data type of the output of `vectorized` is determined by calling
         the function with the first element of the input.  This can be avoided
         by specifying the `otypes` argument.
@@ -88,7 +85,7 @@ def __init__(
             WARNING: cuNumeric doesn't suport this argument at the moment
         cache : bool, optional
             If `True`, then cache the first function call that generates C fun-
-            ction or CUDA kernel. We recomment enabling caching in cuNumeric 
+            ction or CUDA kernel. We recomment enabling caching in cuNumeric
             for better performance, when possible.
             WARNING: in the case when cache=True, cuNumeric will parse function
             signature and create C function or CUDA kernel only once. This
@@ -131,7 +128,7 @@ def __init__(
         self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
-        self._func_body: List[str]=[]
+        self._func_body: List[str] = []
 
         if doc is None:
             self.__doc__ = pyfunc.__doc__
@@ -139,24 +136,26 @@ def __init__(
             self.__doc__ = doc
 
         self._return_names = self._get_return_argumets()
-        self._num_outputs = len(self._return_names) 
-        self._return_args=[]
+        self._num_outputs: int = len(self._return_names)
+        self._return_args: List[Any] = []
         self._output_dtype: Optional[np.dtype[Any]] = None
         self._cached_dtype: Optional[np.dtype[Any]] = None
-        self._cached_scalar_types: List[Any]=[]
+        self._cached_scalar_types: List[Any] = []
 
         if otypes is not None:
-            if self._num_outputs !=len(otypes):
-                raise ValueError("number of types in otypes is not consistente"
-                 " with the number of return values defined in pyfunc")
-            if len(otypes)>1:
+            if self._num_outputs != len(otypes):
+                raise ValueError(
+                    "number of types in otypes is not consistente"
+                    " with the number of return values defined in pyfunc"
+                )
+            if len(otypes) > 1:
                 for t in otypes:
                     if t != otypes[0]:
                         raise NotImplementedError(
-                            "cuNumeric doesn't support variable types in otypes"
+                            "cuNumeric doesn't support variable types"
+                            " in otypes"
                         )
-                self._output_dtype = otypes[0]
-
+                self._output_dtype = np.dtype(otypes[0])
 
         # FIXME
         if excluded is not None:
@@ -170,7 +169,6 @@ def __init__(
                 "signature variable is not supported yet"
             )
 
-
     def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
         """Using the magic method __doc__, we KNOW the size of the docstring.
         We then, just substract this from the total length of the function
@@ -186,30 +184,29 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
             return_lines.append(lines[i].rstrip())
         return return_lines
 
-    def _get_return_argumets(self)->list[str]:
+    def _get_return_argumets(self) -> list[str]:
         """
         Returns the list of names for return arrays/values
         """
         self._func_body = self._get_func_body(self._pyfunc)
         return_names = []
-        for l in self._func_body:
-            if "return"  in l:
-                l = l.replace("return", '')
-                l=l.replace(" ",'')
-                return_names = l.split(",")
+        for ln in self._func_body:
+            if "return" in ln:
+                ln = ln.replace("return", "")
+                ln = ln.replace(" ", "")
+                return_names = ln.split(",")
         return return_names
 
-
     def _replace_name(
         self, name: str, _LOOP_VAR: str, is_gpu: bool = False
     ) -> str:
         """
         add indices to the names of input/output arrays in the function body
         """
-        if (name in self._arg_names) or (name in self._return_names ):
+        if (name in self._arg_names) or (name in self._return_names):
             return "{}[int({})]".format(name, _LOOP_VAR)
         else:
-            if is_gpu or ((not is_gpu) and not (name  in self._scalar_names)) :
+            if is_gpu or ((not is_gpu) and not (name in self._scalar_names)):
                 return "{}".format(name)
             else:
                 return "{}[0]".format(name)
@@ -267,7 +264,7 @@ def _lift_to_array_access(m: Any) -> str:
         # kernel body
         lines_old = self._func_body
         for line in lines_old:
-            if not ( "return" in line):
+            if not ("return" in line):
                 l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
                 lines.append(l_new)
 
@@ -300,10 +297,7 @@ def _build_cpu_function(self) -> Callable[[Any], Any]:
 
         # Unpack kernel arguments
         def _emit_assignment(
-            var: Any,
-            idx: int,
-            sz: Any,
-            ty: np.dtype[Any]
+            var: Any, idx: int, sz: Any, ty: np.dtype[Any]
         ) -> None:
             lines.append(
                 "    {} = carray({}[{}], {}, types.{})".format(
@@ -319,7 +313,7 @@ def _emit_assignment(
                 self._return_names[count], arg_idx, _SIZE_VAR, type_a
             )
             arg_idx += 1
-        for count,a in enumerate(self._args):
+        for count, a in enumerate(self._args):
             type_a = a.dtype
             _emit_assignment(
                 self._arg_names[count], arg_idx, _SIZE_VAR, type_a
@@ -359,7 +353,7 @@ def _lift_to_array_access(m: Any) -> str:
             return self._replace_name(m.group(0), _LOOP_VAR)
 
         for line in lines_old:
-            if not ( "return" in line): 
+            if not ("return" in line):
                 l_new = re.sub(r"[_a-zA-Z]\w*", _lift_to_array_access, line)
                 lines.append("    " + l_new)
 
@@ -405,13 +399,13 @@ def _compile_func_gpu(self) -> tuple[Any]:
         return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch)
 
     def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
-        sig = numba.core.types.void(
+        sig = numba.core.types.void(  # type : ignore
             numba.types.CPointer(numba.types.voidptr),
             numba.core.types.uint64,
             numba.core.types.uint64,
             numba.core.types.CPointer(numba.core.types.uint64),
             numba.core.types.CPointer(numba.core.types.uint64),
-        )
+        )  # type : ignore
 
         return numba.cfunc(sig)(self._numba_func)
 
@@ -447,8 +441,8 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             task.add_scalar_arg(a, dtype)
 
         # add return arguments
-        a0=None
-        if len (self._return_args)>0:
+        a0 = None
+        if len(self._return_args) > 0:
             a0 = self._return_args[0]._thunk
             a0 = runtime.to_deferred_array(a0)
             for count, a in enumerate(self._return_args):
@@ -459,10 +453,10 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
                 if count != 0:
                     task.add_alignment(a0.base, a_tmp_base)
         # add array arguments
-        if len (self._args)>0:
+        if len(self._args) > 0:
             if a0 is None:
-              a0 = self._args[0]._thunk
-              a0 = runtime.to_deferred_array(a0)
+                a0 = self._args[0]._thunk
+                a0 = runtime.to_deferred_array(a0)
             for count, a in enumerate(self._args):
                 a_tmp = runtime.to_deferred_array(a._thunk)
                 a_tmp_base = a_tmp.base
@@ -478,7 +472,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             )  # type : ignore
         task.execute()
 
-    def __call__(self, *args: Any, **kwargs: Any) -> None:
+    def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
         # each time we call `vectorize` on a pyfunc we need to clear
         # these lists to support different types of arguments passed
         self._scalar_args.clear()
@@ -487,7 +481,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
         self._arg_names.clear()
         self._scalar_names.clear()
 
-        scalar_idx=0
+        scalar_idx = 0
         for i, arg in enumerate(args):
             if arg is None:
                 raise ValueError(
@@ -496,15 +490,16 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 )
             elif np.ndim(arg) == 0:
                 if self._cache and not self._created:
-                    self._cached_scalar_types.apend(type(arg))
+                    self._cached_scalar_types.append(type(arg))
                 elif self._cache:
                     if self._cached_scalar_types[scalar_idx] != type(arg):
                         raise TypeError(
                             " Input arguments to vectorized function should"
-                            " have consistent types for each invocation")
+                            " have consistent types for each invocation"
+                        )
                 self._scalar_args.append(arg)
                 self._scalar_idxs.append(i)
-                scalar_idx+=1
+                scalar_idx += 1
             else:
                 self._args.append(convert_to_cunumeric_ndarray(arg))
 
@@ -523,8 +518,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 "kwargs are not supported in user functions"
             )
 
-        #we need to do ther rest each time `__call__` is executed
-        output_shape = None
+        # we need to do ther rest each time `__call__` is executed
+        output_shape: Tuple[int] = (-1,)
         output_dtype = self._output_dtype
         self._return_args.clear()
         # if output type is not specified, we need to decide
@@ -537,13 +532,13 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
             if r in self._arg_names:
                 idx = self._arg_names.index(r)
                 if output_dtype is None:
-                   output_dtype = self._args[idx].dtype
-                if output_shape is None:
-                   output_shape = self._args[idx].shape
+                    output_dtype = self._args[idx].dtype
+                if output_shape is (-1,):
+                    output_shape = self._args[idx].shape
                 break
-                
-        #the case if we didn't find output argument in input argnames
-        if output_shape is None:
+
+        # the case if we didn't find output argument in input argnames
+        if output_shape is (-1,):
             for r in self._return_names:
                 if r in self._scalar_names:
                     idx = self._scalar_names.index(r)
@@ -551,38 +546,40 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                         output_dtype = np.dtype(type(self._scalar_args[idx]))
                     output_shape = (1,)
                     break
-        
+
         if self._cache and not (self._cached_dtype is None):
-            if self._cached_dtype !=output_dtype:
-                raise TypeError("types of the arguments should stay the same"
-                    " for each invocation of the vectorize object")
+            if self._cached_dtype != output_dtype:
+                raise TypeError(
+                    "types of the arguments should stay the same"
+                    " for each invocation of the vectorize object"
+                )
         elif self._cache:
             self._cached_dtype = output_dtype
 
-        #FIXME            
-        #we could find common type of input arguments here and
-        #broadcasted shapes
-        if self._num_outputs>0 and output_dtype is None:
+        # FIXME
+        # we could find common type of input arguments here and
+        # broadcasted shapes
+        if self._num_outputs > 0 and output_dtype is None:
             raise ValueError("Unable to choose output dtype")
-        if self._num_outputs>0 and output_shape is None:
+        if self._num_outputs > 0 and output_shape is None:
             raise ValueError("Unable to choose output shape")
 
-
         # filing the list of return arguments
         # check if there are return argnames in input argnames,
         # if not, create a new array
         for r in self._return_names:
             if r in self._arg_names:
                 idx = self._arg_names.index(r)
-                if  self._args[idx].shape !=output_shape:
+                if self._args[idx].shape != output_shape:
                     raise ValueError(
-                        "all output arrays should have the same shape")
+                        "all output arrays should have the same shape"
+                    )
                 if output_dtype != self._args[idx].dtype:
                     runtime.warn(
                         "converting input array to output types in user func ",
                         category=RuntimeWarning,
                     )
-                    self._args[idx]=self._args[idx].astype(output_dtype)
+                    self._args[idx] = self._args[idx].astype(output_dtype)
                 self._return_args.append(self._args[idx])
                 self._args.remove(self._args[idx])
                 self._arg_names.remove(r)
@@ -590,31 +587,34 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                 idx = self._scalar_names.index(r)
                 if output_shape != (1,):
                     raise ValueError(
-                      "all output arrays should have the same shape")
-                self._return_args.append(full(output_shape,self._scalar_args[idx], output_dtype))
+                        "all output arrays should have the same shape"
+                    )
+                self._return_args.append(
+                    full(output_shape, self._scalar_args[idx], output_dtype)
+                )
                 self._scalar_args.remove(self._scalar_args[idx])
                 self._scalar_names.remove(r)
             else:
-                #create array and add it to the list of return_args
-                tmp_ret = full(output_shape,0, output_dtype)
+                # create array and add it to the list of return_args
+                tmp_ret = full(output_shape, 0, output_dtype)
                 self._return_args.append(tmp_ret)
-        #FIXME
-        #if self._num_outputs==0:
+        # FIXME
+        # if self._num_outputs==0:
         #   #execute function that doesn't modify anything:
         #   self._pyfunc(args)
         #   return
 
         # bring all arrays to same type
         if len(self._args) > 0:
-            for count, a  in enumerate(self._args):
+            for count, a in enumerate(self._args):
                 if output_dtype != a.dtype:
                     runtime.warn(
                         "converting input array to output types in user func ",
                         category=RuntimeWarning,
                     )
                     self._args[count] = self._args[count].astype(output_dtype)
-                #FIXME broadcast shapes
-                if output_shape != self._args[count].shape :
+                # FIXME broadcast shapes
+                if output_shape != self._args[count].shape:
                     raise ValueError(
                         "cuNumeric doesnt support "
                         "different shapes for arrays in "
@@ -634,9 +634,9 @@ def __call__(self, *args: Any, **kwargs: Any) -> None:
                     self._created = True
             self._execute(False)
 
-        if len(self._return_args)==1:
+        if len(self._return_args) == 1:
             return self._return_args[0]
-        if len(self._return_args)>1:
+        if len(self._return_args) > 1:
             return tuple(self._return_args)
         else:
-            return 
+            return -1
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 04133ef671..1860b588c1 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -75,10 +75,10 @@ struct EvalUdfCPU {
   int dim = 1;
   if (args.inputs.size() > 0) {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
-    assert(dim>0);
+    assert(dim > 0);
     double_dispatch(dim, args.inputs[0].code(), EvalUdfCPU{}, args);
   } else {
-    LegateTypeCode code = LegateTypeCode::BOOL_LT ;
+    LegateTypeCode code = LegateTypeCode::BOOL_LT;
     double_dispatch(dim, code, EvalUdfCPU{}, args);
   }
 }
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index ebe3e0730e..f7c6a452a9 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -33,7 +33,7 @@ struct EvalUdfGPU {
     Rect<DIM> rect;
 
     size_t input_size = args.inputs.size();
-    CUfunction func = get_udf(args.hash);
+    CUfunction func   = get_udf(args.hash);
 
     // Filling up the buffer with arguments
     size_t buffer_size = (input_size + args.scalars.size()) * sizeof(void*);
@@ -136,7 +136,7 @@ struct EvalUdfGPU {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
     double_dispatch(dim, args.inputs[0].code(), EvalUdfGPU{}, args);
   } else {
-    LegateTypeCode code = LegateTypeCode::BOOL_LT ;
+    LegateTypeCode code = LegateTypeCode::BOOL_LT;
     double_dispatch(dim, code, EvalUdfGPU{}, args);
   }
 }
diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi
index ffbfbd5a94..697a068cfd 100644
--- a/typings/numba/types/__init__.pyi
+++ b/typings/numba/types/__init__.pyi
@@ -2,7 +2,7 @@
 
 class Type(): ...
 
-class Number(): ...
+class Number(Type): ...
 
 class Integer(Number):
     def __init__(self, name: str) ->None: ...

From 5b752976ac7809092c3bf2118d21f8aea1ab0cd7 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 23 Mar 2023 16:39:15 -0700
Subject: [PATCH 62/78] small bugfux

---
 cunumeric/vectorize.py              | 4 ++--
 tests/integration/test_vectorize.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 6c07d85da3..040bc433c9 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -534,12 +534,12 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
                 idx = self._arg_names.index(r)
                 if output_dtype is None:
                     output_dtype = self._args[idx].dtype
-                if output_shape is (-1,):
+                if output_shape == (-1,):
                     output_shape = self._args[idx].shape
                 break
 
         # the case if we didn't find output argument in input argnames
-        if output_shape is (-1,):
+        if output_shape == (-1,):
             for r in self._return_names:
                 if r in self._scalar_names:
                     idx = self._scalar_names.index(r)
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index cbb49bcaf4..f75aeab085 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -180,9 +180,9 @@ def test_cache():
 def test_nd_vectorize(ndim):
     a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
     a = mk_seq_array(np, a_shape)
-    a_num = mk_seq_array(num, a_shape)
+    a_num = num.array(a)
     b=a*2
-    b_num=a_num*2
+    b_num=num.array(b)
     a,c=func_np2(a,b)
     a_num,c_num = func_num2(a_num, b_num)
     assert np.array_equal(a, a_num)

From 09b755fb306db1c3c305c62a59a949099e49eaf0 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 23 Mar 2023 20:40:54 -0700
Subject: [PATCH 63/78] clean-up + formatting

---
 cunumeric/utils.py                            |   5 +-
 cunumeric/vectorize.py                        | 120 +++++-----
 examples/black_scholes_greeks.py              | 154 ++++++++-----
 src/cunumeric/cuda_help.h                     |  38 ----
 src/cunumeric/pitches.h                       |  27 +--
 src/cunumeric/vectorize/create_cu_kernel.cu   |  10 +-
 src/cunumeric/vectorize/eval_udf.cc           |  11 +-
 src/cunumeric/vectorize/eval_udf_template.inl |  69 ------
 tests/integration/test_vectorize.py           | 212 +++++++++---------
 9 files changed, 287 insertions(+), 359 deletions(-)
 delete mode 100644 src/cunumeric/vectorize/eval_udf_template.inl

diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index f4cf6f962f..381470a0e9 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -51,7 +51,7 @@
     "int16": ty.int16,
     "int32": ty.int32,
     "int": ty.int64,  # np.int is int
-    "int64": ty.int64, 
+    "int64": ty.int64,
     "uint8": ty.uint8,
     "uint16": ty.uint16,
     "uint32": ty.uint32,
@@ -117,7 +117,8 @@ def is_supported_dtype(dtype: Any) -> bool:
         raise TypeError("expected a NumPy dtype")
     return dtype.type in SUPPORTED_DTYPES
 
-def convert_to_cunumeric_dtype(dtype: str) ->Any:
+
+def convert_to_cunumeric_dtype(dtype: str) -> Any:
     if dtype in CUNUMERIC_TYPE_MAP:
         return CUNUMERIC_TYPE_MAP[dtype]
     else:
diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 040bc433c9..893b0189ed 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -1,4 +1,4 @@
-  # Copyright 2023  NVIDIA Corporation
+# Copyright 2023  NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -172,7 +172,7 @@ def __init__(
 
     def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
         """Using the magic method __doc__, we KNOW the size of the docstring.
-        We then, just substract this from the total length of the function
+        We then, just subtract this from the total length of the function
         """
         lines_to_skip = 0
         if func.__doc__ is not None and len(func.__doc__.split("\n")) > 0:
@@ -239,12 +239,12 @@ def _build_gpu_function(self) -> Any:
         lines.append("    local_i = cuda.grid(1)")
         lines.append("    if local_i >= {}:".format(_SIZE_VAR))
         lines.append("        return")
-        # we compute inndex for sparse data access when using Legion's
+        # we compute index for sparse data access when using Legion's
         # pointer.
-        # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
+        # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
         lines.append("    {}:int = 0".format(_LOOP_VAR))
         lines.append("    for p in range({}-1):".format(_DIM_VAR))
-        # fixme make sure we compute index correct for all data types
+        # FIXME make sure we compute index correct for all data types
         lines.append("        x=int(local_i/{}[p])".format(_PITCHES_VAR))
         lines.append(
             "        local_i = int(local_i%{}[p])".format(_PITCHES_VAR)
@@ -306,7 +306,7 @@ def _emit_assignment(
                 )
             )
 
-        # define pyfunc arguments ar carrays
+        # define pyfunc arguments as carrays
         arg_idx = 0
         for count, a in enumerate(self._return_args):
             type_a = a.dtype
@@ -329,9 +329,9 @@ def _emit_assignment(
 
         # Main loop
         lines.append("    for local_i in range({}):".format(_SIZE_VAR))
-        # we compute inndex for sparse data access when using Legion's
+        # we compute index for sparse data access when using Legion's
         # pointer.
-        # aa[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
+        # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
         lines.append("        {}:int = 0".format(_LOOP_VAR))
         lines.append("        j:int = local_i")
         lines.append("        for p in range({}-1):".format(_DIM_VAR))
@@ -366,13 +366,7 @@ def _lift_to_array_access(m: Any) -> str:
 
     def _get_numba_types(self, need_pointer: bool = True) -> list[Any]:
         types = []
-        for arg in self._return_args:
-            type_a = arg.dtype
-            type_a = str(type_a) if type_a != bool else "int8"
-            type_a = getattr(numba.core.types, type_a)
-            type_a = numba.core.types.CPointer(type_a)
-            types.append(type_a)
-        for arg in self._args:
+        for arg in self._return_args + self._args:
             type_a = arg.dtype
             type_a = str(type_a) if type_a != bool else "int8"
             type_a = getattr(numba.core.types, type_a)
@@ -400,69 +394,80 @@ def _compile_func_gpu(self) -> tuple[Any]:
         return numba.cuda.compile_ptx(self._numba_func, sig, cc=cuda_arch)
 
     def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
-        sig = numba.core.types.void(  # type : ignore
+        sig = numba.core.types.void(  # type: ignore
             numba.types.CPointer(numba.types.voidptr),
             numba.core.types.uint64,
             numba.core.types.uint64,
             numba.core.types.CPointer(numba.core.types.uint64),
             numba.core.types.CPointer(numba.core.types.uint64),
-        )  # type : ignore
+        )  
 
         return numba.cfunc(sig)(self._numba_func)
 
+    def _create_cuda_kernel(self, num_gpus: int) -> None:
+        # create CUDA kernel
+        launch_domain = Rect(lo=(0,), hi=(num_gpus,))
+        kernel_task = self._context.create_manual_task(
+            CuNumericOpCode.CREATE_CU_KERNEL,
+            launch_domain=launch_domain,
+        )
+        ptx_hash = hash(self._gpu_func[0])
+        kernel_task.add_scalar_arg(ptx_hash, ty.int64)
+        kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
+        kernel_task.execute()
+        # we want to make sure EVAL_UDF function is not executed before
+        # CUDA kernel is created
+        self._context.issue_execution_fence(block=True)
+
+        # task has finished by the time we set self._created to True
+        if self._cache:
+            self._created = True
+
     @track_provenance(runtime.legate_context)
     def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
         if is_gpu and not self._created:
-            # create CUDA kernel
-            launch_domain = Rect(lo=(0,), hi=(num_gpus,))
-            kernel_task = self._context.create_manual_task(
-                CuNumericOpCode.CREATE_CU_KERNEL,
-                launch_domain=launch_domain,
-            )
-            ptx_hash = hash(self._gpu_func[0])
-            kernel_task.add_scalar_arg(ptx_hash, ty.int64)
-            kernel_task.add_scalar_arg(self._gpu_func[0], ty.string)
-            kernel_task.execute()
-            # we want to make sure EVAL_UDF function is not executed before
-            # CUDA kernel is created
-            self._context.issue_execution_fence(block=True)
-
-            # task has finished by the time we set self._created to True
-            if self._cache:
-                self._created = True
+            self._create_cuda_kernel(num_gpus)
 
         task = self._context.create_auto_task(CuNumericOpCode.EVAL_UDF)
         task.add_scalar_arg(self._num_outputs, ty.uint32)  # N of outputs
         task.add_scalar_arg(
             len(self._scalar_args), ty.uint32
         )  # N of scalar_args
-        # add all scalars
+
+        # add all scalar arguments first
         for a in self._scalar_args:
             dtype = convert_to_cunumeric_dtype(type(a).__name__)
             task.add_scalar_arg(a, dtype)
 
-        # add return arguments
-        a0 = None
-        if len(self._return_args) > 0:
-            a0 = self._return_args[0]._thunk
-            a0 = runtime.to_deferred_array(a0)
-            for count, a in enumerate(self._return_args):
-                a_tmp = runtime.to_deferred_array(a._thunk)
+        num_args = len(self._args)
+        # add return arguments with RW permissions
+        first_array = None
+        if self._num_outputs > 0:
+            first_array = runtime.to_deferred_array(
+                self._return_args[0]._thunk
+            )
+            task.add_input(first_array.base)
+            task.add_output(first_array.base)
+
+            for i in range(1, self._num_outputs):
+                a_tmp = runtime.to_deferred_array(self._return_args[i]._thunk)
                 a_tmp_base = a_tmp.base
                 task.add_input(a_tmp_base)
                 task.add_output(a_tmp_base)
-                if count != 0:
-                    task.add_alignment(a0.base, a_tmp_base)
-        # add array arguments
-        if len(self._args) > 0:
-            if a0 is None:
-                a0 = self._args[0]._thunk
-                a0 = runtime.to_deferred_array(a0)
-            for count, a in enumerate(self._args):
-                a_tmp = runtime.to_deferred_array(a._thunk)
+                task.add_alignment(first_array.base, a_tmp_base)
+
+        # add array arguments with read-only permissions
+        if num_args > 0:
+            start = 0
+            if first_array is None:
+                first_array = runtime.to_deferred_array(self._args[0]._thunk)
+                task.add_input(first_array.base)
+                start = 1
+            for i in range(start, num_args):
+                a_tmp = runtime.to_deferred_array(self._args[i]._thunk)
                 a_tmp_base = a_tmp.base
                 task.add_input(a_tmp_base)
-                task.add_alignment(a0.base, a_tmp_base)
+                task.add_alignment(first_array.base, a_tmp_base)
 
         if is_gpu:
             ptx_hash = hash(self._gpu_func[0])
@@ -519,10 +524,10 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
                 "kwargs are not supported in user functions"
             )
 
-        # we need to do ther rest each time `__call__` is executed
         output_shape: Tuple[int] = (-1,)
         output_dtype = self._output_dtype
         self._return_args.clear()
+
         # if output type is not specified, we need to decide
         # which one to use
         # we also want to choose the shape for output array
@@ -599,13 +604,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
                 # create array and add it to the list of return_args
                 tmp_ret = full(output_shape, 0, output_dtype)
                 self._return_args.append(tmp_ret)
-        # FIXME
-        # if self._num_outputs==0:
-        #   #execute function that doesn't modify anything:
-        #   self._pyfunc(args)
-        #   return
 
-        # bring all arrays to same type
+        # check types and shapes
         if len(self._args) > 0:
             for count, a in enumerate(self._args):
                 if output_dtype != a.dtype:
@@ -617,7 +617,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
                 # FIXME broadcast shapes
                 if output_shape != self._args[count].shape:
                     raise ValueError(
-                        "cuNumeric doesnt support "
+                        "cuNumeric doesn't support "
                         "different shapes for arrays in "
                         "user function passed to vectorize"
                     )
diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py
index b4538533bb..6d955c945b 100644
--- a/examples/black_scholes_greeks.py
+++ b/examples/black_scholes_greeks.py
@@ -16,57 +16,92 @@
 #
 
 import argparse
-
-from benchmark import parse_args, run_benchmark, CuNumericTimer
 from enum import IntEnum
-import math
-import cunumeric as np
 
+from benchmark import CuNumericTimer, parse_args, run_benchmark
 
-NUM_ITERS=10
-WARMUP_ITER=2
+NUM_ITERS = 10
+WARMUP_ITER = 2
 
 vol_start = 0.1
 vol_step = 0.01
 t_start = 0.5
-t_step = 1.0/(365*10)
+t_step = 1.0 / (365 * 10)
 money_start = -0.4
 money_step = 0.01
 
 
 RISKFREE = 0.02
 S0 = 100.0
-N_GREEKS=7
+N_GREEKS = 7
+
 
 class Greeks(IntEnum):
-    PREM=0,
-    DELTA=1,
-    VEGA=2,
-    GAMMA=3,
-    VANNA=4,
-    VOLGA=5,
-    THETA=6
+    PREM = (0,)
+    DELTA = (1,)
+    VEGA = (2,)
+    GAMMA = (3,)
+    VANNA = (4,)
+    VOLGA = (5,)
+    THETA = 6
 
 
 def initialize(n_vol_steps, n_t_steps, n_money_steps, D):
-    CALL = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D)
-    PUT = np.zeros((N_GREEKS, n_t_steps, n_vol_steps, n_money_steps,), dtype = D)
-    S=np.full((n_t_steps, n_vol_steps, n_money_steps,),S0, dtype = D)
-    temp_arr = np.arange((n_vol_steps*n_t_steps*n_money_steps), dtype=int)
-    k_temp=(temp_arr%n_money_steps)*money_step
-    k_temp = k_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,))
-    K=(k_temp+(1 + money_start))*S0
-
-    t_temp = (temp_arr%(n_vol_steps*n_money_steps))*vol_step
-    t_temp = t_temp.reshape((n_t_steps, n_vol_steps, n_money_steps,))
-    T=t_temp+t_start
-    R=  0.02
-    V=np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype = D)
+    CALL = np.zeros(
+        (
+            N_GREEKS,
+            n_t_steps,
+            n_vol_steps,
+            n_money_steps,
+        ),
+        dtype=D,
+    )
+    PUT = np.zeros(
+        (
+            N_GREEKS,
+            n_t_steps,
+            n_vol_steps,
+            n_money_steps,
+        ),
+        dtype=D,
+    )
+    S = np.full(
+        (
+            n_t_steps,
+            n_vol_steps,
+            n_money_steps,
+        ),
+        S0,
+        dtype=D,
+    )
+    temp_arr = np.arange((n_vol_steps * n_t_steps * n_money_steps), dtype=int)
+    k_temp = (temp_arr % n_money_steps) * money_step
+    k_temp = k_temp.reshape(
+        (
+            n_t_steps,
+            n_vol_steps,
+            n_money_steps,
+        )
+    )
+    K = (k_temp + (1 + money_start)) * S0
+
+    t_temp = (temp_arr % (n_vol_steps * n_money_steps)) * vol_step
+    t_temp = t_temp.reshape(
+        (
+            n_t_steps,
+            n_vol_steps,
+            n_money_steps,
+        )
+    )
+    T = t_temp + t_start
+    R = 0.02
+    V = np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype=D)
     for i in range(n_vol_steps):
-        V[:,i,:]+=i*vol_step        
+        V[:, i, :] += i * vol_step
 
     return CALL, PUT, S, K, T, R, V
 
+
 def normCDF(d):
     A1 = 0.31938153
     A2 = -0.356563782
@@ -77,37 +112,44 @@ def normCDF(d):
 
     K = 1.0 / (1.0 + 0.2316419 * np.absolute(d))
 
-    cnd = RSQRT2PI * np.exp(- 0.5 * d * d) * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+    cnd = (
+        RSQRT2PI
+        * np.exp(-0.5 * d * d)
+        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+    )
 
     return np.where(d > 0, 1.0 - cnd, cnd)
 
+
 def normPDF(d):
     RSQRT2PI = 0.39894228040143267793994605993438
-    return RSQRT2PI * np.exp(- 0.5 * d * d)
+    return RSQRT2PI * np.exp(-0.5 * d * d)
+
 
-def black_scholes ( out , S, K, R, T, V, CP, greek):
-    EPS = 0.00000001
+def black_scholes(out, S, K, R, T, V, CP, greek):
     stdev = V * np.sqrt(T)
-    df = np.exp(-R*T)
-    d1 = (np.log(S/K)+(R+0.5*V*V)*T)/stdev
-    d2= d1-stdev
-    nd1 = normCDF(CP*d1)
-    nd2 = normCDF(CP*d2)
+    df = np.exp(-R * T)
+    d1 = (np.log(S / K) + (R + 0.5 * V * V) * T) / stdev
+    d2 = d1 - stdev
+    nd1 = normCDF(CP * d1)
+    nd2 = normCDF(CP * d2)
 
     if greek == Greeks.PREM:
-        out[...] = CP*(S*nd1 - K*df*nd2)
+        out[...] = CP * (S * nd1 - K * df * nd2)
     elif greek == Greeks.DELTA:
-        out[...] = CP*nd1
-    elif greek ==Greeks.VEGA:
-        out[...] = S*np.sqrt(T)*normPDF(d1)
+        out[...] = CP * nd1
+    elif greek == Greeks.VEGA:
+        out[...] = S * np.sqrt(T) * normPDF(d1)
     elif greek == Greeks.GAMMA:
-        out[...] = normPDF(d1)/(S*V*np.sqrt(T))
+        out[...] = normPDF(d1) / (S * V * np.sqrt(T))
     elif greek == Greeks.VANNA:
-        out[...] = -d2*normPDF(d1)/V
+        out[...] = -d2 * normPDF(d1) / V
     elif greek == Greeks.VOLGA:
-        out[...] = S*np.sqrt(T)*d1*d2*normPDF(d1)/V
+        out[...] = S * np.sqrt(T) * d1 * d2 * normPDF(d1) / V
     elif greek == Greeks.THETA:
-        out[...] = -(0.5*S*V/np.sqrt(T)*normPDF(d1)+CP*R*df*K*nd2)
+        out[...] = -(
+            0.5 * S * V / np.sqrt(T) * normPDF(d1) + CP * R * df * K * nd2
+        )
     else:
         raise RuntimeError("Wrong greek name is passed")
 
@@ -115,20 +157,23 @@ def black_scholes ( out , S, K, R, T, V, CP, greek):
 def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps):
     timer = CuNumericTimer()
     print("Start black_scholes")
-    CALL, PUT, S, K, T, R, V = initialize(n_vol_steps, n_t_steps, n_money_steps, np.float32)
+    CALL, PUT, S, K, T, R, V = initialize(
+        n_vol_steps, n_t_steps, n_money_steps, np.float32
+    )
 
     print("After the initialization")
-    for i in range (NUM_ITERS):
-        if i==WARMUP_ITER:
+    for i in range(NUM_ITERS):
+        if i == WARMUP_ITER:
             timer.start()
         for g in Greeks:
-            black_scholes(CALL[g.value],S, K, R, T, V, 1, g)
-            black_scholes(PUT[g.value],S, K, R, T, V, -1, g)
+            black_scholes(CALL[g.value], S, K, R, T, V, 1, g)
+            black_scholes(PUT[g.value], S, K, R, T, V, -1, g)
 
-    total = (timer.stop())/(NUM_ITERS-WARMUP_ITER)
+    total = (timer.stop()) / (NUM_ITERS - WARMUP_ITER)
     print("Elapsed Time: {} ms".format(total))
     return total
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -157,14 +202,11 @@ def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps):
         help="number of money steps",
     )
 
-
     args, np, timer = parse_args(parser)
-    
+
     run_benchmark(
         run_black_scholes,
         args.benchmark,
         "Black Scholes",
         (args.n_vol_steps, args.n_time_steps, args.n_money_steps),
     )
-
-
diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
index eb90ab6da2..de88b6707d 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cunumeric/cuda_help.h
@@ -392,42 +392,4 @@ __device__ __forceinline__ void store_streaming<double>(double* ptr, double valu
 {
   asm volatile("st.global.cs.f64 [%0], %1;" : : "l"(ptr), "d"(value) : "memory");
 }
-#if 0
-#include <cuda.h>
-
-class JITKernelStorage
-{
-
-private:
-    JITKernelStorage(){}
-    std::map<std::pair<int64_t,Legion::Processor>, CUfunction> jit_functions_;
-
-public:
-    JITKernelStorage( JITKernelStorage const&) = delete;
-
-    void operator=(JITKernelStorage const&) = delete;
-
-    static JITKernelStorage& get_instance(void){
-        static JITKernelStorage instance;
-        return instance;
-    }
-
-    bool registered_jit_funtion(std::pair<int64_t, Legion::Processor> &key){
-         return jit_functions_.find(key)!=jit_functions_.end();
-    };
-
-    CUfunction return_saved_jit_function(std::pair<int64_t,Legion::Processor> &key){
-       if (
-            jit_functions_.find(key)!=jit_functions_.end())
-            return jit_functions_[key];
-      else 
-          assert(false);//should never come here
-    }
-
-  void add_jit_function(std::pair<int64_t,Legion::Processor> &key, CUfunction func){
-        jit_functions_.insert({key, func});
-  }
-};//class JITKernelStorage
-
-#endif
 }  // namespace cunumeric
diff --git a/src/cunumeric/pitches.h b/src/cunumeric/pitches.h
index dd95068e38..27d179b0e5 100644
--- a/src/cunumeric/pitches.h
+++ b/src/cunumeric/pitches.h
@@ -53,12 +53,9 @@ class Pitches {
     point[DIM] += index;
     return point;
   }
-  
+
   __CUDA_HD__
-  inline const size_t* data(void)
-  {
-       return &pitches[0];
-  }
+  inline const size_t* data(void) { return &pitches[0]; }
 
  private:
   size_t pitches[DIM];
@@ -97,11 +94,7 @@ class Pitches<DIM, false /*C_ORDER*/> {
   }
 
   __CUDA_HD__
-  inline const size_t* data(void)
-  {
-       return &pitches[0];
-  }
-
+  inline const size_t* data(void) { return &pitches[0]; }
 
  private:
   size_t pitches[DIM];
@@ -116,10 +109,10 @@ class Pitches<0, C_ORDER> {
   {
     if (rect.lo[0] > rect.hi[0])
       return 0;
-    else{
-      pitches[0]=rect.hi[0] - rect.lo[0] + 1;
+    else {
+      pitches[0] = rect.hi[0] - rect.lo[0] + 1;
       return (rect.hi[0] - rect.lo[0] + 1);
-      }
+    }
   }
   __CUDA_HD__
   inline legate::Point<1> unflatten(size_t index, const legate::Point<1>& lo) const
@@ -129,14 +122,10 @@ class Pitches<0, C_ORDER> {
     return point;
   }
   __CUDA_HD__
-  inline const size_t* data(void)
-  {
-       return &pitches[0];
-  }
+  inline const size_t* data(void) { return &pitches[0]; }
 
-  private:
+ private:
   size_t pitches[1];
-
 };
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/vectorize/create_cu_kernel.cu b/src/cunumeric/vectorize/create_cu_kernel.cu
index dde9616543..5805d2ef1c 100644
--- a/src/cunumeric/vectorize/create_cu_kernel.cu
+++ b/src/cunumeric/vectorize/create_cu_kernel.cu
@@ -24,19 +24,11 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  fill_out_kernel(const AccessorRD<legate::ProdReduction<bool>, true, 1> out)
-{
-  const int idx = (blockIdx.x * blockDim.x + threadIdx.x);
-  if (idx > 0) return;
-  out.reduce(0, true);
-}
-
 /*static*/ void CreateCUKernelTask::gpu_variant(TaskContext& context)
 {
   int64_t ptx_hash = context.scalars()[0].value<int64_t>();
   std::string ptx  = context.scalars()[1].value<std::string>();
-  Processor point  = context.get_current_processor();
+  Processor point  = legate::Processor::get_executing_processor();
 
   CUfunction func;
   const unsigned num_options   = 4;
diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 1860b588c1..3ea306ff50 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -66,12 +66,11 @@ struct EvalUdfCPU {
   std::vector<Scalar> scalars;
   for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]);
 
-  EvalUdfArgs args{context.scalars()[2 + num_scalars].value<uint64_t>(),
-                   context.inputs(),
-                   context.outputs(),
-                   scalars,
-                   num_outputs,
-                   context.get_current_processor()};
+  EvalUdfArgs args
+  {
+    context.scalars()[2 + num_scalars].value<uint64_t>(), context.inputs(), context.outputs(),
+      scalars, num_outputs, legate::Processor::get_executing_processor()
+  };
   int dim = 1;
   if (args.inputs.size() > 0) {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
diff --git a/src/cunumeric/vectorize/eval_udf_template.inl b/src/cunumeric/vectorize/eval_udf_template.inl
deleted file mode 100644
index c0bc72c428..0000000000
--- a/src/cunumeric/vectorize/eval_udf_template.inl
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2023 NVIDIA Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
-
-#pragma once
-
-// Useful for IDEs
-#include "cunumeric/vectorize/eval_udf.h"
-#include "cunumeric/pitches.h"
-
-namespace cunumeric {
-
-using namespace Legion;
-using namespace legate;
-
-template <VariantKind KIND, LegateTypeCode CODE, int DIM>
-struct EvalUdfImplBody;
-
-template <VariantKind KIND>
-struct EvalUdfImpl {
-  template <LegateTypeCode CODE, int DIM>
-  void operator()(EvalUdfArgs& args) const
-  {
-    using UDF = void(void**, size_t);
-    auto udf  = reinterpret_cast<UDF*>(args.func_ptr);
-    std::vector<void*> udf_args;
-    using VAL = legate_type_of<CODE>;
-    auto rect = args.args[0].shape<DIM>();
-
-    size_t strides[DIM];
-
-    if (rect.empty()) return;
-    EvalUdfImplBody<KIND, CODE, DIM>();
-    for (size_t i = 0; i < args.args.size(); i++) {
-      auto out = args.args[i].write_accessor<VAL, DIM>(rect);
-      udf_args.push_back(reinterpret_cast<void*>(out.ptr(rect, strides)));
-      for (size_t i = 0; i < DIM; i++)
-        std::cout << "IRINA DEBUG strides = " << strides[i] << std::endl;
-    }
-
-    udf(udf_args.data(), rect.volume());
-  }
-};
-
-template <VariantKind KIND>
-static void eval_udf_template(TaskContext& context)
-{
-  is_gpus = context.scalars()[0].value<bool>();
-  if (is_gpus)
-    EvalUdfArgs args{0, context.scalars()[1].value<char*>(), context.outputs()};
-  else
-    EvalUdfArgs args{context.scalars()[1].value<uint64_t>(),'', context.outputs()};
-  size_t dim = args.args[0].dim() == 0 ? 1 : args.args[0].dim();
-  double_dispatch(dim, args.args[0].code(), EvalUdfImpl<KIND>{}, args);
-}
-
-}  // namespace cunumeric
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index f75aeab085..429bd5474b 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -16,177 +16,189 @@
 
 import numpy as np
 import pytest
-
-import cunumeric as num
-import numpy as np
 from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_seq_array
 
+import cunumeric as num
+
+
 def my_func(a, b):
     a = a * 2 + b
     return a
 
-#Capital letters and numbers in the signature 
+
+# Capital letters and numbers in the signature
 def my_func2(A0, B0):
     A0 = A0 * 2 + B0
-    C0=A0*2
-    return A0,C0
+    C0 = A0 * 2
+    return A0, C0
+
 
 def test_vectorize():
-    #2 arrays
+    # 2 arrays
     func = num.vectorize(my_func)
     a = num.arange(5)
     b = num.ones((5,))
     a = func(a, b)
-    assert(np.array_equal(a, [1,3,5,7,9]))
+    assert np.array_equal(a, [1, 3, 5, 7, 9])
 
-    #array and scalar
+    # array and scalar
     func = num.vectorize(my_func)
-    a= num.arange(5)
-    b=2
-    a = func(a,b)
-    assert(np.array_equal(a, [2,4,6,8,10]))
-   
-    #2 scalars
+    a = num.arange(5)
+    b = 2
+    a = func(a, b)
+    assert np.array_equal(a, [2, 4, 6, 8, 10])
+
+    # 2 scalars
     func = num.vectorize(my_func)
-    a=3
-    b=2
-    a = func(a,b)
-    assert(a ==8)
+    a = 3
+    b = 2
+    a = func(a, b)
+    assert a == 8
+
 
 def empty_func():
     print("within empty function")
 
-def print_func(a,b):
-    print ("I am pringing input arguments", a, b)
 
 def test_empty_functions():
-    #empty function
+    # empty function
     func = num.vectorize(empty_func)
     func()
 
-    func2 = num.vectorize(print_func)
-    print_func(1,2)
-
-    print_func(np.array([1,2,3]), 2)
-
 
 def test_vectorize_over_slices():
-    #reuse the same vectorize object on
-    #different slices
+    # reuse the same vectorize object on
+    # different slices
     func_num = num.vectorize(my_func)
     func_np = np.vectorize(my_func)
 
-    a=np.array([[1,2,3],[4,5,6],[7,8,9]])
-    b=np.array([[10,11,12],[13,14,15],[16,17,18]])
-    a_num=num.array(a)
+    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    b = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]])
+    a_num = num.array(a)
     b_num = num.array(b)
-    a[:2] = func_np(a[:2],b[:2])
-    a_num[:2] = func_num(a_num[:2],b_num[:2])
+    a[:2] = func_np(a[:2], b[:2])
+    a_num[:2] = func_num(a_num[:2], b_num[:2])
     assert np.array_equal(a, a_num)
 
-
-    a=np.arange(100).reshape((25,4))
-    a_num= num.array(a)
-    b=a*10
-    b_num=a_num*10
-    a=func_np(a,b)
-    a_num=func_num(a_num, b_num)
+    a = np.arange(100).reshape((25, 4))
+    a_num = num.array(a)
+    b = a * 10
+    b_num = a_num * 10
+    a = func_np(a, b)
+    a_num = func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
 
-    #reusing the same function for different inputs
-    a[:,2]=func_np(a[:, 2], b[:,2])
-    a_num[:,2] =func_num(a_num[:,2],b_num[:,2])
+    # reusing the same function for different inputs
+    a[:, 2] = func_np(a[:, 2], b[:, 2])
+    a_num[:, 2] = func_num(a_num[:, 2], b_num[:, 2])
     assert np.array_equal(a, a_num)
 
-    #reusing the same function for different inputs
-    a[5:10,2]=func_np(a[5:10, 2], b[1:6,2])
-    a_num[5:10,2]=func_num(a_num[5:10,2],b_num[1:6,2])
+    # reusing the same function for different inputs
+    a[5:10, 2] = func_np(a[5:10, 2], b[1:6, 2])
+    a_num[5:10, 2] = func_num(a_num[5:10, 2], b_num[1:6, 2])
     assert np.array_equal(a, a_num)
 
-    #reusing the same function for different inputs
-    a[15:20]=func_np(a[15:20], b[15:20])
-    a_num[15:20]=func_num(a_num[15:20],b_num[15:20])
+    # reusing the same function for different inputs
+    a[15:20] = func_np(a[15:20], b[15:20])
+    a_num[15:20] = func_num(a_num[15:20], b_num[15:20])
     assert np.array_equal(a, a_num)
 
     # reusing the same function for different inputs
-    a=np.arange(1000).reshape((25,10,4))
-    a_num= num.array(a)
-    a[:, 2, :] = func_np(a[:, 2, :],2)
-    a_num[:, 2, :]=func_num(a_num[:, 2, :],2)
+    a = np.arange(1000).reshape((25, 10, 4))
+    a_num = num.array(a)
+    a[:, 2, :] = func_np(a[:, 2, :], 2)
+    a_num[:, 2, :] = func_num(a_num[:, 2, :], 2)
     assert np.array_equal(a, a_num)
 
+
 def test_multiple_outputs():
-    #checking signature with capital letters and numbers
+    # checking signature with capital letters and numbers
     # + checking multiple outputs
-    a=np.arange(100).reshape((25,4))
-    a_num= num.array(a)
-    b=a*10
-    b_num=a_num*10
+    a = np.arange(100).reshape((25, 4))
+    a_num = num.array(a)
+    b = a * 10
+    b_num = a_num * 10
     func_np = np.vectorize(my_func2)
-    func_num=num.vectorize(my_func2)
-    a,c=func_np(a,b)
-    a_num,c_num = func_num(a_num, b_num)
+    func_num = num.vectorize(my_func2)
+    a, c = func_np(a, b)
+    a_num, c_num = func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
     assert np.array_equal(c, c_num)
 
+
 def test_different_types():
-    #checking the case when input and output types are different
-    a=np.arange(100, dtype = int).reshape((25,4))
-    a_num= num.array(a)
-    b=a*10
-    b_num=a_num*10
+    # checking the case when input and output types are different
+    a = np.arange(100, dtype=int).reshape((25, 4))
+    a_num = num.array(a)
+    b = a * 10
+    b_num = a_num * 10
     func_np = np.vectorize(my_func, otypes=(float,))
-    func_num=num.vectorize(my_func, otypes=(float,))
-    a=func_np(a,b)
-    a_num=func_num(a_num, b_num)
+    func_num = num.vectorize(my_func, otypes=(float,))
+    a = func_np(a, b)
+    a_num = func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
 
-    #another test for different types
-    a=np.arange(100, dtype = float).reshape((25,4))
-    a_num= num.array(a)
-    b=a*10
-    b_num=a_num*10
-    func_np = np.vectorize(my_func2, otypes = (int, int,))
-    func_num=num.vectorize(my_func2, otypes = (int, int, ))
-    a,c=func_np(a,b)
-    a_num,c_num = func_num(a_num, b_num)
+    # another test for different types
+    a = np.arange(100, dtype=float).reshape((25, 4))
+    a_num = num.array(a)
+    b = a * 10
+    b_num = a_num * 10
+    func_np = np.vectorize(
+        my_func2,
+        otypes=(
+            int,
+            int,
+        ),
+    )
+    func_num = num.vectorize(
+        my_func2,
+        otypes=(
+            int,
+            int,
+        ),
+    )
+    a, c = func_np(a, b)
+    a_num, c_num = func_num(a_num, b_num)
     assert np.array_equal(a, a_num)
     assert np.array_equal(c, c_num)
 
 
 def test_cache():
-    a=np.arange(100).reshape((25,4))
-    a_num= num.array(a)
-    b=a*10
-    b_num=a_num*10
-    func_np = np.vectorize(my_func2, cache = True)
-    func_num=num.vectorize(my_func2, cache = True)
-    for i in range (10):
-        a=a*2
-        b=b*3
-        a_num=a_num*2
-        b_num=b_num*3
-        a,c=func_np(a,b)
-        a_num,c_num = func_num(a_num, b_num)
+    a = np.arange(100).reshape((25, 4))
+    a_num = num.array(a)
+    b = a * 10
+    b_num = a_num * 10
+    func_np = np.vectorize(my_func2, cache=True)
+    func_num = num.vectorize(my_func2, cache=True)
+    for i in range(10):
+        a = a * 2
+        b = b * 3
+        a_num = a_num * 2
+        b_num = b_num * 3
+        a, c = func_np(a, b)
+        a_num, c_num = func_num(a_num, b_num)
         assert np.array_equal(a, a_num)
         assert np.array_equal(c, c_num)
 
-#checking caching on different shapes of arrays:
-func_np2 = np.vectorize(my_func2, cache = True)
-func_num2=num.vectorize(my_func2, cache = True)
+
+# checking caching on different shapes of arrays:
+func_np2 = np.vectorize(my_func2, cache=True)
+func_num2 = num.vectorize(my_func2, cache=True)
+
 
 @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
 def test_nd_vectorize(ndim):
     a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
     a = mk_seq_array(np, a_shape)
     a_num = num.array(a)
-    b=a*2
-    b_num=num.array(b)
-    a,c=func_np2(a,b)
-    a_num,c_num = func_num2(a_num, b_num)
+    b = a * 2
+    b_num = num.array(b)
+    a, c = func_np2(a, b)
+    a_num, c_num = func_num2(a_num, b_num)
     assert np.array_equal(a, a_num)
-    assert np.array_equal(c, c_num)        
+    assert np.array_equal(c, c_num)
+
 
 if __name__ == "__main__":
     import sys

From 393aa1b449eb6af2b366c3978bb29ec695cd03f7 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 23 Mar 2023 21:00:02 -0700
Subject: [PATCH 64/78] clean-up + formatting

---
 src/cunumeric/vectorize/eval_udf.cc | 11 ++++++-----
 src/cunumeric/vectorize/eval_udf.cu |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/cunumeric/vectorize/eval_udf.cc b/src/cunumeric/vectorize/eval_udf.cc
index 3ea306ff50..8952fb1262 100644
--- a/src/cunumeric/vectorize/eval_udf.cc
+++ b/src/cunumeric/vectorize/eval_udf.cc
@@ -66,11 +66,12 @@ struct EvalUdfCPU {
   std::vector<Scalar> scalars;
   for (size_t i = 2; i < (2 + num_scalars); i++) scalars.push_back(context.scalars()[i]);
 
-  EvalUdfArgs args
-  {
-    context.scalars()[2 + num_scalars].value<uint64_t>(), context.inputs(), context.outputs(),
-      scalars, num_outputs, legate::Processor::get_executing_processor()
-  };
+  EvalUdfArgs args{context.scalars()[2 + num_scalars].value<uint64_t>(),
+                   context.inputs(),
+                   context.outputs(),
+                   scalars,
+                   num_outputs,
+                   legate::Processor::get_executing_processor()};
   int dim = 1;
   if (args.inputs.size() > 0) {
     dim = args.inputs[0].dim() == 0 ? 1 : args.inputs[0].dim();
diff --git a/src/cunumeric/vectorize/eval_udf.cu b/src/cunumeric/vectorize/eval_udf.cu
index f7c6a452a9..dd1d52c75b 100644
--- a/src/cunumeric/vectorize/eval_udf.cu
+++ b/src/cunumeric/vectorize/eval_udf.cu
@@ -129,7 +129,7 @@ struct EvalUdfGPU {
                    context.outputs(),
                    scalars,
                    num_outputs,
-                   context.get_current_processor(),
+                   legate::Processor::get_executing_processor(),
                    ptx_hash};
   size_t dim = 1;
   if (args.inputs.size() > 0) {

From c8dd7fa064b302b5513153573e5a5860c98bd406 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 Mar 2023 04:10:32 +0000
Subject: [PATCH 65/78] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 cunumeric/vectorize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 893b0189ed..ac150c1e36 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -400,7 +400,7 @@ def _compile_func_cpu(self) -> numba.core.ccallback.CFunc:
             numba.core.types.uint64,
             numba.core.types.CPointer(numba.core.types.uint64),
             numba.core.types.CPointer(numba.core.types.uint64),
-        )  
+        )
 
         return numba.cfunc(sig)(self._numba_func)
 

From 92170ff3639690f749b970b7a63ae1c827843fb6 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Sun, 26 Mar 2023 20:37:50 -0700
Subject: [PATCH 66/78] Update cunumeric/vectorize.py

Co-authored-by: Bryan Van de Ven <bryan@bokeh.org>
---
 cunumeric/vectorize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index ac150c1e36..f7345dba48 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -500,7 +500,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
                 elif self._cache:
                     if self._cached_scalar_types[scalar_idx] != type(arg):
                         raise TypeError(
-                            " Input arguments to vectorized function should"
+                            "Input arguments to vectorized function should"
                             " have consistent types for each invocation"
                         )
                 self._scalar_args.append(arg)

From a3196b4737e67e17e86cc3132fa4108665dd726f Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 27 Mar 2023 11:28:52 -0700
Subject: [PATCH 67/78] addressing comments from Bryan

---
 cunumeric/utils.py                  |   3 +-
 cunumeric/vectorize.py              | 105 +++++++-------
 examples/black_scholes_greeks.py    | 212 ----------------------------
 pyproject.toml                      |   3 +-
 tests/integration/test_vectorize.py |  90 +++++++-----
 typings/numba/types/__init__.pyi    |  15 +-
 6 files changed, 114 insertions(+), 314 deletions(-)
 delete mode 100644 examples/black_scholes_greeks.py

diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 381470a0e9..62917020d2 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -121,8 +121,7 @@ def is_supported_dtype(dtype: Any) -> bool:
 def convert_to_cunumeric_dtype(dtype: str) -> Any:
     if dtype in CUNUMERIC_TYPE_MAP:
         return CUNUMERIC_TYPE_MAP[dtype]
-    else:
-        raise TypeError("dtype is not supported")
+    raise TypeError("dtype is not supported")
 
 
 def calculate_volume(shape: NdShape) -> int:
diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index f7345dba48..c4b73f5f9a 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -126,7 +126,6 @@ def __init__(
         self._scalar_idxs: List[int] = []
         self._scalar_names: List[str] = []
         self._arg_names: List[str] = []
-        self._kwargs: List[Any] = []
         self._context = runtime.legate_context
         self._created: bool = False
         self._func_body: List[str] = []
@@ -136,7 +135,7 @@ def __init__(
         else:
             self.__doc__ = doc
 
-        self._return_names = self._get_return_argumets()
+        self._return_names = self._get_return_arguments()
         self._num_outputs: int = len(self._return_names)
         self._return_args: List[Any] = []
         self._output_dtype: Optional[np.dtype[Any]] = None
@@ -170,7 +169,7 @@ def __init__(
                 "signature variable is not supported yet"
             )
 
-    def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
+    def _get_func_body(self, func: Callable[[Any], Any]) -> List[str]:
         """Using the magic method __doc__, we KNOW the size of the docstring.
         We then, just subtract this from the total length of the function
         """
@@ -185,7 +184,7 @@ def _get_func_body(self, func: Callable[[Any], Any]) -> list[str]:
             return_lines.append(lines[i].rstrip())
         return return_lines
 
-    def _get_return_argumets(self) -> list[str]:
+    def _get_return_arguments(self) -> List[str]:
         """
         Returns the list of names for return arrays/values
         """
@@ -478,51 +477,9 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
             )  # type : ignore
         task.execute()
 
-    def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
-        # each time we call `vectorize` on a pyfunc we need to clear
-        # these lists to support different types of arguments passed
-        self._scalar_args.clear()
-        self._scalar_idxs.clear()
-        self._args.clear()
-        self._arg_names.clear()
-        self._scalar_names.clear()
-
-        scalar_idx = 0
-        for i, arg in enumerate(args):
-            if arg is None:
-                raise ValueError(
-                    "None is not supported in user function "
-                    "passed to cunumeric.vectorize"
-                )
-            elif np.ndim(arg) == 0:
-                if self._cache and not self._created:
-                    self._cached_scalar_types.append(type(arg))
-                elif self._cache:
-                    if self._cached_scalar_types[scalar_idx] != type(arg):
-                        raise TypeError(
-                            "Input arguments to vectorized function should"
-                            " have consistent types for each invocation"
-                        )
-                self._scalar_args.append(arg)
-                self._scalar_idxs.append(i)
-                scalar_idx += 1
-            else:
-                self._args.append(convert_to_cunumeric_ndarray(arg))
-
-        # first fill arrays to argnames, then scalars:
-        for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
-            if not (i in self._scalar_idxs):
-                self._arg_names.append(k)
-
-        for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
-            if i in self._scalar_idxs:
-                self._scalar_names.append(k)
-
-        self._kwargs = list(kwargs)
-        if len(self._kwargs) > 0:
-            raise NotImplementedError(
-                "kwargs are not supported in user functions"
-            )
+    def _filter_arguments_and_check(self) -> None:
+        # this method will filter return and input arguments
+        # it will also check shape and type of the arguments
 
         output_shape: Tuple[int] = (-1,)
         output_dtype = self._output_dtype
@@ -622,6 +579,53 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
                         "user function passed to vectorize"
                     )
 
+    def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
+        # each time we call `vectorize` on a pyfunc we need to clear
+        # these lists to support different types of arguments passed
+        self._scalar_args.clear()
+        self._scalar_idxs.clear()
+        self._args.clear()
+        self._arg_names.clear()
+        self._scalar_names.clear()
+
+        scalar_idx = 0
+        for i, arg in enumerate(args):
+            if arg is None:
+                raise ValueError(
+                    "None is not supported in user function "
+                    "passed to cunumeric.vectorize"
+                )
+            elif np.ndim(arg) == 0:
+                if self._cache and not self._created:
+                    self._cached_scalar_types.append(type(arg))
+                elif self._cache:
+                    if self._cached_scalar_types[scalar_idx] != type(arg):
+                        raise TypeError(
+                            "Input arguments to vectorized function should"
+                            " have consistent types for each invocation"
+                        )
+                self._scalar_args.append(arg)
+                self._scalar_idxs.append(i)
+                scalar_idx += 1
+            else:
+                self._args.append(convert_to_cunumeric_ndarray(arg))
+
+        # first fill arrays to argnames, then scalars:
+        for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
+            if not (i in self._scalar_idxs):
+                self._arg_names.append(k)
+
+        for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
+            if i in self._scalar_idxs:
+                self._scalar_names.append(k)
+
+        if len(kwargs) > 0:
+            raise NotImplementedError(
+                "kwargs are not supported in user functions"
+            )
+
+        self._filter_arguments_and_check()
+
         if runtime.num_gpus > 0:
             if not self._created:
                 self._numba_func = self._build_gpu_function()
@@ -639,5 +643,4 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
             return self._return_args[0]
         if len(self._return_args) > 1:
             return tuple(self._return_args)
-        else:
-            return -1
+        return -1
diff --git a/examples/black_scholes_greeks.py b/examples/black_scholes_greeks.py
deleted file mode 100644
index 6d955c945b..0000000000
--- a/examples/black_scholes_greeks.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import argparse
-from enum import IntEnum
-
-from benchmark import CuNumericTimer, parse_args, run_benchmark
-
-NUM_ITERS = 10
-WARMUP_ITER = 2
-
-vol_start = 0.1
-vol_step = 0.01
-t_start = 0.5
-t_step = 1.0 / (365 * 10)
-money_start = -0.4
-money_step = 0.01
-
-
-RISKFREE = 0.02
-S0 = 100.0
-N_GREEKS = 7
-
-
-class Greeks(IntEnum):
-    PREM = (0,)
-    DELTA = (1,)
-    VEGA = (2,)
-    GAMMA = (3,)
-    VANNA = (4,)
-    VOLGA = (5,)
-    THETA = 6
-
-
-def initialize(n_vol_steps, n_t_steps, n_money_steps, D):
-    CALL = np.zeros(
-        (
-            N_GREEKS,
-            n_t_steps,
-            n_vol_steps,
-            n_money_steps,
-        ),
-        dtype=D,
-    )
-    PUT = np.zeros(
-        (
-            N_GREEKS,
-            n_t_steps,
-            n_vol_steps,
-            n_money_steps,
-        ),
-        dtype=D,
-    )
-    S = np.full(
-        (
-            n_t_steps,
-            n_vol_steps,
-            n_money_steps,
-        ),
-        S0,
-        dtype=D,
-    )
-    temp_arr = np.arange((n_vol_steps * n_t_steps * n_money_steps), dtype=int)
-    k_temp = (temp_arr % n_money_steps) * money_step
-    k_temp = k_temp.reshape(
-        (
-            n_t_steps,
-            n_vol_steps,
-            n_money_steps,
-        )
-    )
-    K = (k_temp + (1 + money_start)) * S0
-
-    t_temp = (temp_arr % (n_vol_steps * n_money_steps)) * vol_step
-    t_temp = t_temp.reshape(
-        (
-            n_t_steps,
-            n_vol_steps,
-            n_money_steps,
-        )
-    )
-    T = t_temp + t_start
-    R = 0.02
-    V = np.full((n_t_steps, n_vol_steps, n_money_steps), vol_start, dtype=D)
-    for i in range(n_vol_steps):
-        V[:, i, :] += i * vol_step
-
-    return CALL, PUT, S, K, T, R, V
-
-
-def normCDF(d):
-    A1 = 0.31938153
-    A2 = -0.356563782
-    A3 = 1.781477937
-    A4 = -1.821255978
-    A5 = 1.330274429
-    RSQRT2PI = 0.39894228040143267793994605993438
-
-    K = 1.0 / (1.0 + 0.2316419 * np.absolute(d))
-
-    cnd = (
-        RSQRT2PI
-        * np.exp(-0.5 * d * d)
-        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
-    )
-
-    return np.where(d > 0, 1.0 - cnd, cnd)
-
-
-def normPDF(d):
-    RSQRT2PI = 0.39894228040143267793994605993438
-    return RSQRT2PI * np.exp(-0.5 * d * d)
-
-
-def black_scholes(out, S, K, R, T, V, CP, greek):
-    stdev = V * np.sqrt(T)
-    df = np.exp(-R * T)
-    d1 = (np.log(S / K) + (R + 0.5 * V * V) * T) / stdev
-    d2 = d1 - stdev
-    nd1 = normCDF(CP * d1)
-    nd2 = normCDF(CP * d2)
-
-    if greek == Greeks.PREM:
-        out[...] = CP * (S * nd1 - K * df * nd2)
-    elif greek == Greeks.DELTA:
-        out[...] = CP * nd1
-    elif greek == Greeks.VEGA:
-        out[...] = S * np.sqrt(T) * normPDF(d1)
-    elif greek == Greeks.GAMMA:
-        out[...] = normPDF(d1) / (S * V * np.sqrt(T))
-    elif greek == Greeks.VANNA:
-        out[...] = -d2 * normPDF(d1) / V
-    elif greek == Greeks.VOLGA:
-        out[...] = S * np.sqrt(T) * d1 * d2 * normPDF(d1) / V
-    elif greek == Greeks.THETA:
-        out[...] = -(
-            0.5 * S * V / np.sqrt(T) * normPDF(d1) + CP * R * df * K * nd2
-        )
-    else:
-        raise RuntimeError("Wrong greek name is passed")
-
-
-def run_black_scholes(n_vol_steps, n_t_steps, n_money_steps):
-    timer = CuNumericTimer()
-    print("Start black_scholes")
-    CALL, PUT, S, K, T, R, V = initialize(
-        n_vol_steps, n_t_steps, n_money_steps, np.float32
-    )
-
-    print("After the initialization")
-    for i in range(NUM_ITERS):
-        if i == WARMUP_ITER:
-            timer.start()
-        for g in Greeks:
-            black_scholes(CALL[g.value], S, K, R, T, V, 1, g)
-            black_scholes(PUT[g.value], S, K, R, T, V, -1, g)
-
-    total = (timer.stop()) / (NUM_ITERS - WARMUP_ITER)
-    print("Elapsed Time: {} ms".format(total))
-    return total
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-v",
-        "--vol_tesps",
-        type=int,
-        default=40,
-        dest="n_vol_steps",
-        help="number of voltivity steps",
-    )
-
-    parser.add_argument(
-        "-t",
-        "--time_tesps",
-        type=int,
-        default=3650,
-        dest="n_time_steps",
-        help="number of time steps",
-    )
-    parser.add_argument(
-        "-m",
-        "--money_tesps",
-        type=int,
-        default=60,
-        dest="n_money_steps",
-        help="number of money steps",
-    )
-
-    args, np, timer = parse_args(parser)
-
-    run_benchmark(
-        run_black_scholes,
-        args.benchmark,
-        "Black Scholes",
-        (args.n_vol_steps, args.n_time_steps, args.n_money_steps),
-    )
diff --git a/pyproject.toml b/pyproject.toml
index 5c8e1f83aa..f577d875ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,8 @@ exclude = '''
     _build |
     buck-out |
     build |
-    dist
+    dist |
+    typings 
 )/
 '''
 
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 429bd5474b..daba0c9979 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -67,48 +67,31 @@ def test_empty_functions():
     func()
 
 
-def test_vectorize_over_slices():
-    # reuse the same vectorize object on
-    # different slices
-    func_num = num.vectorize(my_func)
-    func_np = np.vectorize(my_func)
-
-    a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-    b = np.array([[10, 11, 12], [13, 14, 15], [16, 17, 18]])
-    a_num = num.array(a)
-    b_num = num.array(b)
-    a[:2] = func_np(a[:2], b[:2])
-    a_num[:2] = func_num(a_num[:2], b_num[:2])
-    assert np.array_equal(a, a_num)
-
-    a = np.arange(100).reshape((25, 4))
-    a_num = num.array(a)
-    b = a * 10
-    b_num = a_num * 10
-    a = func_np(a, b)
-    a_num = func_num(a_num, b_num)
-    assert np.array_equal(a, a_num)
-
-    # reusing the same function for different inputs
-    a[:, 2] = func_np(a[:, 2], b[:, 2])
-    a_num[:, 2] = func_num(a_num[:, 2], b_num[:, 2])
-    assert np.array_equal(a, a_num)
-
-    # reusing the same function for different inputs
-    a[5:10, 2] = func_np(a[5:10, 2], b[1:6, 2])
-    a_num[5:10, 2] = func_num(a_num[5:10, 2], b_num[1:6, 2])
-    assert np.array_equal(a, a_num)
+func_num = num.vectorize(my_func)
+func_np = np.vectorize(my_func)
 
-    # reusing the same function for different inputs
-    a[15:20] = func_np(a[15:20], b[15:20])
-    a_num[15:20] = func_num(a_num[15:20], b_num[15:20])
-    assert np.array_equal(a, a_num)
 
-    # reusing the same function for different inputs
+@pytest.mark.parametrize(
+    "slice",
+    (
+        (Ellipsis),
+        (
+            slice(5, 10),
+            2,
+        ),
+        (slice(15, 20),),
+    ),
+)  # , (Ellipsis,2,)))
+def test_vectorize_over_slices(slice):
     a = np.arange(1000).reshape((25, 10, 4))
     a_num = num.array(a)
-    a[:, 2, :] = func_np(a[:, 2, :], 2)
-    a_num[:, 2, :] = func_num(a_num[:, 2, :], 2)
+    b = a * 10
+    b_num = num.array(b)
+    a[slice] = func_np(a[slice], b[slice])
+    a_num[slice] = func_num(a_num[slice], b_num[slice])
+    print("IRINA DEBUG", slice)
+    print(a)
+    print(a_num)
     assert np.array_equal(a, a_num)
 
 
@@ -164,7 +147,7 @@ def test_different_types():
     assert np.array_equal(c, c_num)
 
 
-def test_cache():
+def test_cache_multiple_outputs():
     a = np.arange(100).reshape((25, 4))
     a_num = num.array(a)
     b = a * 10
@@ -181,6 +164,35 @@ def test_cache():
         assert np.array_equal(a, a_num)
         assert np.array_equal(c, c_num)
 
+    a_num = a_num.astype(float)
+    b_num = b_num.astype(float)
+    msg = r"types of the arguments should stay the same"
+    with pytest.raises(TypeError, match=msg):
+        a_num = func_num(a_num, b_num)
+
+
+def test_cache_single_output():
+    a = np.arange(100).reshape((2, 50))
+    a_num = num.array(a)
+    b = a * 10
+    b_num = a_num * 10
+    func_np = np.vectorize(my_func, cache=True)
+    func_num = num.vectorize(my_func, cache=True)
+    for i in range(10):
+        a = a * 2
+        b = b * 3
+        a_num = a_num * 2
+        b_num = b_num * 3
+        a = func_np(a, b)
+        a_num = func_num(a_num, b_num)
+        assert np.array_equal(a, a_num)
+
+    a_num = a_num.astype(float)
+    b_num = b_num.astype(float)
+    msg = r"types of the arguments should stay the same"
+    with pytest.raises(TypeError, match=msg):
+        a_num = func_num(a_num, b_num)
+
 
 # checking caching on different shapes of arrays:
 func_np2 = np.vectorize(my_func2, cache=True)
diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi
index 697a068cfd..df003acbe4 100644
--- a/typings/numba/types/__init__.pyi
+++ b/typings/numba/types/__init__.pyi
@@ -1,15 +1,12 @@
-                      configuration locations on your computer.
-
-class Type(): ...
-
+class Type: ...
 class Number(Type): ...
 
 class Integer(Number):
-    def __init__(self, name: str) ->None: ...
+    def __init__(self, name: str) -> None: ...
 
-class CPointer (Type):
-    def __init__ (self, dtype : Type) -> None : ...
+class CPointer(Type):
+    def __init__(self, dtype: Type) -> None: ...
 
-uint32 = Integer('uint32')
-uint64 = Integer('uint64')
+uint32 = Integer("uint32")
+uint64 = Integer("uint64")
 void = none

From fb7853e0f87eba2a5a8517663e08266bed22815e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 27 Mar 2023 20:44:23 -0700
Subject: [PATCH 68/78] making strings more readable

---
 cunumeric/vectorize.py              | 58 ++++++++++++-----------------
 tests/integration/test_vectorize.py |  6 +--
 2 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index c4b73f5f9a..1dee0c6894 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -232,30 +232,22 @@ def _build_gpu_function(self) -> Any:
         )
 
         lines.append("def {}({}):".format(funcid, ",".join(args)))
-
         # Initialize the index variable and return immediately
         # when it exceeds the data size
-        lines.append("    local_i = cuda.grid(1)")
-        lines.append("    if local_i >= {}:".format(_SIZE_VAR))
-        lines.append("        return")
         # we compute index for sparse data access when using Legion's
         # pointer.
         # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
-        lines.append("    {}:int = 0".format(_LOOP_VAR))
-        lines.append("    for p in range({}-1):".format(_DIM_VAR))
-        # FIXME make sure we compute index correct for all data types
-        lines.append("        x=int(local_i/{}[p])".format(_PITCHES_VAR))
-        lines.append(
-            "        local_i = int(local_i%{}[p])".format(_PITCHES_VAR)
-        )
-        lines.append(
-            "        {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
-        )
-        lines.append(
-            "    {}+=int(local_i*{}[{}-1])".format(
-                _LOOP_VAR, _STRIDES_VAR, _DIM_VAR
-            )
-        )
+        loop_lines = f"""    local_i = cuda.grid(1)
+    if local_i >= {_SIZE_VAR}:
+        return
+    {_LOOP_VAR}:int = 0
+    for p in range({_DIM_VAR}-1):
+        x=int(local_i/{_PITCHES_VAR}[p])
+        local_i = int(local_i%{_PITCHES_VAR}[p])
+        {_LOOP_VAR}+=int(x*{_STRIDES_VAR}[p])
+    {_LOOP_VAR}+=int(local_i*{_STRIDES_VAR}[{_DIM_VAR}-1])
+    """
+        lines += loop_lines.split("\n")
 
         # this function is used to replace all array names with array[i]
         def _lift_to_array_access(m: Any) -> str:
@@ -326,25 +318,21 @@ def _emit_assignment(
             )
             arg_idx += 1
 
-        # Main loop
-        lines.append("    for local_i in range({}):".format(_SIZE_VAR))
+        # Initialize the index variable and return immediately
+        # when it exceeds the data size
         # we compute index for sparse data access when using Legion's
         # pointer.
         # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
-        lines.append("        {}:int = 0".format(_LOOP_VAR))
-        lines.append("        j:int = local_i")
-        lines.append("        for p in range({}-1):".format(_DIM_VAR))
-        lines.append("            x=int(j/{}[p])".format(_PITCHES_VAR))
-        lines.append("            j = int(j%{}[p])".format(_PITCHES_VAR))
-
-        lines.append(
-            "            {}+=int(x*{}[p])".format(_LOOP_VAR, _STRIDES_VAR)
-        )
-        lines.append(
-            "        {}+=int(j*{}[{}-1])".format(
-                _LOOP_VAR, _STRIDES_VAR, _DIM_VAR
-            )
-        )
+        loop_lines = f"""    for local_i in range({_SIZE_VAR}):
+        {_LOOP_VAR}:int = 0
+        j:int = local_i
+        for p in range({_DIM_VAR}-1):
+            x=int(j/{_PITCHES_VAR}[p])
+            j = int(j%{_PITCHES_VAR}[p])
+            {_LOOP_VAR}+=int(x*{_STRIDES_VAR}[p])
+        {_LOOP_VAR}+=int(j*{_STRIDES_VAR}[{_DIM_VAR}-1])
+        """
+        lines += loop_lines.split("\n")
 
         lines_old = self._func_body
 
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index daba0c9979..17ad778be7 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -89,9 +89,9 @@ def test_vectorize_over_slices(slice):
     b_num = num.array(b)
     a[slice] = func_np(a[slice], b[slice])
     a_num[slice] = func_num(a_num[slice], b_num[slice])
-    print("IRINA DEBUG", slice)
-    print(a)
-    print(a_num)
+    #print("IRINA DEBUG", slice)
+    #print(a)
+    #print(a_num)
     assert np.array_equal(a, a_num)
 
 

From f62ed37dd412617715e86c22d399d7c91aa303fd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 28 Mar 2023 03:44:46 +0000
Subject: [PATCH 69/78] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/integration/test_vectorize.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 17ad778be7..772120257f 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -89,9 +89,9 @@ def test_vectorize_over_slices(slice):
     b_num = num.array(b)
     a[slice] = func_np(a[slice], b[slice])
     a_num[slice] = func_num(a_num[slice], b_num[slice])
-    #print("IRINA DEBUG", slice)
-    #print(a)
-    #print(a_num)
+    # print("IRINA DEBUG", slice)
+    # print(a)
+    # print(a_num)
     assert np.array_equal(a, a_num)
 
 

From 27621233e84f612a54f0155e47fe22aad746af54 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 27 Mar 2023 20:55:08 -0700
Subject: [PATCH 70/78] replacing strings with f-strings

---
 cunumeric/vectorize.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 1dee0c6894..55f985488d 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -204,15 +204,15 @@ def _replace_name(
         add indices to the names of input/output arrays in the function body
         """
         if (name in self._arg_names) or (name in self._return_names):
-            return "{}[int({})]".format(name, _LOOP_VAR)
+            return f"{name}[int({_LOOP_VAR})]"
         else:
             if is_gpu or ((not is_gpu) and not (name in self._scalar_names)):
-                return "{}".format(name)
+                return f"{name}"
             else:
-                return "{}[0]".format(name)
+                return f"{name}[0]"
 
     def _build_gpu_function(self) -> Any:
-        funcid = "vectorized_{}".format(self._pyfunc.__name__)
+        funcid = f"vectorized_{self._pyfunc.__name__}"
 
         # Preamble
         lines = ["from numba import cuda"]
@@ -267,7 +267,7 @@ def _lift_to_array_access(m: Any) -> str:
         return glbs[funcid]
 
     def _build_cpu_function(self) -> Callable[[Any], Any]:
-        funcid = "vectorized_{}".format(self._pyfunc.__name__)
+        funcid = f"vectorized_{self._pyfunc.__name__}"
 
         # Preamble
         lines = ["from numba import carray, types"]
@@ -277,14 +277,7 @@ def _build_cpu_function(self) -> Callable[[Any], Any]:
 
         # Signature
         lines.append(
-            "def {}({}, {}, {}, {}, {}):".format(
-                funcid,
-                _ARGS_VAR,
-                _SIZE_VAR,
-                _DIM_VAR,
-                _PITCHES_VAR,
-                _STRIDES_VAR,
-            )
+            f"def {funcid}({_ARGS_VAR},{_SIZE_VAR}, {_DIM_VAR}, {_PITCHES_VAR}, {_STRIDES_VAR}):"
         )
 
         # Unpack kernel arguments
@@ -292,9 +285,7 @@ def _emit_assignment(
             var: Any, idx: int, sz: Any, ty: np.dtype[Any]
         ) -> None:
             lines.append(
-                "    {} = carray({}[{}], {}, types.{})".format(
-                    var, _ARGS_VAR, idx, sz, ty
-                )
+                f"    {var} = carray({ _ARGS_VAR}[{idx}], {sz}, types.{ty})"
             )
 
         # define pyfunc arguments as carrays

From 0a0650c43c684dbb87ed7fbf77d5128148f7b97e Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 27 Mar 2023 22:17:47 -0700
Subject: [PATCH 71/78] arguments should be a copy of arrays to match the numpy
 logic

---
 cunumeric/vectorize.py              | 13 ++++++++++---
 tests/integration/test_vectorize.py | 13 +++++++------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 55f985488d..c3310063b8 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -194,7 +194,12 @@ def _get_return_arguments(self) -> List[str]:
             if "return" in ln:
                 ln = ln.replace("return", "")
                 ln = ln.replace(" ", "")
-                return_names = ln.split(",")
+                return_names += ln.split(",")
+        # FIXME
+        # for n in return_names:
+        #   if re.match("^([-+]? ?(\d+|\(\g<1>\))( ?[-+*\/] ?\g<1>)?)$", n):
+        #       raise NotImplementedError (" User defined function can't have"
+        #           " mathematical operation as a return")
         return return_names
 
     def _replace_name(
@@ -277,7 +282,8 @@ def _build_cpu_function(self) -> Callable[[Any], Any]:
 
         # Signature
         lines.append(
-            f"def {funcid}({_ARGS_VAR},{_SIZE_VAR}, {_DIM_VAR}, {_PITCHES_VAR}, {_STRIDES_VAR}):"
+            f"def {funcid}({_ARGS_VAR},{_SIZE_VAR}, "
+            f"{_DIM_VAR}, {_PITCHES_VAR}, {_STRIDES_VAR}):"
         )
 
         # Unpack kernel arguments
@@ -587,7 +593,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> Union[Any, Tuple[Any]]:
                 self._scalar_idxs.append(i)
                 scalar_idx += 1
             else:
-                self._args.append(convert_to_cunumeric_ndarray(arg))
+                # we need to make a copy of original array to match numpy
+                self._args.append(convert_to_cunumeric_ndarray(arg.copy()))
 
         # first fill arrays to argnames, then scalars:
         for i, k in enumerate(inspect.signature(self._pyfunc).parameters):
diff --git a/tests/integration/test_vectorize.py b/tests/integration/test_vectorize.py
index 772120257f..df03987fd7 100644
--- a/tests/integration/test_vectorize.py
+++ b/tests/integration/test_vectorize.py
@@ -79,19 +79,20 @@ def test_empty_functions():
             slice(5, 10),
             2,
         ),
-        (slice(15, 20),),
+        (slice(3, 7),),
+        (
+            Ellipsis,
+            2,
+        ),
     ),
-)  # , (Ellipsis,2,)))
+)
 def test_vectorize_over_slices(slice):
-    a = np.arange(1000).reshape((25, 10, 4))
+    a = np.arange(160).reshape((10, 4, 4))
     a_num = num.array(a)
     b = a * 10
     b_num = num.array(b)
     a[slice] = func_np(a[slice], b[slice])
     a_num[slice] = func_num(a_num[slice], b_num[slice])
-    # print("IRINA DEBUG", slice)
-    # print(a)
-    # print(a_num)
     assert np.array_equal(a, a_num)
 
 

From 98f15d6f4caba22e7a06495cb93c66ba3c3db516 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 28 Mar 2023 10:13:37 -0700
Subject: [PATCH 72/78] adding missing type stubs for numpy

---
 typings/numba/core/__init__.pyi           |  0
 typings/numba/core/ccallback/__init__.pyi |  8 ++++++++
 typings/numba/core/types/__init__.pyi     | 25 +++++++++++++++++++++++
 3 files changed, 33 insertions(+)
 create mode 100644 typings/numba/core/__init__.pyi
 create mode 100644 typings/numba/core/ccallback/__init__.pyi
 create mode 100644 typings/numba/core/types/__init__.pyi

diff --git a/typings/numba/core/__init__.pyi b/typings/numba/core/__init__.pyi
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/typings/numba/core/ccallback/__init__.pyi b/typings/numba/core/ccallback/__init__.pyi
new file mode 100644
index 0000000000..81b5030b9c
--- /dev/null
+++ b/typings/numba/core/ccallback/__init__.pyi
@@ -0,0 +1,8 @@
+from typing import Any
+
+class CFunc(object):
+    def __init__(
+        self, pyfunc: Any, sig: Any, locals: Any, options: Any
+    ) -> None: ...
+    @property
+    def address(self) -> int: ...
diff --git a/typings/numba/core/types/__init__.pyi b/typings/numba/core/types/__init__.pyi
new file mode 100644
index 0000000000..8bb1e2b103
--- /dev/null
+++ b/typings/numba/core/types/__init__.pyi
@@ -0,0 +1,25 @@
+class Opaque: ...
+
+class NoneType(Opaque):
+    def __init__(self, name: str) -> None: ...
+
+class Type:
+    def __init__(self, name: str) -> None: ...
+
+class Number(Type): ...
+
+class Integer(Number):
+    def __init__(self, name: str) -> None: ...
+
+class RawPointer:
+    def __init__(self, name: str) -> None: ...
+
+class CPointer(Type):
+    def __init__(self, dtype: Type) -> None: ...
+
+none = NoneType("none")
+
+uint32 = Integer("uint32")
+uint64 = Integer("uint64")
+void = none
+voidptr = Type("void*")

From 8d118caeb2c2c024bae82430504063649dfbf1cd Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 29 Mar 2023 23:16:02 -0700
Subject: [PATCH 73/78] checking return statememt of UDF for special characters

---
 cunumeric/vectorize.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index c3310063b8..33387b287c 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -195,11 +195,17 @@ def _get_return_arguments(self) -> List[str]:
                 ln = ln.replace("return", "")
                 ln = ln.replace(" ", "")
                 return_names += ln.split(",")
-        # FIXME
-        # for n in return_names:
-        #   if re.match("^([-+]? ?(\d+|\(\g<1>\))( ?[-+*\/] ?\g<1>)?)$", n):
-        #       raise NotImplementedError (" User defined function can't have"
-        #           " mathematical operation as a return")
+        # we check if return statement has any special characters since
+        # we don't support cases like "return a+b"
+        for n in return_names:
+            regex = re.compile("[^A-Za-z0-9]")
+            res = regex.findall(n)
+            if len(res) > 0:
+                raise NotImplementedError(
+                    " CuNumeric doesn't support special "
+                    "characters in the return statement of the "
+                    "user-defined function "
+                )
         return return_names
 
     def _replace_name(

From 889fb65c8bd08c712d31d9a3b7e5936d9d24ca9f Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 31 Mar 2023 13:34:18 -0700
Subject: [PATCH 74/78] updating the map between pyarow types and legate types

---
 cunumeric/array.py     |  2 +-
 cunumeric/utils.py     | 36 +++++++++++++++++++-----------------
 cunumeric/vectorize.py |  2 +-
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index a44e5afd8d..05a1c876ce 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -33,7 +33,7 @@
 
 import legate.core.types as ty
 import numpy as np
-import pyarrow  # type: ignore  [import]
+import pyarrow
 from legate.core import Array
 from numpy.core.multiarray import (  # type: ignore [attr-defined]
     normalize_axis_index,
diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 62917020d2..d63ac2e30d 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -22,6 +22,7 @@
 
 import legate.core.types as ty
 import numpy as np
+import pyarrow as pa
 
 from .types import NdShape
 
@@ -43,25 +44,26 @@
     np.float64: ty.float64,
     np.complex64: ty.complex64,
     np.complex128: ty.complex128,
+    complex: ty.complex128,
 }
 
 CUNUMERIC_TYPE_MAP = {
-    "bool": ty.bool_,
-    "int8": ty.int8,
-    "int16": ty.int16,
-    "int32": ty.int32,
-    "int": ty.int64,  # np.int is int
-    "int64": ty.int64,
-    "uint8": ty.uint8,
-    "uint16": ty.uint16,
-    "uint32": ty.uint32,
-    "uint64": ty.uint64,  # np.uint is np.uint64
-    "float16": ty.float16,
-    "float32": ty.float32,
-    "float": ty.float64,
-    "float64": ty.float64,
-    "complex64": ty.complex64,
-    "complex128": ty.complex128,
+    bool: ty.bool_,
+    int: ty.int64,
+    float: ty.float64,
+    complex: ty.complex128,
+    pa.bool_: ty.bool_,
+    pa.int8: ty.int8,
+    pa.int16: ty.int16,
+    pa.int32: ty.int32,
+    pa.int64: ty.int64,  # np.int is int
+    pa.uint8: ty.uint8,
+    pa.uint16: ty.uint16,
+    pa.uint32: ty.uint32,
+    pa.uint64: ty.uint64,  # np.uint is np.uint64
+    pa.float16: ty.float16,
+    pa.float32: ty.float32,
+    pa.float64: ty.float64,
 }
 
 
@@ -118,7 +120,7 @@ def is_supported_dtype(dtype: Any) -> bool:
     return dtype.type in SUPPORTED_DTYPES
 
 
-def convert_to_cunumeric_dtype(dtype: str) -> Any:
+def convert_to_cunumeric_dtype(dtype: Any) -> Any:
     if dtype in CUNUMERIC_TYPE_MAP:
         return CUNUMERIC_TYPE_MAP[dtype]
     raise TypeError("dtype is not supported")
diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 33387b287c..e412dccf7f 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -426,7 +426,7 @@ def _execute(self, is_gpu: bool, num_gpus: int = 0) -> None:
 
         # add all scalar arguments first
         for a in self._scalar_args:
-            dtype = convert_to_cunumeric_dtype(type(a).__name__)
+            dtype = convert_to_cunumeric_dtype(type(a))
             task.add_scalar_arg(a, dtype)
 
         num_args = len(self._args)

From 7df88a44dbaefd52b75e9e049d6e5cee4f3de7c6 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Fri, 31 Mar 2023 13:34:51 -0700
Subject: [PATCH 75/78] adding type stubs for pyarrow

---
 typings/pyarrow/__init__.pyi | 136 +++++++++++++++++++++++++++++++++++
 typings/pyarrow/lib.pyi      |  38 ++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 typings/pyarrow/__init__.pyi
 create mode 100644 typings/pyarrow/lib.pyi

diff --git a/typings/pyarrow/__init__.pyi b/typings/pyarrow/__init__.pyi
new file mode 100644
index 0000000000..cc2ac93aa9
--- /dev/null
+++ b/typings/pyarrow/__init__.pyi
@@ -0,0 +1,136 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any, Union
+
+from .lib import (
+    DataType,
+    binary,
+    bool_,
+    float16,
+    float32,
+    float64,
+    int8,
+    int16,
+    int32,
+    int64,
+    string,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+)
+
+class Field:
+    name: str
+    type: DataType
+    def __init__(self, *args: Any, **kwargs: Any) -> None: ...
+    def with_name(self, name: str) -> Field: ...
+
+def field(
+    name: Union[str, bytes],
+    type: DataType,
+    nullable: bool = True,
+    metadata: Any = None,
+) -> Field: ...
+
+class Schema:
+    types: Any
+    def field(self, i: Union[str, int]) -> Field: ...
+    def get_all_field_indices(self, name: str) -> list[int]: ...
+    def get_field_index(self, name: str) -> int: ...
+    def __len__(self) -> int: ...
+    def __getitem__(self, idx: int) -> Field: ...
+
+def schema(fields: Any, metadata: Any = None) -> Schema: ...
+
+class ExtensionType:
+    def __init__(self, dtype: DataType, name: str) -> None: ...
+
+class DictionaryType: ...
+class ListType: ...
+class MapType: ...
+class StructType: ...
+class UnionType: ...
+class TimestampType: ...
+class Time32Type: ...
+class Time64Type: ...
+class FixedSizeBinaryType: ...
+class Decimal128Type: ...
+class time32: ...
+class time64: ...
+class timestamp: ...
+class date32: ...
+class date64: ...
+class large_binary: ...
+class large_string: ...
+class large_utf8: ...
+class decimal128: ...
+class large_list: ...
+class struct: ...
+class dictionary: ...
+class null: ...
+class utf8: ...
+class list_: ...
+class map_: ...
+
+def from_numpy_dtype(dtype: Any) -> DataType: ...
+
+__all__ = (
+    "binary",
+    "bool_",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "Field",
+    "Schema",
+    "DataType",
+    "DictionaryType",
+    "ListType",
+    "MapType",
+    "StructType",
+    "UnionType",
+    "TimestampType",
+    "Time32Type",
+    "Time64Type",
+    "FixedSizeBinaryType",
+    "Decimal128Type",
+    "time32",
+    "time64",
+    "timestamp",
+    "date32",
+    "date64",
+    "string",
+    "large_binary",
+    "large_string",
+    "large_utf8",
+    "decimal128",
+    "large_list",
+    "struct",
+    "dictionary",
+    "null",
+    "utf8",
+    "list_",
+    "map_",
+    "from_numpy_dtype",
+)
diff --git a/typings/pyarrow/lib.pyi b/typings/pyarrow/lib.pyi
new file mode 100644
index 0000000000..398361089b
--- /dev/null
+++ b/typings/pyarrow/lib.pyi
@@ -0,0 +1,38 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any
+
+class DataType:
+    id: int
+    num_fields: int
+    num_buffers: int
+    def equals(self, other: object) -> bool: ...
+    def to_pandas_dtype(self) -> Any: ...
+
+def binary(length: int) -> DataType: ...
+def bool_() -> DataType: ...
+def int8() -> DataType: ...
+def int16() -> DataType: ...
+def int32() -> DataType: ...
+def int64() -> DataType: ...
+def uint8() -> DataType: ...
+def uint16() -> DataType: ...
+def uint32() -> DataType: ...
+def uint64() -> DataType: ...
+def float16() -> DataType: ...
+def float32() -> DataType: ...
+def float64() -> DataType: ...
+def string() -> DataType: ...

From a65eea1d274a061cda1c0332bc66cc71e63e9842 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <dirishka@gmail.com>
Date: Mon, 3 Apr 2023 14:56:47 -0700
Subject: [PATCH 76/78] Update cunumeric/vectorize.py

Co-authored-by: Bryan Van de Ven <bryan@bokeh.org>
---
 cunumeric/vectorize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index e412dccf7f..9724371e7b 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -248,7 +248,8 @@ def _build_gpu_function(self) -> Any:
         # we compute index for sparse data access when using Legion's
         # pointer.
         # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
-        loop_lines = f"""    local_i = cuda.grid(1)
+        loop_lines = f"""\
+    local_i = cuda.grid(1)
     if local_i >= {_SIZE_VAR}:
         return
     {_LOOP_VAR}:int = 0

From 4dc584b0d9cf5c5aa19368df79cc492489220cfd Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Mon, 3 Apr 2023 15:02:45 -0700
Subject: [PATCH 77/78] addressing comments from Bryan

---
 cunumeric/vectorize.py           | 3 ++-
 typings/numba/__init__.pyi       | 5 -----
 typings/numba/types/__init__.pyi | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 9724371e7b..7af68f6aed 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -327,7 +327,8 @@ def _emit_assignment(
         # we compute index for sparse data access when using Legion's
         # pointer.
         # a[x][y][z]=a[x*strides[0] + y*strides[1] + z*strides[2]]
-        loop_lines = f"""    for local_i in range({_SIZE_VAR}):
+        loop_lines = f"""\
+    for local_i in range({_SIZE_VAR}):
         {_LOOP_VAR}:int = 0
         j:int = local_i
         for p in range({_DIM_VAR}-1):
diff --git a/typings/numba/__init__.pyi b/typings/numba/__init__.pyi
index 571faf5a05..3aa25ebbd1 100644
--- a/typings/numba/__init__.pyi
+++ b/typings/numba/__init__.pyi
@@ -1,14 +1,9 @@
 from typing import Any, Callable
 
-# Re-export types itself
 import numba.core.types as types
 import numba.cuda  # import compile_ptx
-
-# import types
 from numba.core import types
 from numba.core.ccallback import CFunc
-
-# Re-export all type names
 from numba.core.types import CPointer, uint64
 
 def cfunc(sig: Any) -> Any:
diff --git a/typings/numba/types/__init__.pyi b/typings/numba/types/__init__.pyi
index df003acbe4..14c90eca2a 100644
--- a/typings/numba/types/__init__.pyi
+++ b/typings/numba/types/__init__.pyi
@@ -9,4 +9,4 @@ class CPointer(Type):
 
 uint32 = Integer("uint32")
 uint64 = Integer("uint64")
-void = none
+void = None

From c123d63850020629ab2e18bac4b0ab3a9d84eef6 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Thu, 18 May 2023 20:54:20 -0700
Subject: [PATCH 78/78] fixed some typos

---
 cunumeric/vectorize.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cunumeric/vectorize.py b/cunumeric/vectorize.py
index 3ac914bc36..c3f691d164 100644
--- a/cunumeric/vectorize.py
+++ b/cunumeric/vectorize.py
@@ -83,10 +83,10 @@ def __init__(
             Set of strings or integers representing the positional or keyword
             arguments for which the function will not be vectorized.
             These will be passed directly to `pyfunc` unmodified.
-            WARNING: cuNumeric doesn't suport this argument at the moment
+            WARNING: cuNumeric doesn't support this argument at the moment
         cache : bool, optional
             If `True`, then cache the first function call that generates C fun-
-            ction or CUDA kernel. We recomment enabling caching in cuNumeric
+            ction or CUDA kernel. We recommend enabling caching in cuNumeric
             for better performance, when possible.
             WARNING: in the case when cache=True, cuNumeric will parse function
             signature and create C function or CUDA kernel only once. This
@@ -99,7 +99,7 @@ def __init__(
             arrays with shapes given by the size of corresponding core
             dimensions. By default, ``pyfunc`` is assumed to take scalars
             as input and output.
-            WARNING: cuNumeric doesn't suport this argument at the moment
+            WARNING: cuNumeric doesn't support this argument at the moment
 
         Returns
         -------
@@ -145,7 +145,7 @@ def __init__(
         if otypes is not None:
             if self._num_outputs != len(otypes):
                 raise ValueError(
-                    "number of types in otypes is not consistente"
+                    "number of types in otypes is not consistent"
                     " with the number of return values defined in pyfunc"
                 )
             if len(otypes) > 1: