Refactored kernel dispatch API.

- The concept of a kernel was decoupled from the notion of dispatching of a kernel. The present implementation in compiler.py intermixes both things, making hard the separation of compute-follows-data based kernel launch and legacy `dpctl.device_context` based behavior. - Deprecates support for numpy arrays as kernel args. - Deprecates support for the square bracket notation using `__getitem__` to provide global and local ranges for a kernel launch. - Changes the behavior of specializing a kernel using only a signature. The new way to specialize will require a device type and a backend. - Improvements to exception messages using custom exceptions. - The new API is now inside `numba_dpex.core.kernel_interface`.
IntelPython · Oct 18, 2022 · 6b179af · 6b179af
1 parent b9098d1
commit 6b179af
Show file tree

Hide file tree

Showing 4 changed files with 577 additions and 0 deletions.
diff --git a/numba_dpex/core/kernel_interface/__init__.py b/numba_dpex/core/kernel_interface/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Defines the Kernel classes that abstract a SYCL device kernel.
+"""
diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py
@@ -0,0 +1,369 @@
+# SPDX-FileCopyrightText: 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+from warnings import warn
+
+import dpctl
+
+from numba_dpex.core.descriptor import dpex_target
+from numba_dpex.core.exceptions import (
+    ComputeFollowsDataInferenceError,
+    ExecutionQueueInferenceError,
+    InvalidKernelLaunchArgsError,
+    UnknownGlobalRangeError,
+    UnsupportedBackendError,
+)
+from numba_dpex.core.types import Array as ArrayType
+from numba_dpex.dpctl_iface import USMNdArrayType
+
+
+class KernelLauncher(object):
+    """Creates a Kernel object from a @kernel decorated function and enqueues
+    the Kernel object on a specified device.
+    """
+
+    def __init__(
+        self,
+        pyfunc,
+        debug_flags=None,
+        compile_flags=None,
+        array_access_specifiers=None,
+    ):
+        self.typingctx = dpex_target.typing_context
+        self.pyfunc = pyfunc
+        self.debug_flags = debug_flags
+        self.compile_flags = compile_flags
+        self.kernel_name = pyfunc.__name__
+
+        if array_access_specifiers:
+            warn(
+                "Access specifiers apply only to NumPy ndarrays. "
+                + "Support for NumPy ndarray objects as kernel arguments "
+                + "and access specifiers flags is deprecated. "
+                + "Use dpctl.tensor.usm_ndarray based arrays instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            self.array_access_specifiers = array_access_specifiers
+
+    # def _ensure_valid_work_item_grid(self, val, sycl_queue):
+
+    #     if not isinstance(val, (tuple, list, int)):
+    #         error_message = (
+    #             "Cannot create work item dimension from provided argument"
+    #         )
+    #         raise ValueError(error_message)
+
+    #     if isinstance(val, int):
+    #         val = [val]
+
+    #     # TODO: we need some way to check the max dimensions
+    #     """
+    #     if len(val) > device_env.get_max_work_item_dims():
+    #         error_message = ("Unsupported number of work item dimensions ")
+    #         raise ValueError(error_message)
+    #     """
+
+    #     return list(
+    #         val[::-1]
+    #     )  # reversing due to sycl and opencl interop kernel range mismatch semantic
+
+    def _determine_compute_follows_data_queue(self, usm_array_list):
+        """Determine the execution queue for the list of usm array args using
+        compute follows data rules.
+
+        Uses ``dpctl.utils.get_execution_queue()`` to check if the list of
+        queues belonging to the usm_ndarrays are equivalent. If the queues are
+        equivalent, then returns the queue. If the queues are not equivalent
+        then returns None.
+
+        Args:
+            usm_array_list : A list of usm_ndarray objects
+
+        Returns:
+            A queue the common queue used to allocate the arrays. If no such
+            queue exists, then returns None.
+        """
+
+        queues = [usm_array.sycl_queue for usm_array in usm_array_list]
+        return dpctl.utils.get_execution_queue(queues)
+
+    def _determine_kernel_launch_queue(self, args):
+        """Determines the queue where the kernel is to be launched.
+
+        The execution queue is derived using the following algorithm. In future,
+        support for ``numpy.ndarray`` and ``dpctl.device_context`` is to be
+        removed and queue derivation will follows Python Array API's
+        "compute follows data" logic.
+
+        Check if there are array arguments.
+        True:
+          Check if all array arguments are of type numpy.ndarray
+          (numba.types.Array)
+              True:
+                  Check if the kernel was invoked from within a
+                  dpctl.device_context.
+                  True:
+                      Provide a deprecation warning for device_context use and
+                      point to using dpctl.tensor.usm_ndarray or dpnp.ndarray
+
+                      return dpctl.get_current_queue
+                  False:
+                      Raise ExecutionQueueInferenceError
+              False:
+                  Check if all of the arrays are USMNdarray
+                      True:
+                          Check if execution queue could be inferred using
+                          compute follows data rules
+                          True:
+                              return the compute follows data inferred queue
+                          False:
+                              Raise ComputeFollowsDataInferenceError
+                      False:
+                          Raise ComputeFollowsDataInferenceError
+        False:
+          Check if the kernel was invoked from within a dpctl.device_context.
+            True:
+                Provide a deprecation warning for device_context use and
+                point to using dpctl.tensor.usm_ndarray of dpnp.ndarray
+
+                return dpctl.get_current_queue
+            False:
+                Raise ExecutionQueueInferenceError
+
+        Args:
+            args : A list of arguments passed to the kernel stored in the
+            launcher.
+
+        Returns:
+            A queue the common queue used to allocate the arrays. If no such
+            queue exists, then raises an Exception.
+
+        Raises:
+            ComputeFollowsDataInferenceError: If the queue could not be inferred
+                using compute follows data rules.
+            ExecutionQueueInferenceError: If the queue could not be inferred
+                using the dpctl queue manager.
+        """
+        array_argnums = [
+            i
+            for i, arg in enumerate(args)
+            if isinstance(self.typingctx.resolve_argument_type(arg), ArrayType)
+        ]
+        usmarray_argnums = [
+            i
+            for i, arg in enumerate(args)
+            if isinstance(
+                self.typingctx.resolve_argument_type(arg), USMNdArrayType
+            )
+        ]
+        # if usm and non-usm array arguments are getting mixed, then the
+        # execution queue cannot be inferred using compute follows data rules.
+        if array_argnums and usmarray_argnums:
+            raise ComputeFollowsDataInferenceError(
+                array_argnums, usmarray_argnums
+            )
+        elif array_argnums and not usmarray_argnums:
+            if dpctl.is_in_device_context():
+                warn(
+                    "Support for dpctl.device_context to specify the "
+                    + "execution queue is deprecated. "
+                    + "Use dpctl.tensor.usm_ndarray based array "
+                    + "containers instead. ",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                warn(
+                    "Support for NumPy ndarray objects as kernel arguments is "
+                    + "deprecated. Use dpctl.tensor.usm_ndarray based array "
+                    + "containers instead. ",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                return dpctl.get_current_queue()
+            else:
+                raise ExecutionQueueInferenceError(self.kernel.name)
+        elif usmarray_argnums and not array_argnums:
+            if dpctl.is_in_device_context():
+                warn(
+                    "dpctl.device_context ignored as the kernel arguments "
+                    + "are dpctl.tensor.usm_ndarray based array containers."
+                )
+            usm_array_args = [
+                arg for i, arg in enumerate(args) if i in usmarray_argnums
+            ]
+            queue = self._determine_compute_follows_data_queue(usm_array_args)
+            if not queue:
+                raise ComputeFollowsDataInferenceError(
+                    self.kernel.name, usmarray_argnums
+                )
+            else:
+                return queue
+        else:
+            if dpctl.is_in_device_context():
+                warn(
+                    "Support for dpctl.device_context to specify the "
+                    + "execution queue is deprecated. "
+                    + "Use dpctl.tensor.usm_ndarray based array "
+                    + "containers instead. ",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                return dpctl.get_current_queue()
+            else:
+                raise ExecutionQueueInferenceError(self.kernel.name)
+
+    def __getitem__(self, args):
+        """Mimic's ``numba.cuda`` square-bracket notation for configuring the
+        global_range and local_range settings when launching a kernel on a
+        SYCL queue.
+
+        When a Python function decorated with the @kernel decorator,
+        is invoked it creates a KernelLauncher object. Calling the
+        KernelLauncher objects ``__getitem__`` function inturn clones the object
+        and sets the ``global_range`` and optionally the ``local_range``
+        attributes with the arguments passed to ``__getitem__``.
+
+        Args:
+            args (tuple): A tuple of tuples that specify the global and
+            optionally the local range for the kernel execution. If the
+            argument is a two-tuple of tuple, then it is assumed that both
+            global and local range options are specified. The first entry is
+            considered to be the global range and the second the local range.
+
+            If only a single tuple value is provided, then the kernel is
+            launched with only a global range and the local range configuration
+            is decided by the SYCL runtime.
+
+        Returns:
+            KernelLauncher: A clone of the KernelLauncher object, but with the
+            global_range and local_range attributes initialized.
+
+        .. deprecated:: 0.19
+            Use :func:`KernelLauncher.execute` instead.
+        """
+
+        warn(
+            "The [] (__getitem__) method to set global and local ranges for "
+            + "launching a kernel is deprecated. "
+            + "Use the execute function instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        nargs = len(args)
+
+        # Check if the kernel launch arguments are sane.
+        if nargs < 1:
+            raise UnknownGlobalRangeError(kernel_name=self.kernel_name)
+        elif nargs > 2:
+            raise InvalidKernelLaunchArgsError(
+                kernel_name=self.kernel_name, args=args
+            )
+
+        self.global_range = args[0]
+        if nargs == 2 and args[1] != []:
+            self.local_range = args[1]
+        else:
+            self.local_range = None
+
+        return copy.copy(self)
+
+    def __call__(self, *args, global_range=None, local_range=None):
+        """_summary_
+
+        Args:
+            global_range (_type_): _description_
+            local_range (_type_, optional): _description_. Defaults to None.
+        """
+
+        # TODO: Move to separate function
+        if global_range:
+            if self.global_range:
+                warn(
+                    "Ignoring the previously set value of global_range and "
+                    + "using the value specified at the kernel call site."
+                )
+        else:
+            if self.global_range:
+                warn(
+                    "Use of __getitem__ to set the global_range attribute is "
+                    + 'deprecated. Use the keyword argument "global_range" of '
+                    + "__call__ method to set the attribute."
+                )
+                global_range = self.global_range
+
+            else:
+                raise UnknownGlobalRangeError(self.kernel_name)
+
+        if local_range:
+            if self.local_range:
+                warn(
+                    "Ignoring the previously set value of local_range and "
+                    + "using the value specified at the kernel call site.."
+                )
+        else:
+            if self.local_range:
+                warn(
+                    "Use of __getitem__ to set the local_range attribute is "
+                    + 'deprecated. Use the keyword argument "local_range" of '
+                    + "__call__ method to set the attribute."
+                )
+                local_range = self.local_range
+            else:
+                local_range = None
+                warn(
+                    "Kernel to be submitted without a local range letting "
+                    + "the SYCL runtime select a local range. The behavior "
+                    + "can lead to suboptimal performance in certain cases. "
+                    + "Consider setting the local range value for the kernel "
+                    + "execution.\n"
+                    + "The local_range keyword may be made a required argument "
+                    + "in the future."
+                )
+        # TODO: Move out to separate function
+        exec_queue = self._determine_kernel_launch_queue(*args)
+        backend = exec_queue.backend
+        supported_backends = ["opencl", "level_zero"]
+        if exec_queue.backend not in [
+            dpctl.backend_type.opencl,
+            dpctl.backend_type.level_zero,
+        ]:
+            raise UnsupportedBackendError(
+                self.kernel_name, backend, supported_backends
+            )
+        # TODO: Test global and local ranges to be valid for the device
+        # TODO: Create a kernel (or check if we already have a cached version)
+        # TODO: kernel.compile()
+
+        # TODO: submit the kernel
+        # kernelargs = []
+        # internal_device_arrs = []
+        # for ty, val, access_type in zip(
+        #     self.argument_types, args, self.ordered_arg_access_types
+        # ):
+        #     self._unpack_argument(
+        #         ty,
+        #         val,
+        #         self.sycl_queue,
+        #         kernelargs,
+        #         internal_device_arrs,
+        #         access_type,
+        #     )
+
+        # self.sycl_queue.submit(
+        #     self.kernel, kernelargs, self.global_size, self.local_size
+        # )
+        # self.sycl_queue.wait()
+
+        # for ty, val, i_dev_arr, access_type in zip(
+        #     self.argument_types,
+        #     args,
+        #     internal_device_arrs,
+        #     self.ordered_arg_access_types,
+        # ):
+        #     self._pack_argument(
+        #         ty, val, self.sycl_queue, i_dev_arr, access_type
+        #     )